Creating release candidate final from release_370 branch git-svn-id: https://llvm.org/svn/llvm-project/openmp/tags/RELEASE_370@246256 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 4a99d3ceb95fea1cc5ce3a87cf553712c0ecd415 [log] [tgz]
author: Hans Wennborg <hans@hanshq.net> Fri Aug 28 01:44:15 2015 +0000
committer: Hans Wennborg <hans@hanshq.net> Fri Aug 28 01:44:15 2015 +0000
tree: fa5a5bf22fc8baf4c2d56726dfd9f58c2a8e6424
parent: c23659910f772638c6b016d506c208e1a42e24a9 [diff]
diff --git a/final/CMakeLists.txt b/final/CMakeLists.txt
new file mode 100644
index 0000000..7011187
--- /dev/null
+++ b/final/CMakeLists.txt

@@ -0,0 +1,2 @@
+cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+add_subdirectory(runtime)

diff --git a/final/CREDITS.txt b/final/CREDITS.txt
new file mode 100644
index 0000000..781da21
--- /dev/null
+++ b/final/CREDITS.txt

@@ -0,0 +1,53 @@
+This file is a partial list of people who have contributed to the LLVM/openmp
+project.  If you have contributed a patch or made some other contribution to
+LLVM/openmp, please submit a patch to this file to add yourself, and it will be
+done!
+
+The list is sorted by surname and formatted to allow easy grepping and
+beautification by scripts.  The fields are: name (N), email (E), web-address
+(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
+(S).
+
+N: Carlo Bertolli
+W: http://ibm.com
+D: IBM contributor to PowerPC support in CMake files and elsewhere.
+
+N: Sunita Chandrasekaran
+D: Contributor to testsuite from OpenUH
+
+N: Barbara Chapman
+D: Contributor to testsuite from OpenUH
+
+N: University of Houston 
+W: http://web.cs.uh.edu/~openuh/download/
+D: OpenUH test suite
+
+N: Intel Corporation OpenMP runtime team
+W: http://openmprtl.org
+D: Created the runtime.
+
+N: John Mellor-Crummey and other members of the OpenMP Tools Working Group
+E: johnmc@rice.edu
+D: OpenMP Tools Interface (OMPT)
+
+N: Matthias Muller
+D: Contributor to testsuite from OpenUH
+
+N: Tal Nevo
+E: tal@scalemp.com
+D: ScaleMP contributor to improve runtime performance there.
+W: http://scalemp.com
+
+N: Pavel Neytchev
+D: Contributor to testsuite from OpenUH
+
+N: Steven Noonan
+E: steven@uplinklabs.net
+D: Patches for the ARM architecture and removal of several inconsistencies.
+
+N: Alp Toker
+E: alp@nuanti.com
+D: Making build work for FreeBSD.
+
+N: Cheng Wang 
+D: Contributor to testsuite from OpenUH

diff --git a/final/LICENSE.txt b/final/LICENSE.txt
new file mode 100644
index 0000000..97dc20c
--- /dev/null
+++ b/final/LICENSE.txt

@@ -0,0 +1,124 @@
+==============================================================================
+
+The software contained in this directory tree is dual licensed under both the
+University of Illinois "BSD-Like" license and the MIT license.  As a user of
+this code you may choose to use it under either license.  As a contributor,
+you agree to allow your code to be used under both.  The full text of the
+relevant licenses is included below.
+
+In addition, a license agreement from the copyright/patent holders of the
+software contained in this directory tree is included below.
+
+==============================================================================
+
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 1997-2015 Intel Corporation
+
+All rights reserved.
+
+Developed by:
+    OpenMP Runtime Team
+    Intel Corporation
+    http://www.openmprtl.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of Intel Corporation OpenMP Runtime Team nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this Software without specific prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+
+Copyright (c) 1997-2014 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+==============================================================================
+
+Intel Corporation
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, Intel Corporation ("Intel") reserves
+all right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by Intel to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+   Agreement, Intel hereby grants to you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable copyright license to reproduce, prepare derivative
+   works of, publicly display, publicly perform, sublicense, and distribute the
+   Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+   Agreement, Intel hereby grants you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable (except as stated in this section) patent license
+   to make, have made, use, offer to sell, sell, import, and otherwise transfer
+   the Work, where such license applies only to those patent claims licensable
+   by Intel that are necessarily infringed by Intel's Software alone or by
+   combination of the Software with the Work to which such Software was
+   submitted. If any entity institutes patent litigation against Intel or any
+   other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+   that Intel's Software, or the Work to which Intel has contributed constitutes
+   direct or contributory patent infringement, then any patent licenses granted
+   to that entity under this Agreement for the Software or Work shall terminate
+   as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
+
+==============================================================================

diff --git a/final/offload/Makefile b/final/offload/Makefile
new file mode 100755
index 0000000..75e3744
--- /dev/null
+++ b/final/offload/Makefile

@@ -0,0 +1,224 @@
+#

+##//===----------------------------------------------------------------------===//

+#//

+#//                     The LLVM Compiler Infrastructure

+#//

+#// This file is dual licensed under the MIT and the University of Illinois Open

+#// Source Licenses. See LICENSE.txt for details.

+#//

+#//===----------------------------------------------------------------------===//

+#

+

+# MAKEFILE PARAMETERS

+#

+# root_dir - path to root directory of liboffload

+# build_dir - path to build directory

+# mpss_dir - path to root directory of mpss

+# mpss_version - version of the mpss (e.g., version "3.3.x" would be "33")

+# libiomp_host_dir - path to host libiomp directory (unnecessary if compiler_host is icc)

+# libiomp_target_dir - path to target libiomp directory (unnecesarry if compiler_target is icc)

+# omp_header_dir - path to omp.h (unnecessary if compiler_host and compiler_target are icc)

+# os_host - host operating system

+# os_target - target operating system

+# compiler_host - host compiler

+# compiler_target - target compiler

+# options_host - additional options for host compiler

+# options_target - additional options for target compiler

+#

+

+# Directories

+root_dir?=.

+build_dir?=$(root_dir)/build

+build_host_dir=$(build_dir)/host

+build_target_dir=$(build_dir)/target

+obj_host_dir=$(build_dir)/obj_host

+obj_target_dir=$(build_dir)/obj_target

+source_dir=$(root_dir)/src

+imported_dir=$(source_dir)/imported

+

+# OS

+os_host?=linux

+os_target?=linux

+ifneq ($(os_host)_$(os_target), linux_linux)

+  $(error "Only linux is supported")

+endif

+

+# Compilers

+compiler_host?=gcc

+compiler_target?=gcc

+

+# MPSS

+mpss_version?=30

+mpss_dir?=/

+mpss_present=$(shell if test -d $(mpss_dir); then echo OK; else echo KO; fi)

+ifneq ($(mpss_present), OK)

+  $(error "Cannot find MPSS directory $(mpss_dir)")

+endif

+

+ifeq ($(shell test $(mpss_version) -gt 33; echo $$?), 0)

+  coi_dir=$(mpss_dir)/sysroots/k1om-mpss-linux/usr

+  coi_include=$(coi_dir)/include/intel-coi

+  coi_lib_host=$(mpss_dir)/lib64

+  coi_lib_device=$(coi_dir)/lib64

+else

+  coi_dir=$(mpss_dir)/opt/intel/mic/coi

+  coi_include=$(coi_dir)/include

+  coi_lib_host=$(coi_dir)/host-linux-release/lib

+  coi_lib_device=$(coi_dir)/device-linux-release/lib

+endif

+myo_dir=$(mpss_dir)/opt/intel/mic/myo

+

+# Sources

+src_liboffload_common=dv_util.cpp liboffload_error.c liboffload_msg.c offload_common.cpp offload_table.cpp offload_trace.cpp offload_util.cpp

+

+src_liboffload_host=$(src_liboffload_common) cean_util.cpp coi/coi_client.cpp compiler_if_host.cpp offload_engine.cpp offload_env.cpp offload_host.cpp offload_omp_host.cpp offload_timer_host.cpp offload_orsl.cpp orsl-lite/lib/orsl-lite.c offload_myo_host.cpp

+src_liboffload_host:=$(foreach file,$(src_liboffload_host),$(source_dir)/$(file))

+

+src_liboffload_target=$(src_liboffload_common) coi/coi_server.cpp compiler_if_target.cpp offload_omp_target.cpp offload_target.cpp offload_timer_target.cpp offload_myo_target.cpp

+src_liboffload_target:=$(foreach file,$(src_liboffload_target),$(source_dir)/$(file))

+

+src_ofld=ofldbegin.cpp ofldend.cpp

+src_ofld:=$(foreach file,$(src_ofld),$(source_dir)/$(file))

+

+headers=$(wildcard $(source_dir)/*.h) $(wildcard $(source_dir)/coi/*.h) $(wildcard $(source_dir)/orsl-lite/include/*.h)

+ifneq ($(omp_header_dir), )

+  headers+=$(imported_dir)/omp.h

+endif

+

+# Objects

+obj_liboffload_host=$(notdir $(src_liboffload_host))

+obj_liboffload_host:=$(obj_liboffload_host:.cpp=.o)

+obj_liboffload_host:=$(obj_liboffload_host:.c=.o)

+obj_liboffload_host:=$(foreach file,$(obj_liboffload_host),$(obj_host_dir)/$(file))

+

+obj_liboffload_target=$(notdir $(src_liboffload_target))

+obj_liboffload_target:=$(obj_liboffload_target:.cpp=.o)

+obj_liboffload_target:=$(obj_liboffload_target:.c=.o)

+obj_liboffload_target:=$(foreach file,$(obj_liboffload_target),$(obj_target_dir)/$(file))

+

+obj_ofld=$(notdir $(src_ofld))

+obj_ofld:=$(obj_ofld:.cpp=.o)

+obj_ofld_host=$(foreach file,$(obj_ofld),$(build_host_dir)/$(file))

+obj_ofld_target=$(foreach file,$(obj_ofld),$(build_target_dir)/$(file))

+

+# Options

+opts_common=-O2 -w -fpic -c -DCOI_LIBRARY_VERSION=2 -DMYO_SUPPORT -DOFFLOAD_DEBUG=1 -DSEP_SUPPORT -DTIMING_SUPPORT -I$(coi_include) -I$(myo_dir)/include -I$(source_dir)

+ifneq ($(omp_header_dir), )

+  opts_common+=-I$(imported_dir)

+endif

+

+opts_liboffload=-shared -Wl,-soname,liboffload.so.5 -ldl -lstdc++ -liomp5

+

+opts_liboffload_host=$(opts_liboffload) -L$(coi_lib_host) -lcoi_host -L$(myo_dir)/lib -lmyo-client

+ifneq ($(libiomp_host_dir), )

+  opts_liboffload_host+=-L$(libiomp_host_dir)

+endif

+

+opts_liboffload_target=$(opts_liboffload) -L$(coi_lib_device) -lcoi_device -L$(myo_dir)/lib -lmyo-service

+ifneq ($(libiomp_target_dir), )

+  opts_liboffload_target+=-L$(libiomp_target_dir)

+endif

+

+options_host?=

+opts_host=$(options_host) -DHOST_LIBRARY=1 -DMPSS_VERSION=$(mpss_version)

+ifeq ($(os_host), linux)

+  opts_host+=-DLINUX

+endif

+

+options_target?=

+opts_target=$(options_target) -DHOST_LIBRARY=0

+ifeq ($(os_target), linux)

+  opts_target+=-DLINUX

+endif

+ifeq ($(compiler_target), icc)

+  opts_target+=-mmic

+endif

+

+# Make targets

+.PHONY: all clean info

+

+all: info $(build_host_dir)/liboffload.so $(build_target_dir)/liboffload.so $(obj_ofld_host) $(obj_ofld_target)

+

+

+$(build_host_dir)/liboffload.so: $(build_host_dir)/liboffload.so.5 | $(build_host_dir)

+	ln -f $< $@

+

+$(build_host_dir)/liboffload.so.5: $(obj_liboffload_host) | $(build_host_dir)

+	$(compiler_host) $(opts_liboffload_host) $(opts_host) $^ -o $@

+

+$(obj_host_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_host_dir)

+	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

+

+$(obj_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_host_dir)

+	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

+

+$(obj_host_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_host_dir)

+	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

+

+$(obj_host_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_host_dir)

+	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

+

+

+$(build_target_dir)/liboffload.so: $(build_target_dir)/liboffload.so.5 | $(build_target_dir)

+	ln -f $< $@

+

+$(build_target_dir)/liboffload.so.5: $(obj_liboffload_target) | $(build_target_dir)

+	$(compiler_target) $(opts_liboffload_target) $(opts_target) $^ -o $@

+

+$(obj_target_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_target_dir)

+	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

+

+$(obj_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_target_dir)

+	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

+

+$(obj_target_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_target_dir)

+	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

+

+$(obj_target_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_target_dir)

+	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

+

+

+$(build_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_host_dir)

+	$(compiler_host) $(opts_common) $(opts_host) $< -o $@

+

+$(build_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_target_dir)

+	$(compiler_target) $(opts_common) $(opts_target) $< -o $@

+

+

+$(imported_dir)/omp.h: $(omp_header_dir)/omp.h | $(imported_dir)

+	cp $< $@

+

+

+$(build_host_dir) $(build_target_dir) $(obj_host_dir) $(obj_target_dir): | $(build_dir)

+	$(shell mkdir -p $@ >/dev/null 2>/dev/null)

+	@echo "Created $@ directory"

+

+$(build_dir):

+	$(shell mkdir -p $@ >/dev/null 2>/dev/null)

+	@echo "Created $@ directory"

+

+$(imported_dir):

+	$(shell mkdir -p $@ >/dev/null 2>/dev/null)

+	@echo "Created $@ directory"

+

+

+clean:

+	$(shell rm -rf $(build_dir))

+	@echo "Remove $(build_dir) directory"

+

+

+info:

+	@echo "root_dir = $(root_dir)"

+	@echo "build_dir = $(build_dir)"

+	@echo "mpss_dir = $(mpss_dir)"

+	@echo "mpss_version = $(mpss_version)"

+	@echo "libiomp_host_dir = $(libiomp_host_dir)"

+	@echo "libiomp_target_dir = $(libiomp_target_dir)"

+	@echo "omp_header_dir = $(omp_header_dir)"

+	@echo "os_host = $(os_host)"

+	@echo "os_target = $(os_target)"

+	@echo "compiler_host = $(compiler_host)"

+	@echo "compiler_target = $(compiler_target)"

+	@echo "options_host = $(options_host)"

+	@echo "options_target = $(options_target)"

+


diff --git a/final/offload/README.txt b/final/offload/README.txt
new file mode 100755
index 0000000..eb9fb1d
--- /dev/null
+++ b/final/offload/README.txt

@@ -0,0 +1,129 @@
+

+	       README for Intel(R) Offload Runtime Library

+	       ===========================================

+

+How to Build Documentation

+==========================

+

+The main documentation is in Doxygen* format, and this distribution

+should come with pre-built PDF documentation in doc/Reference.pdf.

+However, an HTML version can be built by executing:

+

+% doxygen doc/doxygen/config

+

+in this directory.

+

+That will produce HTML documentation in the doc/doxygen/generated

+directory, which can be accessed by pointing a web browser at the

+index.html file there.

+

+If you don't have Doxygen installed, you can download it from

+www.doxygen.org.

+

+

+Software Requirements

+=====================

+

+Intel(R) Offload Runtime Library requires additional software:

+

+1) Intel(R) OpenMP* Runtime Library.  You can either download the source

+code for that (from openmprtl.org or openmp.llvm.org) or simply use the

+compiled version distributed with the Intel compilers.

+2) Intel(R) COI Runtime Library and Intel(R) MYO Runtime Library.  These

+libraries are part of Intel(R) Manycore Platform Software Stack (MPSS).  You

+can download MPSS source code or binaries from

+software.intel.com/en-us/articles/intel-manycore-platform-software-stack-mpss.

+Binaries include host libraries for Intel(R) 64 Architecture and target

+libraries for Intel(R) Many Integrated Core Architecture.

+

+Also you will require all of the libraries that enable the target code to run

+on device.  If you target the Intel(R) Xeon Phi (TM) coprocessor, these

+libraries can be taken from MPSS too.

+

+

+How to Build the Intel(R) Offload Runtime Library

+=================================================

+

+The Makefile at the top-level will attempt to detect what it needs to

+build the Intel(R) Offload Runtime Library.  To see the default settings,

+type:

+

+make info

+

+You can change the Makefile's behavior with the following options:

+

+root_dir:	      The path to the top-level directory containing the

+		      top-level Makefile.  By default, this will take on the

+		      value of the current working directory.

+

+build_dir:	      The path to the build directory.  By default, this will

+		      take on value [root_dir]/build.

+

+mpss_dir:	      The path to the Intel(R) Manycore Platform Software

+		      Stack install directory.  By default, this will take on

+		      the value of operating system's root directory.

+

+libiomp_host_dir:     The path to the host Intel(R) OpenMP* Runtime Library.

+		      This option is required when the host compiler is other

+		      than icc.

+

+libiomp_target_dir:   The path to the target Intel(R) OpenMP* Runtime

+		      Library.  This option is required when the target

+		      compiler is other than icc.

+

+omp_header_dir:       The path to the header file <omp.h> of Intel(R) OpenMP*

+		      Runtime Library.  This option is required if either host

+		      or target compiler is other than icc.

+

+os_host:	      Operating system on host.  Currently supports only

+		      "linux" which is set by default.

+

+os_target:	      Operating system on target device.  Currently supports

+		      only "linux" which is set by default.

+

+compiler_host:	      Which compiler to use for the build of the host part.

+		      Defaults to "gcc"*.  Also supports "icc" and "clang"*.

+		      You should provide the full path to the compiler or it

+		      should be in the user's path.

+

+compiler_host:	      Which compiler to use for the build of the target part.

+		      Defaults to "gcc"*.  Also supports "icc" and "clang"*.

+		      You should provide the full path to the compiler or it

+		      should be in the user's path.

+

+options_host:	      Additional options for the host compiler.

+

+options_target:       Additional options for the target compiler.

+

+To use any of the options above, simple add <option_name>=<value>.  For

+example, if you want to build with icc instead of gcc, type:

+

+make compiler_host=icc compiler_target=icc

+

+

+Supported RTL Build Configurations

+==================================

+

+Supported Architectures: Intel(R) 64, and Intel(R) Many Integrated

+Core Architecture

+

+	      ---------------------------------------------

+	      |   icc/icl     |    gcc      |    clang    |

+--------------|---------------|---------------------------|

+| Linux* OS   |      Yes      |     Yes(1)  |     Yes(1)  |

+| OS X*       |       No      |      No     |      No     |

+| Windows* OS |       No      |      No     |      No     |

+-----------------------------------------------------------

+

+(1) Liboffload requires _rdtsc intrinsic, which may be unsupported by some

+    versions of compiler.  In this case you need to include src/rdtsc.h

+    manually by using Makefile options options_host and options_target:

+

+    make options_host="-include src/rdtsc.h" options_target="-include src/rdtsc.h"

+

+-----------------------------------------------------------------------

+

+Notices

+=======

+

+*Other names and brands may be claimed as the property of others.


diff --git a/final/offload/doc/Reference.pdf b/final/offload/doc/Reference.pdf
new file mode 100644
index 0000000..b9176f0
--- /dev/null
+++ b/final/offload/doc/Reference.pdf
Binary files differ

diff --git a/final/offload/doc/doxygen/config b/final/offload/doc/doxygen/config
new file mode 100755
index 0000000..d45b696
--- /dev/null
+++ b/final/offload/doc/doxygen/config

@@ -0,0 +1,2328 @@
+# Doxyfile 1.8.6
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Intel&reg;&nbsp;Offload Runtime Library"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = 
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = 
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO           = 
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc/doxygen/generated
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        = src/
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    = src/
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                = 
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              = 
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      = 
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    = 
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = 
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. Do not use file names with spaces, bibtex cannot handle them. See
+# also \cite for info how to create references.
+
+CITE_BIB_FILES         = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = src
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.c *.h *.cpp *.f90
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = src/imported src/rdtsc.h
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        = 
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = 
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS = 
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more acurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        = 
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
+# defined cascading style sheet that is included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet file to the output directory. For an example
+# see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = 
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = 
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               = 
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           = 
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     = 
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               = 
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   = 
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  = 
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  = 
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           = 
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     = 
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       = 
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavours of web server based searching depending on the
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
+# searching and an index file used by the script. When EXTERNAL_SEARCH is
+# enabled the indexing and searching needs to be provided by external tools. See
+# the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       = 
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     = 
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
+# replace them by respectively the title of the page, the current date and time,
+# only the current date, the version number of doxygen, the project name (see
+# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           = doc/doxygen/header.tex
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           = 
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_SCHEMA             = 
+
+# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_DTD                = 
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = COI_LIBRARY_VERSION=2 -DMYO_SUPPORT -DOFFLOAD_DEBUG=1 -DSEP_SUPPORT -DTIMING_SUPPORT
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have an
+# all uppercase name, and do not end with a semicolon. Such function macros are
+# typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have an unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            = 
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               = 
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font n the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           = 
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           = 
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           = 
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           = 
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES

diff --git a/final/offload/doc/doxygen/header.tex b/final/offload/doc/doxygen/header.tex
new file mode 100755
index 0000000..5e963c2
--- /dev/null
+++ b/final/offload/doc/doxygen/header.tex

@@ -0,0 +1,90 @@
+% Latex header for doxygen 1.8.3.1

+\documentclass{book}

+\usepackage[a4paper,top=2.5cm,bottom=2.5cm,left=2.5cm,right=2.5cm]{geometry}

+\usepackage{makeidx}

+\usepackage{natbib}

+\usepackage{graphicx}

+\usepackage{multicol}

+\usepackage{float}

+\usepackage{listings}

+\usepackage{color}

+\usepackage{ifthen}

+\usepackage[table]{xcolor}

+\usepackage{textcomp}

+\usepackage{alltt}

+\usepackage{ifpdf}

+\ifpdf

+\usepackage[pdftex,

+            pagebackref=true,

+            colorlinks=true,

+            linkcolor=blue,

+            unicode

+           ]{hyperref}

+\else

+\usepackage[ps2pdf,

+            pagebackref=true,

+            colorlinks=true,

+            linkcolor=blue,

+            unicode

+           ]{hyperref}

+\usepackage{pspicture}

+\fi

+\usepackage[utf8]{inputenc}

+\usepackage{mathptmx}

+\usepackage[scaled=.90]{helvet}

+\usepackage{courier}

+\usepackage{sectsty}

+\usepackage{amssymb}

+\usepackage[titles]{tocloft}

+\usepackage{doxygen}

+\usepackage{fancyhdr}

+\pagestyle{fancy}

+\lstset{language=C++,inputencoding=utf8,basicstyle=\footnotesize,breaklines=true,breakatwhitespace=true,tabsize=4,numbers=left }

+\makeindex

+\setcounter{tocdepth}{3}

+\renewcommand{\footrulewidth}{0.4pt}

+\renewcommand{\familydefault}{\sfdefault}

+\hfuzz=15pt

+\setlength{\emergencystretch}{15pt}

+\hbadness=750

+\tolerance=750

+\begin{document}

+\hypersetup{pageanchor=false,citecolor=blue}

+\begin{titlepage}

+\vspace*{7cm}

+\begin{center}

+{\Large Intel\textsuperscript{\textregistered} Offload Runtime Library }\\

+\vspace*{1cm}

+{\large Generated by Doxygen $doxygenversion }\\

+\vspace*{0.5cm}

+{\small $datetime }\\

+\end{center}

+\end{titlepage}

+

+{\bf FTC Optimization Notice}

+

+Intel's compilers may or may not optimize to the same degree for non-Intel microprocessors for

+optimizations that are not unique to Intel microprocessors. These optimizations include SSE2,

+SSE3, and SSSE3 instruction sets and other optimizations. Intel does not guarantee the

+availability, functionality, or effectiveness of any optimization on microprocessors not

+manufactured by Intel.

+

+Microprocessor-dependent optimizations in this product are intended for use with Intel

+microprocessors. Certain optimizations not specific to Intel microarchitecture are reserved for

+Intel microprocessors. Please refer to the applicable product User and Reference Guides for

+more information regarding the specific instruction sets covered by this notice.

+

+Notice revision \#20110804

+

+\vspace*{0.5cm}

+

+{\bf Trademarks}

+

+Intel, Xeon, and Intel Xeon Phi are trademarks of Intel Corporation in the U.S. and/or other countries.

+

+This document is Copyright \textcopyright 2014, Intel Corporation. All rights reserved. 

+

+\pagenumbering{roman}

+\tableofcontents

+\pagenumbering{arabic}

+\hypersetup{pageanchor=true,citecolor=blue}


diff --git a/final/offload/src/cean_util.cpp b/final/offload/src/cean_util.cpp
new file mode 100644
index 0000000..fe1890b
--- /dev/null
+++ b/final/offload/src/cean_util.cpp

@@ -0,0 +1,344 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "cean_util.h"
+#include "offload_common.h"
+
+// 1. allocate element of CeanReadRanges type
+// 2. initialized it for reading consequently contiguous ranges
+//    described by "ap" argument
+CeanReadRanges * init_read_ranges_arr_desc(const arr_desc *ap)
+{
+    CeanReadRanges * res;
+
+    // find the max contiguous range
+    int64_t rank = ap->rank - 1;
+    int64_t length = ap->dim[rank].size;
+    for (; rank >= 0; rank--) {
+        if (ap->dim[rank].stride == 1) {
+            length *= (ap->dim[rank].upper - ap->dim[rank].lower + 1);
+            if (rank > 0 && length != ap->dim[rank - 1].size) {
+                break;
+            }
+        }
+        else {
+            break;
+        }
+    }
+
+    res =(CeanReadRanges *)malloc(sizeof(CeanReadRanges) +
+                                  (ap->rank - rank) * sizeof(CeanReadDim));
+    res->current_number = 0;
+    res->range_size = length;
+    res->last_noncont_ind = rank;
+
+    // calculate number of contiguous ranges inside noncontiguous dimensions
+    int count = 1;
+    bool prev_is_cont = true;
+    int64_t offset = 0;
+
+    for (; rank >= 0; rank--) {
+        res->Dim[rank].count = count;
+        res->Dim[rank].size = ap->dim[rank].stride * ap->dim[rank].size;
+        count *= (prev_is_cont && ap->dim[rank].stride == 1? 1 :
+            (ap->dim[rank].upper - ap->dim[rank].lower +
+            ap->dim[rank].stride) / ap->dim[rank].stride);
+        prev_is_cont = false;
+        offset +=(ap->dim[rank].lower - ap->dim[rank].lindex) *
+                 ap->dim[rank].size;
+    }
+    res->range_max_number = count;
+    res -> ptr = (void*)ap->base;
+    res -> init_offset = offset;
+    return res;
+}
+
+// check if ranges described by 1 argument could be transferred into ranges
+// described by 2-nd one
+bool cean_ranges_match(
+    CeanReadRanges * read_rng1,
+    CeanReadRanges * read_rng2
+)
+{
+    return ( read_rng1 == NULL || read_rng2 == NULL ||
+            (read_rng1->range_size % read_rng2->range_size == 0 ||
+            read_rng2->range_size % read_rng1->range_size == 0));
+}
+
+// Set next offset and length and returns true for next range.
+// Returns false if the ranges are over.
+bool get_next_range(
+    CeanReadRanges * read_rng,
+    int64_t *offset
+)
+{
+    if (++read_rng->current_number > read_rng->range_max_number) {
+        read_rng->current_number = 0;
+        return false;
+    }
+    int rank = 0;
+    int num = read_rng->current_number - 1;
+    int64_t cur_offset = 0;
+    int num_loc;
+    for (; rank <= read_rng->last_noncont_ind; rank++) {
+        num_loc = num / read_rng->Dim[rank].count;
+        cur_offset += num_loc * read_rng->Dim[rank].size;
+        num = num % read_rng->Dim[rank].count;
+    }
+    *offset = cur_offset + read_rng->init_offset;
+    return true;
+}
+
+bool is_arr_desc_contiguous(const arr_desc *ap)
+{
+    int64_t rank = ap->rank - 1;
+    int64_t length = ap->dim[rank].size;
+    for (; rank >= 0; rank--) {
+        if (ap->dim[rank].stride > 1 &&
+            ap->dim[rank].upper - ap->dim[rank].lower != 0) {
+                return false;
+        }
+        else if (length != ap->dim[rank].size) {
+            for (; rank >= 0; rank--) {
+                if (ap->dim[rank].upper - ap->dim[rank].lower != 0) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        length *= (ap->dim[rank].upper - ap->dim[rank].lower + 1);
+    }
+    return true;
+}
+
+int64_t cean_get_transf_size(CeanReadRanges * read_rng)
+{
+    return(read_rng->range_max_number * read_rng->range_size);
+}
+
+static uint64_t last_left, last_right;
+typedef void (*fpp)(const char *spaces, uint64_t low, uint64_t high, int esize);
+
+static void generate_one_range(
+    const char *spaces,
+    uint64_t lrange,
+    uint64_t rrange,
+    fpp fp,
+    int esize
+)
+{
+    OFFLOAD_TRACE(3,
+        "%s    generate_one_range(lrange=%p, rrange=%p, esize=%d)\n",
+        spaces, (void*)lrange, (void*)rrange, esize);
+    if (last_left == -1) {
+        // First range
+        last_left = lrange;
+    }
+    else {
+        if (lrange == last_right+1) {
+            // Extend previous range, don't print
+        }
+        else {
+            (*fp)(spaces, last_left, last_right, esize);
+            last_left = lrange;
+        }
+    }
+    last_right = rrange;
+}
+
+static void generate_mem_ranges_one_rank(
+    const char *spaces,
+    uint64_t base,
+    uint64_t rank,
+    const struct dim_desc *ddp,
+    fpp fp,
+    int esize
+)
+{
+    uint64_t lindex = ddp->lindex;
+    uint64_t lower = ddp->lower;
+    uint64_t upper = ddp->upper;
+    uint64_t stride = ddp->stride;
+    uint64_t size = ddp->size;
+    OFFLOAD_TRACE(3,
+        "%s    "
+        "generate_mem_ranges_one_rank(base=%p, rank=%lld, lindex=%lld, "
+        "lower=%lld, upper=%lld, stride=%lld, size=%lld, esize=%d)\n",
+        spaces, (void*)base, rank, lindex, lower, upper, stride, size, esize);
+    if (rank == 1) {
+        uint64_t lrange, rrange;
+        if (stride == 1) {
+            lrange = base + (lower-lindex)*size;
+            rrange = lrange + (upper-lower+1)*size - 1;
+            generate_one_range(spaces, lrange, rrange, fp, esize);
+        }
+        else {
+            for (int i=lower-lindex; i<=upper-lindex; i+=stride) {
+                lrange = base + i*size;
+                rrange = lrange + size - 1;
+                generate_one_range(spaces, lrange, rrange, fp, esize);
+            }
+        }
+    }
+    else {
+        for (int i=lower-lindex; i<=upper-lindex; i+=stride) {
+            generate_mem_ranges_one_rank(
+                spaces, base+i*size, rank-1, ddp+1, fp, esize);
+
+        }
+    }
+}
+
+static void generate_mem_ranges(
+    const char *spaces,
+    const arr_desc *adp,
+    bool deref,
+    fpp fp
+)
+{
+    uint64_t esize;
+
+    OFFLOAD_TRACE(3,
+        "%s    "
+        "generate_mem_ranges(adp=%p, deref=%d, fp)\n",
+        spaces, adp, deref);
+    last_left = -1;
+    last_right = -2;
+
+    // Element size is derived from last dimension
+    esize = adp->dim[adp->rank-1].size;
+
+    generate_mem_ranges_one_rank(
+        // For c_cean_var the base addr is the address of the data
+        // For c_cean_var_ptr the base addr is dereferenced to get to the data
+        spaces, deref ? *((uint64_t*)(adp->base)) : adp->base,
+        adp->rank, &adp->dim[0], fp, esize);
+    (*fp)(spaces, last_left, last_right, esize);
+}
+
+// returns offset and length of the data to be transferred
+void __arr_data_offset_and_length(
+    const arr_desc *adp,
+    int64_t &offset,
+    int64_t &length
+)
+{
+    int64_t rank = adp->rank - 1;
+    int64_t size = adp->dim[rank].size;
+    int64_t r_off = 0; // offset from right boundary
+
+    // find the rightmost dimension which takes just part of its
+    // range. We define it if the size of left rank is not equal
+    // the range's length between upper and lower boungaries
+    while (rank > 0) {
+        size *= (adp->dim[rank].upper - adp->dim[rank].lower + 1);
+        if (size != adp->dim[rank - 1].size) {
+            break;
+        }
+        rank--;
+    }
+
+    offset = (adp->dim[rank].lower - adp->dim[rank].lindex) *
+             adp->dim[rank].size;
+
+    // find gaps both from the left - offset and from the right - r_off
+    for (rank--; rank >= 0; rank--) {
+        offset += (adp->dim[rank].lower - adp->dim[rank].lindex) *
+                  adp->dim[rank].size;
+        r_off += adp->dim[rank].size -
+                 (adp->dim[rank + 1].upper - adp->dim[rank + 1].lindex + 1) *
+                 adp->dim[rank + 1].size;
+    }
+    length = (adp->dim[0].upper - adp->dim[0].lindex + 1) *
+             adp->dim[0].size - offset - r_off;
+}
+
+#if OFFLOAD_DEBUG > 0
+
+void print_range(
+    const char *spaces,
+    uint64_t low,
+    uint64_t high,
+    int esize
+)
+{
+    char buffer[1024];
+    char number[32];
+
+    OFFLOAD_TRACE(3, "%s        print_range(low=%p, high=%p, esize=%d)\n",
+        spaces, (void*)low, (void*)high, esize);
+
+    if (console_enabled < 4) {
+        return;
+    }
+    OFFLOAD_TRACE(4, "%s            values:\n", spaces);
+    int count = 0;
+    buffer[0] = '\0';
+    while (low <= high)
+    {
+        switch (esize)
+        {
+        case 1:
+            sprintf(number, "%d ", *((char *)low));
+            low += 1;
+            break;
+        case 2:
+            sprintf(number, "%d ", *((short *)low));
+            low += 2;
+            break;
+        case 4:
+            sprintf(number, "%d ", *((int *)low));
+            low += 4;
+            break;
+        default:
+            sprintf(number, "0x%016x ", *((uint64_t *)low));
+            low += 8;
+            break;
+        }
+        strcat(buffer, number);
+        count++;
+        if (count == 10) {
+            OFFLOAD_TRACE(4, "%s            %s\n", spaces, buffer);
+            count = 0;
+            buffer[0] = '\0';
+        }
+    }
+    if (count != 0) {
+        OFFLOAD_TRACE(4, "%s            %s\n", spaces, buffer);
+    }
+}
+
+void __arr_desc_dump(
+    const char *spaces,
+    const char *name,
+    const arr_desc *adp,
+    bool deref
+)
+{
+    OFFLOAD_TRACE(2, "%s%s CEAN expression %p\n", spaces, name, adp);
+
+    if (adp != 0) {
+        OFFLOAD_TRACE(2, "%s    base=%llx, rank=%lld\n",
+            spaces, adp->base, adp->rank);
+
+        for (int i = 0; i < adp->rank; i++) {
+            OFFLOAD_TRACE(2,
+                          "%s    dimension %d: size=%lld, lindex=%lld, "
+                          "lower=%lld, upper=%lld, stride=%lld\n",
+                          spaces, i, adp->dim[i].size, adp->dim[i].lindex,
+                          adp->dim[i].lower, adp->dim[i].upper,
+                          adp->dim[i].stride);
+        }
+        // For c_cean_var the base addr is the address of the data
+        // For c_cean_var_ptr the base addr is dereferenced to get to the data
+        generate_mem_ranges(spaces, adp, deref, &print_range);
+    }
+}
+#endif // OFFLOAD_DEBUG

diff --git a/final/offload/src/cean_util.h b/final/offload/src/cean_util.h
new file mode 100644
index 0000000..d0debcc
--- /dev/null
+++ b/final/offload/src/cean_util.h

@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef CEAN_UTIL_H_INCLUDED
+#define CEAN_UTIL_H_INCLUDED
+
+#if MPSS_VERSION > 33
+#include <source/COIBuffer_source.h>
+#endif
+#include <stdint.h>
+
+#if MPSS_VERSION <= 33
+// CEAN expression representation
+struct dim_desc {
+    int64_t size;       // Length of data type
+    int64_t lindex;     // Lower index
+    int64_t lower;      // Lower section bound
+    int64_t upper;      // Upper section bound
+    int64_t stride;     // Stride
+};
+
+struct arr_desc {
+    int64_t base;       // Base address
+    int64_t rank;       // Rank of array
+    dim_desc dim[1];
+};
+#endif
+
+struct CeanReadDim {
+    int64_t count; // The number of elements in this dimension
+    int64_t size;  // The number of bytes between successive
+                   // elements in this dimension.
+};
+
+struct CeanReadRanges {
+    void *  ptr;
+    int64_t current_number;   // the number of ranges read
+    int64_t range_max_number; // number of contiguous ranges
+    int64_t range_size;       // size of max contiguous range
+    int     last_noncont_ind; // size of Dim array
+    int64_t init_offset;      // offset of 1-st element from array left bound
+    CeanReadDim Dim[1];
+};
+
+// array descriptor length
+#define __arr_desc_length(rank) \
+    (sizeof(int64_t) + sizeof(dim_desc) * (rank))
+
+// returns offset and length of the data to be transferred
+void __arr_data_offset_and_length(const arr_desc *adp,
+                                  int64_t &offset,
+                                  int64_t &length);
+
+// define if data array described by argument is contiguous one
+bool is_arr_desc_contiguous(const arr_desc *ap);
+
+// allocate element of CeanReadRanges type initialized
+// to read consequently contiguous ranges described by "ap" argument
+CeanReadRanges * init_read_ranges_arr_desc(const arr_desc *ap);
+
+// check if ranges described by 1 argument could be transferred into ranges
+// described by 2-nd one
+bool cean_ranges_match(
+    CeanReadRanges * read_rng1,
+    CeanReadRanges * read_rng2
+);
+
+// first argument - returned value by call to init_read_ranges_arr_desc.
+// returns true if offset and length of next range is set successfuly.
+// returns false if the ranges is over.
+bool get_next_range(
+    CeanReadRanges * read_rng,
+    int64_t *offset
+);
+
+// returns number of transferred bytes
+int64_t cean_get_transf_size(CeanReadRanges * read_rng);
+
+#if OFFLOAD_DEBUG > 0
+// prints array descriptor contents to stderr
+void    __arr_desc_dump(
+    const char *spaces,
+    const char *name,
+    const arr_desc *adp,
+    bool dereference);
+#else
+#define __arr_desc_dump(
+    spaces,
+    name,
+    adp,
+    dereference)
+#endif // OFFLOAD_DEBUG
+
+#endif // CEAN_UTIL_H_INCLUDED

diff --git a/final/offload/src/coi/coi_client.cpp b/final/offload/src/coi/coi_client.cpp
new file mode 100644
index 0000000..ab8c7f5
--- /dev/null
+++ b/final/offload/src/coi/coi_client.cpp

@@ -0,0 +1,350 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// The COI host interface
+
+#include "coi_client.h"
+#include "../offload_common.h"
+
+namespace COI {
+
+#define COI_VERSION1    "COI_1.0"
+#define COI_VERSION2    "COI_2.0"
+
+bool            is_available;
+static void*    lib_handle;
+
+// pointers to functions from COI library
+COIRESULT (*EngineGetCount)(COI_ISA_TYPE, uint32_t*);
+COIRESULT (*EngineGetHandle)(COI_ISA_TYPE, uint32_t, COIENGINE*);
+
+COIRESULT (*ProcessCreateFromMemory)(COIENGINE, const char*, const void*,
+                                     uint64_t, int, const char**, uint8_t,
+                                     const char**, uint8_t, const char*,
+                                     uint64_t, const char*, const char*,
+                                     uint64_t, COIPROCESS*);
+COIRESULT (*ProcessDestroy)(COIPROCESS, int32_t, uint8_t, int8_t*, uint32_t*);
+COIRESULT (*ProcessGetFunctionHandles)(COIPROCESS, uint32_t, const char**,
+                                       COIFUNCTION*);
+COIRESULT (*ProcessLoadLibraryFromMemory)(COIPROCESS, const void*, uint64_t,
+                                          const char*, const char*,
+                                          const char*, uint64_t, uint32_t,
+                                          COILIBRARY*);
+COIRESULT (*ProcessRegisterLibraries)(uint32_t, const void**, const uint64_t*,
+                                      const char**, const uint64_t*);
+
+COIRESULT (*PipelineCreate)(COIPROCESS, COI_CPU_MASK, uint32_t, COIPIPELINE*);
+COIRESULT (*PipelineDestroy)(COIPIPELINE);
+COIRESULT (*PipelineRunFunction)(COIPIPELINE, COIFUNCTION, uint32_t,
+                                 const COIBUFFER*, const COI_ACCESS_FLAGS*,
+                                 uint32_t, const COIEVENT*, const void*,
+                                 uint16_t, void*, uint16_t, COIEVENT*);
+
+COIRESULT (*BufferCreate)(uint64_t, COI_BUFFER_TYPE, uint32_t, const void*,
+                          uint32_t, const COIPROCESS*, COIBUFFER*);
+COIRESULT (*BufferCreateFromMemory)(uint64_t, COI_BUFFER_TYPE, uint32_t,
+                                    void*, uint32_t, const COIPROCESS*,
+                                    COIBUFFER*);
+COIRESULT (*BufferDestroy)(COIBUFFER);
+COIRESULT (*BufferMap)(COIBUFFER, uint64_t, uint64_t, COI_MAP_TYPE, uint32_t,
+                       const COIEVENT*, COIEVENT*, COIMAPINSTANCE*, void**);
+COIRESULT (*BufferUnmap)(COIMAPINSTANCE, uint32_t, const COIEVENT*, COIEVENT*);
+COIRESULT (*BufferWrite)(COIBUFFER, uint64_t, const void*, uint64_t,
+                         COI_COPY_TYPE, uint32_t, const COIEVENT*, COIEVENT*);
+COIRESULT (*BufferRead)(COIBUFFER, uint64_t, void*, uint64_t, COI_COPY_TYPE,
+                        uint32_t, const COIEVENT*, COIEVENT*);
+COIRESULT (*BufferCopy)(COIBUFFER, COIBUFFER, uint64_t, uint64_t, uint64_t,
+                        COI_COPY_TYPE, uint32_t, const COIEVENT*, COIEVENT*);
+COIRESULT (*BufferGetSinkAddress)(COIBUFFER, uint64_t*);
+COIRESULT (*BufferSetState)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
+                            COI_BUFFER_MOVE_FLAG, uint32_t,
+                            const   COIEVENT*, COIEVENT*);
+
+COIRESULT (*EventWait)(uint16_t, const COIEVENT*, int32_t, uint8_t, uint32_t*,
+                       uint32_t*);
+
+uint64_t  (*PerfGetCycleFrequency)(void);
+
+bool init(void)
+{
+#ifndef TARGET_WINNT
+    const char *lib_name = "libcoi_host.so.0";
+#else // TARGET_WINNT
+    const char *lib_name = "coi_host.dll";
+#endif // TARGET_WINNT
+
+    OFFLOAD_DEBUG_TRACE(2, "Loading COI library %s ...\n", lib_name);
+    lib_handle = DL_open(lib_name);
+    if (lib_handle == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to load the library\n");
+        return false;
+    }
+
+    EngineGetCount =
+        (COIRESULT (*)(COI_ISA_TYPE, uint32_t*))
+            DL_sym(lib_handle, "COIEngineGetCount", COI_VERSION1);
+    if (EngineGetCount == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIEngineGetCount");
+        fini();
+        return false;
+    }
+
+    EngineGetHandle =
+        (COIRESULT (*)(COI_ISA_TYPE, uint32_t, COIENGINE*))
+            DL_sym(lib_handle, "COIEngineGetHandle", COI_VERSION1);
+    if (EngineGetHandle == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIEngineGetHandle");
+        fini();
+        return false;
+    }
+
+    ProcessCreateFromMemory =
+        (COIRESULT (*)(COIENGINE, const char*, const void*, uint64_t, int,
+                       const char**, uint8_t, const char**, uint8_t,
+                       const char*, uint64_t, const char*, const char*,
+                       uint64_t, COIPROCESS*))
+            DL_sym(lib_handle, "COIProcessCreateFromMemory", COI_VERSION1);
+    if (ProcessCreateFromMemory == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIProcessCreateFromMemory");
+        fini();
+        return false;
+    }
+
+    ProcessDestroy =
+        (COIRESULT (*)(COIPROCESS, int32_t, uint8_t, int8_t*,
+                       uint32_t*))
+            DL_sym(lib_handle, "COIProcessDestroy", COI_VERSION1);
+    if (ProcessDestroy == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIProcessDestroy");
+        fini();
+        return false;
+    }
+
+    ProcessGetFunctionHandles =
+        (COIRESULT (*)(COIPROCESS, uint32_t, const char**, COIFUNCTION*))
+            DL_sym(lib_handle, "COIProcessGetFunctionHandles", COI_VERSION1);
+    if (ProcessGetFunctionHandles == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIProcessGetFunctionHandles");
+        fini();
+        return false;
+    }
+
+    ProcessLoadLibraryFromMemory =
+        (COIRESULT (*)(COIPROCESS, const void*, uint64_t, const char*,
+                       const char*, const char*, uint64_t, uint32_t,
+                       COILIBRARY*))
+            DL_sym(lib_handle, "COIProcessLoadLibraryFromMemory", COI_VERSION2);
+    if (ProcessLoadLibraryFromMemory == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIProcessLoadLibraryFromMemory");
+        fini();
+        return false;
+    }
+
+    ProcessRegisterLibraries =
+        (COIRESULT (*)(uint32_t, const void**, const uint64_t*, const char**,
+                       const uint64_t*))
+            DL_sym(lib_handle, "COIProcessRegisterLibraries", COI_VERSION1);
+    if (ProcessRegisterLibraries == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIProcessRegisterLibraries");
+        fini();
+        return false;
+    }
+
+    PipelineCreate =
+        (COIRESULT (*)(COIPROCESS, COI_CPU_MASK, uint32_t, COIPIPELINE*))
+            DL_sym(lib_handle, "COIPipelineCreate", COI_VERSION1);
+    if (PipelineCreate == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIPipelineCreate");
+        fini();
+        return false;
+    }
+
+    PipelineDestroy =
+        (COIRESULT (*)(COIPIPELINE))
+            DL_sym(lib_handle, "COIPipelineDestroy", COI_VERSION1);
+    if (PipelineDestroy == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIPipelineDestroy");
+        fini();
+        return false;
+    }
+
+    PipelineRunFunction =
+        (COIRESULT (*)(COIPIPELINE, COIFUNCTION, uint32_t, const COIBUFFER*,
+                       const COI_ACCESS_FLAGS*, uint32_t, const COIEVENT*,
+                       const void*, uint16_t, void*, uint16_t, COIEVENT*))
+            DL_sym(lib_handle, "COIPipelineRunFunction", COI_VERSION1);
+    if (PipelineRunFunction == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIPipelineRunFunction");
+        fini();
+        return false;
+    }
+
+    BufferCreate =
+        (COIRESULT (*)(uint64_t, COI_BUFFER_TYPE, uint32_t, const void*,
+                       uint32_t, const COIPROCESS*, COIBUFFER*))
+            DL_sym(lib_handle, "COIBufferCreate", COI_VERSION1);
+    if (BufferCreate == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferCreate");
+        fini();
+        return false;
+    }
+
+    BufferCreateFromMemory =
+        (COIRESULT (*)(uint64_t, COI_BUFFER_TYPE, uint32_t, void*,
+                       uint32_t, const COIPROCESS*, COIBUFFER*))
+            DL_sym(lib_handle, "COIBufferCreateFromMemory", COI_VERSION1);
+    if (BufferCreateFromMemory == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferCreateFromMemory");
+        fini();
+        return false;
+    }
+
+    BufferDestroy =
+        (COIRESULT (*)(COIBUFFER))
+            DL_sym(lib_handle, "COIBufferDestroy", COI_VERSION1);
+    if (BufferDestroy == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferDestroy");
+        fini();
+        return false;
+    }
+
+    BufferMap =
+        (COIRESULT (*)(COIBUFFER, uint64_t, uint64_t, COI_MAP_TYPE, uint32_t,
+                       const COIEVENT*, COIEVENT*, COIMAPINSTANCE*,
+                       void**))
+            DL_sym(lib_handle, "COIBufferMap", COI_VERSION1);
+    if (BufferMap == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferMap");
+        fini();
+        return false;
+    }
+
+    BufferUnmap =
+        (COIRESULT (*)(COIMAPINSTANCE, uint32_t, const COIEVENT*,
+                       COIEVENT*))
+            DL_sym(lib_handle, "COIBufferUnmap", COI_VERSION1);
+    if (BufferUnmap == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferUnmap");
+        fini();
+        return false;
+    }
+
+    BufferWrite =
+        (COIRESULT (*)(COIBUFFER, uint64_t, const void*, uint64_t,
+                       COI_COPY_TYPE, uint32_t, const COIEVENT*,
+                       COIEVENT*))
+            DL_sym(lib_handle, "COIBufferWrite", COI_VERSION1);
+    if (BufferWrite == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferWrite");
+        fini();
+        return false;
+    }
+
+    BufferRead =
+        (COIRESULT (*)(COIBUFFER, uint64_t, void*, uint64_t,
+                                     COI_COPY_TYPE, uint32_t,
+                                     const COIEVENT*, COIEVENT*))
+            DL_sym(lib_handle, "COIBufferRead", COI_VERSION1);
+    if (BufferRead == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferRead");
+        fini();
+        return false;
+    }
+
+    BufferCopy =
+        (COIRESULT (*)(COIBUFFER, COIBUFFER, uint64_t, uint64_t, uint64_t,
+                       COI_COPY_TYPE, uint32_t, const COIEVENT*,
+                       COIEVENT*))
+            DL_sym(lib_handle, "COIBufferCopy", COI_VERSION1);
+    if (BufferCopy == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferCopy");
+        fini();
+        return false;
+    }
+
+    BufferGetSinkAddress =
+        (COIRESULT (*)(COIBUFFER, uint64_t*))
+            DL_sym(lib_handle, "COIBufferGetSinkAddress", COI_VERSION1);
+    if (BufferGetSinkAddress == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferGetSinkAddress");
+        fini();
+        return false;
+    }
+
+    BufferSetState =
+        (COIRESULT(*)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
+                      COI_BUFFER_MOVE_FLAG, uint32_t, const COIEVENT*,
+                      COIEVENT*))
+            DL_sym(lib_handle, "COIBufferSetState", COI_VERSION1);
+    if (BufferSetState == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIBufferSetState");
+        fini();
+        return false;
+    }
+
+    EventWait =
+        (COIRESULT (*)(uint16_t, const COIEVENT*, int32_t, uint8_t,
+                       uint32_t*, uint32_t*))
+            DL_sym(lib_handle, "COIEventWait", COI_VERSION1);
+    if (EventWait == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIEventWait");
+        fini();
+        return false;
+    }
+
+    PerfGetCycleFrequency =
+        (uint64_t (*)(void))
+            DL_sym(lib_handle, "COIPerfGetCycleFrequency", COI_VERSION1);
+    if (PerfGetCycleFrequency == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
+                            "COIPerfGetCycleFrequency");
+        fini();
+        return false;
+    }
+
+    is_available = true;
+
+    return true;
+}
+
+void fini(void)
+{
+    is_available = false;
+
+    if (lib_handle != 0) {
+#ifndef TARGET_WINNT
+        DL_close(lib_handle);
+#endif // TARGET_WINNT
+        lib_handle = 0;
+    }
+}
+
+} // namespace COI

diff --git a/final/offload/src/coi/coi_client.h b/final/offload/src/coi/coi_client.h
new file mode 100644
index 0000000..4775a8b
--- /dev/null
+++ b/final/offload/src/coi/coi_client.h

@@ -0,0 +1,118 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// The interface between offload library and the COI API on the host
+
+#ifndef COI_CLIENT_H_INCLUDED
+#define COI_CLIENT_H_INCLUDED
+
+#include <common/COIPerf_common.h>
+#include <source/COIEngine_source.h>
+#include <source/COIProcess_source.h>
+#include <source/COIPipeline_source.h>
+#include <source/COIBuffer_source.h>
+#include <source/COIEvent_source.h>
+
+#include <string.h>
+
+#include "../liboffload_error_codes.h"
+#include "../offload_util.h"
+
+#define MIC_ENGINES_MAX     128
+
+#if MIC_ENGINES_MAX < COI_MAX_ISA_MIC_DEVICES
+#error MIC_ENGINES_MAX need to be increased
+#endif
+
+// COI library interface
+namespace COI {
+
+extern bool init(void);
+extern void fini(void);
+
+extern bool is_available;
+
+// pointers to functions from COI library
+extern COIRESULT (*EngineGetCount)(COI_ISA_TYPE, uint32_t*);
+extern COIRESULT (*EngineGetHandle)(COI_ISA_TYPE, uint32_t, COIENGINE*);
+
+extern COIRESULT (*ProcessCreateFromMemory)(COIENGINE, const char*,
+                                           const void*, uint64_t, int,
+                                           const char**, uint8_t,
+                                           const char**, uint8_t,
+                                           const char*, uint64_t,
+                                           const char*,
+                                           const char*, uint64_t,
+                                           COIPROCESS*);
+extern COIRESULT (*ProcessDestroy)(COIPROCESS, int32_t, uint8_t,
+                                  int8_t*, uint32_t*);
+extern COIRESULT (*ProcessGetFunctionHandles)(COIPROCESS, uint32_t,
+                                             const char**,
+                                             COIFUNCTION*);
+extern COIRESULT (*ProcessLoadLibraryFromMemory)(COIPROCESS,
+                                                const void*,
+                                                uint64_t,
+                                                const char*,
+                                                const char*,
+                                                const char*,
+                                                uint64_t,
+                                                uint32_t,
+                                                COILIBRARY*);
+extern COIRESULT (*ProcessRegisterLibraries)(uint32_t,
+                                            const void**,
+                                            const uint64_t*,
+                                            const char**,
+                                            const uint64_t*);
+
+extern COIRESULT (*PipelineCreate)(COIPROCESS, COI_CPU_MASK, uint32_t,
+                                  COIPIPELINE*);
+extern COIRESULT (*PipelineDestroy)(COIPIPELINE);
+extern COIRESULT (*PipelineRunFunction)(COIPIPELINE, COIFUNCTION,
+                                       uint32_t, const COIBUFFER*,
+                                       const COI_ACCESS_FLAGS*,
+                                       uint32_t, const COIEVENT*,
+                                       const void*, uint16_t, void*,
+                                       uint16_t, COIEVENT*);
+
+extern COIRESULT (*BufferCreate)(uint64_t, COI_BUFFER_TYPE, uint32_t,
+                                const void*, uint32_t,
+                                const COIPROCESS*, COIBUFFER*);
+extern COIRESULT (*BufferCreateFromMemory)(uint64_t, COI_BUFFER_TYPE,
+                                          uint32_t, void*,
+                                          uint32_t, const COIPROCESS*,
+                                          COIBUFFER*);
+extern COIRESULT (*BufferDestroy)(COIBUFFER);
+extern COIRESULT (*BufferMap)(COIBUFFER, uint64_t, uint64_t,
+                             COI_MAP_TYPE, uint32_t, const COIEVENT*,
+                             COIEVENT*, COIMAPINSTANCE*, void**);
+extern COIRESULT (*BufferUnmap)(COIMAPINSTANCE, uint32_t,
+                               const COIEVENT*, COIEVENT*);
+extern COIRESULT (*BufferWrite)(COIBUFFER, uint64_t, const void*,
+                               uint64_t, COI_COPY_TYPE, uint32_t,
+                               const COIEVENT*, COIEVENT*);
+extern COIRESULT (*BufferRead)(COIBUFFER, uint64_t, void*, uint64_t,
+                              COI_COPY_TYPE, uint32_t,
+                              const COIEVENT*, COIEVENT*);
+extern COIRESULT (*BufferCopy)(COIBUFFER, COIBUFFER, uint64_t, uint64_t,
+                              uint64_t, COI_COPY_TYPE, uint32_t,
+                              const COIEVENT*, COIEVENT*);
+extern COIRESULT (*BufferGetSinkAddress)(COIBUFFER, uint64_t*);
+extern COIRESULT (*BufferSetState)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
+                                   COI_BUFFER_MOVE_FLAG, uint32_t,
+                                   const   COIEVENT*, COIEVENT*);
+
+extern COIRESULT (*EventWait)(uint16_t, const COIEVENT*, int32_t,
+                           uint8_t, uint32_t*, uint32_t*);
+
+extern uint64_t  (*PerfGetCycleFrequency)(void);
+
+} // namespace COI
+
+#endif // COI_CLIENT_H_INCLUDED

diff --git a/final/offload/src/coi/coi_server.cpp b/final/offload/src/coi/coi_server.cpp
new file mode 100644
index 0000000..73e6c2d
--- /dev/null
+++ b/final/offload/src/coi/coi_server.cpp

@@ -0,0 +1,130 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// The COI interface on the target
+
+#include "coi_server.h"
+
+#include "../offload_target.h"
+#include "../offload_timer.h"
+#ifdef MYO_SUPPORT
+#include "../offload_myo_target.h"      // for __offload_myoLibInit/Fini
+#endif // MYO_SUPPORT
+
+COINATIVELIBEXPORT
+void server_compute(
+    uint32_t  buffer_count,
+    void**    buffers,
+    uint64_t* buffers_len,
+    void*     misc_data,
+    uint16_t  misc_data_len,
+    void*     return_data,
+    uint16_t  return_data_len
+)
+{
+    OffloadDescriptor::offload(buffer_count, buffers,
+                               misc_data, misc_data_len,
+                               return_data, return_data_len);
+}
+
+COINATIVELIBEXPORT
+void server_init(
+    uint32_t  buffer_count,
+    void**    buffers,
+    uint64_t* buffers_len,
+    void*     misc_data,
+    uint16_t  misc_data_len,
+    void*     return_data,
+    uint16_t  return_data_len
+)
+{
+    struct init_data {
+        int  device_index;
+        int  devices_total;
+        int  console_level;
+        int  offload_report_level;
+    } *data = (struct init_data*) misc_data;
+
+    // set device index and number of total devices
+    mic_index = data->device_index;
+    mic_engines_total = data->devices_total;
+
+    // initialize trace level
+    console_enabled = data->console_level;
+    offload_report_level = data->offload_report_level;
+
+    // return back the process id
+    *((pid_t*) return_data) = getpid();
+}
+
+COINATIVELIBEXPORT
+void server_var_table_size(
+    uint32_t  buffer_count,
+    void**    buffers,
+    uint64_t* buffers_len,
+    void*     misc_data,
+    uint16_t  misc_data_len,
+    void*     return_data,
+    uint16_t  return_data_len
+)
+{
+    struct Params {
+        int64_t nelems;
+        int64_t length;
+    } *params;
+
+    params = static_cast<Params*>(return_data);
+    params->length = __offload_vars.table_size(params->nelems);
+}
+
+COINATIVELIBEXPORT
+void server_var_table_copy(
+    uint32_t  buffer_count,
+    void**    buffers,
+    uint64_t* buffers_len,
+    void*     misc_data,
+    uint16_t  misc_data_len,
+    void*     return_data,
+    uint16_t  return_data_len
+)
+{
+    __offload_vars.table_copy(buffers[0], *static_cast<int64_t*>(misc_data));
+}
+
+#ifdef MYO_SUPPORT
+// temporary workaround for blocking behavior of myoiLibInit/Fini calls
+COINATIVELIBEXPORT
+void server_myoinit(
+    uint32_t  buffer_count,
+    void**    buffers,
+    uint64_t* buffers_len,
+    void*     misc_data,
+    uint16_t  misc_data_len,
+    void*     return_data,
+    uint16_t  return_data_len
+)
+{
+    __offload_myoLibInit();
+}
+
+COINATIVELIBEXPORT
+void server_myofini(
+    uint32_t  buffer_count,
+    void**    buffers,
+    uint64_t* buffers_len,
+    void*     misc_data,
+    uint16_t  misc_data_len,
+    void*     return_data,
+    uint16_t  return_data_len
+)
+{
+    __offload_myoLibFini();
+}
+#endif // MYO_SUPPORT

diff --git a/final/offload/src/coi/coi_server.h b/final/offload/src/coi/coi_server.h
new file mode 100644
index 0000000..e744d9e
--- /dev/null
+++ b/final/offload/src/coi/coi_server.h

@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+//The interface between offload library and the COI API on the target.
+
+#ifndef COI_SERVER_H_INCLUDED
+#define COI_SERVER_H_INCLUDED
+
+#include <common/COIEngine_common.h>
+#include <common/COIPerf_common.h>
+#include <sink/COIProcess_sink.h>
+#include <sink/COIPipeline_sink.h>
+#include <sink/COIBuffer_sink.h>
+#include <list>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "../liboffload_error_codes.h"
+
+// wrappers for COI API
+#define PipelineStartExecutingRunFunctions() \
+    { \
+        COIRESULT res = COIPipelineStartExecutingRunFunctions(); \
+        if (res != COI_SUCCESS) { \
+            LIBOFFLOAD_ERROR(c_pipeline_start_run_funcs, mic_index, res); \
+            exit(1); \
+        } \
+    }
+
+#define ProcessWaitForShutdown() \
+    { \
+        COIRESULT res = COIProcessWaitForShutdown(); \
+        if (res != COI_SUCCESS) { \
+            LIBOFFLOAD_ERROR(c_process_wait_shutdown, mic_index, res); \
+            exit(1); \
+        } \
+    }
+
+#define BufferAddRef(buf) \
+    { \
+        COIRESULT res = COIBufferAddRef(buf); \
+        if (res != COI_SUCCESS) { \
+            LIBOFFLOAD_ERROR(c_buf_add_ref, mic_index, res); \
+            exit(1); \
+        } \
+    }
+
+#define BufferReleaseRef(buf) \
+    { \
+        COIRESULT res = COIBufferReleaseRef(buf); \
+        if (res != COI_SUCCESS) { \
+            LIBOFFLOAD_ERROR(c_buf_release_ref, mic_index, res); \
+            exit(1); \
+        } \
+    }
+
+#define EngineGetIndex(index) \
+    { \
+        COI_ISA_TYPE isa_type; \
+        COIRESULT res = COIEngineGetIndex(&isa_type, index); \
+        if (res != COI_SUCCESS) { \
+            LIBOFFLOAD_ERROR(c_get_engine_index, mic_index, res); \
+            exit(1); \
+        } \
+    }
+
+#endif // COI_SERVER_H_INCLUDED

diff --git a/final/offload/src/compiler_if_host.cpp b/final/offload/src/compiler_if_host.cpp
new file mode 100644
index 0000000..2bc430b
--- /dev/null
+++ b/final/offload/src/compiler_if_host.cpp

@@ -0,0 +1,323 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "compiler_if_host.h"
+
+#include <malloc.h>
+#ifndef TARGET_WINNT
+#include <alloca.h>
+#endif // TARGET_WINNT
+
+// Global counter on host. 
+// This variable is used if P2OPT_offload_do_data_persistence == 2.
+// The variable used to identify offload constructs contained in one procedure.
+// Increment of OFFLOAD_CALL_COUNT is inserted at entries of HOST routines with
+// offload constructs.
+static int offload_call_count = 0;
+
+extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE(
+    TARGET_TYPE      target_type,
+    int              target_number,
+    int              is_optional,
+    _Offload_status* status,
+    const char*      file,
+    uint64_t         line
+)
+{
+    bool retval;
+    OFFLOAD ofld;
+
+    // initialize status
+    if (status != 0) {
+        status->result = OFFLOAD_UNAVAILABLE;
+        status->device_number = -1;
+        status->data_sent = 0;
+        status->data_received = 0;
+    }
+
+    // make sure libray is initialized
+    retval = __offload_init_library();
+
+    // OFFLOAD_TIMER_INIT must follow call to __offload_init_library
+    OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line);
+
+    OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload);
+
+    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
+
+    // initialize all devices is init_type is on_offload_all
+    if (retval && __offload_init_type == c_init_on_offload_all) {
+        for (int i = 0; i < mic_engines_total; i++) {
+             mic_engines[i].init();
+        }
+    }
+    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
+
+    OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire);
+
+    if (target_type == TARGET_HOST) {
+        // Host always available
+        retval = true;
+    }
+    else if (target_type == TARGET_MIC) {
+        if (target_number >= -1) {
+            if (retval) {
+                if (target_number >= 0) {
+                    // User provided the device number
+                    target_number = target_number % mic_engines_total;
+                }
+                else {
+                    // use device 0
+                    target_number = 0;
+                }
+
+                // reserve device in ORSL
+                if (is_optional) {
+                    if (!ORSL::try_reserve(target_number)) {
+                        target_number = -1;
+                    }
+                }
+                else {
+                    if (!ORSL::reserve(target_number)) {
+                        target_number = -1;
+                    }
+                }
+
+                // initialize device
+                if (target_number >= 0 &&
+                    __offload_init_type == c_init_on_offload) {
+                    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
+                    mic_engines[target_number].init();
+                    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
+                }
+            }
+            else {
+                // fallback to CPU
+                target_number = -1;
+            }
+
+            if (target_number < 0 || !retval) {
+                if (!is_optional && status == 0) {
+                    LIBOFFLOAD_ERROR(c_device_is_not_available);
+                    exit(1);
+                }
+
+                retval = false;
+            }
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_device_number);
+            exit(1);
+        }
+    }
+
+    if (retval) {
+        ofld = new OffloadDescriptor(target_number, status,
+                                     !is_optional, false, timer_data);
+        OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number);
+        Offload_Report_Prolog(timer_data);
+        OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start,
+                              "Starting offload: target_type = %d, "
+                              "number = %d, is_optional = %d\n",
+                              target_type, target_number, is_optional);
+
+        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
+    }
+    else {
+        ofld = NULL;
+
+        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
+        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_total_offload);
+        offload_report_free_data(timer_data);
+    }
+
+    return ofld;
+}
+
+extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE1(
+    const int*  device_num,
+    const char* file,
+    uint64_t    line
+)
+{
+    int target_number;
+
+    // make sure libray is initialized and at least one device is available
+    if (!__offload_init_library()) {
+        LIBOFFLOAD_ERROR(c_device_is_not_available);
+        exit(1);
+    }
+
+    // OFFLOAD_TIMER_INIT must follow call to __offload_init_library
+
+    OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line);
+
+    OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload);
+
+    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
+
+    if (__offload_init_type == c_init_on_offload_all) {
+        for (int i = 0; i < mic_engines_total; i++) {
+             mic_engines[i].init();
+        }
+    }
+
+    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
+
+    OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire);
+
+    // use default device number if it is not provided
+    if (device_num != 0) {
+        target_number = *device_num;
+    }
+    else {
+        target_number = __omp_device_num;
+    }
+
+    // device number should be a non-negative integer value
+    if (target_number < 0) {
+        LIBOFFLOAD_ERROR(c_omp_invalid_device_num);
+        exit(1);
+    }
+
+    // should we do this for OpenMP?
+    target_number %= mic_engines_total;
+
+    // reserve device in ORSL
+    if (!ORSL::reserve(target_number)) {
+        LIBOFFLOAD_ERROR(c_device_is_not_available);
+        exit(1);
+    }
+
+    // initialize device(s)
+    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
+
+    if (__offload_init_type == c_init_on_offload) {
+        mic_engines[target_number].init();
+    }
+
+    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
+
+    OFFLOAD ofld =
+        new OffloadDescriptor(target_number, 0, true, true, timer_data);
+
+    OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number);
+
+    Offload_Report_Prolog(timer_data);
+
+    OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start,
+                          "Starting OpenMP offload, device = %d\n",
+                          target_number);
+
+    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
+
+    return ofld;
+}
+
+int offload_offload_wrap(
+    OFFLOAD ofld,
+    const char *name,
+    int is_empty,
+    int num_vars,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int num_waits,
+    const void **waits,
+    const void **signal,
+    int entry_id,
+    const void *stack_addr
+)
+{
+    bool ret = ofld->offload(name, is_empty, vars, vars2, num_vars,
+                             waits, num_waits, signal, entry_id, stack_addr);
+    if (!ret || signal == 0) {
+        delete ofld;
+    }
+    return ret;
+}
+
+extern "C" int OFFLOAD_OFFLOAD1(
+    OFFLOAD ofld,
+    const char *name,
+    int is_empty,
+    int num_vars,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int num_waits,
+    const void **waits,
+    const void **signal
+)
+{
+    return offload_offload_wrap(ofld, name, is_empty,
+                            num_vars, vars, vars2,
+                            num_waits, waits,
+                            signal, NULL, NULL);
+}
+
+extern "C" int OFFLOAD_OFFLOAD2(
+    OFFLOAD ofld,
+    const char *name,
+    int is_empty,
+    int num_vars,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int num_waits,
+    const void** waits,
+    const void** signal,
+    int entry_id,
+    const void *stack_addr
+)
+{
+    return offload_offload_wrap(ofld, name, is_empty,
+                            num_vars, vars, vars2,
+                            num_waits, waits,
+                            signal, entry_id, stack_addr);
+}
+
+extern "C" int OFFLOAD_OFFLOAD(
+    OFFLOAD ofld,
+    const char *name,
+    int is_empty,
+    int num_vars,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int num_waits,
+    const void **waits,
+    const void *signal,
+    int entry_id,
+    const void *stack_addr
+)
+{
+    // signal is passed by reference now
+    const void **signal_new = (signal != 0) ? &signal : 0;
+    const void **waits_new = 0;
+    int num_waits_new = 0;
+
+    // remove NULL values from the list of signals to wait for
+    if (num_waits > 0) {
+        waits_new = (const void**) alloca(sizeof(void*) * num_waits);
+        for (int i = 0; i < num_waits; i++) {
+            if (waits[i] != 0) {
+                waits_new[num_waits_new++] = waits[i];
+            }
+        }
+    }
+
+    return OFFLOAD_OFFLOAD1(ofld, name, is_empty,
+                            num_vars, vars, vars2,
+                            num_waits_new, waits_new,
+                            signal_new);
+}
+
+extern "C" int OFFLOAD_CALL_COUNT()
+{
+    offload_call_count++;
+    return offload_call_count;
+}

diff --git a/final/offload/src/compiler_if_host.h b/final/offload/src/compiler_if_host.h
new file mode 100644
index 0000000..4b34c51
--- /dev/null
+++ b/final/offload/src/compiler_if_host.h

@@ -0,0 +1,133 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*! \file
+    \brief The interface between compiler-generated host code and runtime library
+*/
+
+#ifndef COMPILER_IF_HOST_H_INCLUDED
+#define COMPILER_IF_HOST_H_INCLUDED
+
+#include "offload_host.h"
+
+#define OFFLOAD_TARGET_ACQUIRE          OFFLOAD_PREFIX(target_acquire)
+#define OFFLOAD_TARGET_ACQUIRE1         OFFLOAD_PREFIX(target_acquire1)
+#define OFFLOAD_OFFLOAD                 OFFLOAD_PREFIX(offload)
+#define OFFLOAD_OFFLOAD1                OFFLOAD_PREFIX(offload1)
+#define OFFLOAD_OFFLOAD2                OFFLOAD_PREFIX(offload2)
+#define OFFLOAD_CALL_COUNT              OFFLOAD_PREFIX(offload_call_count)
+
+
+/*! \fn OFFLOAD_TARGET_ACQUIRE
+    \brief Attempt to acquire the target.
+    \param target_type   The type of target.
+    \param target_number The device number.
+    \param is_optional   Whether CPU fall-back is allowed.
+    \param status        Address of variable to hold offload status.
+    \param file          Filename in which this offload occurred.
+    \param line          Line number in the file where this offload occurred.
+*/
+extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE(
+    TARGET_TYPE      target_type,
+    int              target_number,
+    int              is_optional,
+    _Offload_status* status,
+    const char*      file,
+    uint64_t         line
+);
+
+/*! \fn OFFLOAD_TARGET_ACQUIRE1
+    \brief Acquire the target for offload (OpenMP).
+    \param device_number Device number or null if not specified.
+    \param file          Filename in which this offload occurred
+    \param line          Line number in the file where this offload occurred.
+*/
+extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE1(
+    const int*      device_number,
+    const char*     file,
+    uint64_t        line
+);
+
+/*! \fn OFFLOAD_OFFLOAD1
+    \brief Run function on target using interface for old data persistence.
+    \param o Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
+    \param name Name of offload entry point.
+    \param is_empty If no code to execute (e.g. offload_transfer)
+    \param num_vars Number of variable descriptors.
+    \param vars Pointer to VarDesc array.
+    \param vars2 Pointer to VarDesc2 array.
+    \param num_waits Number of "wait" values.
+    \param waits Pointer to array of wait values.
+    \param signal Pointer to signal value or NULL.
+*/
+extern "C" int OFFLOAD_OFFLOAD1(
+    OFFLOAD o,
+    const char *name,
+    int is_empty,
+    int num_vars,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int num_waits,
+    const void** waits,
+    const void** signal
+);
+
+/*! \fn OFFLOAD_OFFLOAD2
+    \brief Run function on target using interface for new data persistence.
+    \param o Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
+    \param name Name of offload entry point.
+    \param is_empty If no code to execute (e.g. offload_transfer)
+    \param num_vars Number of variable descriptors.
+    \param vars Pointer to VarDesc array.
+    \param vars2 Pointer to VarDesc2 array.
+    \param num_waits Number of "wait" values.
+    \param waits Pointer to array of wait values.
+    \param signal Pointer to signal value or NULL.
+    \param entry_id A signature for the function doing the offload.
+    \param stack_addr The stack frame address of the function doing offload.
+*/
+extern "C" int OFFLOAD_OFFLOAD2(
+    OFFLOAD o,
+    const char *name,
+    int is_empty,
+    int num_vars,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int num_waits,
+    const void** waits,
+    const void** signal,
+    int entry_id,
+    const void *stack_addr
+);
+
+// Run function on target (obsolete).
+// @param o    OFFLOAD object
+// @param name function name
+extern "C" int OFFLOAD_OFFLOAD(
+    OFFLOAD o,
+    const char *name,
+    int is_empty,
+    int num_vars,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int num_waits,
+    const void** waits,
+    const void* signal,
+    int entry_id = 0,
+    const void *stack_addr = NULL
+);
+
+// Global counter on host.
+// This variable is used if P2OPT_offload_do_data_persistence == 2.
+// The variable used to identify offload constructs contained in one procedure.
+// Call to OFFLOAD_CALL_COUNT() is inserted at HOST on entry of the routine.
+extern "C" int  OFFLOAD_CALL_COUNT();
+
+#endif // COMPILER_IF_HOST_H_INCLUDED

diff --git a/final/offload/src/compiler_if_target.cpp b/final/offload/src/compiler_if_target.cpp
new file mode 100644
index 0000000..1af82b8
--- /dev/null
+++ b/final/offload/src/compiler_if_target.cpp

@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "compiler_if_target.h"
+
+extern "C" void OFFLOAD_TARGET_ENTER(
+    OFFLOAD ofld,
+    int vars_total,
+    VarDesc *vars,
+    VarDesc2 *vars2
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p, %d, %p, %p)\n", __func__, ofld,
+                        vars_total, vars, vars2);
+    ofld->merge_var_descs(vars, vars2, vars_total);
+    ofld->scatter_copyin_data();
+}
+
+extern "C" void OFFLOAD_TARGET_LEAVE(
+    OFFLOAD ofld
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ofld);
+    ofld->gather_copyout_data();
+}
+
+extern "C" void OFFLOAD_TARGET_MAIN(void)
+{
+    // initialize target part
+    __offload_target_init();
+
+    // pass control to COI
+    PipelineStartExecutingRunFunctions();
+    ProcessWaitForShutdown();
+
+    OFFLOAD_DEBUG_TRACE(2, "Exiting main...\n");
+}

diff --git a/final/offload/src/compiler_if_target.h b/final/offload/src/compiler_if_target.h
new file mode 100644
index 0000000..49d2c1c
--- /dev/null
+++ b/final/offload/src/compiler_if_target.h

@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*! \file
+    \brief The interface between compiler-generated target code and runtime library
+*/
+
+#ifndef COMPILER_IF_TARGET_H_INCLUDED
+#define COMPILER_IF_TARGET_H_INCLUDED
+
+#include "offload_target.h"
+
+#define OFFLOAD_TARGET_ENTER            OFFLOAD_PREFIX(target_enter)
+#define OFFLOAD_TARGET_LEAVE            OFFLOAD_PREFIX(target_leave)
+#define OFFLOAD_TARGET_MAIN             OFFLOAD_PREFIX(target_main)
+
+/*! \fn OFFLOAD_TARGET_ENTER
+    \brief Fill in variable addresses using VarDesc array.
+    \brief Then call back the runtime library to fetch data.
+    \param ofld         Offload descriptor created by runtime.
+    \param var_desc_num Number of variable descriptors.
+    \param var_desc     Pointer to VarDesc array.
+    \param var_desc2    Pointer to VarDesc2 array.
+*/
+extern "C" void OFFLOAD_TARGET_ENTER(
+    OFFLOAD ofld,
+    int var_desc_num,
+    VarDesc *var_desc,
+    VarDesc2 *var_desc2
+);
+
+/*! \fn OFFLOAD_TARGET_LEAVE
+    \brief Call back the runtime library to gather outputs using VarDesc array.
+    \param ofld Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
+*/
+extern "C" void OFFLOAD_TARGET_LEAVE(
+    OFFLOAD ofld
+);
+
+// Entry point for the target application.
+extern "C" void OFFLOAD_TARGET_MAIN(void);
+
+#endif // COMPILER_IF_TARGET_H_INCLUDED

diff --git a/final/offload/src/dv_util.cpp b/final/offload/src/dv_util.cpp
new file mode 100644
index 0000000..4ad7271
--- /dev/null
+++ b/final/offload/src/dv_util.cpp

@@ -0,0 +1,131 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_common.h"
+
+bool __dv_is_contiguous(const ArrDesc *dvp)
+{
+    if (dvp->Flags & ArrDescFlagsContiguous) {
+        return true;
+    }
+
+    if (dvp->Rank != 0) {
+        if (dvp->Dim[0].Mult != dvp->Len) {
+            return false;
+        }
+        for (int i = 1; i < dvp->Rank; i++) {
+            if (dvp->Dim[i].Mult !=
+                dvp->Dim[i-1].Extent * dvp->Dim[i-1].Mult) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool __dv_is_allocated(const ArrDesc *dvp)
+{
+    return (dvp->Flags & ArrDescFlagsDefined);
+}
+
+uint64_t __dv_data_length(const ArrDesc *dvp)
+{
+    uint64_t size;
+
+    if (dvp->Rank == 0) {
+        size = dvp->Len;
+        return size;
+    }
+
+    size = dvp->Len;
+    for (int i = 0; i < dvp->Rank; ++i) {
+        size += (dvp->Dim[i].Extent-1) * dvp->Dim[i].Mult;
+    }
+    return size;
+}
+
+uint64_t __dv_data_length(const ArrDesc *dvp, int64_t count)
+{
+    if (dvp->Rank == 0) {
+        return count;
+    }
+
+    return count * dvp->Dim[0].Mult;
+}
+
+// Create CeanReadRanges data for reading contiguous ranges of
+// noncontiguous array defined by the argument
+CeanReadRanges * init_read_ranges_dv(const ArrDesc *dvp)
+{
+    int64_t         len;
+    int             count;
+    int             rank = dvp->Rank;
+    CeanReadRanges *res = NULL;
+
+    if (rank != 0) {
+        int i = 0;
+        len = dvp->Len;
+        if (dvp->Dim[0].Mult == len) {
+            for (i = 1; i < rank; i++) {
+                len *= dvp->Dim[i-1].Extent;
+                if (dvp->Dim[i].Mult != len) {
+                    break;
+                }
+            }
+        }
+        res = (CeanReadRanges *)malloc(
+            sizeof(CeanReadRanges) + (rank - i) * sizeof(CeanReadDim));
+        res -> last_noncont_ind = rank - i - 1;
+        count = 1;
+        for (; i < rank; i++) {
+            res->Dim[rank - i - 1].count = count;
+            res->Dim[rank - i - 1].size = dvp->Dim[i].Mult;
+            count *= dvp->Dim[i].Extent;
+        }
+        res -> range_max_number = count;
+        res -> range_size = len;
+        res -> ptr = (void*)dvp->Base;
+        res -> current_number = 0;
+        res -> init_offset = 0;
+    }
+    return res;
+}
+
+#if OFFLOAD_DEBUG > 0
+void __dv_desc_dump(const char *name, const ArrDesc *dvp)
+{
+    OFFLOAD_TRACE(3, "%s DV %p\n", name, dvp);
+
+    if (dvp != 0) {
+        OFFLOAD_TRACE(3,
+                      "    dv->Base   = 0x%lx\n"
+                      "    dv->Len    = 0x%lx\n"
+                      "    dv->Offset = 0x%lx\n"
+                      "    dv->Flags  = 0x%lx\n"
+                      "    dv->Rank   = 0x%lx\n"
+                      "    dv->Resrvd = 0x%lx\n",
+                      dvp->Base,
+                      dvp->Len,
+                      dvp->Offset,
+                      dvp->Flags,
+                      dvp->Rank,
+                      dvp->Reserved);
+
+        for (int i = 0 ; i < dvp->Rank; i++) {
+            OFFLOAD_TRACE(3,
+                          "    (%d) Extent=%ld, Multiplier=%ld, LowerBound=%ld\n",
+                          i,
+                          dvp->Dim[i].Extent,
+                          dvp->Dim[i].Mult,
+                          dvp->Dim[i].LowerBound);
+        }
+    }
+}
+#endif // OFFLOAD_DEBUG > 0

diff --git a/final/offload/src/dv_util.h b/final/offload/src/dv_util.h
new file mode 100644
index 0000000..fdfa77d
--- /dev/null
+++ b/final/offload/src/dv_util.h

@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef DV_UTIL_H_INCLUDED
+#define DV_UTIL_H_INCLUDED
+
+#include <stdint.h>
+
+// Dope vector declarations
+#define ArrDescMaxArrayRank         31
+
+// Dope vector flags
+#define ArrDescFlagsDefined         1
+#define ArrDescFlagsNodealloc       2
+#define ArrDescFlagsContiguous      4
+
+typedef int64_t dv_size;
+
+typedef struct DimDesc {
+    dv_size        Extent;      // Number of elements in this dimension
+    dv_size        Mult;        // Multiplier for this dimension.
+                                // The number of bytes between successive
+                                // elements in this dimension.
+    dv_size        LowerBound;  // LowerBound of this dimension
+} DimDesc ;
+
+typedef struct ArrDesc {
+    dv_size        Base;        // Base address
+    dv_size        Len;         // Length of data type, used only for
+                                // character strings.
+    dv_size        Offset;
+    dv_size        Flags;       // Flags
+    dv_size        Rank;        // Rank of pointer
+    dv_size        Reserved;    // reserved for openmp requests
+    DimDesc Dim[ArrDescMaxArrayRank];
+} ArrDesc ;
+
+typedef ArrDesc* pArrDesc;
+
+bool __dv_is_contiguous(const ArrDesc *dvp);
+
+bool __dv_is_allocated(const ArrDesc *dvp);
+
+uint64_t __dv_data_length(const ArrDesc *dvp);
+
+uint64_t __dv_data_length(const ArrDesc *dvp, int64_t nelems);
+
+CeanReadRanges * init_read_ranges_dv(const ArrDesc *dvp);
+
+#if OFFLOAD_DEBUG > 0
+void    __dv_desc_dump(const char *name, const ArrDesc *dvp);
+#else // OFFLOAD_DEBUG
+#define __dv_desc_dump(name, dvp)
+#endif // OFFLOAD_DEBUG
+
+#endif // DV_UTIL_H_INCLUDED

diff --git a/final/offload/src/liboffload_error.c b/final/offload/src/liboffload_error.c
new file mode 100644
index 0000000..fc15f8b
--- /dev/null
+++ b/final/offload/src/liboffload_error.c

@@ -0,0 +1,452 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stdio.h>
+#include <stdarg.h>
+#ifndef va_copy
+#define va_copy(dst, src) ((dst) = (src))
+#endif
+
+#include "liboffload_msg.h"
+
+#include "liboffload_error_codes.h"
+
+/***********************************************/
+/* error-handling function, liboffload_error_support */
+/***********************************************/
+
+void __liboffload_error_support(error_types input_tag, ...)
+{
+    va_list args;
+    va_start(args, input_tag);
+
+    switch (input_tag) {
+        case c_device_is_not_available:
+            write_message(stderr, msg_c_device_is_not_available, args);
+            break;
+        case c_invalid_device_number:
+            write_message(stderr, msg_c_invalid_device_number, args);
+            break;
+        case c_send_func_ptr:
+            write_message(stderr, msg_c_send_func_ptr, args);
+            break;
+        case c_receive_func_ptr:
+            write_message(stderr, msg_c_receive_func_ptr, args);
+            break;
+        case c_offload_malloc:
+            write_message(stderr, msg_c_offload_malloc, args);
+            break;
+        case c_offload1:
+            write_message(stderr, msg_c_offload1, args);
+            break;
+        case c_unknown_var_type:
+            write_message(stderr, c_unknown_var_type, args);
+            break;
+        case c_invalid_env_var_value:
+            write_message(stderr, msg_c_invalid_env_var_value, args);
+            break;
+        case c_invalid_env_var_int_value:
+            write_message(stderr, msg_c_invalid_env_var_int_value, args);
+            break;
+        case c_invalid_env_report_value:
+            write_message(stderr, msg_c_invalid_env_report_value, args);
+            break;
+        case c_offload_signaled1:
+            write_message(stderr, msg_c_offload_signaled1, args);
+            break;
+        case c_offload_signaled2:
+            write_message(stderr, msg_c_offload_signaled2, args);
+            break;
+        case c_myowrapper_checkresult:
+            write_message(stderr, msg_c_myowrapper_checkresult, args);
+            break;
+        case c_myotarget_checkresult:
+            write_message(stderr, msg_c_myotarget_checkresult, args);
+            break;
+        case c_offload_descriptor_offload:
+            write_message(stderr, msg_c_offload_descriptor_offload, args);
+            break;
+        case c_merge_var_descs1:
+            write_message(stderr, msg_c_merge_var_descs1, args);
+            break;
+        case c_merge_var_descs2:
+            write_message(stderr, msg_c_merge_var_descs2, args);
+            break;
+        case c_mic_parse_env_var_list1:
+            write_message(stderr, msg_c_mic_parse_env_var_list1, args);
+            break;
+        case c_mic_parse_env_var_list2:
+            write_message(stderr, msg_c_mic_parse_env_var_list2, args);
+            break;
+        case c_mic_process_exit_ret:
+            write_message(stderr, msg_c_mic_process_exit_ret, args);
+            break;
+        case c_mic_process_exit_sig:
+            write_message(stderr, msg_c_mic_process_exit_sig, args);
+            break;
+        case c_mic_process_exit:
+            write_message(stderr, msg_c_mic_process_exit, args);
+            break;
+        case c_mic_init3:
+            write_message(stderr, msg_c_mic_init3, args);
+            break;
+        case c_mic_init4:
+            write_message(stderr, msg_c_mic_init4, args);
+            break;
+        case c_mic_init5:
+            write_message(stderr, msg_c_mic_init5, args);
+            break;
+        case c_mic_init6:
+            write_message(stderr, msg_c_mic_init6, args);
+            break;
+        case c_no_static_var_data:
+            write_message(stderr, msg_c_no_static_var_data, args);
+            break;
+        case c_no_ptr_data:
+            write_message(stderr, msg_c_no_ptr_data, args);
+            break;
+        case c_get_engine_handle:
+            write_message(stderr, msg_c_get_engine_handle, args);
+            break;
+        case c_get_engine_index:
+            write_message(stderr, msg_c_get_engine_index, args);
+            break;
+        case c_process_create:
+            write_message(stderr, msg_c_process_create, args);
+            break;
+        case c_process_wait_shutdown:
+            write_message(stderr, msg_c_process_wait_shutdown, args);
+            break;
+        case c_process_proxy_flush:
+            write_message(stderr, msg_c_process_proxy_flush, args);
+            break;
+        case c_process_get_func_handles:
+            write_message(stderr, msg_c_process_get_func_handles, args);
+            break;
+        case c_load_library:
+            write_message(stderr, msg_c_load_library, args);
+            break;
+        case c_coipipe_max_number:
+            write_message(stderr, msg_c_coi_pipeline_max_number, args);
+            break;
+        case c_pipeline_create:
+            write_message(stderr, msg_c_pipeline_create, args);
+            break;
+        case c_pipeline_run_func:
+            write_message(stderr, msg_c_pipeline_run_func, args);
+            break;
+        case c_pipeline_start_run_funcs:
+            write_message(stderr, msg_c_pipeline_start_run_funcs, args);
+            break;
+        case c_buf_create:
+            write_message(stderr, msg_c_buf_create, args);
+            break;
+        case c_buf_create_out_of_mem:
+            write_message(stderr, msg_c_buf_create_out_of_mem, args);
+            break;
+        case c_buf_create_from_mem:
+            write_message(stderr, msg_c_buf_create_from_mem, args);
+            break;
+        case c_buf_destroy:
+            write_message(stderr, msg_c_buf_destroy, args);
+            break;
+        case c_buf_map:
+            write_message(stderr, msg_c_buf_map, args);
+            break;
+        case c_buf_unmap:
+            write_message(stderr, msg_c_buf_unmap, args);
+            break;
+        case c_buf_read:
+            write_message(stderr, msg_c_buf_read, args);
+            break;
+        case c_buf_write:
+            write_message(stderr, msg_c_buf_write, args);
+            break;
+        case c_buf_copy:
+            write_message(stderr, msg_c_buf_copy, args);
+            break;
+        case c_buf_get_address:
+            write_message(stderr, msg_c_buf_get_address, args);
+            break;
+        case c_buf_add_ref:
+            write_message(stderr, msg_c_buf_add_ref, args);
+            break;
+        case c_buf_release_ref:
+            write_message(stderr, msg_c_buf_release_ref, args);
+            break;
+        case c_buf_set_state:
+            write_message(stderr, msg_c_buf_set_state, args);
+            break;
+        case c_event_wait:
+            write_message(stderr, msg_c_event_wait, args);
+            break;
+        case c_zero_or_neg_ptr_len:
+            write_message(stderr, msg_c_zero_or_neg_ptr_len, args);
+            break;
+        case c_zero_or_neg_transfer_size:
+            write_message(stderr, msg_c_zero_or_neg_transfer_size, args);
+            break;
+        case c_bad_ptr_mem_range:
+            write_message(stderr, msg_c_bad_ptr_mem_range, args);
+            break;
+        case c_different_src_and_dstn_sizes:
+            write_message(stderr, msg_c_different_src_and_dstn_sizes, args);
+            break;
+        case c_ranges_dont_match:
+            write_message(stderr, msg_c_ranges_dont_match, args);
+            break;
+        case c_destination_is_over:
+            write_message(stderr, msg_c_destination_is_over, args);
+            break;
+        case c_slice_of_noncont_array:
+            write_message(stderr, msg_c_slice_of_noncont_array, args);
+            break;
+        case c_non_contiguous_dope_vector:
+            write_message(stderr, msg_c_non_contiguous_dope_vector, args);
+            break;
+        case c_pointer_array_mismatch:
+            write_message(stderr, msg_c_pointer_array_mismatch, args);
+            break;
+        case c_omp_invalid_device_num_env:
+            write_message(stderr, msg_c_omp_invalid_device_num_env, args);
+            break;
+        case c_omp_invalid_device_num:
+            write_message(stderr, msg_c_omp_invalid_device_num, args);
+            break;
+        case c_unknown_binary_type:
+            write_message(stderr, msg_c_unknown_binary_type, args);
+            break;
+        case c_multiple_target_exes:
+            write_message(stderr, msg_c_multiple_target_exes, args);
+            break;
+        case c_no_target_exe:
+            write_message(stderr, msg_c_no_target_exe, args);
+            break;
+        case c_report_unknown_timer_node:
+            write_message(stderr, msg_c_report_unknown_timer_node, args);
+            break;
+        case c_report_unknown_trace_node:
+            write_message(stderr, msg_c_report_unknown_trace_node, args);
+            break;
+    }
+    va_end(args);
+}
+
+char const * report_get_message_str(error_types input_tag)
+{
+    switch (input_tag) {
+        case c_report_title:
+            return (offload_get_message_str(msg_c_report_title));
+        case c_report_from_file:
+            return (offload_get_message_str(msg_c_report_from_file));
+        case c_report_offload:
+            return (offload_get_message_str(msg_c_report_offload));
+        case c_report_mic:
+            return (offload_get_message_str(msg_c_report_mic));
+        case c_report_file:
+            return (offload_get_message_str(msg_c_report_file));
+        case c_report_line:
+            return (offload_get_message_str(msg_c_report_line));
+        case c_report_host:
+            return (offload_get_message_str(msg_c_report_host));
+        case c_report_tag:
+            return (offload_get_message_str(msg_c_report_tag));
+        case c_report_cpu_time:
+            return (offload_get_message_str(msg_c_report_cpu_time));
+        case c_report_seconds:
+            return (offload_get_message_str(msg_c_report_seconds));
+        case c_report_cpu_to_mic_data:
+            return (offload_get_message_str(msg_c_report_cpu_to_mic_data));
+        case c_report_bytes:
+            return (offload_get_message_str(msg_c_report_bytes));
+        case c_report_mic_time:
+            return (offload_get_message_str(msg_c_report_mic_time));
+        case c_report_mic_to_cpu_data:
+            return (offload_get_message_str(msg_c_report_mic_to_cpu_data));
+        case c_report_compute:
+            return (offload_get_message_str(msg_c_report_compute));
+        case c_report_copyin_data:
+            return (offload_get_message_str(msg_c_report_copyin_data));
+        case c_report_copyout_data:
+            return (offload_get_message_str(msg_c_report_copyout_data));
+        case c_report_create_buf_host:
+            return (offload_get_message_str(c_report_create_buf_host));
+        case c_report_create_buf_mic:
+            return (offload_get_message_str(msg_c_report_create_buf_mic));
+        case c_report_destroy:
+            return (offload_get_message_str(msg_c_report_destroy));
+        case c_report_gather_copyin_data:
+            return (offload_get_message_str(msg_c_report_gather_copyin_data));
+        case c_report_gather_copyout_data:
+            return (offload_get_message_str(msg_c_report_gather_copyout_data));
+        case c_report_state_signal:
+            return (offload_get_message_str(msg_c_report_state_signal));
+        case c_report_signal:
+            return (offload_get_message_str(msg_c_report_signal));
+        case c_report_wait:
+            return (offload_get_message_str(msg_c_report_wait));
+        case c_report_init:
+            return (offload_get_message_str(msg_c_report_init));
+        case c_report_init_func:
+            return (offload_get_message_str(msg_c_report_init_func));
+        case c_report_logical_card:
+            return (offload_get_message_str(msg_c_report_logical_card));
+        case c_report_mic_myo_fptr:
+            return (offload_get_message_str(msg_c_report_mic_myo_fptr));
+        case c_report_mic_myo_shared:
+            return (offload_get_message_str(msg_c_report_mic_myo_shared));
+        case c_report_myoacquire:
+            return (offload_get_message_str(msg_c_report_myoacquire));
+        case c_report_myofini:
+            return (offload_get_message_str(msg_c_report_myofini));
+        case c_report_myoinit:
+            return (offload_get_message_str(msg_c_report_myoinit));
+        case c_report_myoregister:
+            return (offload_get_message_str(msg_c_report_myoregister));
+        case c_report_myorelease:
+            return (offload_get_message_str(msg_c_report_myorelease));
+        case c_report_myosharedalignedfree:
+            return (
+                offload_get_message_str(msg_c_report_myosharedalignedfree));
+        case c_report_myosharedalignedmalloc:
+            return (
+                offload_get_message_str(msg_c_report_myosharedalignedmalloc));
+        case c_report_myosharedfree:
+            return (offload_get_message_str(msg_c_report_myosharedfree));
+        case c_report_myosharedmalloc:
+            return (offload_get_message_str(msg_c_report_myosharedmalloc));
+        case c_report_physical_card:
+            return (offload_get_message_str(msg_c_report_physical_card));
+        case c_report_receive_pointer_data:
+            return (
+                offload_get_message_str(msg_c_report_receive_pointer_data));
+        case c_report_received_pointer_data:
+            return (
+                offload_get_message_str(msg_c_report_received_pointer_data));
+        case c_report_register:
+            return (offload_get_message_str(msg_c_report_register));
+        case c_report_scatter_copyin_data:
+            return (offload_get_message_str(msg_c_report_scatter_copyin_data));
+        case c_report_scatter_copyout_data:
+            return (
+                offload_get_message_str(msg_c_report_scatter_copyout_data));
+        case c_report_send_pointer_data:
+            return (offload_get_message_str(msg_c_report_send_pointer_data));
+        case c_report_sent_pointer_data:
+            return (offload_get_message_str(msg_c_report_sent_pointer_data));
+        case c_report_start:
+            return (offload_get_message_str(msg_c_report_start));
+        case c_report_start_target_func:
+            return (offload_get_message_str(msg_c_report_start_target_func));
+        case c_report_state:
+            return (offload_get_message_str(msg_c_report_state));
+        case c_report_unregister:
+            return (offload_get_message_str(msg_c_report_unregister));
+        case c_report_var:
+            return (offload_get_message_str(msg_c_report_var));
+
+        default:
+            LIBOFFLOAD_ERROR(c_report_unknown_trace_node);
+            abort();
+    }
+}
+
+char const * report_get_host_stage_str(int i)
+{
+    switch (i) {
+        case c_offload_host_total_offload:
+            return (
+               offload_get_message_str(msg_c_report_host_total_offload_time));
+        case c_offload_host_initialize:
+            return (offload_get_message_str(msg_c_report_host_initialize));
+        case c_offload_host_target_acquire:
+            return (
+                offload_get_message_str(msg_c_report_host_target_acquire));
+        case c_offload_host_wait_deps:
+            return (offload_get_message_str(msg_c_report_host_wait_deps));
+        case c_offload_host_setup_buffers:
+            return (offload_get_message_str(msg_c_report_host_setup_buffers));
+        case c_offload_host_alloc_buffers:
+            return (offload_get_message_str(msg_c_report_host_alloc_buffers));
+        case c_offload_host_setup_misc_data:
+            return (
+                offload_get_message_str(msg_c_report_host_setup_misc_data));
+        case c_offload_host_alloc_data_buffer:
+            return (
+                offload_get_message_str(msg_c_report_host_alloc_data_buffer));
+        case c_offload_host_send_pointers:
+            return (offload_get_message_str(msg_c_report_host_send_pointers));
+        case c_offload_host_gather_inputs:
+            return (offload_get_message_str(msg_c_report_host_gather_inputs));
+        case c_offload_host_map_in_data_buffer:
+            return (
+                offload_get_message_str(msg_c_report_host_map_in_data_buffer));
+        case c_offload_host_unmap_in_data_buffer:
+            return (offload_get_message_str(
+                msg_c_report_host_unmap_in_data_buffer));
+        case c_offload_host_start_compute:
+            return (offload_get_message_str(msg_c_report_host_start_compute));
+        case c_offload_host_wait_compute:
+            return (offload_get_message_str(msg_c_report_host_wait_compute));
+        case c_offload_host_start_buffers_reads:
+            return (offload_get_message_str(
+                msg_c_report_host_start_buffers_reads));
+        case c_offload_host_scatter_outputs:
+            return (
+                offload_get_message_str(msg_c_report_host_scatter_outputs));
+        case c_offload_host_map_out_data_buffer:
+            return (offload_get_message_str(
+                msg_c_report_host_map_out_data_buffer));
+        case c_offload_host_unmap_out_data_buffer:
+            return (offload_get_message_str(
+                msg_c_report_host_unmap_out_data_buffer));
+        case c_offload_host_wait_buffers_reads:
+            return (
+                offload_get_message_str(msg_c_report_host_wait_buffers_reads));
+        case c_offload_host_destroy_buffers:
+            return (
+                offload_get_message_str(msg_c_report_host_destroy_buffers));
+        default:
+            LIBOFFLOAD_ERROR(c_report_unknown_timer_node);
+            abort();
+    }
+}
+
+char const * report_get_target_stage_str(int i)
+{
+    switch (i) {
+        case c_offload_target_total_time:
+            return (offload_get_message_str(msg_c_report_target_total_time));
+        case c_offload_target_descriptor_setup:
+            return (
+                offload_get_message_str(msg_c_report_target_descriptor_setup));
+        case c_offload_target_func_lookup:
+            return (offload_get_message_str(msg_c_report_target_func_lookup));
+        case c_offload_target_func_time:
+            return (offload_get_message_str(msg_c_report_target_func_time));
+        case c_offload_target_scatter_inputs:
+            return (
+                offload_get_message_str(msg_c_report_target_scatter_inputs));
+        case c_offload_target_add_buffer_refs:
+            return (
+                offload_get_message_str(msg_c_report_target_add_buffer_refs));
+        case c_offload_target_compute:
+            return (offload_get_message_str(msg_c_report_target_compute));
+        case c_offload_target_gather_outputs:
+            return (offload_get_message_str
+                (msg_c_report_target_gather_outputs));
+        case c_offload_target_release_buffer_refs:
+            return (offload_get_message_str(
+                msg_c_report_target_release_buffer_refs));
+        default:
+            LIBOFFLOAD_ERROR(c_report_unknown_timer_node);
+            abort();
+    }
+}

diff --git a/final/offload/src/liboffload_error_codes.h b/final/offload/src/liboffload_error_codes.h
new file mode 100644
index 0000000..982167b
--- /dev/null
+++ b/final/offload/src/liboffload_error_codes.h

@@ -0,0 +1,276 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if !defined(LIBOFFLOAD_ERROR_CODES_H)
+#define LIBOFFLOAD_ERROR_CODES_H
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+typedef enum
+{
+    c_device_is_not_available = 0,
+    c_invalid_device_number,
+    c_offload1,
+    c_unknown_var_type,
+    c_send_func_ptr,
+    c_receive_func_ptr,
+    c_offload_malloc,
+    c_invalid_env_var_value,
+    c_invalid_env_var_int_value,
+    c_invalid_env_report_value,
+    c_offload_signaled1,
+    c_offload_signaled2,
+    c_myotarget_checkresult,
+    c_myowrapper_checkresult,
+    c_offload_descriptor_offload,
+    c_merge_var_descs1,
+    c_merge_var_descs2,
+    c_mic_parse_env_var_list1,
+    c_mic_parse_env_var_list2,
+    c_mic_process_exit_ret,
+    c_mic_process_exit_sig,
+    c_mic_process_exit,
+    c_mic_init3,
+    c_mic_init4,
+    c_mic_init5,
+    c_mic_init6,
+    c_no_static_var_data,
+    c_no_ptr_data,
+    c_get_engine_handle,
+    c_get_engine_index,
+    c_process_create,
+    c_process_get_func_handles,
+    c_process_wait_shutdown,
+    c_process_proxy_flush,
+    c_load_library,
+    c_pipeline_create,
+    c_pipeline_run_func,
+    c_pipeline_start_run_funcs,
+    c_buf_create,
+    c_buf_create_out_of_mem,
+    c_buf_create_from_mem,
+    c_buf_destroy,
+    c_buf_map,
+    c_buf_unmap,
+    c_buf_read,
+    c_buf_write,
+    c_buf_copy,
+    c_buf_get_address,
+    c_buf_add_ref,
+    c_buf_release_ref,
+    c_buf_set_state,
+    c_event_wait,
+    c_zero_or_neg_ptr_len,
+    c_zero_or_neg_transfer_size,
+    c_bad_ptr_mem_range,
+    c_different_src_and_dstn_sizes,
+    c_ranges_dont_match,
+    c_destination_is_over,
+    c_slice_of_noncont_array,
+    c_non_contiguous_dope_vector,
+    c_pointer_array_mismatch,
+    c_omp_invalid_device_num_env,
+    c_omp_invalid_device_num,
+    c_unknown_binary_type,
+    c_multiple_target_exes,
+    c_no_target_exe,
+    c_report_host,
+    c_report_target,
+    c_report_title,
+    c_report_from_file,
+    c_report_file,
+    c_report_line,
+    c_report_tag,
+    c_report_seconds,
+    c_report_bytes,
+    c_report_mic,
+    c_report_cpu_time,
+    c_report_cpu_to_mic_data,
+    c_report_mic_time,
+    c_report_mic_to_cpu_data,
+    c_report_unknown_timer_node,
+    c_report_unknown_trace_node,
+    c_report_offload,
+    c_report_w_tag,
+    c_report_state,
+    c_report_start,
+    c_report_init,
+    c_report_logical_card,
+    c_report_physical_card,
+    c_report_register,
+    c_report_init_func,
+    c_report_create_buf_host,
+    c_report_create_buf_mic,
+    c_report_send_pointer_data,
+    c_report_sent_pointer_data,
+    c_report_gather_copyin_data,
+    c_report_copyin_data,
+    c_report_state_signal,
+    c_report_signal,
+    c_report_wait,
+    c_report_compute,
+    c_report_receive_pointer_data,
+    c_report_received_pointer_data,
+    c_report_start_target_func,
+    c_report_var,
+    c_report_scatter_copyin_data,
+    c_report_gather_copyout_data,
+    c_report_scatter_copyout_data,
+    c_report_copyout_data,
+    c_report_unregister,
+    c_report_destroy,
+    c_report_myoinit,
+    c_report_myoregister,
+    c_report_myofini,
+    c_report_mic_myo_shared,
+    c_report_mic_myo_fptr,
+    c_report_myosharedmalloc,
+    c_report_myosharedfree,
+    c_report_myosharedalignedmalloc,
+    c_report_myosharedalignedfree,
+    c_report_myoacquire,
+    c_report_myorelease,
+    c_coipipe_max_number
+} error_types;
+
+enum OffloadHostPhase {
+    // Total time on host for entire offload
+    c_offload_host_total_offload = 0,
+
+    // Time to load target binary
+    c_offload_host_initialize,
+
+    // Time to acquire lrb availability dynamically
+    c_offload_host_target_acquire,
+
+    // Time to wait for dependencies
+    c_offload_host_wait_deps,
+
+    // Time to allocate pointer buffers, initiate writes for pointers
+    // and calculate size of copyin/copyout buffer
+    c_offload_host_setup_buffers,
+
+    // Time to allocate pointer buffers
+    c_offload_host_alloc_buffers,
+
+    // Time to initialize misc data
+    c_offload_host_setup_misc_data,
+
+    // Time to allocate copyin/copyout buffer
+    c_offload_host_alloc_data_buffer,
+
+    // Time to initiate writes from host pointers to buffers
+    c_offload_host_send_pointers,
+
+    // Time to Gather IN data of offload into buffer
+    c_offload_host_gather_inputs,
+
+    // Time to map buffer
+    c_offload_host_map_in_data_buffer,
+
+    // Time to unmap buffer
+    c_offload_host_unmap_in_data_buffer,
+
+    // Time to start remote function call that does computation on lrb
+    c_offload_host_start_compute,
+
+    // Time to wait for compute to finish
+    c_offload_host_wait_compute,
+
+    // Time to initiate reads from pointer buffers
+    c_offload_host_start_buffers_reads,
+
+    // Time to update host variabels with OUT data from buffer
+    c_offload_host_scatter_outputs,
+
+    // Time to map buffer
+    c_offload_host_map_out_data_buffer,
+
+    // Time to unmap buffer
+    c_offload_host_unmap_out_data_buffer,
+
+    // Time to wait reads from buffers to finish
+    c_offload_host_wait_buffers_reads,
+
+    // Time to destroy buffers that are no longer needed
+    c_offload_host_destroy_buffers,
+
+    // LAST TIME MONITOR
+    c_offload_host_max_phase
+};
+
+enum OffloadTargetPhase {
+    // Total time spent on the target
+    c_offload_target_total_time = 0,
+
+    // Time to initialize offload descriptor
+    c_offload_target_descriptor_setup,
+
+    // Time to find target entry point in lookup table
+    c_offload_target_func_lookup,
+
+    // Total time spend executing offload entry
+    c_offload_target_func_time,
+
+    // Time to initialize target variables with IN values from buffer
+    c_offload_target_scatter_inputs,
+
+    // Time to add buffer reference for pointer buffers
+    c_offload_target_add_buffer_refs,
+
+    // Total time on lrb for computation
+    c_offload_target_compute,
+
+    // On lrb, time to copy OUT into buffer
+    c_offload_target_gather_outputs,
+
+    // Time to release buffer references
+    c_offload_target_release_buffer_refs,
+
+    // LAST TIME MONITOR
+    c_offload_target_max_phase
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __liboffload_error_support(error_types input_tag, ...);
+void __liboffload_report_support(error_types input_tag, ...);
+char const *offload_get_message_str(int msgCode);
+char const * report_get_message_str(error_types input_tag);
+char const * report_get_host_stage_str(int i);
+char const * report_get_target_stage_str(int i);
+#ifdef __cplusplus
+}
+#endif
+
+#define test_msg_cat(nm, msg) \
+    fprintf(stderr, "\t TEST for %s \n \t", nm); \
+    __liboffload_error_support(msg);
+
+#define test_msg_cat1(nm, msg, ...) \
+    fprintf(stderr, "\t TEST for %s \n \t", nm); \
+    __liboffload_error_support(msg, __VA_ARGS__);
+
+void write_message(FILE * file, int msgCode, va_list args_p);
+
+#define LIBOFFLOAD_ERROR __liboffload_error_support
+
+#ifdef TARGET_WINNT
+#define LIBOFFLOAD_ABORT \
+         _set_abort_behavior(0, _WRITE_ABORT_MSG); \
+         abort()
+#else
+#define LIBOFFLOAD_ABORT \
+         abort()
+#endif
+
+#endif // !defined(LIBOFFLOAD_ERROR_CODES_H)

diff --git a/final/offload/src/liboffload_msg.c b/final/offload/src/liboffload_msg.c
new file mode 100644
index 0000000..b160392
--- /dev/null
+++ b/final/offload/src/liboffload_msg.c

@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include <stdio.h>
+
+// ===========================================================================
+// Bring in the static string table and the enumerations for indexing into
+// it.
+// ===========================================================================
+
+#include "liboffload_msg.h"
+
+# define DYNART_STDERR_PUTS(__message_text__) fputs((__message_text__),stderr)
+
+// ===========================================================================
+// Now the code for accessing the message catalogs
+// ===========================================================================
+
+
+    void write_message(FILE * file, int msgCode) {
+        fputs(MESSAGE_TABLE_NAME[ msgCode ], file);
+        fflush(file);
+    }
+
+    char const *offload_get_message_str(int msgCode) {
+        return MESSAGE_TABLE_NAME[ msgCode ];
+    }

diff --git a/final/offload/src/liboffload_msg.h b/final/offload/src/liboffload_msg.h
new file mode 100644
index 0000000..c1445f9
--- /dev/null
+++ b/final/offload/src/liboffload_msg.h

@@ -0,0 +1,326 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+// file: liboffload_msg.h
+enum {
+	__dummy__ = 0,
+	msg_c_device_is_not_available,
+	msg_c_invalid_device_number,
+	msg_c_send_func_ptr,
+	msg_c_receive_func_ptr,
+	msg_c_offload_malloc,
+	msg_c_offload1,
+	msg_c_unknown_var_type,
+	msg_c_invalid_env_var_value,
+	msg_c_invalid_env_var_int_value,
+	msg_c_invalid_env_report_value,
+	msg_c_offload_signaled1,
+	msg_c_offload_signaled2,
+	msg_c_myowrapper_checkresult,
+	msg_c_myotarget_checkresult,
+	msg_c_offload_descriptor_offload,
+	msg_c_merge_var_descs1,
+	msg_c_merge_var_descs2,
+	msg_c_mic_parse_env_var_list1,
+	msg_c_mic_parse_env_var_list2,
+	msg_c_mic_process_exit_ret,
+	msg_c_mic_process_exit_sig,
+	msg_c_mic_process_exit,
+	msg_c_mic_init3,
+	msg_c_mic_init4,
+	msg_c_mic_init5,
+	msg_c_mic_init6,
+	msg_c_no_static_var_data,
+	msg_c_no_ptr_data,
+	msg_c_get_engine_handle,
+	msg_c_get_engine_index,
+	msg_c_process_create,
+	msg_c_process_get_func_handles,
+	msg_c_process_wait_shutdown,
+	msg_c_process_proxy_flush,
+	msg_c_load_library,
+	msg_c_pipeline_create,
+	msg_c_pipeline_run_func,
+	msg_c_pipeline_start_run_funcs,
+	msg_c_buf_create,
+	msg_c_buf_create_out_of_mem,
+	msg_c_buf_create_from_mem,
+	msg_c_buf_destroy,
+	msg_c_buf_map,
+	msg_c_buf_unmap,
+	msg_c_buf_read,
+	msg_c_buf_write,
+	msg_c_buf_copy,
+	msg_c_buf_get_address,
+	msg_c_buf_add_ref,
+	msg_c_buf_release_ref,
+	msg_c_buf_set_state,
+	msg_c_event_wait,
+	msg_c_zero_or_neg_ptr_len,
+	msg_c_zero_or_neg_transfer_size,
+	msg_c_bad_ptr_mem_range,
+	msg_c_different_src_and_dstn_sizes,
+	msg_c_non_contiguous_dope_vector,
+	msg_c_omp_invalid_device_num_env,
+	msg_c_omp_invalid_device_num,
+	msg_c_unknown_binary_type,
+	msg_c_multiple_target_exes,
+	msg_c_no_target_exe,
+	msg_c_report_unknown_timer_node,
+	msg_c_report_unknown_trace_node,
+	msg_c_report_host,
+	msg_c_report_mic,
+	msg_c_report_title,
+	msg_c_report_seconds,
+	msg_c_report_bytes,
+	msg_c_report_cpu_time,
+	msg_c_report_mic_time,
+	msg_c_report_tag,
+	msg_c_report_from_file,
+	msg_c_report_file,
+	msg_c_report_line,
+	msg_c_report_cpu_to_mic_data,
+	msg_c_report_mic_to_cpu_data,
+	msg_c_report_offload,
+	msg_c_report_w_tag,
+	msg_c_report_state,
+	msg_c_report_start,
+	msg_c_report_init,
+	msg_c_report_logical_card,
+	msg_c_report_physical_card,
+	msg_c_report_register,
+	msg_c_report_init_func,
+	msg_c_report_create_buf_host,
+	msg_c_report_create_buf_mic,
+	msg_c_report_send_pointer_data,
+	msg_c_report_sent_pointer_data,
+	msg_c_report_gather_copyin_data,
+	msg_c_report_copyin_data,
+	msg_c_report_state_signal,
+	msg_c_report_signal,
+	msg_c_report_wait,
+	msg_c_report_compute,
+	msg_c_report_receive_pointer_data,
+	msg_c_report_received_pointer_data,
+	msg_c_report_start_target_func,
+	msg_c_report_var,
+	msg_c_report_scatter_copyin_data,
+	msg_c_report_gather_copyout_data,
+	msg_c_report_scatter_copyout_data,
+	msg_c_report_copyout_data,
+	msg_c_report_unregister,
+	msg_c_report_destroy,
+	msg_c_report_myoinit,
+	msg_c_report_myoregister,
+	msg_c_report_myofini,
+	msg_c_report_mic_myo_shared,
+	msg_c_report_mic_myo_fptr,
+	msg_c_report_myosharedmalloc,
+	msg_c_report_myosharedfree,
+	msg_c_report_myosharedalignedmalloc,
+	msg_c_report_myosharedalignedfree,
+	msg_c_report_myoacquire,
+	msg_c_report_myorelease,
+	msg_c_report_host_total_offload_time,
+	msg_c_report_host_initialize,
+	msg_c_report_host_target_acquire,
+	msg_c_report_host_wait_deps,
+	msg_c_report_host_setup_buffers,
+	msg_c_report_host_alloc_buffers,
+	msg_c_report_host_setup_misc_data,
+	msg_c_report_host_alloc_data_buffer,
+	msg_c_report_host_send_pointers,
+	msg_c_report_host_gather_inputs,
+	msg_c_report_host_map_in_data_buffer,
+	msg_c_report_host_unmap_in_data_buffer,
+	msg_c_report_host_start_compute,
+	msg_c_report_host_wait_compute,
+	msg_c_report_host_start_buffers_reads,
+	msg_c_report_host_scatter_outputs,
+	msg_c_report_host_map_out_data_buffer,
+	msg_c_report_host_unmap_out_data_buffer,
+	msg_c_report_host_wait_buffers_reads,
+	msg_c_report_host_destroy_buffers,
+	msg_c_report_target_total_time,
+	msg_c_report_target_descriptor_setup,
+	msg_c_report_target_func_lookup,
+	msg_c_report_target_func_time,
+	msg_c_report_target_scatter_inputs,
+	msg_c_report_target_add_buffer_refs,
+	msg_c_report_target_compute,
+	msg_c_report_target_gather_outputs,
+	msg_c_report_target_release_buffer_refs,
+	msg_c_coi_pipeline_max_number,
+	msg_c_ranges_dont_match,
+	msg_c_destination_is_over,
+	msg_c_slice_of_noncont_array,
+	msg_c_pointer_array_mismatch,
+	lastMsg = 152,
+	firstMsg = 1
+};
+
+
+#if !defined(MESSAGE_TABLE_NAME)
+#    define MESSAGE_TABLE_NAME __liboffload_message_table
+#endif
+
+static char const * MESSAGE_TABLE_NAME[] = {
+	/*   0 __dummy__                               */ "Un-used message",
+	/*   1 msg_c_device_is_not_available           */ "offload error: cannot offload to MIC - device is not available",
+	/*   2 msg_c_invalid_device_number             */ "offload error: expected a number greater than or equal to -1",
+	/*   3 msg_c_send_func_ptr                     */ "offload error: cannot find function name for address %p",
+	/*   4 msg_c_receive_func_ptr                  */ "offload error: cannot find address of function %s",
+	/*   5 msg_c_offload_malloc                    */ "offload error: memory allocation failed (requested=%lld bytes, align %lld)",
+	/*   6 msg_c_offload1                          */ "offload error: device %d does not have a pending signal for wait(%p)",
+	/*   7 msg_c_unknown_var_type                  */ "offload error: unknown variable type %d",
+	/*   8 msg_c_invalid_env_var_value             */ "offload warning: ignoring invalid value specified for %s",
+	/*   9 msg_c_invalid_env_var_int_value         */ "offload warning: specify an integer value for %s",
+	/*  10 msg_c_invalid_env_report_value          */ "offload warning: ignoring %s setting; use a value in range 1-3",
+	/*  11 msg_c_offload_signaled1                 */ "offload error: invalid device number %d specified in _Offload_signaled",
+	/*  12 msg_c_offload_signaled2                 */ "offload error: invalid signal %p specified for _Offload_signaled",
+	/*  13 msg_c_myowrapper_checkresult            */ "offload error: %s failed with error %d",
+	/*  14 msg_c_myotarget_checkresult             */ "offload error: %s failed with error %d",
+	/*  15 msg_c_offload_descriptor_offload        */ "offload error: cannot find offload entry %s",
+	/*  16 msg_c_merge_var_descs1                  */ "offload error: unexpected number of variable descriptors",
+	/*  17 msg_c_merge_var_descs2                  */ "offload error: unexpected variable type",
+	/*  18 msg_c_mic_parse_env_var_list1           */ "offload_error: MIC environment variable must begin with an alpabetic character",
+	/*  19 msg_c_mic_parse_env_var_list2           */ "offload_error: MIC environment variable value must be specified with =",
+	/*  20 msg_c_mic_process_exit_ret              */ "offload error: process on the device %d unexpectedly exited with code %d",
+	/*  21 msg_c_mic_process_exit_sig              */ "offload error: process on the device %d was terminated by signal %d (%s)",
+	/*  22 msg_c_mic_process_exit                  */ "offload error: process on the device %d was unexpectedly terminated",
+	/*  23 msg_c_mic_init3                         */ "offload warning: ignoring MIC_STACKSIZE setting; use a value >= 16K and a multiple of 4K",
+	/*  24 msg_c_mic_init4                         */ "offload error: thread key create failed with error %d",
+	/*  25 msg_c_mic_init5                         */ "offload warning: specify OFFLOAD_DEVICES as comma-separated physical device numbers or 'none'",
+	/*  26 msg_c_mic_init6                         */ "offload warning: OFFLOAD_DEVICES device number %d does not correspond to a physical device",
+	/*  27 msg_c_no_static_var_data                */ "offload error: cannot find data associated with statically allocated variable %p",
+	/*  28 msg_c_no_ptr_data                       */ "offload error: cannot find data associated with pointer variable %p",
+	/*  29 msg_c_get_engine_handle                 */ "offload error: cannot get device %d handle (error code %d)",
+	/*  30 msg_c_get_engine_index                  */ "offload error: cannot get physical index for logical device %d (error code %d)",
+	/*  31 msg_c_process_create                    */ "offload error: cannot start process on the device %d (error code %d)",
+	/*  32 msg_c_process_get_func_handles          */ "offload error: cannot get function handles on the device %d (error code %d)",
+	/*  33 msg_c_process_wait_shutdown             */ "offload error: wait for process shutdown failed on device %d (error code %d)",
+	/*  34 msg_c_process_proxy_flush               */ "offload error: cannot flush process output on device %d (error code %d)",
+	/*  35 msg_c_load_library                      */ "offload error: cannot load library to the device %d (error code %d)",
+	/*  36 msg_c_pipeline_create                   */ "offload error: cannot create pipeline on the device %d (error code %d)",
+	/*  37 msg_c_pipeline_run_func                 */ "offload error: cannot execute function on the device %d (error code %d)",
+	/*  38 msg_c_pipeline_start_run_funcs          */ "offload error: cannot start executing pipeline function on the device %d (error code %d)",
+	/*  39 msg_c_buf_create                        */ "offload error: cannot create buffer on device %d (error code %d)",
+	/*  40 msg_c_buf_create_out_of_mem             */ "offload error: cannot create buffer on device %d, out of memory",
+	/*  41 msg_c_buf_create_from_mem               */ "offload error: cannot create buffer from memory on device %d (error code %d)",
+	/*  42 msg_c_buf_destroy                       */ "offload error: buffer destroy failed (error code %d)",
+	/*  43 msg_c_buf_map                           */ "offload error: buffer map failed (error code %d)",
+	/*  44 msg_c_buf_unmap                         */ "offload error: buffer unmap failed (error code %d)",
+	/*  45 msg_c_buf_read                          */ "offload error: buffer read failed (error code %d)",
+	/*  46 msg_c_buf_write                         */ "offload error: buffer write failed (error code %d)",
+	/*  47 msg_c_buf_copy                          */ "offload error: buffer copy failed (error code %d)",
+	/*  48 msg_c_buf_get_address                   */ "offload error: cannot get buffer address on device %d (error code %d)",
+	/*  49 msg_c_buf_add_ref                       */ "offload error: cannot reuse buffer memory on device %d (error code %d)",
+	/*  50 msg_c_buf_release_ref                   */ "offload error: cannot release buffer memory on device %d (error code %d)",
+	/*  51 msg_c_buf_set_state                     */ "offload error: buffer set state failed (error code %d)",
+	/*  52 msg_c_event_wait                        */ "offload error: wait for event to become signaled failed (error code %d)",
+	/*  53 msg_c_zero_or_neg_ptr_len               */ "offload error: memory allocation of negative length is not supported",
+	/*  54 msg_c_zero_or_neg_transfer_size         */ "offload error: data transfer of zero or negative size is not supported",
+	/*  55 msg_c_bad_ptr_mem_range                 */ "offload error: address range partially overlaps with existing allocation",
+	/*  56 msg_c_different_src_and_dstn_sizes      */ "offload error: size of the source %d differs from size of the destination %d",
+	/*  57 msg_c_non_contiguous_dope_vector        */ "offload error: offload data transfer supports only a single contiguous memory range per variable",
+	/*  58 msg_c_omp_invalid_device_num_env        */ "offload warning: ignoring %s setting; use a non-negative integer value",
+	/*  59 msg_c_omp_invalid_device_num            */ "offload error: device number should be a non-negative integer value",
+	/*  60 msg_c_unknown_binary_type               */ "offload error: unexpected embedded target binary type, expected either an executable or shared library",
+	/*  61 msg_c_multiple_target_exes              */ "offload error: more that one target executable found",
+	/*  62 msg_c_no_target_exe                     */ "offload error: target executable is not available",
+	/*  63 msg_c_report_unknown_timer_node         */ "offload error: unknown timer node",
+	/*  64 msg_c_report_unknown_trace_node         */ "offload error: unknown trace node",
+	/*  65 msg_c_report_host                       */ "HOST",
+	/*  66 msg_c_report_mic                        */ "MIC",
+	/*  67 msg_c_report_title                      */ "timer data       (sec)",
+	/*  68 msg_c_report_seconds                    */ "(seconds)",
+	/*  69 msg_c_report_bytes                      */ "(bytes)",
+	/*  70 msg_c_report_cpu_time                   */ "CPU Time",
+	/*  71 msg_c_report_mic_time                   */ "MIC Time",
+	/*  72 msg_c_report_tag                        */ "Tag",
+	/*  73 msg_c_report_from_file                  */ "Offload from file",
+	/*  74 msg_c_report_file                       */ "File",
+	/*  75 msg_c_report_line                       */ "Line",
+	/*  76 msg_c_report_cpu_to_mic_data            */ "CPU->MIC Data",
+	/*  77 msg_c_report_mic_to_cpu_data            */ "MIC->CPU Data",
+	/*  78 msg_c_report_offload                    */ "Offload",
+	/*  79 msg_c_report_w_tag                      */ "Tag %d",
+	/*  80 msg_c_report_state                      */ "State",
+	/*  81 msg_c_report_start                      */ "Start target",
+	/*  82 msg_c_report_init                       */ "Initialize",
+	/*  83 msg_c_report_logical_card               */ "logical card",
+	/*  84 msg_c_report_physical_card              */ "physical card",
+	/*  85 msg_c_report_register                   */ "Register static data tables",
+	/*  86 msg_c_report_init_func                  */ "Setup target entry",
+	/*  87 msg_c_report_create_buf_host            */ "Create host buffer",
+	/*  88 msg_c_report_create_buf_mic             */ "Create target buffer",
+	/*  89 msg_c_report_send_pointer_data          */ "Send pointer data",
+	/*  90 msg_c_report_sent_pointer_data          */ "Host->target pointer data",
+	/*  91 msg_c_report_gather_copyin_data         */ "Gather copyin data",
+	/*  92 msg_c_report_copyin_data                */ "Host->target copyin data",
+	/*  93 msg_c_report_state_signal               */ "Signal",
+	/*  94 msg_c_report_signal                     */ "signal :",
+	/*  95 msg_c_report_wait                       */ "waits  :",
+	/*  96 msg_c_report_compute                    */ "Execute task on target",
+	/*  97 msg_c_report_receive_pointer_data       */ "Receive pointer data",
+	/*  98 msg_c_report_received_pointer_data      */ "Target->host pointer data",
+	/*  99 msg_c_report_start_target_func          */ "Start target entry",
+	/* 100 msg_c_report_var                        */ "Var",
+	/* 101 msg_c_report_scatter_copyin_data        */ "Scatter copyin data",
+	/* 102 msg_c_report_gather_copyout_data        */ "Gather copyout data",
+	/* 103 msg_c_report_scatter_copyout_data       */ "Scatter copyout data",
+	/* 104 msg_c_report_copyout_data               */ "Target->host copyout data",
+	/* 105 msg_c_report_unregister                 */ "Unregister data tables",
+	/* 106 msg_c_report_destroy                    */ "Destroy",
+	/* 107 msg_c_report_myoinit                    */ "Initialize MYO",
+	/* 108 msg_c_report_myoregister                */ "Register MYO tables",
+	/* 109 msg_c_report_myofini                    */ "Finalize MYO",
+	/* 110 msg_c_report_mic_myo_shared             */ "MIC MYO shared table register",
+	/* 111 msg_c_report_mic_myo_fptr               */ "MIC MYO fptr table register",
+	/* 112 msg_c_report_myosharedmalloc            */ "MYO shared malloc",
+	/* 113 msg_c_report_myosharedfree              */ "MYO shared free",
+	/* 114 msg_c_report_myosharedalignedmalloc     */ "MYO shared aligned malloc",
+	/* 115 msg_c_report_myosharedalignedfree       */ "MYO shared aligned free",
+	/* 116 msg_c_report_myoacquire                 */ "MYO acquire",
+	/* 117 msg_c_report_myorelease                 */ "MYO release",
+	/* 118 msg_c_report_host_total_offload_time    */ "host: total offload time",
+	/* 119 msg_c_report_host_initialize            */ "host: initialize target",
+	/* 120 msg_c_report_host_target_acquire        */ "host: acquire target",
+	/* 121 msg_c_report_host_wait_deps             */ "host: wait dependencies",
+	/* 122 msg_c_report_host_setup_buffers         */ "host: setup buffers",
+	/* 123 msg_c_report_host_alloc_buffers         */ "host: allocate buffers",
+	/* 124 msg_c_report_host_setup_misc_data       */ "host: setup misc_data",
+	/* 125 msg_c_report_host_alloc_data_buffer     */ "host: allocate buffer",
+	/* 126 msg_c_report_host_send_pointers         */ "host: send pointers",
+	/* 127 msg_c_report_host_gather_inputs         */ "host: gather inputs",
+	/* 128 msg_c_report_host_map_in_data_buffer    */ "host: map IN data buffer",
+	/* 129 msg_c_report_host_unmap_in_data_buffer  */ "host: unmap IN data buffer",
+	/* 130 msg_c_report_host_start_compute         */ "host: initiate compute",
+	/* 131 msg_c_report_host_wait_compute          */ "host: wait compute",
+	/* 132 msg_c_report_host_start_buffers_reads   */ "host: initiate pointer reads",
+	/* 133 msg_c_report_host_scatter_outputs       */ "host: scatter outputs",
+	/* 134 msg_c_report_host_map_out_data_buffer   */ "host: map OUT data buffer",
+	/* 135 msg_c_report_host_unmap_out_data_buffer */ "host: unmap OUT data buffer",
+	/* 136 msg_c_report_host_wait_buffers_reads    */ "host: wait pointer reads",
+	/* 137 msg_c_report_host_destroy_buffers       */ "host: destroy buffers",
+	/* 138 msg_c_report_target_total_time          */ "target: total time",
+	/* 139 msg_c_report_target_descriptor_setup    */ "target: setup offload descriptor",
+	/* 140 msg_c_report_target_func_lookup         */ "target: entry lookup",
+	/* 141 msg_c_report_target_func_time           */ "target: entry time",
+	/* 142 msg_c_report_target_scatter_inputs      */ "target: scatter inputs",
+	/* 143 msg_c_report_target_add_buffer_refs     */ "target: add buffer reference",
+	/* 144 msg_c_report_target_compute             */ "target: compute",
+	/* 145 msg_c_report_target_gather_outputs      */ "target: gather outputs",
+	/* 146 msg_c_report_target_release_buffer_refs */ "target: remove buffer reference",
+	/* 147 msg_c_coi_pipeline_max_number           */ "number of host threads doing offload exceeds maximum of %d",
+	/* 148 msg_c_ranges_dont_match                 */ "ranges of source and destination don't match together",
+	/* 149 msg_c_destination_is_over               */ "insufficient destination memory to transfer source",
+	/* 150 msg_c_slice_of_noncont_array            */ "a non-contiguous slice may be taken of contiguous arrays only",
+	/* 151 msg_c_pointer_array_mismatch            */ "number of %s elements is less than described by the source",
+};

diff --git a/final/offload/src/mic_lib.f90 b/final/offload/src/mic_lib.f90
new file mode 100644
index 0000000..0c2e4de
--- /dev/null
+++ b/final/offload/src/mic_lib.f90

@@ -0,0 +1,441 @@
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+
+! **********************************************************************************
+! * This file is intended to support the Intel(r) Many Integrated Core Architecture.
+! **********************************************************************************
+! free form Fortran source - should be named .f90
+! lines are longer than 72 characters
+
+module mic_lib
+use, intrinsic :: iso_c_binding
+
+integer, parameter:: target_mic=2
+integer, parameter:: default_target_type=target_mic
+integer, parameter:: default_target_number=0
+
+enum, bind(C)
+    enumerator :: OFFLOAD_SUCCESS  = 0
+    enumerator :: OFFLOAD_DISABLED          ! offload is disabled
+    enumerator :: OFFLOAD_UNAVAILABLE       ! card is not available
+    enumerator :: OFFLOAD_OUT_OF_MEMORY     ! not enough memory on device
+    enumerator :: OFFLOAD_PROCESS_DIED      ! target process has died
+    enumerator :: OFFLOAD_ERROR             ! unspecified error
+end enum
+
+type, bind (C) :: offload_status
+    integer(kind=c_int)    ::  result          = OFFLOAD_DISABLED
+    integer(kind=c_int)    ::  device_number   = -1
+    integer(kind=c_size_t) ::  data_sent       = 0
+    integer(kind=c_size_t) ::  data_received   = 0
+end type offload_status
+
+interface
+function offload_number_of_devices ()                                  &
+           bind (C, name = "_Offload_number_of_devices")
+!dec$ attributes default :: offload_number_of_devices
+  import :: c_int
+  integer (kind=c_int)        :: offload_number_of_devices
+!dec$ attributes offload:mic :: offload_number_of_devices
+!dir$ attributes known_intrinsic ::  offload_number_of_devices
+end function offload_number_of_devices
+
+function offload_signaled(target_number, signal)                       &
+           bind (C, name = "_Offload_signaled")
+!dec$ attributes default :: offload_signaled
+  import :: c_int, c_int64_t
+  integer (kind=c_int) :: offload_signaled
+  integer (kind=c_int), value :: target_number
+  integer (kind=c_int64_t), value :: signal
+!dec$ attributes offload:mic :: offload_signaled
+end function offload_signaled
+
+subroutine offload_report(val)                                         &
+           bind (C, name = "_Offload_report")
+!dec$ attributes default :: offload_report
+  import :: c_int
+  integer (kind=c_int), value :: val
+!dec$ attributes offload:mic :: offload_report
+end subroutine offload_report
+
+function offload_get_device_number()                                   &
+           bind (C, name = "_Offload_get_device_number")
+!dec$ attributes default :: offload_get_device_number
+  import :: c_int
+  integer (kind=c_int)        :: offload_get_device_number
+!dec$ attributes offload:mic :: offload_get_device_number
+end function offload_get_device_number
+
+function offload_get_physical_device_number()                          &
+           bind (C, name = "_Offload_get_physical_device_number")
+!dec$ attributes default :: offload_get_physical_device_number
+  import :: c_int
+  integer (kind=c_int)        :: offload_get_physical_device_number
+!dec$ attributes offload:mic :: offload_get_physical_device_number
+end function offload_get_physical_device_number
+
+! OpenMP API wrappers
+
+subroutine omp_set_num_threads_target (target_type,                    &
+                                       target_number,                  &
+                                       num_threads)                    &
+           bind (C, name = "omp_set_num_threads_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, num_threads
+end subroutine omp_set_num_threads_target
+
+function omp_get_max_threads_target (target_type,                      &
+                                     target_number)                    &
+         bind (C, name = "omp_get_max_threads_target")
+  import :: c_int
+  integer (kind=c_int)        :: omp_get_max_threads_target
+  integer (kind=c_int), value :: target_type, target_number
+end function omp_get_max_threads_target
+
+function omp_get_num_procs_target (target_type,                        &
+                                   target_number)                      &
+         bind (C, name = "omp_get_num_procs_target")
+  import :: c_int
+  integer (kind=c_int)        :: omp_get_num_procs_target
+  integer (kind=c_int), value :: target_type, target_number
+end function omp_get_num_procs_target
+
+subroutine omp_set_dynamic_target (target_type,                        &
+                                   target_number,                      &
+                                   num_threads)                        &
+           bind (C, name = "omp_set_dynamic_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, num_threads
+end subroutine omp_set_dynamic_target
+
+function omp_get_dynamic_target (target_type,                          &
+                                 target_number)                        &
+         bind (C, name = "omp_get_dynamic_target")
+  import :: c_int
+  integer (kind=c_int)        :: omp_get_dynamic_target
+  integer (kind=c_int), value :: target_type, target_number
+end function omp_get_dynamic_target
+
+subroutine omp_set_nested_target (target_type,                         &
+                                  target_number,                       &
+                                  nested)                              &
+           bind (C, name = "omp_set_nested_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, nested
+end subroutine omp_set_nested_target
+
+function omp_get_nested_target (target_type,                           &
+                                target_number)                         &
+         bind (C, name = "omp_get_nested_target")
+  import :: c_int
+  integer (kind=c_int)        :: omp_get_nested_target
+  integer (kind=c_int), value :: target_type, target_number
+end function omp_get_nested_target
+
+subroutine omp_set_schedule_target (target_type,                       &
+                                    target_number,                     &
+                                    kind,                              &
+                                    modifier)                          &
+           bind (C, name = "omp_set_schedule_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, kind, modifier
+end subroutine omp_set_schedule_target
+
+subroutine omp_get_schedule_target (target_type,                       &
+                                    target_number,                     &
+                                    kind,                              &
+                                    modifier)                          &
+           bind (C, name = "omp_get_schedule_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: kind, modifier
+end subroutine omp_get_schedule_target
+
+! lock API functions
+
+subroutine omp_init_lock_target (target_type,                          &
+                                 target_number,                        &
+                                 lock)                                 &
+           bind (C, name = "omp_init_lock_target")
+  import :: c_int, c_intptr_t
+  !dir$ attributes known_intrinsic ::  omp_init_lock_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_init_lock_target
+
+subroutine omp_destroy_lock_target (target_type,                       &
+                                    target_number,                     &
+                                    lock)                              &
+           bind (C, name = "omp_destroy_lock_target")
+  import :: c_int, c_intptr_t
+  !dir$ attributes known_intrinsic ::  omp_destroy_lock_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_destroy_lock_target
+
+subroutine omp_set_lock_target (target_type,                           &
+                                target_number,                         &
+                                lock)                                  &
+           bind (C, name = "omp_set_lock_target")
+  import :: c_int, c_intptr_t
+  !dir$ attributes known_intrinsic ::  omp_set_lock_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_set_lock_target
+
+subroutine omp_unset_lock_target (target_type,                         &
+                                  target_number,                       &
+                                  lock)                                &
+           bind (C, name = "omp_unset_lock_target")
+  import :: c_int, c_intptr_t
+  !dir$ attributes known_intrinsic ::  omp_unset_lock_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_unset_lock_target
+
+function omp_test_lock_target (target_type,                            &
+                               target_number,                          &
+                               lock)                                   &
+           bind (C, name = "omp_test_lock_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int)        :: omp_test_lock_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end function omp_test_lock_target
+
+! nested lock API functions
+
+subroutine omp_init_nest_lock_target (target_type,                     &
+                                      target_number,                   &
+                                      lock)                            &
+           bind (C, name = "omp_init_nest_lock_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_init_nest_lock_target
+
+subroutine omp_destroy_nest_lock_target (target_type,                  &
+                                         target_number,                &
+                                         lock)                         &
+           bind (C, name = "omp_destroy_nest_lock_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_destroy_nest_lock_target
+
+subroutine omp_set_nest_lock_target (target_type,                      &
+                                     target_number,                    &
+                                     lock)                             &
+           bind (C, name = "omp_set_nest_lock_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_set_nest_lock_target
+
+subroutine omp_unset_nest_lock_target (target_type,                    &
+                                       target_number,                  &
+                                       lock)                           &
+           bind (C, name = "omp_unset_nest_lock_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end subroutine omp_unset_nest_lock_target
+
+function omp_test_nest_lock_target (target_type,                       &
+                                    target_number,                     &
+                                    lock)                              &
+           bind (C, name = "omp_test_nest_lock_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int)        :: omp_test_nest_lock_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: lock
+end function omp_test_nest_lock_target
+
+! kmp API functions
+
+subroutine kmp_set_stacksize_target (target_type,                      &
+                                     target_number,                    &
+                                     size)                             &
+           bind (C, name = "kmp_set_stacksize_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, size
+end subroutine kmp_set_stacksize_target
+
+function kmp_get_stacksize_target (target_type,                        &
+                                   target_number)                      &
+         bind (C, name = "kmp_get_stacksize_target")
+  import :: c_int
+  integer (kind=c_int)        :: kmp_get_stacksize_target
+  integer (kind=c_int), value :: target_type, target_number
+end function kmp_get_stacksize_target
+
+subroutine kmp_set_stacksize_s_target (target_type,                    &
+                                       target_number,                  &
+                                       size)                           &
+           bind (C, name = "kmp_set_stacksize_s_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, size
+end subroutine kmp_set_stacksize_s_target
+
+function kmp_get_stacksize_s_target (target_type,                      &
+                                     target_number)                    &
+         bind (C, name = "kmp_get_stacksize_s_target")
+  import :: c_int
+  integer (kind=c_int)        :: kmp_get_stacksize_s_target
+  integer (kind=c_int), value :: target_type, target_number
+end function kmp_get_stacksize_s_target
+
+subroutine kmp_set_blocktime_target (target_type,                      &
+                                     target_number,                    &
+                                     time)                             &
+           bind (C, name = "kmp_set_blocktime_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, time
+end subroutine kmp_set_blocktime_target
+
+function kmp_get_blocktime_target (target_type,                        &
+                                   target_number)                      &
+         bind (C, name = "kmp_get_blocktime_target")
+  import :: c_int
+  integer (kind=c_int)        :: kmp_get_blocktime_target
+  integer (kind=c_int), value :: target_type, target_number
+end function kmp_get_blocktime_target
+
+subroutine kmp_set_library_serial_target (target_type,                 &
+                                          target_number)               &
+           bind (C, name = "kmp_set_library_serial_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number
+end subroutine kmp_set_library_serial_target
+
+subroutine kmp_set_library_turnaround_target (target_type,             &
+                                              target_number)           &
+           bind (C, name = "kmp_set_library_turnaround_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number
+end subroutine kmp_set_library_turnaround_target
+
+subroutine kmp_set_library_throughput_target (target_type,             &
+                                              target_number)           &
+           bind (C, name = "kmp_set_library_throughput_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number
+end subroutine kmp_set_library_throughput_target
+
+subroutine kmp_set_library_target (target_type,                        &
+                                   target_number,                      &
+                                   mode)                               &
+           bind (C, name = "kmp_set_library_target")
+  import :: c_int
+  integer (kind=c_int), value :: target_type, target_number, mode
+end subroutine kmp_set_library_target
+
+function kmp_get_library_target (target_type,                          &
+                                 target_number)                        &
+         bind (C, name = "kmp_get_library_target")
+  import :: c_int
+  integer (kind=c_int)        :: kmp_get_library_target
+  integer (kind=c_int), value :: target_type, target_number
+end function kmp_get_library_target
+
+subroutine kmp_set_defaults_target (target_type,                       &
+                                    target_number,                     &
+                                    defaults)                          &
+           bind (C, name = "kmp_set_defaults_target")
+  import :: c_int, c_char
+ character (kind=c_char) :: defaults(*)
+ integer (kind=c_int), value :: target_type, target_number
+end subroutine kmp_set_defaults_target
+
+! affinity API functions
+
+subroutine kmp_create_affinity_mask_target (target_type,               &
+                                            target_number,             &
+                                            mask)                      &
+           bind (C, name = "kmp_create_affinity_mask_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: mask
+end subroutine kmp_create_affinity_mask_target
+
+subroutine kmp_destroy_affinity_mask_target (target_type,              &
+                                             target_number,            &
+                                             mask)                     &
+           bind (C, name = "kmp_destroy_affinity_mask_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: mask
+end subroutine kmp_destroy_affinity_mask_target
+
+function kmp_set_affinity_target (target_type,                         &
+                                  target_number,                       &
+                                  mask)                                &
+           bind (C, name = "kmp_set_affinity_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int)        :: kmp_set_affinity_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: mask
+end function kmp_set_affinity_target
+
+function kmp_get_affinity_target (target_type,                         &
+                                  target_number,                       &
+                                  mask)                                &
+           bind (C, name = "kmp_get_affinity_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int)        :: kmp_get_affinity_target
+  integer (kind=c_int), value :: target_type, target_number
+  integer (kind=c_intptr_t), value :: mask
+end function kmp_get_affinity_target
+
+function kmp_get_affinity_max_proc_target (target_type,                &
+                                           target_number)              &
+           bind (C, name = "kmp_get_affinity_max_proc_target")
+  import :: c_int
+  integer (kind=c_int)        :: kmp_get_affinity_max_proc_target
+  integer (kind=c_int), value :: target_type, target_number
+end function kmp_get_affinity_max_proc_target
+
+function kmp_set_affinity_mask_proc_target (target_type,               &
+                                            target_number,             &
+                                            proc,                      &
+                                            mask)                      &
+           bind (C, name = "kmp_set_affinity_mask_proc_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int)        :: kmp_set_affinity_mask_proc_target
+  integer (kind=c_int), value :: target_type, target_number, proc
+  integer (kind=c_intptr_t), value :: mask
+end function kmp_set_affinity_mask_proc_target
+
+function kmp_unset_affinity_mask_proc_target (target_type,             &
+                                              target_number,           &
+                                              proc,                    &
+                                              mask)                    &
+           bind (C, name = "kmp_unset_affinity_mask_proc_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int)        :: kmp_unset_affinity_mask_proc_target
+  integer (kind=c_int), value :: target_type, target_number, proc
+  integer (kind=c_intptr_t), value :: mask
+end function kmp_unset_affinity_mask_proc_target
+
+function kmp_get_affinity_mask_proc_target (target_type,               &
+                                            target_number,             &
+                                            proc,                      &
+                                            mask)                      &
+           bind (C, name = "kmp_get_affinity_mask_proc_target")
+  import :: c_int, c_intptr_t
+  integer (kind=c_int)        :: kmp_get_affinity_mask_proc_target
+  integer (kind=c_int), value :: target_type, target_number, proc
+  integer (kind=c_intptr_t), value :: mask
+end function kmp_get_affinity_mask_proc_target
+
+end interface
+end module mic_lib

diff --git a/final/offload/src/offload.h b/final/offload/src/offload.h
new file mode 100644
index 0000000..68914b7
--- /dev/null
+++ b/final/offload/src/offload.h

@@ -0,0 +1,474 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*
+ * Include file for Offload API.
+ */
+
+#ifndef OFFLOAD_H_INCLUDED
+#define OFFLOAD_H_INCLUDED
+
+#if defined(LINUX) || defined(FREEBSD)
+#include <bits/functexcept.h>
+#endif
+
+#include <stddef.h>
+#include <omp.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TARGET_ATTRIBUTE __declspec(target(mic))
+
+/*
+ *  The target architecture.
+ */
+typedef enum TARGET_TYPE {
+    TARGET_NONE,    /* Undefine target */
+    TARGET_HOST,    /* Host used as target */
+    TARGET_MIC      /* MIC target */
+} TARGET_TYPE;
+
+/*
+ *  The default target type.
+ */
+#define DEFAULT_TARGET_TYPE TARGET_MIC
+
+/*
+ *  The default target number.
+ */
+#define DEFAULT_TARGET_NUMBER 0
+
+/*
+ *  Offload status.
+ */
+typedef enum {
+    OFFLOAD_SUCCESS = 0,
+    OFFLOAD_DISABLED,               /* offload is disabled */
+    OFFLOAD_UNAVAILABLE,            /* card is not available */
+    OFFLOAD_OUT_OF_MEMORY,          /* not enough memory on device */
+    OFFLOAD_PROCESS_DIED,           /* target process has died */
+    OFFLOAD_ERROR                   /* unspecified error */
+} _Offload_result;
+
+typedef struct {
+    _Offload_result result;         /* result, see above */
+    int             device_number;  /* device number */
+    size_t          data_sent;      /* number of bytes sent to the target */
+    size_t          data_received;  /* number of bytes received by host */
+} _Offload_status;
+
+#define OFFLOAD_STATUS_INIT(x) \
+    ((x).result = OFFLOAD_DISABLED)
+
+#define OFFLOAD_STATUS_INITIALIZER \
+    { OFFLOAD_DISABLED, -1, 0, 0 }
+
+/* Offload runtime interfaces */
+
+extern int _Offload_number_of_devices(void);
+extern int _Offload_get_device_number(void);
+extern int _Offload_get_physical_device_number(void);
+
+extern void* _Offload_shared_malloc(size_t size);
+extern void  _Offload_shared_free(void *ptr);
+
+extern void* _Offload_shared_aligned_malloc(size_t size, size_t align);
+extern void  _Offload_shared_aligned_free(void *ptr);
+
+extern int _Offload_signaled(int index, void *signal);
+extern void _Offload_report(int val);
+
+/* OpenMP API */
+
+extern void omp_set_default_device(int num);
+extern int  omp_get_default_device(void);
+extern int  omp_get_num_devices(void);
+
+/* OpenMP API wrappers */
+
+/* Set num_threads on target */
+extern void omp_set_num_threads_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+);
+
+/* Get max_threads from target */
+extern int omp_get_max_threads_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+/* Get num_procs from target */
+extern int omp_get_num_procs_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+/* Set dynamic on target */
+extern void omp_set_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+);
+
+/* Get dynamic from target */
+extern int omp_get_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+/* Set nested on target */
+extern void omp_set_nested_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int nested
+);
+
+/* Get nested from target */
+extern int omp_get_nested_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void omp_set_num_threads_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+);
+
+extern int omp_get_max_threads_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern int omp_get_num_procs_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void omp_set_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+);
+
+extern int omp_get_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void omp_set_nested_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+);
+
+extern int omp_get_nested_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void omp_set_schedule_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_sched_t kind,
+    int modifier
+);
+
+extern void omp_get_schedule_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_sched_t *kind,
+    int *modifier
+);
+
+/* lock API functions */
+
+typedef struct {
+    omp_lock_t lock;
+} omp_lock_target_t;
+
+extern void omp_init_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+);
+
+extern void omp_destroy_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+);
+
+extern void omp_set_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+);
+
+extern void omp_unset_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+);
+
+extern int omp_test_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+);
+
+/* nested lock API functions */
+
+typedef struct {
+    omp_nest_lock_t lock;
+} omp_nest_lock_target_t;
+
+extern void omp_init_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+);
+
+extern void omp_destroy_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+);
+
+extern void omp_set_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+);
+
+extern void omp_unset_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+);
+
+extern int omp_test_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+);
+
+/* kmp API functions */
+
+extern void kmp_set_stacksize_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int size
+);
+
+extern int kmp_get_stacksize_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void kmp_set_stacksize_s_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    size_t size
+);
+
+extern size_t kmp_get_stacksize_s_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void kmp_set_blocktime_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int time
+);
+
+extern int kmp_get_blocktime_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void kmp_set_library_serial_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void kmp_set_library_turnaround_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void kmp_set_library_throughput_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void kmp_set_library_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int mode
+);
+
+extern int kmp_get_library_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern void kmp_set_defaults_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    char const *defaults
+);
+
+/* affinity API functions */
+
+typedef struct {
+    kmp_affinity_mask_t mask;
+} kmp_affinity_mask_target_t;
+
+extern void kmp_create_affinity_mask_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+);
+
+extern void kmp_destroy_affinity_mask_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+);
+
+extern int kmp_set_affinity_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+);
+
+extern int kmp_get_affinity_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+);
+
+extern int kmp_get_affinity_max_proc_target(
+    TARGET_TYPE target_type,
+    int target_number
+);
+
+extern int kmp_set_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+);
+
+extern int kmp_unset_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+);
+
+extern int kmp_get_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+);
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+/* Namespace for the shared_allocator. */
+namespace __offload {
+  /* This follows the specification for std::allocator. */
+  /* Forward declaration of the class template. */
+  template <typename T>
+  class shared_allocator;
+
+  /* Specialization for shared_allocator<void>. */
+  template <>
+  class shared_allocator<void> {
+  public:
+    typedef void       *pointer;
+    typedef const void *const_pointer;
+    typedef void        value_type;
+    template <class U> struct rebind { typedef shared_allocator<U> other; };
+  };
+
+  /* Definition of shared_allocator<T>. */
+  template <class T>
+  class shared_allocator {
+  public:
+    typedef size_t     size_type;
+    typedef ptrdiff_t  difference_type;
+    typedef T         *pointer;
+    typedef const T   *const_pointer;
+    typedef T         &reference;
+    typedef const T   &const_reference;
+    typedef T          value_type;
+    template <class U> struct rebind { typedef shared_allocator<U> other; };
+    shared_allocator() throw() { }
+    shared_allocator(const shared_allocator&) throw() { }
+    template <class U> shared_allocator(const shared_allocator<U>&) throw() { }
+    ~shared_allocator() throw() { }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+    pointer allocate(
+      size_type, shared_allocator<void>::const_pointer hint = 0);
+    void deallocate(pointer p, size_type n);
+    size_type max_size() const throw() {
+      return size_type(-1)/sizeof(T);
+    } /* max_size */
+    void construct(pointer p, const T& arg) {
+      ::new (p) T(arg);
+    } /* construct */
+    void destroy(pointer p) {
+      p->~T();
+    } /* destroy */
+  };
+
+  /* Definition for allocate. */
+  template <class T>
+  typename shared_allocator<T>::pointer
+  shared_allocator<T>::allocate(shared_allocator<T>::size_type s,
+                                shared_allocator<void>::const_pointer) {
+    /* Allocate from shared memory. */
+    void *ptr = _Offload_shared_malloc(s*sizeof(T));
+    if (ptr == 0) std::__throw_bad_alloc();
+    return static_cast<pointer>(ptr);
+  } /* allocate */
+
+  template <class T>
+  void shared_allocator<T>::deallocate(pointer p,
+                                       shared_allocator<T>::size_type) {
+    /* Free the shared memory. */
+    _Offload_shared_free(p);
+  } /* deallocate */
+
+  template <typename _T1, typename _T2>
+  inline bool operator==(const shared_allocator<_T1> &, 
+                         const shared_allocator<_T2> &) throw() {
+    return true;
+  }  /* operator== */
+
+  template <typename _T1, typename _T2>
+  inline bool operator!=(const shared_allocator<_T1> &, 
+                         const shared_allocator<_T2> &) throw() {
+    return false;
+  }  /* operator!= */
+}  /* __offload */
+#endif /* __cplusplus */
+
+#endif /* OFFLOAD_H_INCLUDED */

diff --git a/final/offload/src/offload_common.cpp b/final/offload/src/offload_common.cpp
new file mode 100644
index 0000000..3681b06
--- /dev/null
+++ b/final/offload/src/offload_common.cpp

@@ -0,0 +1,170 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if defined(LINUX) || defined(FREEBSD)
+#include <mm_malloc.h>
+#endif
+
+#include "offload_common.h"
+
+// The debug routines
+
+#if OFFLOAD_DEBUG > 0
+
+void __dump_bytes(
+    int trace_level,
+    const void *data,
+    int len
+)
+{
+    if (console_enabled > trace_level) {
+        const uint8_t *arr = (const uint8_t*) data;
+        char buffer[4096];
+        char *bufferp;
+        int count = 0;
+
+        bufferp = buffer;
+        while (len--) {
+            sprintf(bufferp, "%02x", *arr++);
+            bufferp += 2;
+            count++;
+            if ((count&3) == 0) {
+                sprintf(bufferp, " ");
+                bufferp++;
+            }
+            if ((count&63) == 0) {
+                OFFLOAD_DEBUG_TRACE(trace_level, "%s\n", buffer);
+                bufferp = buffer;
+                count = 0;
+            }
+        }
+        if (count) {
+            OFFLOAD_DEBUG_TRACE(trace_level, "%s\n", buffer);
+        }
+    }
+}
+#endif // OFFLOAD_DEBUG
+
+// The Marshaller and associated routines
+
+void Marshaller::send_data(
+    const void *data,
+    int64_t length
+)
+{
+    OFFLOAD_DEBUG_TRACE(2, "send_data(%p, %lld)\n",
+                        data, length);
+    memcpy(buffer_ptr, data, (size_t)length);
+    buffer_ptr += length;
+    tfr_size += length;
+}
+
+void Marshaller::receive_data(
+    void *data,
+    int64_t length
+)
+{
+    OFFLOAD_DEBUG_TRACE(2, "receive_data(%p, %lld)\n",
+                        data, length);
+    memcpy(data, buffer_ptr, (size_t)length);
+    buffer_ptr += length;
+    tfr_size += length;
+}
+
+// Send function pointer
+void Marshaller::send_func_ptr(
+    const void* data
+)
+{
+    const char* name;
+    size_t      length;
+
+    if (data != 0) {
+        name = __offload_funcs.find_name(data);
+        if (name == 0) {
+#if OFFLOAD_DEBUG > 0
+            if (console_enabled > 2) {
+                __offload_funcs.dump();
+            }
+#endif // OFFLOAD_DEBUG > 0
+
+            LIBOFFLOAD_ERROR(c_send_func_ptr, data);
+            exit(1);
+        }
+        length = strlen(name) + 1;
+    }
+    else {
+        name = "";
+        length = 1;
+    }
+
+    memcpy(buffer_ptr, name, length);
+    buffer_ptr += length;
+    tfr_size += length;
+}
+
+// Receive function pointer
+void Marshaller::receive_func_ptr(
+    const void** data
+)
+{
+    const char* name;
+    size_t      length;
+
+    name = (const char*) buffer_ptr;
+    if (name[0] != '\0') {
+        *data = __offload_funcs.find_addr(name);
+        if (*data == 0) {
+#if OFFLOAD_DEBUG > 0
+            if (console_enabled > 2) {
+                __offload_funcs.dump();
+            }
+#endif // OFFLOAD_DEBUG > 0
+
+            LIBOFFLOAD_ERROR(c_receive_func_ptr, name);
+            exit(1);
+        }
+        length = strlen(name) + 1;
+    }
+    else {
+        *data = 0;
+        length = 1;
+    }
+
+    buffer_ptr += length;
+    tfr_size += length;
+}
+
+// End of the Marshaller and associated routines
+
+extern void *OFFLOAD_MALLOC(
+    size_t size,
+    size_t align
+)
+{
+    void *ptr;
+    int   err;
+
+    OFFLOAD_DEBUG_TRACE(2, "%s(%lld, %lld)\n", __func__, size, align);
+
+    if (align < sizeof(void*)) {
+        align = sizeof(void*);
+    }
+
+    ptr = _mm_malloc(size, align);
+    if (ptr == NULL) {
+        LIBOFFLOAD_ERROR(c_offload_malloc, size, align);
+        exit(1);
+    }
+
+    OFFLOAD_DEBUG_TRACE(2, "%s returned %p\n", __func__, ptr);
+
+    return ptr;
+}

diff --git a/final/offload/src/offload_common.h b/final/offload/src/offload_common.h
new file mode 100644
index 0000000..11cb8bb
--- /dev/null
+++ b/final/offload/src/offload_common.h

@@ -0,0 +1,444 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*! \file
+    \brief The parts of the runtime library common to host and target
+*/
+
+#ifndef OFFLOAD_COMMON_H_INCLUDED
+#define OFFLOAD_COMMON_H_INCLUDED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <memory.h>
+
+#include "offload.h"
+#include "offload_table.h"
+#include "offload_trace.h"
+#include "offload_timer.h"
+#include "offload_util.h"
+#include "cean_util.h"
+#include "dv_util.h"
+#include "liboffload_error_codes.h"
+
+#include <stdarg.h>
+
+// The debug routines
+
+// Host console and file logging
+extern int console_enabled;
+extern int offload_report_level;
+
+#define OFFLOAD_DO_TRACE (offload_report_level == 3)
+
+extern const char *prefix;
+extern int offload_number;
+#if !HOST_LIBRARY
+extern int mic_index;
+#endif
+
+#if HOST_LIBRARY
+void Offload_Report_Prolog(OffloadHostTimerData* timer_data);
+void Offload_Report_Epilog(OffloadHostTimerData* timer_data);
+void offload_report_free_data(OffloadHostTimerData * timer_data);
+void Offload_Timer_Print(void);
+
+#ifndef TARGET_WINNT
+#define OFFLOAD_DEBUG_INCR_OFLD_NUM() \
+        __sync_add_and_fetch(&offload_number, 1)
+#else
+#define OFFLOAD_DEBUG_INCR_OFLD_NUM() \
+        _InterlockedIncrement(reinterpret_cast<long*>(&offload_number))
+#endif
+
+#define OFFLOAD_DEBUG_PRINT_TAG_PREFIX() \
+        printf("%s:  ", prefix);
+
+#define OFFLOAD_DEBUG_PRINT_PREFIX() \
+        printf("%s:  ", prefix);
+#else
+#define OFFLOAD_DEBUG_PRINT_PREFIX() \
+        printf("%s%d:  ", prefix, mic_index);
+#endif // HOST_LIBRARY
+
+#define OFFLOAD_TRACE(trace_level, ...)  \
+    if (console_enabled >= trace_level) { \
+        OFFLOAD_DEBUG_PRINT_PREFIX(); \
+        printf(__VA_ARGS__); \
+        fflush(NULL); \
+    }
+
+#if OFFLOAD_DEBUG > 0
+
+#define OFFLOAD_DEBUG_TRACE(level, ...) \
+    OFFLOAD_TRACE(level, __VA_ARGS__)
+
+#define OFFLOAD_REPORT(level, offload_number, stage, ...) \
+    if (OFFLOAD_DO_TRACE) { \
+        offload_stage_print(stage, offload_number, __VA_ARGS__); \
+        fflush(NULL); \
+    }
+
+#define OFFLOAD_DEBUG_TRACE_1(level, offload_number, stage, ...) \
+    if (OFFLOAD_DO_TRACE) { \
+        offload_stage_print(stage, offload_number, __VA_ARGS__); \
+        fflush(NULL); \
+    } \
+    if (!OFFLOAD_DO_TRACE) { \
+        OFFLOAD_TRACE(level, __VA_ARGS__) \
+    }
+
+#define OFFLOAD_DEBUG_DUMP_BYTES(level, a, b) \
+    __dump_bytes(level, a, b)
+
+extern void __dump_bytes(
+    int level,
+    const void *data,
+    int len
+);
+
+#else
+
+#define OFFLOAD_DEBUG_LOG(level, ...)
+#define OFFLOAD_DEBUG_DUMP_BYTES(level, a, b)
+
+#endif
+
+// Runtime interface
+
+#define OFFLOAD_PREFIX(a) __offload_##a
+
+#define OFFLOAD_MALLOC            OFFLOAD_PREFIX(malloc)
+#define OFFLOAD_FREE(a)           _mm_free(a)
+
+// Forward functions
+
+extern void *OFFLOAD_MALLOC(size_t size, size_t align);
+
+// The Marshaller
+
+//! \enum Indicator for the type of entry on an offload item list.
+enum OffloadItemType {
+    c_data =   1,       //!< Plain data
+    c_data_ptr,         //!< Pointer data
+    c_func_ptr,         //!< Function pointer
+    c_void_ptr,         //!< void*
+    c_string_ptr,       //!< C string
+    c_dv,               //!< Dope vector variable
+    c_dv_data,          //!< Dope-vector data
+    c_dv_data_slice,    //!< Dope-vector data's slice
+    c_dv_ptr,           //!< Dope-vector variable pointer
+    c_dv_ptr_data,      //!< Dope-vector pointer data
+    c_dv_ptr_data_slice,//!< Dope-vector pointer data's slice
+    c_cean_var,         //!< CEAN variable
+    c_cean_var_ptr,     //!< Pointer to CEAN variable
+    c_data_ptr_array,   //!< Pointer to data pointer array
+    c_func_ptr_array,   //!< Pointer to function pointer array
+    c_void_ptr_array,   //!< Pointer to void* pointer array
+    c_string_ptr_array  //!< Pointer to char* pointer array
+};
+
+#define VAR_TYPE_IS_PTR(t) ((t) == c_string_ptr || \
+                            (t) == c_data_ptr || \
+                            (t) == c_cean_var_ptr || \
+                            (t) == c_dv_ptr)
+
+#define VAR_TYPE_IS_SCALAR(t) ((t) == c_data || \
+                               (t) == c_void_ptr || \
+                               (t) == c_cean_var || \
+                               (t) == c_dv)
+
+#define VAR_TYPE_IS_DV_DATA(t) ((t) == c_dv_data || \
+                                (t) == c_dv_ptr_data)
+
+#define VAR_TYPE_IS_DV_DATA_SLICE(t) ((t) == c_dv_data_slice || \
+                                      (t) == c_dv_ptr_data_slice)
+
+
+//! \enum Specify direction to copy offloaded variable.
+enum OffloadParameterType {
+    c_parameter_unknown = -1, //!< Unknown clause
+    c_parameter_nocopy,       //!< Variable listed in "nocopy" clause
+    c_parameter_in,           //!< Variable listed in "in" clause
+    c_parameter_out,          //!< Variable listed in "out" clause
+    c_parameter_inout         //!< Variable listed in "inout" clause
+};
+
+//! An Offload Variable descriptor
+struct VarDesc {
+    //! OffloadItemTypes of source and destination
+    union {
+        struct {
+            uint8_t dst : 4; //!< OffloadItemType of destination
+            uint8_t src : 4; //!< OffloadItemType of source
+        };
+        uint8_t bits;
+    } type;
+
+    //! OffloadParameterType that describes direction of data transfer
+    union {
+        struct {
+            uint8_t in  : 1; //!< Set if IN or INOUT
+            uint8_t out : 1; //!< Set if OUT or INOUT
+        };
+        uint8_t bits;
+    } direction;
+
+    uint8_t alloc_if;        //!< alloc_if modifier value
+    uint8_t free_if;         //!< free_if modifier value
+    uint32_t align;          //!< MIC alignment requested for pointer data
+    //! Not used by compiler; set to 0
+    /*! Used by runtime as offset to data from start of MIC buffer */
+    uint32_t mic_offset;
+    //! Flags describing this variable
+    union {
+        struct {
+            //! source variable has persistent storage
+            uint32_t is_static : 1;
+            //! destination variable has persistent storage
+            uint32_t is_static_dstn : 1;
+            //! has length for c_dv && c_dv_ptr
+            uint32_t has_length : 1;
+            //! persisted local scalar is in stack buffer
+            uint32_t is_stack_buf : 1;
+            //! buffer address is sent in data
+            uint32_t sink_addr : 1;
+            //! alloc displacement is sent in data
+            uint32_t alloc_disp : 1;
+            //! source data is noncontiguous
+            uint32_t is_noncont_src : 1;
+            //! destination data is noncontiguous
+            uint32_t is_noncont_dst : 1;
+        };
+        uint32_t bits;
+    } flags;
+    //! Not used by compiler; set to 0
+    /*! Used by runtime as offset to base from data stored in a buffer */
+    int64_t offset;
+    //! Element byte-size of data to be transferred
+    /*! For dope-vector, the size of the dope-vector      */
+    int64_t size;
+    union {
+        //! Set to 0 for array expressions and dope-vectors
+        /*! Set to 1 for scalars                          */
+        /*! Set to value of length modifier for pointers  */
+        int64_t count;
+        //! Displacement not used by compiler
+        int64_t disp;
+    };
+
+    //! This field not used by OpenMP 4.0
+    /*! The alloc section expression in #pragma offload   */
+    union {
+       void *alloc;
+       int64_t ptr_arr_offset;
+    };
+
+    //! This field not used by OpenMP 4.0
+    /*! The into section expression in #pragma offload    */
+    /*! For c_data_ptr_array this is the into ptr array   */
+    void *into;
+
+    //! For an ordinary variable, address of the variable
+    /*! For c_cean_var (C/C++ array expression),
+        pointer to arr_desc, which is an array descriptor. */
+    /*! For c_data_ptr_array (array of data pointers),
+        pointer to ptr_array_descriptor,
+        which is a descriptor for pointer array transfers. */
+    void *ptr;
+};
+
+//! Auxiliary struct used when -g is enabled that holds variable names
+struct VarDesc2 {
+    const char *sname; //!< Source name
+    const char *dname; //!< Destination name (when "into" is used)
+};
+
+/*! When the OffloadItemType is c_data_ptr_array
+    the ptr field of the main descriptor points to this struct.          */
+/*! The type in VarDesc1 merely says c_cean_data_ptr, but the pointer
+    type can be c_data_ptr, c_func_ptr, c_void_ptr, or c_string_ptr.
+    Therefore the actual pointer type is in the flags field of VarDesc3. */
+/*! If flag_align_is_array/flag_alloc_if_is_array/flag_free_if_is_array
+    is 0 then alignment/alloc_if/free_if are specified in VarDesc1.      */
+/*! If flag_align_is_array/flag_alloc_if_is_array/flag_free_if_is_array
+    is 1 then align_array/alloc_if_array/free_if_array specify
+    the set of alignment/alloc_if/free_if values.                        */
+/*! For the other fields, if neither the scalar nor the array flag
+    is set, then that modifier was not specified. If the bits are set
+    they specify which modifier was set and whether it was a
+    scalar or an array expression.                                       */
+struct VarDesc3
+{
+    void *ptr_array;        //!< Pointer to arr_desc of array of pointers
+    void *align_array;      //!< Scalar value or pointer to arr_desc
+    void *alloc_if_array;   //!< Scalar value or pointer to arr_desc
+    void *free_if_array;    //!< Scalar value or pointer to arr_desc
+    void *extent_start;     //!< Scalar value or pointer to arr_desc
+    void *extent_elements;  //!< Scalar value or pointer to arr_desc
+    void *into_start;       //!< Scalar value or pointer to arr_desc
+    void *into_elements;    //!< Scalar value or pointer to arr_desc
+    void *alloc_start;      //!< Scalar value or pointer to arr_desc
+    void *alloc_elements;   //!< Scalar value or pointer to arr_desc
+    /*! Flags that describe the pointer type and whether each field
+        is a scalar value or an array expression.        */
+    /*! First 6 bits are pointer array element type:
+        c_data_ptr, c_func_ptr, c_void_ptr, c_string_ptr */
+    /*! Then single bits specify:                        */
+    /*!     align_array is an array                      */
+    /*!     alloc_if_array is an array                   */
+    /*!     free_if_array is an array                    */
+    /*!     extent_start is a scalar expression          */
+    /*!     extent_start is an array expression          */
+    /*!     extent_elements is a scalar expression       */
+    /*!     extent_elements is an array expression       */
+    /*!     into_start is a scalar expression            */
+    /*!     into_start is an array expression            */
+    /*!     into_elements is a scalar expression         */
+    /*!     into_elements is an array expression         */
+    /*!     alloc_start is a scalar expression           */
+    /*!     alloc_start is an array expression           */
+    /*!     alloc_elements is a scalar expression        */
+    /*!     alloc_elements is an array expression        */
+    uint32_t array_fields;
+};
+const int flag_align_is_array = 6;
+const int flag_alloc_if_is_array = 7;
+const int flag_free_if_is_array = 8;
+const int flag_extent_start_is_scalar = 9;
+const int flag_extent_start_is_array = 10;
+const int flag_extent_elements_is_scalar = 11;
+const int flag_extent_elements_is_array = 12;
+const int flag_into_start_is_scalar = 13;
+const int flag_into_start_is_array = 14;
+const int flag_into_elements_is_scalar = 15;
+const int flag_into_elements_is_array = 16;
+const int flag_alloc_start_is_scalar = 17;
+const int flag_alloc_start_is_array = 18;
+const int flag_alloc_elements_is_scalar = 19;
+const int flag_alloc_elements_is_array = 20;
+
+// The Marshaller
+class Marshaller
+{
+private:
+    // Start address of buffer
+    char *buffer_start;
+
+    // Current pointer within buffer
+    char *buffer_ptr;
+
+    // Physical size of data sent (including flags)
+    long long buffer_size;
+
+    // User data sent/received
+    long long tfr_size;
+
+public:
+    // Constructor
+    Marshaller() :
+        buffer_start(0), buffer_ptr(0),
+        buffer_size(0), tfr_size(0)
+    {
+    }
+
+    // Return count of user data sent/received
+    long long get_tfr_size() const
+    {
+        return tfr_size;
+    }
+
+    // Return pointer to buffer
+    char *get_buffer_start() const
+    {
+        return buffer_start;
+    }
+
+    // Return current size of data in buffer
+    long long get_buffer_size() const
+    {
+        return buffer_size;
+    }
+
+    // Set buffer pointer
+    void init_buffer(
+        char *d,
+        long long s
+    )
+    {
+        buffer_start = buffer_ptr = d;
+        buffer_size = s;
+    }
+
+    // Send data
+    void send_data(
+        const void *data,
+        int64_t length
+    );
+
+    // Receive data
+    void receive_data(
+        void *data,
+        int64_t length
+    );
+
+    // Send function pointer
+    void send_func_ptr(
+        const void* data
+    );
+
+    // Receive function pointer
+    void receive_func_ptr(
+        const void** data
+    );
+};
+
+// End of the Marshaller
+
+// The offloaded function descriptor.
+// Sent from host to target to specify which function to run.
+// Also, sets console and file tracing levels.
+struct FunctionDescriptor
+{
+    // Input data size.
+    long long in_datalen;
+
+    // Output data size.
+    long long out_datalen;
+
+    // Whether trace is requested on console.
+    // A value of 1 produces only function name and data sent/received.
+    // Values > 1 produce copious trace information.
+    uint8_t console_enabled;
+
+    // Flag controlling timing on the target side.
+    // Values > 0 enable timing on sink.
+    uint8_t timer_enabled;
+
+    int offload_report_level;
+    int offload_number;
+
+    // number of variable descriptors
+    int vars_num;
+
+    // inout data offset if data is passed as misc/return data
+    // otherwise it should be zero.
+    int data_offset;
+
+    // The name of the offloaded function
+    char data[];
+};
+
+// typedef OFFLOAD.
+// Pointer to OffloadDescriptor.
+typedef struct OffloadDescriptor *OFFLOAD;
+
+#endif // OFFLOAD_COMMON_H_INCLUDED

diff --git a/final/offload/src/offload_engine.cpp b/final/offload/src/offload_engine.cpp
new file mode 100644
index 0000000..069b604
--- /dev/null
+++ b/final/offload/src/offload_engine.cpp

@@ -0,0 +1,531 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_engine.h"
+#include <signal.h>
+#include <errno.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "offload_host.h"
+#include "offload_table.h"
+
+const char* Engine::m_func_names[Engine::c_funcs_total] =
+{
+    "server_compute",
+#ifdef MYO_SUPPORT
+    "server_myoinit",
+    "server_myofini",
+#endif // MYO_SUPPORT
+    "server_init",
+    "server_var_table_size",
+    "server_var_table_copy"
+};
+
+// Symbolic representation of system signals. Fix for CQ233593
+const char* Engine::c_signal_names[Engine::c_signal_max] =
+{
+    "Unknown SIGNAL",
+    "SIGHUP",    /*  1, Hangup (POSIX).  */
+    "SIGINT",    /*  2, Interrupt (ANSI).  */
+    "SIGQUIT",   /*  3, Quit (POSIX).  */
+    "SIGILL",    /*  4, Illegal instruction (ANSI).  */
+    "SIGTRAP",   /*  5, Trace trap (POSIX).  */
+    "SIGABRT",   /*  6, Abort (ANSI).  */
+    "SIGBUS",    /*  7, BUS error (4.2 BSD).  */
+    "SIGFPE",    /*  8, Floating-point exception (ANSI).  */
+    "SIGKILL",   /*  9, Kill, unblockable (POSIX).  */
+    "SIGUSR1",   /* 10, User-defined signal 1 (POSIX).  */
+    "SIGSEGV",   /* 11, Segmentation violation (ANSI).  */
+    "SIGUSR2",   /* 12, User-defined signal 2 (POSIX).  */
+    "SIGPIPE",   /* 13, Broken pipe (POSIX).  */
+    "SIGALRM",   /* 14, Alarm clock (POSIX).  */
+    "SIGTERM",   /* 15, Termination (ANSI).  */
+    "SIGSTKFLT", /* 16, Stack fault.  */
+    "SIGCHLD",   /* 17, Child status has changed (POSIX).  */
+    "SIGCONT",   /* 18, Continue (POSIX).  */
+    "SIGSTOP",   /* 19, Stop, unblockable (POSIX).  */
+    "SIGTSTP",   /* 20, Keyboard stop (POSIX).  */
+    "SIGTTIN",   /* 21, Background read from tty (POSIX).  */
+    "SIGTTOU",   /* 22, Background write to tty (POSIX).  */
+    "SIGURG",    /* 23, Urgent condition on socket (4.2 BSD).  */
+    "SIGXCPU",   /* 24, CPU limit exceeded (4.2 BSD).  */
+    "SIGXFSZ",   /* 25, File size limit exceeded (4.2 BSD).  */
+    "SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD).  */
+    "SIGPROF",   /* 27, Profiling alarm clock (4.2 BSD).  */
+    "SIGWINCH",  /* 28, Window size change (4.3 BSD, Sun).  */
+    "SIGIO",     /* 29, I/O now possible (4.2 BSD).  */
+    "SIGPWR",    /* 30, Power failure restart (System V).  */
+    "SIGSYS"     /* 31, Bad system call.  */
+};
+
+void Engine::init(void)
+{
+    if (!m_ready) {
+        mutex_locker_t locker(m_lock);
+
+        if (!m_ready) {
+            // start process if not done yet
+            if (m_process == 0) {
+                init_process();
+            }
+
+            // load penging images
+            load_libraries();
+
+            // and (re)build pointer table
+            init_ptr_data();
+
+            // it is ready now
+            m_ready = true;
+        }
+    }
+}
+
+void Engine::init_process(void)
+{
+    COIENGINE engine;
+    COIRESULT res;
+    const char **environ;
+
+    // create environment for the target process
+    environ = (const char**) mic_env_vars.create_environ_for_card(m_index);
+    if (environ != 0) {
+        for (const char **p = environ; *p != 0; p++) {
+            OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index, *p);
+        }
+    }
+
+    // Create execution context in the specified device
+    OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index,
+                        m_physical_index);
+    res = COI::EngineGetHandle(COI_ISA_KNC, m_physical_index, &engine);
+    check_result(res, c_get_engine_handle, m_index, res);
+
+    // Target executable should be available by the time when we
+    // attempt to initialize the device
+    if (__target_exe == 0) {
+        LIBOFFLOAD_ERROR(c_no_target_exe);
+        exit(1);
+    }
+
+    OFFLOAD_DEBUG_TRACE(2,
+        "Loading target executable \"%s\" from %p, size %lld\n",
+        __target_exe->name, __target_exe->data, __target_exe->size);
+
+    res = COI::ProcessCreateFromMemory(
+        engine,                 // in_Engine
+        __target_exe->name,     // in_pBinaryName
+        __target_exe->data,     // in_pBinaryBuffer
+        __target_exe->size,     // in_BinaryBufferLength,
+        0,                      // in_Argc
+        0,                      // in_ppArgv
+        environ == 0,           // in_DupEnv
+        environ,                // in_ppAdditionalEnv
+        mic_proxy_io,           // in_ProxyActive
+        mic_proxy_fs_root,      // in_ProxyfsRoot
+        mic_buffer_size,        // in_BufferSpace
+        mic_library_path,       // in_LibrarySearchPath
+        __target_exe->origin,   // in_FileOfOrigin
+        __target_exe->offset,   // in_FileOfOriginOffset
+        &m_process              // out_pProcess
+    );
+    check_result(res, c_process_create, m_index, res);
+
+    // get function handles
+    res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total,
+                                         m_func_names, m_funcs);
+    check_result(res, c_process_get_func_handles, m_index, res);
+
+    // initialize device side
+    pid_t pid = init_device();
+
+    // For IDB
+    if (__dbg_is_attached) {
+        // TODO: we have in-memory executable now.
+        // Check with IDB team what should we provide them now?
+        if (strlen(__target_exe->name) < MAX_TARGET_NAME) {
+            strcpy(__dbg_target_exe_name, __target_exe->name);
+        }
+        __dbg_target_so_pid = pid;
+        __dbg_target_id = m_physical_index;
+        __dbg_target_so_loaded();
+    }
+}
+
+void Engine::fini_process(bool verbose)
+{
+    if (m_process != 0) {
+        uint32_t sig;
+        int8_t ret;
+
+        // destroy target process
+        OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n",
+                            m_index);
+
+        COIRESULT res = COI::ProcessDestroy(m_process, -1, 0, &ret, &sig);
+        m_process = 0;
+
+        if (res == COI_SUCCESS) {
+            OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n",
+                                sig, ret);
+            if (verbose) {
+                if (sig != 0) {
+                    LIBOFFLOAD_ERROR(
+                        c_mic_process_exit_sig, m_index, sig,
+                        c_signal_names[sig >= c_signal_max ? 0 : sig]);
+                }
+                else {
+                    LIBOFFLOAD_ERROR(c_mic_process_exit_ret, m_index, ret);
+                }
+            }
+
+            // for idb
+            if (__dbg_is_attached) {
+                __dbg_target_so_unloaded();
+            }
+        }
+        else {
+            if (verbose) {
+                LIBOFFLOAD_ERROR(c_mic_process_exit, m_index);
+            }
+        }
+    }
+}
+
+void Engine::load_libraries()
+{
+    // load libraries collected so far
+    for (TargetImageList::iterator it = m_images.begin();
+         it != m_images.end(); it++) {
+        OFFLOAD_DEBUG_TRACE(2, "Loading library \"%s\" from %p, size %llu\n",
+                            it->name, it->data, it->size);
+
+        // load library to the device
+        COILIBRARY lib;
+        COIRESULT res;
+        res = COI::ProcessLoadLibraryFromMemory(m_process,
+                                                it->data,
+                                                it->size,
+                                                it->name,
+                                                mic_library_path,
+                                                it->origin,
+                                                it->offset,
+                                                COI_LOADLIBRARY_V1_FLAGS,
+                                                &lib);
+
+        if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) {
+            check_result(res, c_load_library, m_index, res);
+        }
+    }
+    m_images.clear();
+}
+
+static bool target_entry_cmp(
+    const VarList::BufEntry &l,
+    const VarList::BufEntry &r
+)
+{
+    const char *l_name = reinterpret_cast<const char*>(l.name);
+    const char *r_name = reinterpret_cast<const char*>(r.name);
+    return strcmp(l_name, r_name) < 0;
+}
+
+static bool host_entry_cmp(
+    const VarTable::Entry *l,
+    const VarTable::Entry *r
+)
+{
+    return strcmp(l->name, r->name) < 0;
+}
+
+void Engine::init_ptr_data(void)
+{
+    COIRESULT res;
+    COIEVENT event;
+
+    // Prepare table of host entries
+    std::vector<const VarTable::Entry*> host_table(__offload_vars.begin(),
+                                                   __offload_vars.end());
+
+    // no need to do anything further is host table is empty
+    if (host_table.size() <= 0) {
+        return;
+    }
+
+    // Get var table entries from the target.
+    // First we need to get size for the buffer to copy data
+    struct {
+        int64_t nelems;
+        int64_t length;
+    } params;
+
+    res = COI::PipelineRunFunction(get_pipeline(),
+                                   m_funcs[c_func_var_table_size],
+                                   0, 0, 0,
+                                   0, 0,
+                                   0, 0,
+                                   &params, sizeof(params),
+                                   &event);
+    check_result(res, c_pipeline_run_func, m_index, res);
+
+    res = COI::EventWait(1, &event, -1, 1, 0, 0);
+    check_result(res, c_event_wait, res);
+
+    if (params.length == 0) {
+        return;
+    }
+
+    // create buffer for target entries and copy data to host
+    COIBUFFER buffer;
+    res = COI::BufferCreate(params.length, COI_BUFFER_NORMAL, 0, 0, 1,
+                            &m_process, &buffer);
+    check_result(res, c_buf_create, m_index, res);
+
+    COI_ACCESS_FLAGS flags = COI_SINK_WRITE;
+    res = COI::PipelineRunFunction(get_pipeline(),
+                                   m_funcs[c_func_var_table_copy],
+                                   1, &buffer, &flags,
+                                   0, 0,
+                                   &params.nelems, sizeof(params.nelems),
+                                   0, 0,
+                                   &event);
+    check_result(res, c_pipeline_run_func, m_index, res);
+
+    res = COI::EventWait(1, &event, -1, 1, 0, 0);
+    check_result(res, c_event_wait, res);
+
+    // patch names in target data
+    VarList::BufEntry *target_table;
+    COIMAPINSTANCE map_inst;
+    res = COI::BufferMap(buffer, 0, params.length, COI_MAP_READ_ONLY, 0, 0,
+                         0, &map_inst,
+                         reinterpret_cast<void**>(&target_table));
+    check_result(res, c_buf_map, res);
+
+    VarList::table_patch_names(target_table, params.nelems);
+
+    // and sort entries
+    std::sort(target_table, target_table + params.nelems, target_entry_cmp);
+    std::sort(host_table.begin(), host_table.end(), host_entry_cmp);
+
+    // merge host and target entries and enter matching vars map
+    std::vector<const VarTable::Entry*>::const_iterator hi =
+        host_table.begin();
+    std::vector<const VarTable::Entry*>::const_iterator he =
+        host_table.end();
+    const VarList::BufEntry *ti = target_table;
+    const VarList::BufEntry *te = target_table + params.nelems;
+
+    while (hi != he && ti != te) {
+        int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name));
+        if (res == 0) {
+            // add matching entry to var map
+            std::pair<PtrSet::iterator, bool> res =
+                m_ptr_set.insert(PtrData((*hi)->addr, (*hi)->size));
+
+            // store address for new entries
+            if (res.second) {
+                PtrData *ptr = const_cast<PtrData*>(res.first.operator->());
+                ptr->mic_addr = ti->addr;
+                ptr->is_static = true;
+            }
+
+            hi++;
+            ti++;
+        }
+        else if (res < 0) {
+            hi++;
+        }
+        else {
+            ti++;
+        }
+    }
+
+    // cleanup
+    res = COI::BufferUnmap(map_inst, 0, 0, 0);
+    check_result(res, c_buf_unmap, res);
+
+    res = COI::BufferDestroy(buffer);
+    check_result(res, c_buf_destroy, res);
+}
+
+COIRESULT Engine::compute(
+    const std::list<COIBUFFER> &buffers,
+    const void*         data,
+    uint16_t            data_size,
+    void*               ret,
+    uint16_t            ret_size,
+    uint32_t            num_deps,
+    const COIEVENT*     deps,
+    COIEVENT*           event
+) /* const */
+{
+    COIBUFFER *bufs;
+    COI_ACCESS_FLAGS *flags;
+    COIRESULT res;
+
+    // convert buffers list to array
+    int num_bufs = buffers.size();
+    if (num_bufs > 0) {
+        bufs = (COIBUFFER*) alloca(num_bufs * sizeof(COIBUFFER));
+        flags = (COI_ACCESS_FLAGS*) alloca(num_bufs *
+                                           sizeof(COI_ACCESS_FLAGS));
+
+        int i = 0;
+        for (std::list<COIBUFFER>::const_iterator it = buffers.begin();
+             it != buffers.end(); it++) {
+            bufs[i] = *it;
+
+            // TODO: this should be fixed
+            flags[i++] = COI_SINK_WRITE;
+        }
+    }
+    else {
+        bufs = 0;
+        flags = 0;
+    }
+
+    // start computation
+    res = COI::PipelineRunFunction(get_pipeline(),
+                                   m_funcs[c_func_compute],
+                                   num_bufs, bufs, flags,
+                                   num_deps, deps,
+                                   data, data_size,
+                                   ret, ret_size,
+                                   event);
+    return res;
+}
+
+pid_t Engine::init_device(void)
+{
+    struct init_data {
+        int  device_index;
+        int  devices_total;
+        int  console_level;
+        int  offload_report_level;
+    } data;
+    COIRESULT res;
+    COIEVENT event;
+    pid_t pid;
+
+    OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init,
+                          "Initializing device with logical index %d "
+                          "and physical index %d\n",
+                           m_index, m_physical_index);
+
+    // setup misc data
+    data.device_index = m_index;
+    data.devices_total = mic_engines_total;
+    data.console_level = console_enabled;
+    data.offload_report_level = offload_report_level;
+
+    res = COI::PipelineRunFunction(get_pipeline(),
+                                   m_funcs[c_func_init],
+                                   0, 0, 0, 0, 0,
+                                   &data, sizeof(data),
+                                   &pid, sizeof(pid),
+                                   &event);
+    check_result(res, c_pipeline_run_func, m_index, res);
+
+    res = COI::EventWait(1, &event, -1, 1, 0, 0);
+    check_result(res, c_event_wait, res);
+
+    OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid);
+
+    return pid;
+}
+
+// data associated with each thread
+struct Thread {
+    Thread(long* addr_coipipe_counter) {
+        m_addr_coipipe_counter = addr_coipipe_counter;
+        memset(m_pipelines, 0, sizeof(m_pipelines));
+    }
+
+    ~Thread() {
+#ifndef TARGET_WINNT
+        __sync_sub_and_fetch(m_addr_coipipe_counter, 1);
+#else // TARGET_WINNT
+        _InterlockedDecrement(m_addr_coipipe_counter);
+#endif // TARGET_WINNT
+        for (int i = 0; i < mic_engines_total; i++) {
+            if (m_pipelines[i] != 0) {
+                COI::PipelineDestroy(m_pipelines[i]);
+            }
+        }
+    }
+
+    COIPIPELINE get_pipeline(int index) const {
+        return m_pipelines[index];
+    }
+
+    void set_pipeline(int index, COIPIPELINE pipeline) {
+        m_pipelines[index] = pipeline;
+    }
+
+    AutoSet& get_auto_vars() {
+        return m_auto_vars;
+    }
+
+private:
+    long*       m_addr_coipipe_counter;
+    AutoSet     m_auto_vars;
+    COIPIPELINE m_pipelines[MIC_ENGINES_MAX];
+};
+
+COIPIPELINE Engine::get_pipeline(void)
+{
+    Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
+    if (thread == 0) {
+        thread = new Thread(&m_proc_number);
+        thread_setspecific(mic_thread_key, thread);
+    }
+
+    COIPIPELINE pipeline = thread->get_pipeline(m_index);
+    if (pipeline == 0) {
+        COIRESULT res;
+        int proc_num;
+
+#ifndef TARGET_WINNT
+        proc_num = __sync_fetch_and_add(&m_proc_number, 1);
+#else // TARGET_WINNT
+        proc_num = _InterlockedIncrement(&m_proc_number);
+#endif // TARGET_WINNT
+
+        if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
+            LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
+            LIBOFFLOAD_ABORT;
+        }
+        // create pipeline for this thread
+        res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline);
+        check_result(res, c_pipeline_create, m_index, res);
+
+        thread->set_pipeline(m_index, pipeline);
+    }
+    return pipeline;
+}
+
+AutoSet& Engine::get_auto_vars(void)
+{
+    Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
+    if (thread == 0) {
+        thread = new Thread(&m_proc_number);
+        thread_setspecific(mic_thread_key, thread);
+    }
+
+    return thread->get_auto_vars();
+}
+
+void Engine::destroy_thread_data(void *data)
+{
+    delete static_cast<Thread*>(data);
+}

diff --git a/final/offload/src/offload_engine.h b/final/offload/src/offload_engine.h
new file mode 100644
index 0000000..d1a9631
--- /dev/null
+++ b/final/offload/src/offload_engine.h

@@ -0,0 +1,482 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef OFFLOAD_ENGINE_H_INCLUDED
+#define OFFLOAD_ENGINE_H_INCLUDED
+
+#include <limits.h>
+
+#include <list>
+#include <set>
+#include <map>
+#include "offload_common.h"
+#include "coi/coi_client.h"
+
+// Address range
+class MemRange {
+public:
+    MemRange() : m_start(0), m_length(0) {}
+    MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
+
+    const void* start() const {
+        return m_start;
+    }
+
+    const void* end() const {
+        return static_cast<const char*>(m_start) + m_length;
+    }
+
+    uint64_t length() const {
+        return m_length;
+    }
+
+    // returns true if given range overlaps with another one
+    bool overlaps(const MemRange &o) const {
+        // Two address ranges A[start, end) and B[start,end) overlap
+        // if A.start < B.end and A.end > B.start.
+        return start() < o.end() && end() > o.start();
+    }
+
+    // returns true if given range contains the other range
+    bool contains(const MemRange &o) const {
+        return start() <= o.start() && o.end() <= end();
+    }
+
+private:
+    const void* m_start;
+    uint64_t    m_length;
+};
+
+// Data associated with a pointer variable
+class PtrData {
+public:
+    PtrData(const void *addr, uint64_t len) :
+        cpu_addr(addr, len), cpu_buf(0),
+        mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
+        ref_count(0), is_static(false)
+    {}
+
+    //
+    // Copy constructor
+    //
+    PtrData(const PtrData& ptr):
+        cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
+        mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
+        mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
+        ref_count(ptr.ref_count), is_static(ptr.is_static)
+    {}
+
+    bool operator<(const PtrData &o) const {
+        // Variables are sorted by the CPU start address.
+        // Overlapping memory ranges are considered equal.
+        return (cpu_addr.start() < o.cpu_addr.start()) &&
+               !cpu_addr.overlaps(o.cpu_addr);
+    }
+
+    long add_reference() {
+        if (is_static) {
+            return LONG_MAX;
+        }
+#ifndef TARGET_WINNT
+        return __sync_fetch_and_add(&ref_count, 1);
+#else // TARGET_WINNT
+        return _InterlockedIncrement(&ref_count) - 1;
+#endif // TARGET_WINNT
+    }
+
+    long remove_reference() {
+        if (is_static) {
+            return LONG_MAX;
+        }
+#ifndef TARGET_WINNT
+        return __sync_sub_and_fetch(&ref_count, 1);
+#else // TARGET_WINNT
+        return _InterlockedDecrement(&ref_count);
+#endif // TARGET_WINNT
+    }
+
+    long get_reference() const {
+        if (is_static) {
+            return LONG_MAX;
+        }
+        return ref_count;
+    }
+
+public:
+    // CPU address range
+    const MemRange  cpu_addr;
+
+    // CPU and MIC buffers
+    COIBUFFER       cpu_buf;
+    COIBUFFER       mic_buf;
+
+    // placeholder for buffer address on mic
+    uint64_t        mic_addr;
+
+    uint64_t        alloc_disp;
+
+    // additional offset to pointer data on MIC for improving bandwidth for
+    // data which is not 4K aligned
+    uint32_t        mic_offset;
+
+    // if true buffers are created from static memory
+    bool            is_static;
+    mutex_t         alloc_ptr_data_lock;
+
+private:
+    // reference count for the entry
+    long            ref_count;
+};
+
+typedef std::list<PtrData*> PtrDataList;
+
+// Data associated with automatic variable
+class AutoData {
+public:
+    AutoData(const void *addr, uint64_t len) :
+        cpu_addr(addr, len), ref_count(0)
+    {}
+
+    bool operator<(const AutoData &o) const {
+        // Variables are sorted by the CPU start address.
+        // Overlapping memory ranges are considered equal.
+        return (cpu_addr.start() < o.cpu_addr.start()) &&
+               !cpu_addr.overlaps(o.cpu_addr);
+    }
+
+    long add_reference() {
+#ifndef TARGET_WINNT
+        return __sync_fetch_and_add(&ref_count, 1);
+#else // TARGET_WINNT
+        return _InterlockedIncrement(&ref_count) - 1;
+#endif // TARGET_WINNT
+    }
+
+    long remove_reference() {
+#ifndef TARGET_WINNT
+        return __sync_sub_and_fetch(&ref_count, 1);
+#else // TARGET_WINNT
+        return _InterlockedDecrement(&ref_count);
+#endif // TARGET_WINNT
+    }
+
+    long get_reference() const {
+        return ref_count;
+    }
+
+public:
+    // CPU address range
+    const MemRange cpu_addr;
+
+private:
+    // reference count for the entry
+    long ref_count;
+};
+
+// Set of autimatic variables
+typedef std::set<AutoData> AutoSet;
+
+// Target image data
+struct TargetImage
+{
+    TargetImage(const char *_name, const void *_data, uint64_t _size,
+                const char *_origin, uint64_t _offset) :
+        name(_name), data(_data), size(_size),
+        origin(_origin), offset(_offset)
+    {}
+
+    // library name
+    const char* name;
+
+    // contents and size
+    const void* data;
+    uint64_t    size;
+
+    // file of origin and offset within that file
+    const char* origin;
+    uint64_t    offset;
+};
+
+typedef std::list<TargetImage> TargetImageList;
+
+// Data associated with persistent auto objects
+struct PersistData
+{
+    PersistData(const void *addr, uint64_t routine_num, uint64_t size) :
+        stack_cpu_addr(addr), routine_id(routine_num)
+    {
+        stack_ptr_data = new PtrData(0, size);
+    }
+    // 1-st key value - beginning of the stack at CPU
+    const void *   stack_cpu_addr;
+    // 2-nd key value - identifier of routine invocation at CPU
+    uint64_t   routine_id;
+    // corresponded PtrData; only stack_ptr_data->mic_buf is used
+    PtrData * stack_ptr_data;
+    // used to get offset of the variable in stack buffer
+    char * cpu_stack_addr;
+};
+
+typedef std::list<PersistData> PersistDataList;
+
+// class representing a single engine
+struct Engine {
+    friend void __offload_init_library_once(void);
+    friend void __offload_fini_library(void);
+
+#define check_result(res, tag, ...) \
+    { \
+        if (res == COI_PROCESS_DIED) { \
+            fini_process(true); \
+            exit(1); \
+        } \
+        if (res != COI_SUCCESS) { \
+            __liboffload_error_support(tag, __VA_ARGS__); \
+            exit(1); \
+        } \
+    }
+
+    int get_logical_index() const {
+        return m_index;
+    }
+
+    int get_physical_index() const {
+        return m_physical_index;
+    }
+
+    const COIPROCESS& get_process() const {
+        return m_process;
+    }
+
+    // initialize device
+    void init(void);
+
+    // add new library
+    void add_lib(const TargetImage &lib)
+    {
+        m_lock.lock();
+        m_ready = false;
+        m_images.push_back(lib);
+        m_lock.unlock();
+    }
+
+    COIRESULT compute(
+        const std::list<COIBUFFER> &buffers,
+        const void*         data,
+        uint16_t            data_size,
+        void*               ret,
+        uint16_t            ret_size,
+        uint32_t            num_deps,
+        const COIEVENT*     deps,
+        COIEVENT*           event
+    );
+
+#ifdef MYO_SUPPORT
+    // temporary workaround for blocking behavior for myoiLibInit/Fini calls
+    void init_myo(COIEVENT *event) {
+        COIRESULT res;
+        res = COI::PipelineRunFunction(get_pipeline(),
+                                       m_funcs[c_func_myo_init],
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       event);
+        check_result(res, c_pipeline_run_func, m_index, res);
+    }
+
+    void fini_myo(COIEVENT *event) {
+        COIRESULT res;
+        res = COI::PipelineRunFunction(get_pipeline(),
+                                       m_funcs[c_func_myo_fini],
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       event);
+        check_result(res, c_pipeline_run_func, m_index, res);
+    }
+#endif // MYO_SUPPORT
+
+    //
+    // Memory association table
+    //
+    PtrData* find_ptr_data(const void *ptr) {
+        m_ptr_lock.lock();
+        PtrSet::iterator res = m_ptr_set.find(PtrData(ptr, 0));
+        m_ptr_lock.unlock();
+        if (res == m_ptr_set.end()) {
+            return 0;
+        }
+        return const_cast<PtrData*>(res.operator->());
+    }
+
+    PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
+        m_ptr_lock.lock();
+        std::pair<PtrSet::iterator, bool> res =
+            m_ptr_set.insert(PtrData(ptr, len));
+        PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
+        m_ptr_lock.unlock();
+
+        is_new = res.second;
+        if (is_new) {
+            // It's necessary to lock as soon as possible.
+            // unlock must be done at call site of insert_ptr_data at
+            // branch for is_new
+            ptr_data->alloc_ptr_data_lock.lock();
+        }
+        return ptr_data;
+    }
+
+    void remove_ptr_data(const void *ptr) {
+        m_ptr_lock.lock();
+        m_ptr_set.erase(PtrData(ptr, 0));
+        m_ptr_lock.unlock();
+    }
+
+    //
+    // Automatic variables
+    //
+    AutoData* find_auto_data(const void *ptr) {
+        AutoSet &auto_vars = get_auto_vars();
+        AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
+        if (res == auto_vars.end()) {
+            return 0;
+        }
+        return const_cast<AutoData*>(res.operator->());
+    }
+
+    AutoData* insert_auto_data(const void *ptr, uint64_t len) {
+        AutoSet &auto_vars = get_auto_vars();
+        std::pair<AutoSet::iterator, bool> res =
+            auto_vars.insert(AutoData(ptr, len));
+        return const_cast<AutoData*>(res.first.operator->());
+    }
+
+    void remove_auto_data(const void *ptr) {
+        get_auto_vars().erase(AutoData(ptr, 0));
+    }
+
+    //
+    // Signals
+    //
+    void add_signal(const void *signal, OffloadDescriptor *desc) {
+        m_signal_lock.lock();
+        m_signal_map[signal] = desc;
+        m_signal_lock.unlock();
+    }
+
+    OffloadDescriptor* find_signal(const void *signal, bool remove) {
+        OffloadDescriptor *desc = 0;
+
+        m_signal_lock.lock();
+        {
+            SignalMap::iterator it = m_signal_map.find(signal);
+            if (it != m_signal_map.end()) {
+                desc = it->second;
+                if (remove) {
+                    m_signal_map.erase(it);
+                }
+            }
+        }
+        m_signal_lock.unlock();
+
+        return desc;
+    }
+
+    // stop device process
+    void fini_process(bool verbose);
+
+    // list of stacks active at the engine
+    PersistDataList m_persist_list;
+
+private:
+    Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
+               m_proc_number(0)
+    {}
+
+    ~Engine() {
+        if (m_process != 0) {
+            fini_process(false);
+        }
+    }
+
+    // set indexes
+    void set_indexes(int logical_index, int physical_index) {
+        m_index = logical_index;
+        m_physical_index = physical_index;
+    }
+
+    // start process on device
+    void init_process();
+
+    void load_libraries(void);
+    void init_ptr_data(void);
+
+    // performs library intialization on the device side
+    pid_t init_device(void);
+
+private:
+    // get pipeline associated with a calling thread
+    COIPIPELINE get_pipeline(void);
+
+    // get automatic vars set associated with the calling thread
+    AutoSet& get_auto_vars(void);
+
+    // destructor for thread data
+    static void destroy_thread_data(void *data);
+
+private:
+    typedef std::set<PtrData> PtrSet;
+    typedef std::map<const void*, OffloadDescriptor*> SignalMap;
+
+    // device indexes
+    int         m_index;
+    int         m_physical_index;
+
+    // number of COI pipes created for the engine
+    long        m_proc_number;
+
+    // process handle
+    COIPROCESS  m_process;
+
+    // If false, device either has not been initialized or new libraries
+    // have been added.
+    bool        m_ready;
+    mutex_t     m_lock;
+
+    // List of libraries to be loaded
+    TargetImageList m_images;
+
+    // var table
+    PtrSet      m_ptr_set;
+    mutex_t     m_ptr_lock;
+
+    // signals
+    SignalMap m_signal_map;
+    mutex_t   m_signal_lock;
+
+    // constants for accessing device function handles
+    enum {
+        c_func_compute = 0,
+#ifdef MYO_SUPPORT
+        c_func_myo_init,
+        c_func_myo_fini,
+#endif // MYO_SUPPORT
+        c_func_init,
+        c_func_var_table_size,
+        c_func_var_table_copy,
+        c_funcs_total
+    };
+    static const char* m_func_names[c_funcs_total];
+
+    // device function handles
+    COIFUNCTION m_funcs[c_funcs_total];
+
+    // int -> name mapping for device signals
+    static const int   c_signal_max = 32;
+    static const char* c_signal_names[c_signal_max];
+};
+
+#endif // OFFLOAD_ENGINE_H_INCLUDED

diff --git a/final/offload/src/offload_env.cpp b/final/offload/src/offload_env.cpp
new file mode 100644
index 0000000..d037338
--- /dev/null
+++ b/final/offload/src/offload_env.cpp

@@ -0,0 +1,354 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_env.h"
+#include <string.h>
+#include <ctype.h>
+#include "offload_util.h"
+#include "liboffload_error_codes.h"
+
+// for environment variables valid on all cards
+const int MicEnvVar::any_card = -1;
+
+MicEnvVar::~MicEnvVar()
+{
+    for (std::list<MicEnvVar::CardEnvVars*>::const_iterator
+         it = card_spec_list.begin();
+         it != card_spec_list.end(); it++) {
+        CardEnvVars *card_data = *it;
+        delete card_data;
+    }
+}
+
+MicEnvVar::VarValue::~VarValue()
+{
+    free(env_var_value);
+}
+
+MicEnvVar::CardEnvVars::~CardEnvVars()
+{
+    for (std::list<MicEnvVar::VarValue*>::const_iterator it = env_vars.begin();
+        it != env_vars.end(); it++) {
+            VarValue *var_value = *it;
+            delete var_value;
+    }
+}
+
+// Searching for card in "card_spec_list" list with the same "number"
+
+MicEnvVar::CardEnvVars* MicEnvVar::get_card(int number)
+{
+    if (number == any_card) {
+        return &common_vars;
+    }
+    for (std::list<MicEnvVar::CardEnvVars*>::const_iterator
+         it = card_spec_list.begin();
+         it != card_spec_list.end(); it++) {
+        CardEnvVars *card_data = *it;
+        if (card_data->card_number == number) {
+            return card_data;
+        }
+    }
+    return NULL;
+}
+
+// Searching for environment variable in "env_var" list with the same name
+
+MicEnvVar::VarValue* MicEnvVar::CardEnvVars::find_var(
+    char* env_var_name,
+    int env_var_name_length
+)
+{
+    for (std::list<MicEnvVar::VarValue*>::const_iterator it = env_vars.begin();
+        it != env_vars.end(); it++) {
+            VarValue *var_value = *it;
+            if (var_value->length == env_var_name_length &&
+                !strncmp(var_value->env_var, env_var_name,
+                         env_var_name_length)) {
+                return var_value;
+            }
+    }
+    return NULL;
+}
+
+void MicEnvVar::analyze_env_var(char *env_var_string)
+{
+    char          *env_var_name;
+    char          *env_var_def;
+    int           card_number;
+    int           env_var_name_length;
+    MicEnvVarKind env_var_kind;
+
+    env_var_kind = get_env_var_kind(env_var_string,
+                                    &card_number,
+                                    &env_var_name,
+                                    &env_var_name_length,
+                                    &env_var_def);
+    switch (env_var_kind) {
+        case c_mic_var:
+        case c_mic_card_var:
+            add_env_var(card_number,
+                        env_var_name,
+                        env_var_name_length,
+                        env_var_def);
+            break;
+        case c_mic_card_env:
+            mic_parse_env_var_list(card_number, env_var_def);
+            break;
+        case c_no_mic:
+        default:
+            break;
+    }
+}
+
+void MicEnvVar::add_env_var(
+    int card_number,
+    char *env_var_name,
+    int env_var_name_length,
+    char *env_var_def
+)
+{
+    VarValue *var;
+    CardEnvVars *card;
+
+    // The case corresponds to common env var definition of kind
+    // <mic-prefix>_<var>
+    if (card_number == any_card) {
+        card = &common_vars;
+    }
+    else {
+        card = get_card(card_number);
+        if (!card) {
+            // definition for new card occurred
+            card = new CardEnvVars(card_number);
+            card_spec_list.push_back(card);
+        }
+
+    }
+    var = card->find_var(env_var_name, env_var_name_length);
+    if (!var) {
+        // put new env var definition in "env_var" list
+        var = new VarValue(env_var_name, env_var_name_length, env_var_def);
+        card->env_vars.push_back(var);
+    }
+}
+
+// The routine analyses string pointed by "env_var_string" argument
+// according to the following syntax:
+//
+// Specification of prefix for MIC environment variables
+// MIC_ENV_PREFIX=<mic-prefix>
+//
+// Setting single MIC environment variable
+// <mic-prefix>_<var>=<value>
+// <mic-prefix>_<card-number>_<var>=<value>
+
+// Setting multiple MIC environment variables
+// <mic-prefix>_<card-number>_ENV=<env-vars>
+
+MicEnvVarKind MicEnvVar::get_env_var_kind(
+    char *env_var_string,
+    int *card_number,
+    char **env_var_name,
+    int *env_var_name_length,
+    char **env_var_def
+)
+{
+    int len = strlen(prefix);
+    char *c = env_var_string;
+    int num = 0;
+    bool card_is_set = false;
+
+    if (strncmp(c, prefix, len) != 0 || c[len] != '_') {
+            return c_no_mic;
+    }
+    c += len + 1;
+
+    *card_number = any_card;
+    if (isdigit(*c)) {
+        while (isdigit (*c)) {
+            num = (*c++ - '0') + (num * 10);
+        }
+    if (*c != '_') {
+        return c_no_mic;
+    }
+    c++;
+        *card_number = num;
+        card_is_set = true;
+    }
+    if (!isalpha(*c)) {
+        return c_no_mic;
+    }
+    *env_var_name = *env_var_def = c;
+    if (strncmp(c, "ENV=", 4) == 0) {
+        if (!card_is_set) {
+            *env_var_name_length = 3;
+            *env_var_name = *env_var_def = c;
+            *env_var_def = strdup(*env_var_def);
+            return  c_mic_var;
+        }
+        *env_var_def = c + strlen("ENV=");
+        *env_var_def = strdup(*env_var_def);
+        return c_mic_card_env;
+    }
+    if (isalpha(*c)) {
+        *env_var_name_length = 0;
+        while (isalnum(*c) || *c == '_') {
+            c++;
+            (*env_var_name_length)++;
+        }
+    }
+    if (*c != '=') {
+        return c_no_mic;
+    }
+    *env_var_def = strdup(*env_var_def);
+    return card_is_set? c_mic_card_var : c_mic_var;
+}
+
+// analysing <env-vars> in form:
+// <mic-prefix>_<card-number>_ENV=<env-vars>
+// where:
+//
+// <env-vars>:
+//                <env-var>
+//                <env-vars> | <env-var>
+//
+// <env-var>:
+//                variable=value
+//                variable="value"
+//                variable=
+
+void MicEnvVar::mic_parse_env_var_list(
+    int card_number, char *env_vars_def_list)
+{
+    char *c = env_vars_def_list;
+    char *env_var_name;
+    int  env_var_name_length;
+    char *env_var_def;
+    bool var_is_quoted;
+
+    if (*c == '"') {
+        c++;
+    }
+    while (*c != 0) {
+        var_is_quoted = false;
+        env_var_name = c;
+        env_var_name_length = 0;
+        if (isalpha(*c)) {
+            while (isalnum(*c) || *c == '_') {
+                c++;
+                env_var_name_length++;
+            }
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_mic_parse_env_var_list1);
+            return;
+        }
+        if (*c != '=') {
+            LIBOFFLOAD_ERROR(c_mic_parse_env_var_list2);
+            return;
+        }
+        c++;
+
+        if (*c == '"') {
+            var_is_quoted = true;
+            c++;
+        }
+        // Environment variable values that contain | will need to be escaped.
+        while (*c != 0 && *c != '|' &&
+               (!var_is_quoted || *c != '"'))
+        {
+            // skip escaped symbol
+            if (*c == '\\') {
+                c++;
+            }
+            c++;
+        }
+        if (var_is_quoted) {
+            c++; // for "
+            while (*c != 0 && *c != '|') {
+                c++;
+            }
+        }
+
+        int sz = c - env_var_name;
+        env_var_def = (char*)malloc(sz);
+        memcpy(env_var_def, env_var_name, sz);
+        env_var_def[sz] = 0;
+
+        if (*c == '|') {
+            c++;
+            while (*c != 0 && *c == ' ') {
+                c++;
+            }
+        }
+        add_env_var(card_number,
+                    env_var_name,
+                    env_var_name_length,
+                    env_var_def);
+    }
+}
+
+// Collect all definitions for the card with number "card_num".
+// The returned result is vector of string pointers defining one
+// environment variable. The vector is terminated by NULL pointer.
+// In the beginning of the vector there are env vars defined as
+// <mic-prefix>_<card-number>_<var>=<value>
+// or
+// <mic-prefix>_<card-number>_ENV=<env-vars>
+// where <card-number> is equal to "card_num"
+// They are followed by definitions valid for any card
+// and absent in previous definitions.
+
+char** MicEnvVar::create_environ_for_card(int card_num)
+{
+    VarValue *var_value;
+    VarValue *var_value_find;
+    CardEnvVars *card_data = get_card(card_num);
+    CardEnvVars *card_data_common;
+    std::list<char*> new_env;
+    char **rez;
+
+    if (!prefix) {
+        return NULL;
+    }
+    // There is no personel env var definitions for the card with
+    // number "card_num"
+    if (!card_data) {
+        return create_environ_for_card(any_card);
+    }
+
+    for (std::list<MicEnvVar::VarValue*>::const_iterator
+         it = card_data->env_vars.begin();
+         it != card_data->env_vars.end(); it++) {
+        var_value = *it;
+        new_env.push_back(var_value->env_var_value);
+    }
+
+    if (card_num != any_card) {
+        card_data_common = get_card(any_card);
+        for (std::list<MicEnvVar::VarValue*>::const_iterator
+             it = card_data_common->env_vars.begin();
+             it != card_data_common->env_vars.end(); it++) {
+            var_value = *it;
+            var_value_find = card_data->find_var(var_value->env_var,
+                                                 var_value->length);
+            if (!var_value_find) {
+                new_env.push_back(var_value->env_var_value);
+            }
+        }
+    }
+
+    int new_env_size = new_env.size();
+    rez = (char**) malloc((new_env_size + 1) * sizeof(char*));
+    std::copy(new_env.begin(), new_env.end(), rez);
+    rez[new_env_size] = 0;
+    return rez;
+}

diff --git a/final/offload/src/offload_env.h b/final/offload/src/offload_env.h
new file mode 100644
index 0000000..f035ff6
--- /dev/null
+++ b/final/offload/src/offload_env.h

@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef OFFLOAD_ENV_H_INCLUDED
+#define OFFLOAD_ENV_H_INCLUDED
+
+#include <list>
+
+// data structure and routines to parse MIC user environment and pass to MIC
+
+enum MicEnvVarKind
+{
+    c_no_mic,         // not MIC env var
+    c_mic_var,        // for <mic-prefix>_<var>
+    c_mic_card_var,   // for <mic-prefix>_<card-number>_<var>
+    c_mic_card_env    // for <mic-prefix>_<card-number>_ENV
+};
+
+struct MicEnvVar {
+public:
+    MicEnvVar() : prefix(0) {}
+    ~MicEnvVar();
+
+    void analyze_env_var(char *env_var_string);
+    char** create_environ_for_card(int card_num);
+    MicEnvVarKind get_env_var_kind(
+        char *env_var_string,
+        int *card_number,
+        char **env_var_name,
+        int *env_var_name_length,
+        char **env_var_def
+    );
+    void add_env_var(
+        int card_number,
+        char *env_var_name,
+        int env_var_name_length,
+        char *env_var_def
+    );
+
+    void set_prefix(const char *pref) {
+        prefix = (pref && *pref != '\0') ? pref : 0;
+    }
+
+    struct VarValue {
+    public:
+        char* env_var;
+        int   length;
+        char* env_var_value;
+
+        VarValue(char* var, int ln, char* value)
+        {
+            env_var = var;
+            length = ln;
+            env_var_value = value;
+        }
+        ~VarValue();
+    };
+
+    struct CardEnvVars {
+    public:
+
+        int card_number;
+        std::list<struct VarValue*> env_vars;
+
+        CardEnvVars() { card_number = any_card; }
+        CardEnvVars(int num) { card_number = num; }
+        ~CardEnvVars();
+
+        void add_new_env_var(int number, char *env_var, int length,
+                             char *env_var_value);
+        VarValue* find_var(char* env_var_name, int env_var_name_length);
+    };
+    static const int any_card;
+
+private:
+    void         mic_parse_env_var_list(int card_number, char *env_var_def);
+    CardEnvVars* get_card(int number);
+
+    const char *prefix;
+    std::list<struct CardEnvVars *> card_spec_list;
+    CardEnvVars common_vars;
+};
+
+#endif // OFFLOAD_ENV_H_INCLUDED

diff --git a/final/offload/src/offload_host.cpp b/final/offload/src/offload_host.cpp
new file mode 100644
index 0000000..38d5139
--- /dev/null
+++ b/final/offload/src/offload_host.cpp

@@ -0,0 +1,4360 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Forward declaration as the following 2 functions are declared as friend in offload_engine.h
+// CLANG does not like static to been after friend declaration.
+static void __offload_init_library_once(void);
+static void __offload_fini_library(void);
+
+#include "offload_host.h"
+#ifdef MYO_SUPPORT
+#include "offload_myo_host.h"
+#endif
+
+#include <malloc.h>
+#ifndef TARGET_WINNT
+#include <alloca.h>
+#include <elf.h>
+#endif // TARGET_WINNT
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <algorithm>
+#include <bitset>
+
+#if defined(HOST_WINNT)
+#define PATH_SEPARATOR ";"
+#else
+#define PATH_SEPARATOR ":"
+#endif
+
+#define GET_OFFLOAD_NUMBER(timer_data) \
+    timer_data? timer_data->offload_number : 0
+
+#ifdef TARGET_WINNT
+// Small subset of ELF declarations for Windows which is needed to compile
+// this file. ELF header is used to understand what binary type is contained
+// in the target image - shared library or executable.
+
+typedef uint16_t Elf64_Half;
+typedef uint32_t Elf64_Word;
+typedef uint64_t Elf64_Addr;
+typedef uint64_t Elf64_Off;
+
+#define EI_NIDENT   16
+
+#define ET_EXEC     2
+#define ET_DYN      3
+
+typedef struct
+{
+    unsigned char e_ident[EI_NIDENT];
+    Elf64_Half    e_type;
+    Elf64_Half    e_machine;
+    Elf64_Word    e_version;
+    Elf64_Addr    e_entry;
+    Elf64_Off     e_phoff;
+    Elf64_Off     e_shoff;
+    Elf64_Word    e_flags;
+    Elf64_Half    e_ehsize;
+    Elf64_Half    e_phentsize;
+    Elf64_Half    e_phnum;
+    Elf64_Half    e_shentsize;
+    Elf64_Half    e_shnum;
+    Elf64_Half    e_shstrndx;
+} Elf64_Ehdr;
+#endif // TARGET_WINNT
+
+// Host console and file logging
+const char *prefix;
+int console_enabled = 0;
+int offload_number = 0;
+
+static const char *htrace_envname = "H_TRACE";
+static const char *offload_report_envname = "OFFLOAD_REPORT";
+static char *timer_envname = "H_TIME";
+
+// Trace information
+static const char* vardesc_direction_as_string[] = {
+    "NOCOPY",
+    "IN",
+    "OUT",
+    "INOUT"
+};
+static const char* vardesc_type_as_string[] = {
+    "unknown",
+    "data",
+    "data_ptr",
+    "func_ptr",
+    "void_ptr",
+    "string_ptr",
+    "dv",
+    "dv_data",
+    "dv_data_slice",
+    "dv_ptr",
+    "dv_ptr_data",
+    "dv_ptr_data_slice",
+    "cean_var",
+    "cean_var_ptr",
+    "c_data_ptr_array",
+    "c_func_ptr_array",
+    "c_void_ptr_array",
+    "c_string_ptr_array"
+};
+
+Engine*         mic_engines = 0;
+uint32_t        mic_engines_total = 0;
+pthread_key_t   mic_thread_key;
+MicEnvVar       mic_env_vars;
+uint64_t        cpu_frequency = 0;
+
+// MIC_STACKSIZE
+uint32_t mic_stack_size = 12 * 1024 * 1024;
+
+// MIC_BUFFERSIZE
+uint64_t mic_buffer_size = 0;
+
+// MIC_LD_LIBRARY_PATH
+char* mic_library_path = 0;
+
+// MIC_PROXY_IO
+bool mic_proxy_io = true;
+
+// MIC_PROXY_FS_ROOT
+char* mic_proxy_fs_root = 0;
+
+// Threshold for creating buffers with large pages. Buffer is created
+// with large pages hint if its size exceeds the threshold value.
+// By default large pages are disabled right now (by setting default
+// value for threshold to MAX) due to HSD 4114629.
+uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
+static const char *mic_use_2mb_buffers_envname  =
+    "MIC_USE_2MB_BUFFERS";
+
+static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
+static const char *mic_use_async_buffer_write_envname  =
+    "MIC_USE_ASYNC_BUFFER_WRITE";
+
+static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
+static const char *mic_use_async_buffer_read_envname  =
+    "MIC_USE_ASYNC_BUFFER_READ";
+
+// device initialization type
+OffloadInitType __offload_init_type = c_init_on_offload_all;
+static const char *offload_init_envname = "OFFLOAD_INIT";
+
+// active wait
+static bool __offload_active_wait = true;
+static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
+
+// OMP_DEFAULT_DEVICE
+int __omp_device_num = 0;
+static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
+
+// The list of pending target libraries
+static bool            __target_libs;
+static TargetImageList __target_libs_list;
+static mutex_t         __target_libs_lock;
+static mutex_t         stack_alloc_lock;
+
+// Target executable
+TargetImage*           __target_exe;
+
+static char * offload_get_src_base(void * ptr, uint8_t type)
+{
+    char *base;
+    if (VAR_TYPE_IS_PTR(type)) {
+        base = *static_cast<char**>(ptr);
+    }
+    else if (VAR_TYPE_IS_SCALAR(type)) {
+        base = static_cast<char*>(ptr);
+    }
+    else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
+        ArrDesc *dvp;
+        if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
+            const arr_desc *ap = static_cast<const arr_desc*>(ptr);
+            dvp = (type == c_dv_data_slice) ?
+                  reinterpret_cast<ArrDesc*>(ap->base) :
+                  *reinterpret_cast<ArrDesc**>(ap->base);
+        }
+        else {
+            dvp = (type == c_dv_data) ?
+                  static_cast<ArrDesc*>(ptr) :
+                  *static_cast<ArrDesc**>(ptr);
+        }
+        base = reinterpret_cast<char*>(dvp->Base);
+    }
+    else {
+        base = NULL;
+    }
+    return base;
+}
+
+void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
+{
+    // special case for the 'process died' error
+    if (res == COI_PROCESS_DIED) {
+        m_device.fini_process(true);
+    }
+    else {
+        switch (msg) {
+            case c_buf_create:
+                if (res == COI_OUT_OF_MEMORY) {
+                    msg = c_buf_create_out_of_mem;
+                }
+                /* fallthru */
+
+            case c_buf_create_from_mem:
+            case c_buf_get_address:
+            case c_pipeline_create:
+            case c_pipeline_run_func:
+                LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
+                break;
+
+            case c_buf_read:
+            case c_buf_write:
+            case c_buf_copy:
+            case c_buf_map:
+            case c_buf_unmap:
+            case c_buf_destroy:
+            case c_buf_set_state:
+                LIBOFFLOAD_ERROR(msg, res);
+                break;
+
+            default:
+                break;
+        }
+    }
+
+    exit(1);
+}
+
+_Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
+{
+    switch (res) {
+        case COI_SUCCESS:
+            return OFFLOAD_SUCCESS;
+
+        case COI_PROCESS_DIED:
+            return OFFLOAD_PROCESS_DIED;
+
+        case COI_OUT_OF_MEMORY:
+            return OFFLOAD_OUT_OF_MEMORY;
+
+        default:
+            return OFFLOAD_ERROR;
+    }
+}
+
+bool OffloadDescriptor::alloc_ptr_data(
+    PtrData* &ptr_data,
+    void *base,
+    int64_t disp,
+    int64_t size,
+    int64_t alloc_disp,
+    int align
+)
+{
+    // total length of base
+    int64_t length = disp + size;
+    bool is_new;
+
+    OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
+                  base, length);
+
+    // add new entry
+    ptr_data = m_device.insert_ptr_data(base, length, is_new);
+    if (is_new) {
+
+        OFFLOAD_TRACE(3, "Added new association\n");
+
+        if (length > 0) {
+            OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
+            COIRESULT res;
+
+            // align should be a power of 2
+            if (align > 0 && (align & (align - 1)) == 0) {
+                // offset within mic_buffer. Can do offset optimization
+                // only when source address alignment satisfies requested
+                // alignment on the target (cq172736).
+                if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
+                    ptr_data->mic_offset = reinterpret_cast<intptr_t>(base) & 4095;
+                }
+            }
+
+            // buffer size and flags
+            uint64_t buffer_size = length + ptr_data->mic_offset;
+            uint32_t buffer_flags = 0;
+
+            // create buffer with large pages if data length exceeds
+            // large page threshold
+            if (length >= __offload_use_2mb_buffers) {
+                buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
+            }
+
+            // create CPU buffer
+            OFFLOAD_DEBUG_TRACE_1(3,
+                          GET_OFFLOAD_NUMBER(get_timer_data()),
+                          c_offload_create_buf_host,
+                          "Creating buffer from source memory %p, "
+                          "length %lld\n", base, length);
+
+            // result is not checked because we can continue without cpu
+            // buffer. In this case we will use COIBufferRead/Write instead
+            // of COIBufferCopy.
+            COI::BufferCreateFromMemory(length,
+                                        COI_BUFFER_NORMAL,
+                                        0,
+                                        base,
+                                        1,
+                                        &m_device.get_process(),
+                                        &ptr_data->cpu_buf);
+
+            OFFLOAD_DEBUG_TRACE_1(3,
+                          GET_OFFLOAD_NUMBER(get_timer_data()),
+                          c_offload_create_buf_mic,
+                          "Creating buffer for sink: size %lld, offset %d, "
+                          "flags =0x%x\n", buffer_size - alloc_disp,
+                          ptr_data->mic_offset, buffer_flags);
+
+            // create MIC buffer
+            res = COI::BufferCreate(buffer_size - alloc_disp,
+                                    COI_BUFFER_NORMAL,
+                                    buffer_flags,
+                                    0,
+                                    1,
+                                    &m_device.get_process(),
+                                    &ptr_data->mic_buf);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                }
+                else if (m_is_mandatory) {
+                    report_coi_error(c_buf_create, res);
+                }
+                ptr_data->alloc_ptr_data_lock.unlock();
+                return false;
+            }
+
+            // make buffer valid on the device.
+            res = COI::BufferSetState(ptr_data->mic_buf,
+                                      m_device.get_process(),
+                                      COI_BUFFER_VALID,
+                                      COI_BUFFER_NO_MOVE,
+                                      0, 0, 0);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                }
+                else if (m_is_mandatory) {
+                    report_coi_error(c_buf_set_state, res);
+                }
+                ptr_data->alloc_ptr_data_lock.unlock();
+                return false;
+            }
+
+            res = COI::BufferSetState(ptr_data->mic_buf,
+                                      COI_PROCESS_SOURCE,
+                                      COI_BUFFER_INVALID,
+                                      COI_BUFFER_NO_MOVE,
+                                      0, 0, 0);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                }
+                else if (m_is_mandatory) {
+                    report_coi_error(c_buf_set_state, res);
+                }
+                ptr_data->alloc_ptr_data_lock.unlock();
+                return false;
+            }
+        }
+
+        ptr_data->alloc_disp = alloc_disp;
+        ptr_data->alloc_ptr_data_lock.unlock();
+    }
+    else {
+        mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
+
+        OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
+                      "is_static %d\n",
+                      ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
+                      ptr_data->is_static);
+
+        // This is not a new entry. Make sure that provided address range fits
+        // into existing one.
+        MemRange addr_range(base, length - ptr_data->alloc_disp);
+        if (!ptr_data->cpu_addr.contains(addr_range)) {
+            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
+            exit(1);
+        }
+
+        // if the entry is associated with static data it may not have buffers
+        // created because they are created on demand.
+        if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool OffloadDescriptor::find_ptr_data(
+    PtrData* &ptr_data,
+    void *base,
+    int64_t disp,
+    int64_t size,
+    bool report_error
+)
+{
+    // total length of base
+    int64_t length = disp + size;
+
+    OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
+                  "length %lld\n", base, length);
+
+    // find existing association in pointer table
+    ptr_data = m_device.find_ptr_data(base);
+    if (ptr_data == 0) {
+        if (report_error) {
+            LIBOFFLOAD_ERROR(c_no_ptr_data, base);
+            exit(1);
+        }
+        OFFLOAD_TRACE(3, "Association does not exist\n");
+        return true;
+    }
+
+    OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
+                  ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
+                  ptr_data->is_static);
+
+    // make sure that provided address range fits into existing one
+    MemRange addr_range(base, length);
+    if (!ptr_data->cpu_addr.contains(addr_range)) {
+        if (report_error) {
+            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
+            exit(1);
+        }
+        OFFLOAD_TRACE(3, "Existing association partially overlaps with "
+                      "data address range\n");
+        ptr_data = 0;
+        return true;
+    }
+
+    // if the entry is associated with static data it may not have buffers
+    // created because they are created on demand.
+    if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
+        return false;
+    }
+
+    return true;
+}
+
+bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
+
+    if (ptr_data->cpu_buf == 0) {
+        OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
+                      ptr_data->cpu_addr.start());
+
+        COIRESULT res = COI::BufferCreateFromMemory(
+            ptr_data->cpu_addr.length(),
+            COI_BUFFER_NORMAL,
+            0,
+            const_cast<void*>(ptr_data->cpu_addr.start()),
+            1, &m_device.get_process(),
+            &ptr_data->cpu_buf);
+
+        if (res != COI_SUCCESS) {
+            if (m_status != 0) {
+                m_status->result = translate_coi_error(res);
+                return false;
+            }
+            report_coi_error(c_buf_create_from_mem, res);
+        }
+    }
+
+    if (ptr_data->mic_buf == 0) {
+        OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
+                      ptr_data->mic_addr);
+
+        COIRESULT res = COI::BufferCreateFromMemory(
+            ptr_data->cpu_addr.length(),
+            COI_BUFFER_NORMAL,
+            COI_SINK_MEMORY,
+            reinterpret_cast<void*>(ptr_data->mic_addr),
+            1, &m_device.get_process(),
+            &ptr_data->mic_buf);
+
+        if (res != COI_SUCCESS) {
+            if (m_status != 0) {
+                m_status->result = translate_coi_error(res);
+                return false;
+            }
+            report_coi_error(c_buf_create_from_mem, res);
+        }
+    }
+
+    return true;
+}
+
+bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
+{
+    if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
+        COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
+                                                  &ptr_data->mic_addr);
+        if (res != COI_SUCCESS) {
+            if (m_status != 0) {
+                m_status->result = translate_coi_error(res);
+            }
+            else if (m_is_mandatory) {
+                report_coi_error(c_buf_get_address, res);
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+bool OffloadDescriptor::nullify_target_stack(
+    COIBUFFER targ_buf,
+    uint64_t size
+)
+{
+    char * ptr = (char*)malloc(size);
+    COIRESULT res;
+
+    memset(ptr, 0, size);
+    res = COI::BufferWrite(
+        targ_buf,
+        0,
+        ptr,
+        size,
+        COI_COPY_UNSPECIFIED,
+        0, 0, 0);
+    free(ptr);
+    if (res != COI_SUCCESS) {
+        if (m_status != 0) {
+            m_status->result = translate_coi_error(res);
+            return false;
+        }
+        report_coi_error(c_buf_write, res);
+    }
+    return true;
+}
+
+bool OffloadDescriptor::offload_stack_memory_manager(
+    const void * stack_begin,
+    int  routine_id,
+    int  buf_size,
+    int  align,
+    bool *is_new)
+{
+    mutex_locker_t locker(stack_alloc_lock);
+
+    PersistData * new_el;
+    PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
+    PersistDataList::iterator it_end;
+    int erase = 0;
+
+    *is_new = false;
+
+    for (PersistDataList::iterator it = m_device.m_persist_list.begin();
+        it != m_device.m_persist_list.end(); it++) {
+        PersistData cur_el = *it;
+
+        if (stack_begin > it->stack_cpu_addr) {
+            // this stack data must be destroyed
+            m_destroy_stack.push_front(cur_el.stack_ptr_data);
+            it_end = it;
+            erase++;
+        }
+        else if (stack_begin == it->stack_cpu_addr) {
+            if (routine_id != it-> routine_id) {
+                // this stack data must be destroyed
+                m_destroy_stack.push_front(cur_el.stack_ptr_data);
+                it_end = it;
+                erase++;
+                break;
+            }
+            else {
+                // stack data is reused
+                m_stack_ptr_data = it->stack_ptr_data;
+                if (erase > 0) {
+                    // all obsolete stack sections must be erased from the list
+                    m_device.m_persist_list.erase(it_begin, ++it_end);
+
+                    m_in_datalen +=
+                        erase * sizeof(new_el->stack_ptr_data->mic_addr);
+                }
+                OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
+                                 m_stack_ptr_data->mic_addr);
+                return true;
+            }
+        }
+        else if (stack_begin < it->stack_cpu_addr) {
+            break;
+        }
+    }
+
+    if (erase > 0) {
+        // all obsolete stack sections must be erased from the list
+        m_device.m_persist_list.erase(it_begin, ++it_end);
+        m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
+    }
+    // new stack table is created
+    new_el = new PersistData(stack_begin, routine_id, buf_size);
+    // create MIC buffer
+    COIRESULT res;
+    uint32_t buffer_flags = 0;
+
+    // create buffer with large pages if data length exceeds
+    // large page threshold
+    if (buf_size >= __offload_use_2mb_buffers) {
+        buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
+    }
+    res = COI::BufferCreate(buf_size,
+        COI_BUFFER_NORMAL,
+        buffer_flags,
+        0,
+        1,
+        &m_device.get_process(),
+        &new_el->stack_ptr_data->mic_buf);
+    if (res != COI_SUCCESS) {
+        if (m_status != 0) {
+            m_status->result = translate_coi_error(res);
+        }
+        else if (m_is_mandatory) {
+            report_coi_error(c_buf_create, res);
+        }
+        return false;
+    }
+    // make buffer valid on the device.
+    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
+        m_device.get_process(),
+        COI_BUFFER_VALID,
+        COI_BUFFER_NO_MOVE,
+        0, 0, 0);
+    if (res != COI_SUCCESS) {
+        if (m_status != 0) {
+            m_status->result = translate_coi_error(res);
+        }
+        else if (m_is_mandatory) {
+            report_coi_error(c_buf_set_state, res);
+        }
+        return false;
+    }
+    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
+        COI_PROCESS_SOURCE,
+        COI_BUFFER_INVALID,
+        COI_BUFFER_NO_MOVE,
+        0, 0, 0);
+    if (res != COI_SUCCESS) {
+        if (m_status != 0) {
+            m_status->result = translate_coi_error(res);
+        }
+        else if (m_is_mandatory) {
+            report_coi_error(c_buf_set_state, res);
+        }
+        return false;
+    }
+    // persistence algorithm requires target stack initialy to be nullified
+    if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
+        return false;
+    }
+
+    m_stack_ptr_data = new_el->stack_ptr_data;
+    init_mic_address(m_stack_ptr_data);
+    OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
+                      m_stack_ptr_data->mic_addr);
+    m_device.m_persist_list.push_front(*new_el);
+    init_mic_address(new_el->stack_ptr_data);
+    *is_new = true;
+    return true;
+}
+
+bool OffloadDescriptor::setup_descriptors(
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int vars_total,
+    int entry_id,
+    const void *stack_addr
+)
+{
+    COIRESULT res;
+
+    OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
+
+    // make a copy of variable descriptors
+    m_vars_total = vars_total;
+    if (vars_total > 0) {
+        m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
+        memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
+        m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
+    }
+
+    // dependencies
+    m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * (m_vars_total  + 1));
+    if (m_vars_total > 0) {
+        m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_vars_total);
+    }
+
+    // copyin/copyout data length
+    m_in_datalen = 0;
+    m_out_datalen = 0;
+
+    // First pass over variable descriptors
+    // - Calculate size of the input and output non-pointer data
+    // - Allocate buffers for input and output pointers
+    for (int i = 0; i < m_vars_total; i++) {
+        void*   alloc_base = NULL;
+        int64_t alloc_disp = 0;
+        int64_t alloc_size;
+        bool    src_is_for_mic = (m_vars[i].direction.out ||
+                                  m_vars[i].into == NULL);
+
+        const char *var_sname = "";
+        if (vars2 != NULL && i < vars_total) {
+            if (vars2[i].sname != NULL) {
+                var_sname = vars2[i].sname;
+            }
+        }
+        OFFLOAD_TRACE(2, "   VarDesc %d, var=%s, %s, %s\n",
+            i, var_sname,
+            vardesc_direction_as_string[m_vars[i].direction.bits],
+            vardesc_type_as_string[m_vars[i].type.src]);
+        if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
+            OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
+                vardesc_type_as_string[m_vars[i].type.dst]);
+        }
+        OFFLOAD_TRACE(2,
+            "              type_src=%d, type_dstn=%d, direction=%d, "
+            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
+            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
+            m_vars[i].type.src,
+            m_vars[i].type.dst,
+            m_vars[i].direction.bits,
+            m_vars[i].alloc_if,
+            m_vars[i].free_if,
+            m_vars[i].align,
+            m_vars[i].mic_offset,
+            m_vars[i].flags.bits,
+            m_vars[i].offset,
+            m_vars[i].size,
+            m_vars[i].count,
+            m_vars[i].ptr,
+            m_vars[i].into);
+
+        if (m_vars[i].alloc != NULL) {
+            // array descriptor
+            const arr_desc *ap =
+                static_cast<const arr_desc*>(m_vars[i].alloc);
+
+            // debug dump
+            __arr_desc_dump("    ", "ALLOC", ap, 0);
+
+            __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
+
+            alloc_base = reinterpret_cast<void*>(ap->base);
+        }
+
+        m_vars_extra[i].cpu_disp = 0;
+        m_vars_extra[i].cpu_offset = 0;
+        m_vars_extra[i].src_data = 0;
+        m_vars_extra[i].read_rng_src = 0;
+        m_vars_extra[i].read_rng_dst = 0;
+        // flag is_arr_ptr_el is 1 only for var_descs generated
+        // for c_data_ptr_array type
+        if (i < vars_total) {
+            m_vars_extra[i].is_arr_ptr_el = 0;
+        }
+
+        switch (m_vars[i].type.src) {
+            case c_data_ptr_array:
+                {
+                    const arr_desc *ap;
+                    const VarDesc3 *vd3 =
+                        static_cast<const VarDesc3*>(m_vars[i].ptr);
+                    int flags = vd3->array_fields;
+                    OFFLOAD_TRACE(2,
+                        "              pointer array flags = %04x\n", flags);
+                    OFFLOAD_TRACE(2,
+                        "              pointer array type is %s\n",
+                        vardesc_type_as_string[flags & 0x3f]);
+                    ap = static_cast<const arr_desc*>(vd3->ptr_array);
+                    __arr_desc_dump("              ", "ptr array", ap, 0);
+                    if (m_vars[i].into) {
+                        ap = static_cast<const arr_desc*>(m_vars[i].into);
+                        __arr_desc_dump(
+                            "              ", "into array", ap, 0);
+                    }
+                    if ((flags & (1<<flag_align_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->align_array);
+                        __arr_desc_dump(
+                            "              ", "align array", ap, 0);
+                    }
+                    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
+                        __arr_desc_dump(
+                            "              ", "alloc_if array", ap, 0);
+                    }
+                    if ((flags & (1<<flag_free_if_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->free_if_array);
+                        __arr_desc_dump(
+                            "              ", "free_if array", ap, 0);
+                    }
+                    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->extent_start);
+                        __arr_desc_dump(
+                            "              ", "extent_start array", ap, 0);
+                    } else if ((flags &
+                        (1<<flag_extent_start_is_scalar)) != 0) {
+                        OFFLOAD_TRACE(2,
+                            "              extent_start scalar = %d\n",
+                            (int64_t)vd3->extent_start);
+                    }
+                    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>
+                            (vd3->extent_elements);
+                        __arr_desc_dump(
+                            "              ", "extent_elements array", ap, 0);
+                    } else if ((flags &
+                        (1<<flag_extent_elements_is_scalar)) != 0) {
+                        OFFLOAD_TRACE(2,
+                            "              extent_elements scalar = %d\n",
+                            (int64_t)vd3->extent_elements);
+                    }
+                    if ((flags & (1<<flag_into_start_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->into_start);
+                        __arr_desc_dump(
+                            "              ", "into_start array", ap, 0);
+                    } else if ((flags &
+                        (1<<flag_into_start_is_scalar)) != 0) {
+                        OFFLOAD_TRACE(2,
+                            "              into_start scalar = %d\n",
+                            (int64_t)vd3->into_start);
+                    }
+                    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->into_elements);
+                        __arr_desc_dump(
+                            "              ", "into_elements array", ap, 0);
+                    } else if ((flags &
+                        (1<<flag_into_elements_is_scalar)) != 0) {
+                        OFFLOAD_TRACE(2,
+                            "              into_elements scalar = %d\n",
+                            (int64_t)vd3->into_elements);
+                    }
+                    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->alloc_start);
+                        __arr_desc_dump(
+                            "              ", "alloc_start array", ap, 0);
+                    } else if ((flags &
+                        (1<<flag_alloc_start_is_scalar)) != 0) {
+                        OFFLOAD_TRACE(2,
+                            "              alloc_start scalar = %d\n",
+                            (int64_t)vd3->alloc_start);
+                    }
+                    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
+                        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
+                        __arr_desc_dump(
+                            "              ", "alloc_elements array", ap, 0);
+                    } else if ((flags &
+                        (1<<flag_alloc_elements_is_scalar)) != 0) {
+                        OFFLOAD_TRACE(2,
+                            "              alloc_elements scalar = %d\n",
+                            (int64_t)vd3->alloc_elements);
+                    }
+                }
+                if (!gen_var_descs_for_pointer_array(i)) {
+                    return false;
+                }
+                break;
+
+            case c_data:
+            case c_void_ptr:
+            case c_cean_var:
+                // In all uses later
+                // VarDesc.size will have the length of the data to be
+                // transferred
+                // VarDesc.disp will have an offset from base
+                if (m_vars[i].type.src == c_cean_var) {
+                    // array descriptor
+                    const arr_desc *ap =
+                        static_cast<const arr_desc*>(m_vars[i].ptr);
+
+                    // debug dump
+                    __arr_desc_dump("", "IN/OUT", ap, 0);
+
+                    // offset and length are derived from the array descriptor
+                    __arr_data_offset_and_length(ap, m_vars[i].disp,
+                                                 m_vars[i].size);
+                    if (!is_arr_desc_contiguous(ap)) {
+                        m_vars[i].flags.is_noncont_src = 1;
+                        m_vars_extra[i].read_rng_src =
+                            init_read_ranges_arr_desc(ap);
+                    }
+                    // all necessary information about length and offset is
+                    // transferred in var descriptor. There is no need to send
+                    // array descriptor to the target side.
+                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
+                }
+                else {
+                    m_vars[i].size *= m_vars[i].count;
+                    m_vars[i].disp = 0;
+                }
+
+                if (m_vars[i].direction.bits) {
+                    // make sure that transfer size > 0
+                    if (m_vars[i].size <= 0) {
+                        LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
+                        exit(1);
+                    }
+
+                    if (m_vars[i].flags.is_static) {
+                        PtrData *ptr_data;
+
+                        // find data associated with variable
+                        if (!find_ptr_data(ptr_data,
+                                           m_vars[i].ptr,
+                                           m_vars[i].disp,
+                                           m_vars[i].size,
+                                           false)) {
+                            return false;
+                        }
+
+                        if (ptr_data != 0) {
+                            // offset to base from the beginning of the buffer
+                            // memory
+                            m_vars[i].offset =
+                                (char*) m_vars[i].ptr -
+                                (char*) ptr_data->cpu_addr.start();
+                        }
+                        else {
+                            m_vars[i].flags.is_static = false;
+                            if (m_vars[i].into == NULL) {
+                                m_vars[i].flags.is_static_dstn = false;
+                            }
+                        }
+                        m_vars_extra[i].src_data = ptr_data;
+                    }
+
+                    if (m_is_openmp) {
+                        if (m_vars[i].flags.is_static) {
+                            // Static data is transferred only by omp target
+                            // update construct which passes zeros for
+                            // alloc_if and free_if.
+                            if (m_vars[i].alloc_if || m_vars[i].free_if) {
+                                m_vars[i].direction.bits = c_parameter_nocopy;
+                            }
+                        }
+                        else {
+                            AutoData *auto_data;
+                            if (m_vars[i].alloc_if) {
+                                auto_data = m_device.insert_auto_data(
+                                    m_vars[i].ptr, m_vars[i].size);
+                                auto_data->add_reference();
+                            }
+                            else {
+                                // TODO: what should be done if var is not in
+                                // the table?
+                                auto_data = m_device.find_auto_data(
+                                    m_vars[i].ptr);
+                            }
+
+                            // For automatic variables data is transferred
+                            // only if alloc_if == 0 && free_if == 0
+                            // or reference count is 1
+                            if ((m_vars[i].alloc_if || m_vars[i].free_if) &&
+                                auto_data != 0 &&
+                                auto_data->get_reference() != 1) {
+                                m_vars[i].direction.bits = c_parameter_nocopy;
+                            }
+
+                            // save data for later use
+                            m_vars_extra[i].auto_data = auto_data;
+                        }
+                    }
+
+                    if (m_vars[i].direction.in &&
+                        !m_vars[i].flags.is_static) {
+                        m_in_datalen += m_vars[i].size;
+
+                        // for non-static target destination defined as CEAN
+                        // expression we pass to target its size and dist
+                        if (m_vars[i].into == NULL &&
+                            m_vars[i].type.src == c_cean_var) {
+                            m_in_datalen += 2 * sizeof(uint64_t);
+                        }
+                        m_need_runfunction = true;
+                    }
+                    if (m_vars[i].direction.out &&
+                        !m_vars[i].flags.is_static) {
+                        m_out_datalen += m_vars[i].size;
+                        m_need_runfunction = true;
+                    }
+                }
+                break;
+
+            case c_dv:
+                if (m_vars[i].direction.bits ||
+                    m_vars[i].alloc_if ||
+                    m_vars[i].free_if) {
+                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
+
+                    // debug dump
+                    __dv_desc_dump("IN/OUT", dvp);
+
+                    // send dope vector contents excluding base
+                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
+                    m_need_runfunction = true;
+                }
+                break;
+
+            case c_string_ptr:
+                if ((m_vars[i].direction.bits ||
+                     m_vars[i].alloc_if ||
+                     m_vars[i].free_if) &&
+                    m_vars[i].size == 0) {
+                    m_vars[i].size = 1;
+                    m_vars[i].count =
+                        strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
+                }
+                /* fallthru */
+
+            case c_data_ptr:
+                if (m_vars[i].flags.is_stack_buf &&
+                    !m_vars[i].direction.bits &&
+                    m_vars[i].alloc_if) {
+                    // this var_desc is for stack buffer
+                    bool is_new;
+
+                    if (!offload_stack_memory_manager(
+                            stack_addr, entry_id,
+                            m_vars[i].count, m_vars[i].align, &is_new)) {
+                        return false;
+                    }
+                    if (is_new) {
+                        m_compute_buffers.push_back(
+                            m_stack_ptr_data->mic_buf);
+                        m_device.m_persist_list.front().cpu_stack_addr =
+                            static_cast<char*>(m_vars[i].ptr);
+                    }
+                    else {
+                        m_vars[i].flags.sink_addr = 1;
+                        m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
+                    }
+                    m_vars[i].size = m_destroy_stack.size();
+                    m_vars_extra[i].src_data = m_stack_ptr_data;
+                    // need to add reference for buffer
+                    m_need_runfunction = true;
+                    break;
+                }
+                /* fallthru */
+
+            case c_cean_var_ptr:
+            case c_dv_ptr:
+                if (m_vars[i].type.src == c_cean_var_ptr) {
+                    // array descriptor
+                    const arr_desc *ap =
+                        static_cast<const arr_desc*>(m_vars[i].ptr);
+
+                    // debug dump
+                    __arr_desc_dump("", "IN/OUT", ap, 1);
+
+                    // offset and length are derived from the array descriptor
+                    __arr_data_offset_and_length(ap, m_vars[i].disp,
+                                                 m_vars[i].size);
+
+                    if (!is_arr_desc_contiguous(ap)) {
+                        m_vars[i].flags.is_noncont_src = 1;
+                        m_vars_extra[i].read_rng_src =
+                            init_read_ranges_arr_desc(ap);
+                    }
+                    // all necessary information about length and offset is
+                    // transferred in var descriptor. There is no need to send
+                    // array descriptor to the target side.
+                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
+                }
+                else if (m_vars[i].type.src == c_dv_ptr) {
+                    // need to send DV to the device unless it is 'nocopy'
+                    if (m_vars[i].direction.bits ||
+                        m_vars[i].alloc_if ||
+                        m_vars[i].free_if) {
+                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
+
+                        // debug dump
+                        __dv_desc_dump("IN/OUT", dvp);
+
+                        m_vars[i].direction.bits = c_parameter_in;
+                    }
+
+                    // no displacement
+                    m_vars[i].disp = 0;
+                }
+                else {
+                    // c_data_ptr or c_string_ptr
+                    m_vars[i].size *= m_vars[i].count;
+                    m_vars[i].disp = 0;
+                }
+
+                if (m_vars[i].direction.bits ||
+                    m_vars[i].alloc_if ||
+                    m_vars[i].free_if) {
+                    PtrData *ptr_data;
+
+                    // check that buffer length >= 0
+                    if (m_vars[i].alloc_if &&
+                        m_vars[i].disp + m_vars[i].size < 0) {
+                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
+                        exit(1);
+                    }
+
+                    // base address
+                    void *base = *static_cast<void**>(m_vars[i].ptr);
+
+                    // allocate buffer if we have no INTO and don't need
+                    // allocation for the ptr at target
+                    if (src_is_for_mic) {
+                        if (m_vars[i].flags.is_stack_buf) {
+                            // for stack persistent objects ptr data is created
+                            // by var_desc with number 0.
+                            // Its ptr_data is stored at m_stack_ptr_data
+                            ptr_data = m_stack_ptr_data;
+                            m_vars[i].flags.sink_addr = 1;
+                        }
+                        else if (m_vars[i].alloc_if) {
+                            // add new entry
+                            if (!alloc_ptr_data(
+                                    ptr_data,
+                                    base,
+                                    (alloc_base != NULL) ?
+                                        alloc_disp : m_vars[i].disp,
+                                    (alloc_base != NULL) ?
+                                        alloc_size : m_vars[i].size,
+                                    alloc_disp,
+                                    (alloc_base != NULL) ?
+                                        0 : m_vars[i].align)) {
+                                return false;
+                            }
+
+                            if (ptr_data->add_reference() == 0 &&
+                                ptr_data->mic_buf != 0) {
+                                // add buffer to the list of buffers that
+                                // are passed to dispatch call
+                                m_compute_buffers.push_back(
+                                    ptr_data->mic_buf);
+                            }
+                            else {
+                                // will send buffer address to device
+                                m_vars[i].flags.sink_addr = 1;
+                            }
+
+                            if (!ptr_data->is_static) {
+                                // need to add reference for buffer
+                                m_need_runfunction = true;
+                            }
+                        }
+                        else {
+                            bool error_if_not_found = true;
+                            if (m_is_openmp) {
+                                // For omp target update variable is ignored
+                                // if it does not exist.
+                                if (!m_vars[i].alloc_if &&
+                                    !m_vars[i].free_if) {
+                                    error_if_not_found = false;
+                                }
+                            }
+
+                            // use existing association from pointer table
+                            if (!find_ptr_data(ptr_data,
+                                               base,
+                                               m_vars[i].disp,
+                                               m_vars[i].size,
+                                               error_if_not_found)) {
+                                return false;
+                            }
+
+                            if (m_is_openmp) {
+                                // make var nocopy if it does not exist
+                                if (ptr_data == 0) {
+                                    m_vars[i].direction.bits =
+                                        c_parameter_nocopy;
+                                }
+                            }
+
+                            if (ptr_data != 0) {
+                                m_vars[i].flags.sink_addr = 1;
+                            }
+                        }
+
+                        if (ptr_data != 0) {
+                            if (m_is_openmp) {
+                                // data is transferred only if
+                                // alloc_if == 0 && free_if == 0
+                                // or reference count is 1
+                                if ((m_vars[i].alloc_if ||
+                                     m_vars[i].free_if) &&
+                                    ptr_data->get_reference() != 1) {
+                                    m_vars[i].direction.bits =
+                                        c_parameter_nocopy;
+                                }
+                            }
+
+                            if (ptr_data->alloc_disp != 0) {
+                                m_vars[i].flags.alloc_disp = 1;
+                                m_in_datalen += sizeof(alloc_disp);
+                            }
+
+                            if (m_vars[i].flags.sink_addr) {
+                                // get buffers's address on the sink
+                                if (!init_mic_address(ptr_data)) {
+                                    return false;
+                                }
+
+                                m_in_datalen += sizeof(ptr_data->mic_addr);
+                            }
+
+                            if (!ptr_data->is_static && m_vars[i].free_if) {
+                                // need to decrement buffer reference on target
+                                m_need_runfunction = true;
+                            }
+
+                            // offset to base from the beginning of the buffer
+                            // memory
+                            m_vars[i].offset = (char*) base -
+                                (char*) ptr_data->cpu_addr.start();
+
+                            // copy other pointer properties to var descriptor
+                            m_vars[i].mic_offset = ptr_data->mic_offset;
+                            m_vars[i].flags.is_static = ptr_data->is_static;
+                        }
+                    }
+                    else {
+                        if (!find_ptr_data(ptr_data,
+                                           base,
+                                           m_vars[i].disp,
+                                           m_vars[i].size,
+                                           false)) {
+                            return false;
+                        }
+                        if (ptr_data) {
+                            m_vars[i].offset =
+                                (char*) base -
+                                (char*) ptr_data->cpu_addr.start();
+                        }
+                    }
+
+                    // save pointer data
+                    m_vars_extra[i].src_data = ptr_data;
+                }
+                break;
+
+            case c_func_ptr:
+                if (m_vars[i].direction.in) {
+                    m_in_datalen += __offload_funcs.max_name_length();
+                }
+                if (m_vars[i].direction.out) {
+                    m_out_datalen += __offload_funcs.max_name_length();
+                }
+                m_need_runfunction = true;
+                break;
+
+            case c_dv_data:
+            case c_dv_ptr_data:
+            case c_dv_data_slice:
+            case c_dv_ptr_data_slice:
+                ArrDesc *dvp;
+                if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
+                    const arr_desc *ap;
+                    ap = static_cast<const arr_desc*>(m_vars[i].ptr);
+
+                    dvp = (m_vars[i].type.src == c_dv_data_slice) ?
+                          reinterpret_cast<ArrDesc*>(ap->base) :
+                          *reinterpret_cast<ArrDesc**>(ap->base);
+                }
+                else {
+                    dvp = (m_vars[i].type.src == c_dv_data) ?
+                          static_cast<ArrDesc*>(m_vars[i].ptr) :
+                          *static_cast<ArrDesc**>(m_vars[i].ptr);
+                }
+
+                // if allocatable dope vector isn't allocated don't
+                // transfer its data
+                if (!__dv_is_allocated(dvp)) {
+                    m_vars[i].direction.bits = c_parameter_nocopy;
+                    m_vars[i].alloc_if = 0;
+                    m_vars[i].free_if = 0;
+                }
+                if (m_vars[i].direction.bits ||
+                    m_vars[i].alloc_if ||
+                    m_vars[i].free_if) {
+                    const arr_desc *ap;
+
+                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
+                        ap = static_cast<const arr_desc*>(m_vars[i].ptr);
+
+                        // debug dump
+                        __arr_desc_dump("", "IN/OUT", ap, 0);
+                    }
+                    if (!__dv_is_contiguous(dvp)) {
+                        m_vars[i].flags.is_noncont_src = 1;
+                        m_vars_extra[i].read_rng_src =
+                            init_read_ranges_dv(dvp);
+                    }
+
+                    // size and displacement
+                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
+                        // offset and length are derived from the
+                        // array descriptor
+                        __arr_data_offset_and_length(ap,
+                                                     m_vars[i].disp,
+                                                     m_vars[i].size);
+                        if (m_vars[i].direction.bits) {
+                            if (!is_arr_desc_contiguous(ap)) {
+                                if (m_vars[i].flags.is_noncont_src) {
+                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
+                                    return false;
+                                }
+                                m_vars[i].flags.is_noncont_src = 1;
+                                m_vars_extra[i].read_rng_src =
+                                    init_read_ranges_arr_desc(ap);
+                            }
+                        }
+                    }
+                    else {
+                        if (m_vars[i].flags.has_length) {
+                            m_vars[i].size =
+                                __dv_data_length(dvp, m_vars[i].count);
+                        }
+                        else {
+                            m_vars[i].size = __dv_data_length(dvp);
+                        }
+                        m_vars[i].disp = 0;
+                    }
+
+                    // check that length >= 0
+                    if (m_vars[i].alloc_if &&
+                        (m_vars[i].disp + m_vars[i].size < 0)) {
+                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
+                        exit(1);
+                    }
+
+                    // base address
+                    void *base = reinterpret_cast<void*>(dvp->Base);
+                    PtrData *ptr_data;
+
+                    // allocate buffer if we have no INTO and don't need
+                    // allocation for the ptr at target
+                    if (src_is_for_mic) {
+                        if (m_vars[i].alloc_if) {
+                            // add new entry
+                            if (!alloc_ptr_data(
+                                    ptr_data,
+                                    base,
+                                    (alloc_base != NULL) ?
+                                        alloc_disp : m_vars[i].disp,
+                                    (alloc_base != NULL) ?
+                                        alloc_size : m_vars[i].size,
+                                    alloc_disp,
+                                    (alloc_base != NULL) ?
+                                        0 : m_vars[i].align)) {
+                                return false;
+                            }
+
+                            if (ptr_data->add_reference() == 0 &&
+                                ptr_data->mic_buf != 0) {
+                                // add buffer to the list of buffers
+                                // that are passed to dispatch call
+                                m_compute_buffers.push_back(
+                                    ptr_data->mic_buf);
+                            }
+                            else {
+                                // will send buffer address to device
+                                m_vars[i].flags.sink_addr = 1;
+                            }
+
+                            if (!ptr_data->is_static) {
+                                // need to add reference for buffer
+                                m_need_runfunction = true;
+                            }
+                        }
+                        else {
+                            bool error_if_not_found = true;
+                            if (m_is_openmp) {
+                                // For omp target update variable is ignored
+                                // if it does not exist.
+                                if (!m_vars[i].alloc_if &&
+                                    !m_vars[i].free_if) {
+                                    error_if_not_found = false;
+                                }
+                            }
+
+                            // use existing association from pointer table
+                            if (!find_ptr_data(ptr_data,
+                                               base,
+                                               m_vars[i].disp,
+                                               m_vars[i].size,
+                                               error_if_not_found)) {
+                                return false;
+                            }
+
+                            if (m_is_openmp) {
+                                // make var nocopy if it does not exist
+                                if (ptr_data == 0) {
+                                    m_vars[i].direction.bits =
+                                        c_parameter_nocopy;
+                                }
+                            }
+
+                            if (ptr_data != 0) {
+                                // need to update base in dope vector on device
+                                m_vars[i].flags.sink_addr = 1;
+                            }
+                        }
+
+                        if (ptr_data != 0) {
+                            if (m_is_openmp) {
+                                // data is transferred only if
+                                // alloc_if == 0 && free_if == 0
+                                // or reference count is 1
+                                if ((m_vars[i].alloc_if ||
+                                     m_vars[i].free_if) &&
+                                    ptr_data->get_reference() != 1) {
+                                    m_vars[i].direction.bits =
+                                        c_parameter_nocopy;
+                                }
+                            }
+
+                            if (ptr_data->alloc_disp != 0) {
+                                m_vars[i].flags.alloc_disp = 1;
+                                m_in_datalen += sizeof(alloc_disp);
+                            }
+
+                            if (m_vars[i].flags.sink_addr) {
+                                // get buffers's address on the sink
+                                if (!init_mic_address(ptr_data)) {
+                                    return false;
+                                }
+
+                                m_in_datalen += sizeof(ptr_data->mic_addr);
+                            }
+
+                            if (!ptr_data->is_static && m_vars[i].free_if) {
+                                // need to decrement buffer reference on target
+                                m_need_runfunction = true;
+                            }
+
+                            // offset to base from the beginning of the buffer
+                            // memory
+                            m_vars[i].offset =
+                                (char*) base -
+                                (char*) ptr_data->cpu_addr.start();
+
+                            // copy other pointer properties to var descriptor
+                            m_vars[i].mic_offset = ptr_data->mic_offset;
+                            m_vars[i].flags.is_static = ptr_data->is_static;
+                        }
+                    }
+                    else { // !src_is_for_mic
+                        if (!find_ptr_data(ptr_data,
+                                           base,
+                                           m_vars[i].disp,
+                                           m_vars[i].size,
+                                           false)) {
+                            return false;
+                        }
+                        m_vars[i].offset = !ptr_data ? 0 :
+                                (char*) base -
+                                (char*) ptr_data->cpu_addr.start();
+                    }
+
+                    // save pointer data
+                    m_vars_extra[i].src_data = ptr_data;
+                }
+                break;
+
+            default:
+                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
+                LIBOFFLOAD_ABORT;
+        }
+        if (m_vars[i].type.src == c_data_ptr_array) {
+            continue;
+        }
+
+        if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
+            m_vars[i].offset = static_cast<char*>(m_vars[i].ptr) -
+                m_device.m_persist_list.front().cpu_stack_addr;
+        }
+        // if source is used at CPU save its offset and disp
+        if (m_vars[i].into == NULL || m_vars[i].direction.in) {
+            m_vars_extra[i].cpu_offset = m_vars[i].offset;
+            m_vars_extra[i].cpu_disp   = m_vars[i].disp;
+        }
+
+        // If "into" is define we need to do the similar work for it
+        if (!m_vars[i].into) {
+            continue;
+        }
+
+        int64_t into_disp =0, into_offset = 0;
+
+        switch (m_vars[i].type.dst) {
+            case c_data_ptr_array:
+                break;
+            case c_data:
+            case c_void_ptr:
+            case c_cean_var: {
+                int64_t size = m_vars[i].size;
+
+                if (m_vars[i].type.dst == c_cean_var) {
+                    // array descriptor
+                    const arr_desc *ap =
+                        static_cast<const arr_desc*>(m_vars[i].into);
+
+                    // debug dump
+                    __arr_desc_dump("    ", "INTO", ap, 0);
+
+                    // offset and length are derived from the array descriptor
+                    __arr_data_offset_and_length(ap, into_disp, size);
+
+                    if (!is_arr_desc_contiguous(ap)) {
+                        m_vars[i].flags.is_noncont_dst = 1;
+                        m_vars_extra[i].read_rng_dst =
+                            init_read_ranges_arr_desc(ap);
+                        if (!cean_ranges_match(
+                            m_vars_extra[i].read_rng_src,
+                            m_vars_extra[i].read_rng_dst)) {
+                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
+                            exit(1);
+                        }
+                    }
+                    m_vars[i].into = reinterpret_cast<void*>(ap->base);
+                }
+
+                int64_t size_src = m_vars_extra[i].read_rng_src ?
+                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
+                    m_vars[i].size;
+                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
+                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
+                    size;
+                // It's supposed that "into" size must be not less
+                // than src size
+                if (size_src > size_dst) {
+                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
+                                     size_src, size_dst);
+                    exit(1);
+                }
+
+                if (m_vars[i].direction.bits) {
+                    if (m_vars[i].flags.is_static_dstn) {
+                        PtrData *ptr_data;
+
+                        // find data associated with variable
+                        if (!find_ptr_data(ptr_data, m_vars[i].into,
+                                           into_disp, size, false)) {
+                            return false;
+                        }
+                        if (ptr_data != 0) {
+                            // offset to base from the beginning of the buffer
+                            // memory
+                            into_offset =
+                                (char*) m_vars[i].into -
+                                (char*) ptr_data->cpu_addr.start();
+                        }
+                        else {
+                            m_vars[i].flags.is_static_dstn = false;
+                        }
+                        m_vars_extra[i].dst_data = ptr_data;
+                    }
+                }
+
+                if (m_vars[i].direction.in &&
+                    !m_vars[i].flags.is_static_dstn) {
+                    m_in_datalen += m_vars[i].size;
+
+                    // for non-static target destination defined as CEAN
+                    // expression we pass to target its size and dist
+                    if (m_vars[i].type.dst == c_cean_var) {
+                        m_in_datalen += 2 * sizeof(uint64_t);
+                    }
+                    m_need_runfunction = true;
+                }
+                break;
+            }
+
+            case c_dv:
+                if (m_vars[i].direction.bits ||
+                    m_vars[i].alloc_if ||
+                    m_vars[i].free_if) {
+                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
+
+                    // debug dump
+                    __dv_desc_dump("INTO", dvp);
+
+                    // send dope vector contents excluding base
+                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
+                    m_need_runfunction = true;
+                }
+                break;
+
+            case c_string_ptr:
+            case c_data_ptr:
+            case c_cean_var_ptr:
+            case c_dv_ptr: {
+                int64_t size = m_vars[i].size;
+
+                if (m_vars[i].type.dst == c_cean_var_ptr) {
+                    // array descriptor
+                    const arr_desc *ap =
+                        static_cast<const arr_desc*>(m_vars[i].into);
+
+                    // debug dump
+                    __arr_desc_dump("    ", "INTO", ap, 1);
+
+                    // offset and length are derived from the array descriptor
+                    __arr_data_offset_and_length(ap, into_disp, size);
+
+                    if (!is_arr_desc_contiguous(ap)) {
+                        m_vars[i].flags.is_noncont_src = 1;
+                        m_vars_extra[i].read_rng_dst =
+                            init_read_ranges_arr_desc(ap);
+                        if (!cean_ranges_match(
+                            m_vars_extra[i].read_rng_src,
+                            m_vars_extra[i].read_rng_dst)) {
+                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
+                        }
+                    }
+                    m_vars[i].into = reinterpret_cast<char**>(ap->base);
+                }
+                else if (m_vars[i].type.dst == c_dv_ptr) {
+                    // need to send DV to the device unless it is 'nocopy'
+                    if (m_vars[i].direction.bits ||
+                        m_vars[i].alloc_if ||
+                        m_vars[i].free_if) {
+                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
+
+                        // debug dump
+                        __dv_desc_dump("INTO", dvp);
+
+                        m_vars[i].direction.bits = c_parameter_in;
+                    }
+                }
+
+                int64_t size_src = m_vars_extra[i].read_rng_src ?
+                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
+                    m_vars[i].size;
+                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
+                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
+                    size;
+                // It's supposed that "into" size must be not less than
+                // src size
+                if (size_src > size_dst) {
+                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
+                                     size_src, size_dst);
+                    exit(1);
+                }
+
+                if (m_vars[i].direction.bits) {
+                    PtrData *ptr_data;
+
+                    // base address
+                    void *base = *static_cast<void**>(m_vars[i].into);
+
+                    if (m_vars[i].direction.in) {
+                        // allocate buffer
+                        if (m_vars[i].flags.is_stack_buf) {
+                            // for stack persistent objects ptr data is created
+                            // by var_desc with number 0.
+                            // Its ptr_data is stored at m_stack_ptr_data
+                            ptr_data = m_stack_ptr_data;
+                            m_vars[i].flags.sink_addr = 1;
+                        }
+                        else if (m_vars[i].alloc_if) {
+                            // add new entry
+                            if (!alloc_ptr_data(
+                                    ptr_data,
+                                    base,
+                                    (alloc_base != NULL) ?
+                                        alloc_disp : into_disp,
+                                    (alloc_base != NULL) ?
+                                        alloc_size : size,
+                                    alloc_disp,
+                                    (alloc_base != NULL) ?
+                                        0 : m_vars[i].align)) {
+                                return false;
+                            }
+
+                            if (ptr_data->add_reference() == 0 &&
+                                ptr_data->mic_buf != 0) {
+                                // add buffer to the list of buffers that
+                                // are passed to dispatch call
+                                m_compute_buffers.push_back(
+                                    ptr_data->mic_buf);
+                            }
+                            else {
+                                // will send buffer address to device
+                                m_vars[i].flags.sink_addr = 1;
+                            }
+
+                            if (!ptr_data->is_static) {
+                                // need to add reference for buffer
+                                m_need_runfunction = true;
+                            }
+                        }
+                        else {
+                            // use existing association from pointer table
+                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
+                                return false;
+                            }
+                            m_vars[i].flags.sink_addr = 1;
+                        }
+
+                        if (ptr_data->alloc_disp != 0) {
+                            m_vars[i].flags.alloc_disp = 1;
+                            m_in_datalen += sizeof(alloc_disp);
+                        }
+
+                        if (m_vars[i].flags.sink_addr) {
+                            // get buffers's address on the sink
+                            if (!init_mic_address(ptr_data)) {
+                                return false;
+                            }
+
+                            m_in_datalen += sizeof(ptr_data->mic_addr);
+                        }
+
+                        if (!ptr_data->is_static && m_vars[i].free_if) {
+                            // need to decrement buffer reference on target
+                            m_need_runfunction = true;
+                        }
+
+                        // copy other pointer properties to var descriptor
+                        m_vars[i].mic_offset = ptr_data->mic_offset;
+                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
+                    }
+                    else {
+                        if (!find_ptr_data(ptr_data,
+                                           base,
+                                           into_disp,
+                                           m_vars[i].size,
+                                           false)) {
+                            return false;
+                        }
+                    }
+                    if (ptr_data) {
+                        into_offset = ptr_data ?
+                            (char*) base -
+                            (char*) ptr_data->cpu_addr.start() :
+                            0;
+                    }
+                    // save pointer data
+                    m_vars_extra[i].dst_data = ptr_data;
+                }
+                break;
+            }
+
+            case c_func_ptr:
+                break;
+
+            case c_dv_data:
+            case c_dv_ptr_data:
+            case c_dv_data_slice:
+            case c_dv_ptr_data_slice:
+                if (m_vars[i].direction.bits ||
+                    m_vars[i].alloc_if ||
+                    m_vars[i].free_if) {
+                    const arr_desc *ap;
+                    ArrDesc *dvp;
+                    PtrData *ptr_data;
+                    int64_t disp;
+                    int64_t size;
+
+                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
+                        ap = static_cast<const arr_desc*>(m_vars[i].into);
+
+                        // debug dump
+                        __arr_desc_dump("    ", "INTO", ap, 0);
+
+                        dvp = (m_vars[i].type.dst == c_dv_data_slice) ?
+                              reinterpret_cast<ArrDesc*>(ap->base) :
+                              *reinterpret_cast<ArrDesc**>(ap->base);
+                    }
+                    else {
+                        dvp = (m_vars[i].type.dst == c_dv_data) ?
+                              static_cast<ArrDesc*>(m_vars[i].into) :
+                              *static_cast<ArrDesc**>(m_vars[i].into);
+                    }
+                    if (!__dv_is_contiguous(dvp)) {
+                        m_vars[i].flags.is_noncont_dst = 1;
+                        m_vars_extra[i].read_rng_dst =
+                            init_read_ranges_dv(dvp);
+                    }
+                    // size and displacement
+                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
+                        // offset and length are derived from the array
+                        // descriptor
+                        __arr_data_offset_and_length(ap, into_disp, size);
+                        if (m_vars[i].direction.bits) {
+                            if (!is_arr_desc_contiguous(ap)) {
+                                if (m_vars[i].flags.is_noncont_dst) {
+                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
+                                    return false;
+                                }
+                                m_vars[i].flags.is_noncont_dst = 1;
+                                m_vars_extra[i].read_rng_dst =
+                                    init_read_ranges_arr_desc(ap);
+                                if (!cean_ranges_match(
+                                    m_vars_extra[i].read_rng_src,
+                                    m_vars_extra[i].read_rng_dst)) {
+                                    LIBOFFLOAD_ERROR(c_ranges_dont_match);
+                                }
+                            }
+                        }
+                    }
+                    else {
+                        if (m_vars[i].flags.has_length) {
+                            size = __dv_data_length(dvp, m_vars[i].count);
+                        }
+                        else {
+                            size = __dv_data_length(dvp);
+                        }
+                        disp = 0;
+                    }
+
+                    int64_t size_src =
+                        m_vars_extra[i].read_rng_src ?
+                        cean_get_transf_size(m_vars_extra[i].read_rng_src) :
+                        m_vars[i].size;
+                    int64_t size_dst =
+                        m_vars_extra[i].read_rng_dst ?
+                        cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
+                        size;
+                    // It's supposed that "into" size must be not less
+                    // than src size
+                    if (size_src > size_dst) {
+                        LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
+                            size_src, size_dst);
+                        exit(1);
+                    }
+
+                    // base address
+                    void *base = reinterpret_cast<void*>(dvp->Base);
+
+                    // allocate buffer
+                    if (m_vars[i].direction.in) {
+                        if (m_vars[i].alloc_if) {
+                            // add new entry
+                            if (!alloc_ptr_data(
+                                    ptr_data,
+                                    base,
+                                    (alloc_base != NULL) ?
+                                        alloc_disp : into_disp,
+                                    (alloc_base != NULL) ?
+                                        alloc_size : size,
+                                    alloc_disp,
+                                    (alloc_base != NULL) ?
+                                        0 : m_vars[i].align)) {
+                                return false;
+                            }
+                            if (ptr_data->add_reference() == 0 &&
+                                ptr_data->mic_buf !=0) {
+                                // add buffer to the list of buffers
+                                // that are passed to dispatch call
+                                m_compute_buffers.push_back(
+                                    ptr_data->mic_buf);
+                            }
+                            else {
+                                // will send buffer address to device
+                                m_vars[i].flags.sink_addr = 1;
+                            }
+
+                            if (!ptr_data->is_static) {
+                                // need to add reference for buffer
+                                m_need_runfunction = true;
+                            }
+                        }
+                        else {
+                            // use existing association from pointer table
+                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
+                                return false;
+                            }
+
+                            // need to update base in dope vector on device
+                            m_vars[i].flags.sink_addr = 1;
+                        }
+
+                        if (ptr_data->alloc_disp != 0) {
+                            m_vars[i].flags.alloc_disp = 1;
+                            m_in_datalen += sizeof(alloc_disp);
+                        }
+
+                        if (m_vars[i].flags.sink_addr) {
+                            // get buffers's address on the sink
+                            if (!init_mic_address(ptr_data)) {
+                                return false;
+                            }
+                            m_in_datalen += sizeof(ptr_data->mic_addr);
+                        }
+
+                        if (!ptr_data->is_static && m_vars[i].free_if) {
+                            // need to decrement buffer reference on target
+                            m_need_runfunction = true;
+                        }
+
+                        // offset to base from the beginning of the buffer
+                        // memory
+                        into_offset =
+                            (char*) base - (char*) ptr_data->cpu_addr.start();
+
+                        // copy other pointer properties to var descriptor
+                        m_vars[i].mic_offset = ptr_data->mic_offset;
+                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
+                    }
+                    else { // src_is_for_mic
+                        if (!find_ptr_data(ptr_data,
+                                           base,
+                                           into_disp,
+                                           size,
+                                           false)) {
+                            return false;
+                        }
+                        into_offset = !ptr_data ?
+                            0 :
+                            (char*) base - (char*) ptr_data->cpu_addr.start();
+                    }
+
+                    // save pointer data
+                    m_vars_extra[i].dst_data = ptr_data;
+                }
+                break;
+
+            default:
+                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
+                LIBOFFLOAD_ABORT;
+        }
+        // if into is used at CPU save its offset and disp
+        if (m_vars[i].direction.out) {
+            m_vars_extra[i].cpu_offset = into_offset;
+            m_vars_extra[i].cpu_disp   = into_disp;
+        }
+        else {
+            if (m_vars[i].flags.is_stack_buf) {
+                into_offset = static_cast<char*>(m_vars[i].into) -
+                    m_device.m_persist_list.front().cpu_stack_addr;
+            }
+            m_vars[i].offset = into_offset;
+            m_vars[i].disp   = into_disp;
+        }
+    }
+
+    return true;
+}
+
+bool OffloadDescriptor::setup_misc_data(const char *name)
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
+
+    // we can skip run functon call together with wait if offloaded
+    // region is empty and there is no user defined non-pointer IN/OUT data
+    if (m_need_runfunction) {
+        // variable descriptors are sent as input data
+        m_in_datalen += m_vars_total * sizeof(VarDesc);
+
+        // timer data is sent as a part of the output data
+        m_out_datalen += OFFLOAD_TIMER_DATALEN();
+
+        // max from input data and output data length
+        uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
+                                                           m_out_datalen;
+
+        // Misc data has the following layout
+        //     <Function Descriptor>
+        //     <Function Name>
+        //     <In/Out Data>            (optional)
+        //
+        // We can transfer copyin/copyout data in misc/return data which can
+        // be passed to run function call if its size does not exceed
+        // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
+        // buffer for it.
+
+        m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
+        m_func_desc_size = (m_func_desc_size + 7) & ~7;
+
+        int misc_data_offset = 0;
+        int misc_data_size = 0;
+        if (data_len > 0) {
+            if (m_func_desc_size +
+                m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
+                m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
+                // use misc/return data for copyin/copyout
+                misc_data_offset = m_func_desc_size;
+                misc_data_size = data_len;
+            }
+            else {
+                OffloadTimer timer_buf(get_timer_data(),
+                                       c_offload_host_alloc_data_buffer);
+
+                // send/receive data using buffer
+                COIRESULT res = COI::BufferCreate(data_len,
+                                                  COI_BUFFER_NORMAL,
+                                                  0, 0,
+                                                  1, &m_device.get_process(),
+                                                  &m_inout_buf);
+                if (res != COI_SUCCESS) {
+                    if (m_status != 0) {
+                        m_status->result = translate_coi_error(res);
+                        return false;
+                    }
+                    report_coi_error(c_buf_create, res);
+                }
+
+                m_compute_buffers.push_back(m_inout_buf);
+                m_destroy_buffers.push_back(m_inout_buf);
+            }
+        }
+
+        // initialize function descriptor
+        m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size +
+                                                   misc_data_size);
+        m_func_desc->console_enabled = console_enabled;
+        m_func_desc->timer_enabled =
+            timer_enabled || (offload_report_level && offload_report_enabled);
+        m_func_desc->offload_report_level = offload_report_level;
+        m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
+        m_func_desc->in_datalen = m_in_datalen;
+        m_func_desc->out_datalen = m_out_datalen;
+        m_func_desc->vars_num = m_vars_total;
+        m_func_desc->data_offset = misc_data_offset;
+
+        // append entry name
+        strcpy(m_func_desc->data, name);
+    }
+
+    return true;
+}
+
+bool OffloadDescriptor::wait_dependencies(
+    const void **waits,
+    int num_waits
+)
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
+    bool ret = true;
+
+    for (int i = 0; i < num_waits; i++) {
+
+        OffloadDescriptor *task = m_device.find_signal(waits[i], true);
+        if (task == 0) {
+            LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
+                             waits[i]);
+            LIBOFFLOAD_ABORT;
+        }
+
+        if (!task->offload_finish()) {
+            ret = false;
+        }
+
+        task->cleanup();
+        delete task;
+    }
+
+    return ret;
+}
+
+bool OffloadDescriptor::offload(
+    const char *name,
+    bool is_empty,
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int vars_total,
+    const void **waits,
+    int num_waits,
+    const void **signal,
+    int entry_id,
+    const void *stack_addr
+)
+{
+    if (signal == 0) {
+        OFFLOAD_DEBUG_TRACE_1(1,
+                      GET_OFFLOAD_NUMBER(get_timer_data()),
+                      c_offload_init_func,
+                      "Offload function %s, is_empty=%d, #varDescs=%d, "
+                      "#waits=%d, signal=none\n",
+                      name, is_empty, vars_total, num_waits);
+        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
+                      c_offload_sent_pointer_data,
+                      "#Wait : %d \n", num_waits);
+        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
+                      c_offload_signal,
+                      "none %d\n", 0);
+    }
+    else {
+        OFFLOAD_DEBUG_TRACE_1(1,
+                      GET_OFFLOAD_NUMBER(get_timer_data()),
+                      c_offload_init_func,
+                      "Offload function %s, is_empty=%d, #varDescs=%d, "
+                      "#waits=%d, signal=%p\n",
+                      name, is_empty, vars_total, num_waits,
+                      *signal);
+
+        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
+                      c_offload_signal,
+                      "%d\n", signal);
+    }
+    OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
+                      c_offload_wait,
+                      "#Wait : %d  %p\n", num_waits, waits);
+
+    if (m_status != 0) {
+        m_status->result = OFFLOAD_SUCCESS;
+        m_status->device_number = m_device.get_logical_index();
+    }
+
+    m_need_runfunction = !is_empty;
+
+    // wait for dependencies to finish
+    if (!wait_dependencies(waits, num_waits)) {
+        cleanup();
+        return false;
+    }
+
+    // setup buffers
+    if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
+        cleanup();
+        return false;
+    }
+
+    // initiate send for pointers. Want to do it as early as possible.
+    if (!send_pointer_data(signal != 0)) {
+        cleanup();
+        return false;
+    }
+
+    // setup misc data for run function
+    if (!setup_misc_data(name)) {
+        cleanup();
+        return false;
+    }
+
+    // gather copyin data into buffer
+    if (!gather_copyin_data()) {
+        cleanup();
+        return false;
+    }
+
+    // Start the computation
+    if (!compute()) {
+        cleanup();
+        return false;
+    }
+
+    // initiate receive for pointers
+    if (!receive_pointer_data(signal != 0)) {
+        cleanup();
+        return false;
+    }
+
+    // if there is a signal save descriptor for the later use.
+    if (signal != 0) {
+        m_device.add_signal(*signal, this);
+        return true;
+    }
+
+    // wait for the offload to finish.
+    if (!offload_finish()) {
+        cleanup();
+        return false;
+    }
+
+    cleanup();
+    return true;
+}
+
+bool OffloadDescriptor::offload_finish()
+{
+    COIRESULT res;
+
+    // wait for compute dependencies to become signaled
+    if (m_in_deps_total > 0) {
+        OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
+
+        if (__offload_active_wait) {
+            // keep CPU busy
+            do {
+                res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
+            }
+            while (res == COI_TIME_OUT_REACHED);
+        }
+        else {
+            res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
+        }
+
+        if (res != COI_SUCCESS) {
+            if (m_status != 0) {
+                m_status->result = translate_coi_error(res);
+                return false;
+            }
+            report_coi_error(c_event_wait, res);
+        }
+    }
+
+    // scatter copyout data received from target
+    if (!scatter_copyout_data()) {
+        return false;
+    }
+    // wait for receive dependencies to become signaled
+    if (m_out_deps_total > 0) {
+        OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
+
+        if (__offload_active_wait) {
+            // keep CPU busy
+            do {
+                res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
+            }
+            while (res == COI_TIME_OUT_REACHED);
+        }
+        else {
+            res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
+        }
+
+        if (res != COI_SUCCESS) {
+            if (m_status != 0) {
+                m_status->result = translate_coi_error(res);
+                return false;
+            }
+            report_coi_error(c_event_wait, res);
+        }
+    }
+
+    // destroy buffers
+    {
+        OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
+
+        for (BufferList::const_iterator it = m_destroy_buffers.begin();
+             it != m_destroy_buffers.end(); it++) {
+            res = COI::BufferDestroy(*it);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_destroy, res);
+            }
+        }
+    }
+
+    return true;
+}
+
+void OffloadDescriptor::cleanup()
+{
+    // release device in orsl
+    ORSL::release(m_device.get_logical_index());
+
+    OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
+
+    // report stuff
+    Offload_Report_Epilog(get_timer_data());
+}
+
+bool OffloadDescriptor::is_signaled()
+{
+    bool signaled = true;
+    COIRESULT res;
+
+    // check compute and receive dependencies
+    if (m_in_deps_total > 0) {
+        res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
+        signaled = signaled && (res == COI_SUCCESS);
+    }
+    if (m_out_deps_total > 0) {
+        res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
+        signaled = signaled && (res == COI_SUCCESS);
+    }
+
+    return signaled;
+}
+
+// Send pointer data if source or destination or both of them are
+// noncontiguous. There is guarantee that length of destination enough for
+// transferred data.
+bool OffloadDescriptor::send_noncontiguous_pointer_data(
+    int i,
+    PtrData* src_data,
+    PtrData* dst_data,
+    COIEVENT *event
+    )
+{
+    int64_t offset_src, offset_dst;
+    int64_t length_src, length_dst;
+    int64_t length_src_cur, length_dst_cur;
+    int64_t send_size, data_sent = 0;
+    COIRESULT res;
+    bool dst_is_empty = true;
+    bool src_is_empty = true;
+
+    // Set length_src and length_dst
+    length_src = (m_vars_extra[i].read_rng_src) ?
+        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
+    length_dst = !m_vars[i].into ? length_src :
+                     (m_vars_extra[i].read_rng_dst) ?
+                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
+    send_size = (length_src < length_dst) ? length_src : length_dst;
+
+    // consequently get contiguous ranges,
+    // define corresponded destination offset and send data
+    do {
+        if (src_is_empty) {
+            if (m_vars_extra[i].read_rng_src) {
+                if (!get_next_range(m_vars_extra[i].read_rng_src,
+                         &offset_src)) {
+                    // source ranges are over - nothing to send
+                    break;
+                }
+            }
+            else if (data_sent == 0) {
+                offset_src = m_vars_extra[i].cpu_disp;
+            }
+            else {
+                break;
+            }
+            length_src_cur = length_src;
+        }
+        else {
+            // if source is contiguous or its contiguous range is greater
+            // than destination one
+            offset_src += send_size;
+        }
+        length_src_cur -= send_size;
+        src_is_empty = length_src_cur == 0;
+
+        if (dst_is_empty) {
+            if (m_vars[i].into) {
+                if (m_vars_extra[i].read_rng_dst) {
+                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
+                             &offset_dst)) {
+                        // destination ranges are over
+                        LIBOFFLOAD_ERROR(c_destination_is_over);
+                        return false;
+                    }
+                }
+                // into is contiguous.
+                else {
+                    offset_dst = m_vars[i].disp;
+                }
+                length_dst_cur = length_dst;
+            }
+            // same as source
+            else {
+                offset_dst = offset_src;
+                length_dst_cur = length_src;
+            }
+        }
+        else {
+            // if destination is contiguous or its contiguous range is greater
+            // than source one
+            offset_dst += send_size;
+        }
+        length_dst_cur -= send_size;
+        dst_is_empty = length_dst_cur == 0;
+
+        if (src_data != 0 && src_data->cpu_buf != 0) {
+            res = COI::BufferCopy(
+                dst_data->mic_buf,
+                src_data->cpu_buf,
+                m_vars[i].mic_offset - dst_data->alloc_disp +
+                m_vars[i].offset + offset_dst,
+                m_vars_extra[i].cpu_offset + offset_src,
+                send_size,
+                COI_COPY_UNSPECIFIED,
+                0, 0,
+                event);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_copy, res);
+            }
+        }
+        else {
+            char *base = offload_get_src_base(m_vars[i].ptr,
+                m_vars[i].type.src);
+
+            res = COI::BufferWrite(
+                dst_data->mic_buf,
+                m_vars[i].mic_offset - dst_data->alloc_disp +
+                m_vars[i].offset + offset_dst,
+                base + offset_src,
+                send_size,
+                COI_COPY_UNSPECIFIED,
+                0, 0,
+                event);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_write, res);
+            }
+        }
+        data_sent += length_src;
+    }
+    while (true);
+    return true;
+}
+
+bool OffloadDescriptor::send_pointer_data(bool is_async)
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
+
+    uint64_t ptr_sent = 0;
+    COIRESULT res;
+
+    // Initiate send for pointer data
+    for (int i = 0; i < m_vars_total; i++) {
+        switch (m_vars[i].type.dst) {
+            case c_data_ptr_array:
+                break;
+            case c_data:
+            case c_void_ptr:
+            case c_cean_var:
+                if (m_vars[i].direction.in &&
+                    m_vars[i].flags.is_static_dstn) {
+                    COIEVENT *event =
+                        (is_async ||
+                         m_vars[i].size >= __offload_use_async_buffer_write) ?
+                        &m_in_deps[m_in_deps_total++] : 0;
+                    PtrData* dst_data = m_vars[i].into ?
+                                            m_vars_extra[i].dst_data :
+                                            m_vars_extra[i].src_data;
+                    PtrData* src_data =
+                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
+                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
+                        m_vars[i].flags.is_static ?
+                           m_vars_extra[i].src_data : 0;
+
+                    if (m_vars[i].flags.is_noncont_src ||
+                        m_vars[i].flags.is_noncont_dst) {
+                        if (!send_noncontiguous_pointer_data(
+                                i, src_data, dst_data, event)) {
+                            return false;
+                        }
+                    }
+                    else if (src_data != 0 && src_data->cpu_buf != 0) {
+                        res = COI::BufferCopy(
+                            dst_data->mic_buf,
+                            src_data->cpu_buf,
+                            m_vars[i].mic_offset - dst_data->alloc_disp +
+                            m_vars[i].offset + m_vars[i].disp,
+                            m_vars_extra[i].cpu_offset +
+                            m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_copy, res);
+                        }
+                    }
+                    else {
+                        char *base = offload_get_src_base(m_vars[i].ptr,
+                                                          m_vars[i].type.src);
+                        res = COI::BufferWrite(
+                            dst_data->mic_buf,
+                            m_vars[i].mic_offset - dst_data->alloc_disp +
+                            m_vars[i].offset + m_vars[i].disp,
+                            base + m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_write, res);
+                        }
+                    }
+                    ptr_sent += m_vars[i].size;
+                }
+                break;
+
+            case c_string_ptr:
+            case c_data_ptr:
+            case c_cean_var_ptr:
+            case c_dv_ptr:
+                if (m_vars[i].direction.in && m_vars[i].size > 0) {
+                    COIEVENT *event =
+                        (is_async ||
+                         m_vars[i].size >= __offload_use_async_buffer_write) ?
+                        &m_in_deps[m_in_deps_total++] : 0;
+                    PtrData* dst_data = m_vars[i].into ?
+                                            m_vars_extra[i].dst_data :
+                                            m_vars_extra[i].src_data;
+                    PtrData* src_data =
+                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
+                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
+                        m_vars[i].flags.is_static ?
+                            m_vars_extra[i].src_data : 0;
+
+                    if (m_vars[i].flags.is_noncont_src ||
+                        m_vars[i].flags.is_noncont_dst) {
+                        send_noncontiguous_pointer_data(
+                            i, src_data, dst_data, event);
+                    }
+                    else if (src_data != 0 && src_data->cpu_buf != 0) {
+                        res = COI::BufferCopy(
+                            dst_data->mic_buf,
+                            src_data->cpu_buf,
+                            m_vars[i].mic_offset - dst_data->alloc_disp +
+                            m_vars[i].offset + m_vars[i].disp,
+                            m_vars_extra[i].cpu_offset +
+                            m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_copy, res);
+                        }
+                    }
+                    else {
+                        char *base = offload_get_src_base(m_vars[i].ptr,
+                                                          m_vars[i].type.src);
+                        res = COI::BufferWrite(
+                            dst_data->mic_buf,
+                            m_vars[i].mic_offset - dst_data->alloc_disp +
+                            m_vars[i].offset + m_vars[i].disp,
+                            base + m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_write, res);
+                        }
+                    }
+
+                    ptr_sent += m_vars[i].size;
+                }
+                break;
+
+            case c_dv_data:
+            case c_dv_ptr_data:
+                if (m_vars[i].direction.in &&
+                    m_vars[i].size > 0) {
+                    PtrData *ptr_data = m_vars[i].into ?
+                                        m_vars_extra[i].dst_data :
+                                        m_vars_extra[i].src_data;
+                    PtrData* src_data = m_vars_extra[i].src_data;
+
+                    COIEVENT *event =
+                        (is_async ||
+                         m_vars[i].size >= __offload_use_async_buffer_write) ?
+                        &m_in_deps[m_in_deps_total++] : 0;
+
+                    if (m_vars[i].flags.is_noncont_src ||
+                        m_vars[i].flags.is_noncont_dst) {
+                        send_noncontiguous_pointer_data(
+                            i, src_data, ptr_data, event);
+                    }
+                    else if (src_data && src_data->cpu_buf != 0) {
+                        res = COI::BufferCopy(
+                            ptr_data->mic_buf,
+                            src_data->cpu_buf,
+                            m_vars[i].offset + ptr_data->mic_offset -
+                            ptr_data->alloc_disp +
+                            m_vars[i].disp,
+                            m_vars_extra[i].cpu_offset +
+                            m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_copy, res);
+                        }
+                    }
+                    else {
+                        char *base = offload_get_src_base(m_vars[i].ptr,
+                                                          m_vars[i].type.src);
+                        res = COI::BufferWrite(
+                            ptr_data->mic_buf,
+                            ptr_data->mic_offset - ptr_data->alloc_disp +
+                            m_vars[i].offset + m_vars[i].disp,
+                            base + m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_write, res);
+                        }
+                    }
+                    ptr_sent += m_vars[i].size;
+                }
+                break;
+
+            case c_dv_data_slice:
+            case c_dv_ptr_data_slice:
+                if (m_vars[i].direction.in &&
+                    m_vars[i].size > 0) {
+                    PtrData *dst_data = m_vars[i].into ?
+                                        m_vars_extra[i].dst_data :
+                                        m_vars_extra[i].src_data;
+                    PtrData* src_data =
+                        (VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
+                        VAR_TYPE_IS_DV_DATA(m_vars[i].type.src) ||
+                        VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) ||
+                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
+                        m_vars[i].flags.is_static) ?
+                            m_vars_extra[i].src_data : 0;
+                    COIEVENT *event =
+                        (is_async ||
+                         m_vars[i].size >= __offload_use_async_buffer_write) ?
+                        &m_in_deps[m_in_deps_total++] : 0;
+                    if (m_vars[i].flags.is_noncont_src ||
+                        m_vars[i].flags.is_noncont_dst) {
+                        send_noncontiguous_pointer_data(
+                            i, src_data, dst_data, event);
+                    }
+                    else if (src_data && src_data->cpu_buf != 0) {
+                        res = COI::BufferCopy(
+                            dst_data->mic_buf,
+                            src_data->cpu_buf,
+                            m_vars[i].offset - dst_data->alloc_disp +
+                            dst_data->mic_offset +
+                            m_vars[i].disp,
+                            m_vars_extra[i].cpu_offset +
+                            m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_copy, res);
+                        }
+                    }
+                    else {
+                        char *base = offload_get_src_base(m_vars[i].ptr,
+                                                          m_vars[i].type.src);
+                        res = COI::BufferWrite(
+                            dst_data->mic_buf,
+                            dst_data->mic_offset - dst_data->alloc_disp +
+                            m_vars[i].offset + m_vars[i].disp,
+                            base + m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            0, 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_write, res);
+                        }
+                    }
+
+                    ptr_sent += m_vars[i].size;
+                }
+                break;
+
+            default:
+                break;
+        }
+
+        // alloc field isn't used at target.
+        // We can reuse it for offset of array pointers.
+        if (m_vars_extra[i].is_arr_ptr_el) {
+            m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
+        }
+    }
+
+    if (m_status) {
+        m_status->data_sent += ptr_sent;
+    }
+
+    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
+    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
+                  c_offload_sent_pointer_data,
+                  "Total pointer data sent to target: [%lld] bytes\n",
+                  ptr_sent);
+
+    return true;
+}
+
+bool OffloadDescriptor::gather_copyin_data()
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
+
+    if (m_need_runfunction && m_in_datalen > 0) {
+        COIMAPINSTANCE map_inst;
+        char *data;
+
+        // init marshaller
+        if (m_inout_buf != 0) {
+            OffloadTimer timer_map(get_timer_data(),
+                                   c_offload_host_map_in_data_buffer);
+
+            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
+                                           COI_MAP_WRITE_ENTIRE_BUFFER,
+                                           0, 0, 0, &map_inst,
+                                           reinterpret_cast<void**>(&data));
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_map, res);
+            }
+        }
+        else {
+            data = (char*) m_func_desc + m_func_desc->data_offset;
+        }
+
+        // send variable descriptors
+        memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
+        data += m_vars_total * sizeof(VarDesc);
+
+        // init marshaller
+        m_in.init_buffer(data, m_in_datalen);
+
+        // Gather copy data into buffer
+        for (int i = 0; i < m_vars_total; i++) {
+            bool src_is_for_mic = (m_vars[i].direction.out ||
+                                   m_vars[i].into == NULL);
+            PtrData* ptr_data = src_is_for_mic ?
+                                m_vars_extra[i].src_data :
+                                m_vars_extra[i].dst_data;
+            if (m_vars[i].flags.alloc_disp) {
+                m_in.send_data(&ptr_data->alloc_disp,
+                               sizeof(ptr_data->alloc_disp));
+            }
+
+            // send sink address to the target
+            if (m_vars[i].flags.sink_addr) {
+                m_in.send_data(&ptr_data->mic_addr,
+                               sizeof(ptr_data->mic_addr));
+            }
+
+            switch (m_vars[i].type.dst) {
+                case c_data_ptr_array:
+                    break;
+                case c_data:
+                case c_void_ptr:
+                case c_cean_var:
+                    if (m_vars[i].direction.in &&
+                        !m_vars[i].flags.is_static_dstn) {
+
+                        char *ptr = offload_get_src_base(m_vars[i].ptr,
+                                                         m_vars[i].type.src);
+                        if (m_vars[i].type.dst == c_cean_var) {
+                            // offset and length are derived from the array
+                            // descriptor
+                            int64_t size = m_vars[i].size;
+                            int64_t disp = m_vars[i].disp;
+                            m_in.send_data(reinterpret_cast<char*>(&size),
+                                           sizeof(int64_t));
+                            m_in.send_data(reinterpret_cast<char*>(&disp),
+                                           sizeof(int64_t));
+                        }
+
+                        m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
+                                       m_vars[i].size);
+                    }
+                    break;
+
+                case c_dv:
+                    if (m_vars[i].direction.bits ||
+                        m_vars[i].alloc_if ||
+                        m_vars[i].free_if) {
+                        // send dope vector excluding base
+                        char *ptr = static_cast<char*>(m_vars[i].ptr);
+                        m_in.send_data(ptr + sizeof(uint64_t),
+                                       m_vars[i].size - sizeof(uint64_t));
+                    }
+                    break;
+
+                case c_data_ptr:
+                    // send to target addresses of obsolete
+                    // stacks to be released
+                    if (m_vars[i].flags.is_stack_buf &&
+                        !m_vars[i].direction.bits &&
+                        m_vars[i].alloc_if &&
+                        m_vars[i].size != 0) {
+                        for (PtrDataList::iterator it =
+                            m_destroy_stack.begin();
+                            it != m_destroy_stack.end(); it++) {
+                            PtrData * ptr_data = *it;
+                            m_in.send_data(&(ptr_data->mic_addr),
+                                sizeof(ptr_data->mic_addr));
+                        }
+                    }
+                    break;
+                case c_func_ptr:
+                    if (m_vars[i].direction.in) {
+                        m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+
+        if (m_status) {
+            m_status->data_sent += m_in.get_tfr_size();
+        }
+
+        if (m_func_desc->data_offset == 0) {
+            OffloadTimer timer_unmap(get_timer_data(),
+                                     c_offload_host_unmap_in_data_buffer);
+            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_unmap, res);
+            }
+        }
+    }
+
+    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
+    OFFLOAD_DEBUG_TRACE_1(1,
+                  GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
+                  "Total copyin data sent to target: [%lld] bytes\n",
+                  m_in.get_tfr_size());
+
+    return true;
+}
+
+bool OffloadDescriptor::compute()
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
+
+    if (m_need_runfunction) {
+        OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
+                              c_offload_compute, "Compute task on MIC\n");
+
+        void* misc = m_func_desc;
+        int   misc_len = m_func_desc_size;
+        void* ret = 0;
+        int   ret_len = 0;
+
+        if (m_func_desc->data_offset != 0) {
+            misc_len += m_in_datalen;
+
+            if (m_out_datalen > 0) {
+                ret = (char*) m_func_desc + m_func_desc->data_offset;
+                ret_len = m_out_datalen;
+            }
+        }
+
+        // dispatch task
+        COIRESULT res;
+        COIEVENT event;
+        res = m_device.compute(m_compute_buffers,
+                               misc, misc_len,
+                               ret, ret_len,
+                               m_in_deps_total,
+                               m_in_deps_total > 0 ? m_in_deps : 0,
+                               &event);
+        if (res != COI_SUCCESS) {
+            if (m_status != 0) {
+                m_status->result = translate_coi_error(res);
+                return false;
+            }
+            report_coi_error(c_pipeline_run_func, res);
+        }
+
+        m_in_deps_total = 1;
+        m_in_deps[0] = event;
+    }
+
+    return true;
+}
+
+// receive pointer data if source or destination or both of them are
+// noncontiguous. There is guarantee that length of destination enough for
+// transferred data.
+bool OffloadDescriptor::receive_noncontiguous_pointer_data(
+    int i,
+    char* base,
+    COIBUFFER dst_buf,
+    COIEVENT *event
+)
+{
+    int64_t offset_src, offset_dst;
+    int64_t length_src, length_dst;
+    int64_t length_src_cur, length_dst_cur;
+    int64_t receive_size, data_received = 0;
+    COIRESULT res;
+    bool dst_is_empty = true;
+    bool src_is_empty = true;
+
+    // Set length_src and length_dst
+    length_src = (m_vars_extra[i].read_rng_src) ?
+        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
+    length_dst = !m_vars[i].into ? length_src :
+                     (m_vars_extra[i].read_rng_dst) ?
+                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
+    receive_size = (length_src < length_dst) ? length_src : length_dst;
+
+    // consequently get contiguous ranges,
+    // define corresponded destination offset and receive data
+    do {
+        // get sorce offset
+        if (src_is_empty) {
+            if (m_vars_extra[i].read_rng_src) {
+                if (!get_next_range(m_vars_extra[i].read_rng_src,
+                         &offset_src)) {
+                    // source ranges are over - nothing to send
+                    break;
+                }
+            }
+            else if (data_received == 0) {
+                offset_src = 0;
+            }
+            else {
+                break;
+            }
+            length_src_cur = length_src;
+        }
+        else {
+            // if source is contiguous or its contiguous range is greater
+            // than destination one
+            offset_src += receive_size;
+        }
+        length_src_cur -= receive_size;
+        src_is_empty = length_src_cur == 0;
+
+        // get destination offset
+        if (dst_is_empty) {
+            if (m_vars[i].into) {
+                if (m_vars_extra[i].read_rng_dst) {
+                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
+                             &offset_dst)) {
+                        // destination ranges are over
+                        LIBOFFLOAD_ERROR(c_destination_is_over);
+                        return false;
+                    }
+                }
+                // destination is contiguous.
+                else {
+                    offset_dst = m_vars_extra[i].cpu_disp;
+                }
+                length_dst_cur = length_dst;
+            }
+            // same as source
+            else {
+                offset_dst = offset_src;
+                length_dst_cur = length_src;
+            }
+        }
+        else {
+            // if destination is contiguous or its contiguous range is greater
+            // than source one
+            offset_dst += receive_size;
+        }
+        length_dst_cur -= receive_size;
+        dst_is_empty = length_dst_cur == 0;
+
+        if (dst_buf != 0) {
+            res = COI::BufferCopy(
+                dst_buf,
+                m_vars_extra[i].src_data->mic_buf,
+                m_vars_extra[i].cpu_offset + offset_dst,
+                m_vars[i].offset + offset_src +
+                m_vars[i].mic_offset -
+                m_vars_extra[i].src_data->alloc_disp,
+                receive_size,
+                COI_COPY_UNSPECIFIED,
+                m_in_deps_total,
+                m_in_deps_total > 0 ? m_in_deps : 0,
+                event);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_copy, res);
+            }
+        }
+        else {
+            res = COI::BufferRead(
+                m_vars_extra[i].src_data->mic_buf,
+                m_vars[i].offset + offset_src +
+                m_vars[i].mic_offset -
+                m_vars_extra[i].src_data->alloc_disp,
+                base + offset_dst,
+                receive_size,
+                COI_COPY_UNSPECIFIED,
+                m_in_deps_total,
+                m_in_deps_total > 0 ? m_in_deps : 0,
+                event);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_read, res);
+            }
+        }
+        data_received += receive_size;
+    }
+    while (true);
+    return true;
+}
+
+bool OffloadDescriptor::receive_pointer_data(bool is_async)
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
+
+    uint64_t ptr_received = 0;
+    COIRESULT res;
+
+    for (int i = 0; i < m_vars_total; i++) {
+        switch (m_vars[i].type.src) {
+            case c_data_ptr_array:
+                break;
+            case c_data:
+            case c_void_ptr:
+            case c_cean_var:
+                if (m_vars[i].direction.out &&
+                    m_vars[i].flags.is_static) {
+                    COIEVENT *event =
+                        (is_async ||
+                         m_in_deps_total > 0 ||
+                         m_vars[i].size >= __offload_use_async_buffer_read) ?
+                        &m_out_deps[m_out_deps_total++] : 0;
+                    PtrData *ptr_data = NULL;
+                    COIBUFFER dst_buf = NULL; // buffer at host
+                    char *base;
+
+                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
+                        ptr_data = m_vars[i].into ?
+                                   m_vars_extra[i].dst_data :
+                                   m_vars_extra[i].src_data;
+                    }
+                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
+                        if (m_vars[i].flags.is_static_dstn) {
+                            ptr_data = m_vars[i].into ?
+                                       m_vars_extra[i].dst_data :
+                                       m_vars_extra[i].src_data;
+                        }
+                    }
+                    dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
+                    if (dst_buf == NULL) {
+                        base = offload_get_src_base(
+                            m_vars[i].into ?
+                            static_cast<char*>(m_vars[i].into) :
+                            static_cast<char*>(m_vars[i].ptr),
+                            m_vars[i].type.dst);
+                    }
+
+                    if (m_vars[i].flags.is_noncont_src ||
+                        m_vars[i].flags.is_noncont_dst) {
+                        receive_noncontiguous_pointer_data(
+                            i, base, dst_buf, event);
+                    }
+                    else if (dst_buf != 0) {
+                        res = COI::BufferCopy(
+                            dst_buf,
+                            m_vars_extra[i].src_data->mic_buf,
+                            m_vars_extra[i].cpu_offset +
+                            m_vars_extra[i].cpu_disp,
+                            m_vars[i].offset + m_vars[i].disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            m_in_deps_total,
+                            m_in_deps_total > 0 ? m_in_deps : 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_copy, res);
+                        }
+                    }
+                    else {
+                       res = COI::BufferRead(
+                            m_vars_extra[i].src_data->mic_buf,
+                            m_vars[i].offset + m_vars[i].disp,
+                            base + m_vars_extra[i].cpu_offset +
+                            m_vars_extra[i].cpu_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            m_in_deps_total,
+                            m_in_deps_total > 0 ? m_in_deps : 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_read, res);
+                        }
+                    }
+                    ptr_received += m_vars[i].size;
+                }
+                break;
+
+            case c_string_ptr:
+            case c_data_ptr:
+            case c_cean_var_ptr:
+            case c_dv_data:
+            case c_dv_ptr_data:
+            case c_dv_data_slice:
+            case c_dv_ptr_data_slice:
+            case c_dv_ptr: {
+                COIBUFFER dst_buf = NULL; // buffer on host
+                if (m_vars[i].direction.out && m_vars[i].size > 0) {
+                    COIEVENT *event =
+                        (is_async ||
+                         m_in_deps_total > 0 ||
+                         m_vars[i].size >= __offload_use_async_buffer_read) ?
+                        &m_out_deps[m_out_deps_total++] : 0;
+
+                    uint64_t dst_offset = 0;
+                    char *base = static_cast<char*>(m_vars[i].ptr);
+
+                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
+                        PtrData *ptr_data = m_vars[i].into ?
+                                            m_vars_extra[i].dst_data :
+                                            m_vars_extra[i].src_data;
+                        dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
+                        if (dst_buf == NULL) {
+                            base = m_vars[i].into ?
+                                   *static_cast<char**>(m_vars[i].into) :
+                                   *static_cast<char**>(m_vars[i].ptr);
+                        }
+                        dst_offset = m_vars_extra[i].cpu_offset +
+                                     m_vars_extra[i].cpu_disp;
+                    }
+                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
+                        if (m_vars[i].flags.is_static_dstn) {
+                            dst_buf = m_vars[i].into ?
+                                        m_vars_extra[i].dst_data->cpu_buf :
+                                        m_vars_extra[i].src_data->cpu_buf;
+                        }
+                        if (dst_buf == NULL) {
+                            base = offload_get_src_base(
+                                m_vars[i].into ?
+                                static_cast<char*>(m_vars[i].into) :
+                                static_cast<char*>(m_vars[i].ptr),
+                                m_vars[i].type.dst);
+                        }
+                        dst_offset = m_vars_extra[i].cpu_offset +
+                                     m_vars_extra[i].cpu_disp;
+                    }
+                    else if (VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst) ||
+                             VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
+                        PtrData *ptr_data = m_vars[i].into != 0 ?
+                                            m_vars_extra[i].dst_data :
+                                            m_vars_extra[i].src_data;
+                        dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
+                        if (dst_buf == NULL) {
+                            base = offload_get_src_base(
+                                m_vars[i].into ?
+                                static_cast<char*>(m_vars[i].into) :
+                                static_cast<char*>(m_vars[i].ptr),
+                                m_vars[i].type.dst);
+
+                        }
+                        dst_offset = m_vars_extra[i].cpu_offset +
+                                     m_vars_extra[i].cpu_disp;
+                    }
+
+                    if (m_vars[i].flags.is_noncont_src ||
+                        m_vars[i].flags.is_noncont_dst) {
+                        receive_noncontiguous_pointer_data(
+                            i, base, dst_buf, event);
+                    }
+                    else if (dst_buf != 0) {
+                        res = COI::BufferCopy(
+                            dst_buf,
+                            m_vars_extra[i].src_data->mic_buf,
+                            dst_offset,
+                            m_vars[i].offset + m_vars[i].disp +
+                                m_vars[i].mic_offset -
+                                m_vars_extra[i].src_data->alloc_disp,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            m_in_deps_total,
+                            m_in_deps_total > 0 ? m_in_deps : 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_copy, res);
+                        }
+                    }
+                    else {
+                        res = COI::BufferRead(
+                            m_vars_extra[i].src_data->mic_buf,
+                            m_vars[i].offset + m_vars[i].disp +
+                                m_vars[i].mic_offset -
+                                m_vars_extra[i].src_data->alloc_disp,
+                            base + dst_offset,
+                            m_vars[i].size,
+                            COI_COPY_UNSPECIFIED,
+                            m_in_deps_total,
+                            m_in_deps_total > 0 ? m_in_deps : 0,
+                            event);
+                        if (res != COI_SUCCESS) {
+                            if (m_status != 0) {
+                                m_status->result = translate_coi_error(res);
+                                return false;
+                            }
+                            report_coi_error(c_buf_read, res);
+                        }
+                    }
+                    ptr_received += m_vars[i].size;
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+
+        // destroy buffers for obsolete stacks
+        if (m_destroy_stack.size() != 0) {
+            for (PtrDataList::iterator it = m_destroy_stack.begin();
+                it != m_destroy_stack.end(); it++) {
+                PtrData *ptr_data = *it;
+                m_destroy_buffers.push_back(ptr_data->mic_buf);
+                OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
+                                  ptr_data->mic_addr);
+            }
+            m_destroy_stack.clear();
+        }
+        if (m_vars[i].free_if) {
+            // remove association for automatic variables
+            if (m_is_openmp && !m_vars[i].flags.is_static &&
+                (m_vars[i].type.src == c_data ||
+                 m_vars[i].type.src == c_void_ptr ||
+                 m_vars[i].type.src == c_cean_var)) {
+                AutoData *auto_data = m_vars_extra[i].auto_data;
+                if (auto_data != 0 && auto_data->remove_reference() == 0) {
+                    m_device.remove_auto_data(auto_data->cpu_addr.start());
+                }
+            }
+
+            // destroy buffers
+            if (m_vars[i].direction.out || m_vars[i].into == NULL) {
+                if (!VAR_TYPE_IS_PTR(m_vars[i].type.src) &&
+                    !VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) &&
+                    !VAR_TYPE_IS_DV_DATA(m_vars[i].type.src)) {
+                    continue;
+                }
+
+                PtrData *ptr_data = m_vars_extra[i].src_data;
+                if (ptr_data->remove_reference() == 0) {
+                    // destroy buffers
+                    if (ptr_data->cpu_buf != 0) {
+                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
+                    }
+                    if (ptr_data->mic_buf != 0) {
+                        m_destroy_buffers.push_back(ptr_data->mic_buf);
+                    }
+                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
+                                  ptr_data->cpu_addr.start());
+
+                    // remove association from map
+                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
+                }
+            }
+            else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) ||
+                     VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst) ||
+                     VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst)) {
+                PtrData *ptr_data = m_vars_extra[i].dst_data;
+                if (ptr_data->remove_reference() == 0) {
+                    // destroy buffers
+                    if (ptr_data->cpu_buf != 0) {
+                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
+                    }
+                    if (ptr_data->mic_buf != 0) {
+                        m_destroy_buffers.push_back(ptr_data->mic_buf);
+                    }
+                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
+                                  ptr_data->cpu_addr.start());
+
+                    // remove association from map
+                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
+                }
+            }
+        }
+    }
+
+    if (m_status) {
+        m_status->data_received += ptr_received;
+    }
+
+    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
+    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
+                  c_offload_received_pointer_data,
+                  "Total pointer data received from target: [%lld] bytes\n",
+                  ptr_received);
+
+    return true;
+}
+
+bool OffloadDescriptor::scatter_copyout_data()
+{
+    OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
+
+    if (m_need_runfunction && m_out_datalen > 0) {
+
+        // total size that need to be transferred from target to host
+        COIMAPINSTANCE map_inst;
+        COIRESULT res;
+        char *data;
+
+        // output data buffer
+        if (m_func_desc->data_offset == 0) {
+            OffloadTimer timer_map(get_timer_data(),
+                                   c_offload_host_map_out_data_buffer);
+
+            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
+                                           COI_MAP_READ_ONLY, 0, 0, 0,
+                                           &map_inst,
+                                            reinterpret_cast<void**>(&data));
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_map, res);
+            }
+        }
+        else {
+            data = (char*) m_func_desc + m_func_desc->data_offset;
+        }
+
+        // get timing data
+        OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
+        data += OFFLOAD_TIMER_DATALEN();
+
+        // initialize output marshaller
+        m_out.init_buffer(data, m_out_datalen);
+
+        for (int i = 0; i < m_vars_total; i++) {
+            switch (m_vars[i].type.src) {
+                case c_data_ptr_array:
+                    break;
+                case c_data:
+                case c_void_ptr:
+                case c_cean_var:
+                    if (m_vars[i].direction.out &&
+                        !m_vars[i].flags.is_static) {
+
+                        if (m_vars[i].into) {
+                            char *ptr = offload_get_src_base(
+                                static_cast<char*>(m_vars[i].into),
+                                m_vars[i].type.dst);
+                            m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
+                                               m_vars[i].size);
+                        }
+                        else {
+                            m_out.receive_data(
+                                static_cast<char*>(m_vars[i].ptr) +
+                                    m_vars_extra[i].cpu_disp,
+                                m_vars[i].size);
+                        }
+                    }
+                    break;
+
+                case c_func_ptr:
+                    if (m_vars[i].direction.out) {
+                        m_out.receive_func_ptr((const void**) m_vars[i].ptr);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+
+        if (m_status) {
+            m_status->data_received += m_out.get_tfr_size();
+        }
+
+        if (m_func_desc->data_offset == 0) {
+            OffloadTimer timer_unmap(get_timer_data(),
+                                     c_offload_host_unmap_out_data_buffer);
+
+            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
+            if (res != COI_SUCCESS) {
+                if (m_status != 0) {
+                    m_status->result = translate_coi_error(res);
+                    return false;
+                }
+                report_coi_error(c_buf_unmap, res);
+            }
+        }
+    }
+
+    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
+    OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
+                  m_out.get_tfr_size());
+
+    return true;
+}
+
+void get_arr_desc_numbers(
+    const arr_desc *ap,
+    int64_t el_size,
+    int64_t &offset,
+    int64_t &size,
+    int     &el_number,
+    CeanReadRanges* &ptr_ranges
+)
+{
+    if (is_arr_desc_contiguous(ap)) {
+        ptr_ranges = NULL;
+        __arr_data_offset_and_length(ap, offset, size);
+        el_number = size / el_size;
+    }
+    else {
+        ptr_ranges = init_read_ranges_arr_desc(ap);
+        el_number = (ptr_ranges->range_size / el_size) *
+                    ptr_ranges->range_max_number;
+        size = ptr_ranges->range_size;
+    }
+}
+
+arr_desc * make_arr_desc(
+    void*   ptr_val,
+    int64_t extent_start_val,
+    int64_t extent_elements_val,
+    int64_t size
+)
+{
+    arr_desc *res;
+    res = (arr_desc *)malloc(sizeof(arr_desc));
+    res->base = reinterpret_cast<int64_t>(ptr_val);
+    res->rank = 1;
+    res->dim[0].size = size;
+    res->dim[0].lindex = 0;
+    res->dim[0].lower = extent_start_val;
+    res->dim[0].upper = extent_elements_val + extent_start_val - 1;
+    res->dim[0].stride = 1;
+    return res;
+}
+
+bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
+{
+    int             pointers_number;
+    int             tmp_val;
+    int             new_index = m_vars_total;
+    const arr_desc *ap;
+    const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
+    int             flags = vd3->array_fields;
+    bool            src_is_for_mic = (m_vars[i].direction.out ||
+                                      m_vars[i].into == NULL);
+
+    ReadArrElements<void *>  ptr;
+    ReadArrElements<void *>  into;
+    ReadArrElements<int64_t> ext_start;
+    ReadArrElements<int64_t> ext_elements;
+    ReadArrElements<int64_t> align;
+    ReadArrElements<int64_t> alloc_if;
+    ReadArrElements<int64_t> free_if;
+    ReadArrElements<int64_t> into_start;
+    ReadArrElements<int64_t> into_elem;
+    ReadArrElements<int64_t> alloc_start;
+    ReadArrElements<int64_t> alloc_elem;
+
+
+    ap = static_cast<const arr_desc*>(vd3->ptr_array);
+
+    // "pointers_number" for total number of transferred pointers.
+    // For each of them we create new var_desc and put it at the bottom
+    // of the var_desc's array
+    get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
+        pointers_number, ptr.ranges);
+    ptr.base = reinterpret_cast<char*>(ap->base);
+
+    // 2. prepare memory for new var_descs
+    m_vars_total += pointers_number;
+    m_vars       = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
+    m_vars_extra =
+        (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
+    m_in_deps    =
+        (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
+    m_out_deps   =
+        (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
+
+    // 3. Prepare for reading new var_desc's fields
+    //    EXTENT START
+    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->extent_start);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
+            ext_start.size, tmp_val, ext_start.ranges);
+        ext_start.base = reinterpret_cast<char*>(ap->base);
+        ext_start.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
+            return false;
+        }
+    }
+    else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
+        ext_start.val = (int64_t)vd3->extent_start;
+    }
+    else {
+        ext_start.val = 0;
+    }
+
+    //    EXTENT ELEMENTS NUMBER
+    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->extent_elements);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
+            ext_elements.offset, ext_elements.size,
+            tmp_val, ext_elements.ranges);
+        ext_elements.base = reinterpret_cast<char*>(ap->base);
+        ext_elements.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
+            return false;
+        }
+    }
+    else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
+        ext_elements.val = (int64_t)vd3->extent_elements;
+    }
+    else {
+        ext_elements.val = m_vars[i].count;
+    }
+
+    //    ALLOC_IF
+    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
+            alloc_if.size, tmp_val, alloc_if.ranges);
+        alloc_if.base = reinterpret_cast<char*>(ap->base);
+        alloc_if.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
+            return false;
+        }
+    }
+    else {
+        alloc_if.val = m_vars[i].count;
+    }
+
+    //    FREE_IF
+    if ((flags & (1<<flag_free_if_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->free_if_array);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
+            free_if.size, tmp_val, free_if.ranges);
+        free_if.base = reinterpret_cast<char*>(ap->base);
+        free_if.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
+            return false;
+        }
+    }
+    else {
+        free_if.val = m_vars[i].count;
+    }
+
+    //    ALIGN
+
+    if ((flags & (1<<flag_align_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->align_array);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
+            align.size, tmp_val, align.ranges);
+        align.base = reinterpret_cast<char*>(ap->base);
+        align.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
+            return false;
+        }
+    }
+    else {
+        align.val = m_vars[i].align;
+    }
+
+    // 3.1 INTO
+
+    if (m_vars[i].into) {
+        ap = static_cast<const arr_desc*>(m_vars[i].into);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
+            into.size, tmp_val, into.ranges);
+        into.base = reinterpret_cast<char*>(ap->base);
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
+            return false;
+        }
+    }
+
+    // 3.2 INTO_START
+
+    if ((flags & (1<<flag_into_start_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->into_start);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
+            into_start.size, tmp_val, into_start.ranges);
+        into_start.base = reinterpret_cast<char*>(ap->base);
+        into_start.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
+            return false;
+        }
+    }
+    else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
+        into_start.val = (int64_t)vd3->into_start;
+    }
+    else {
+        into_start.val = 0;
+    }
+
+    // 3.3 INTO_ELEMENTS
+
+    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->into_elements);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
+            into_elem.size, tmp_val, into_elem.ranges);
+        into_elem.base = reinterpret_cast<char*>(ap->base);
+        into_elem.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
+            return false;
+        }
+    }
+    else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
+        into_elem.val = (int64_t)vd3->into_elements;
+    }
+    else {
+        into_elem.val = m_vars[i].count;
+    }
+
+    //    alloc_start
+
+    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->alloc_start);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
+            alloc_start.offset, alloc_start.size, tmp_val,
+            alloc_start.ranges);
+        alloc_start.base = reinterpret_cast<char*>(ap->base);
+        alloc_start.el_size = ap->dim[ap->rank - 1].size;
+
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
+            return false;
+        }
+    }
+    else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
+        alloc_start.val = (int64_t)vd3->alloc_start;
+    }
+    else {
+        alloc_start.val = 0;
+    }
+
+    //    alloc_elem
+
+    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
+        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
+        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
+            alloc_elem.size, tmp_val, alloc_elem.ranges);
+        alloc_elem.base = reinterpret_cast<char*>(ap->base);
+        alloc_elem.el_size = ap->dim[ap->rank - 1].size;
+        if (tmp_val < pointers_number) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
+                             "alloc_extent elements");
+            return false;
+        }
+    }
+    else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
+        alloc_elem.val = (int64_t)vd3->alloc_elements;
+    }
+    else {
+        alloc_elem.val = 0;
+    }
+
+    for (int k = 0; k < pointers_number; k++) {
+        int type = flags & 0x3f;
+        int type_src, type_dst;
+        //  Get new values
+        // type_src, type_dst
+        type_src = type_dst = (type == c_data_ptr_array) ?
+                              c_data_ptr   : (type == c_func_ptr_array) ?
+                              c_func_ptr   : (type == c_void_ptr_array) ?
+                              c_void_ptr   : (type == c_string_ptr_array) ?
+                              c_string_ptr : 0;
+
+        // Get ptr val
+        if (!ptr.read_next(true)) {
+            break;
+        }
+        else {
+            ptr.val = (void*)(ptr.base + ptr.offset);
+        }
+
+        // !!! If we got error at phase of reading - it's an internal
+        // !!! error, as we must detect mismatch before
+
+        // Get into val
+        if (m_vars[i].into) {
+            if (!into.read_next(true)) {
+                LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
+                LIBOFFLOAD_ABORT;
+            }
+            else {
+                into.val = (void*)(into.base + into.offset);
+            }
+        }
+
+        // Get other components of the clause
+        if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!ext_elements.read_next(
+                flags & (1<<flag_extent_elements_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!align.read_next(flags & (1<<flag_align_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
+            LIBOFFLOAD_ABORT;
+        }
+        if (!alloc_elem.read_next(
+                 flags & (1<<flag_alloc_elements_is_array))) {
+            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
+            LIBOFFLOAD_ABORT;
+        }
+
+        m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
+        m_vars[new_index + k].alloc_if = alloc_if.val;
+        m_vars[new_index + k].free_if = free_if.val;
+        m_vars[new_index + k].align = align.val;
+        m_vars[new_index + k].mic_offset = 0;
+        m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
+        m_vars[new_index + k].offset = 0;
+        m_vars[new_index + k].size = m_vars[i].size;
+
+        if (ext_start.val == 0) {
+            m_vars[new_index + k].count = ext_elements.val;
+            m_vars[new_index + k].ptr = ptr.val;
+            if (type_src == c_string_ptr) {
+                m_vars[new_index + k].size = 0;
+            }
+        }
+        else {
+            m_vars[new_index + k].count = 0;
+            m_vars[new_index + k].ptr =
+                static_cast<void*>(make_arr_desc(
+                ptr.val,
+                ext_start.val,
+                ext_elements.val,
+                m_vars[i].size));
+
+            type_src = type_src == c_data_ptr ? c_cean_var_ptr :
+                                   c_string_ptr ? c_cean_var_ptr :
+                                   type_src;
+            if (!m_vars[i].into) {
+                type_dst = type_src;
+            }
+        }
+
+        if (m_vars[i].into && into_elem.val != 0) {
+            m_vars[new_index + k].into =
+                static_cast<void*>(make_arr_desc(
+                into.val,
+                into_start.val,
+                into_elem.val,
+                m_vars[i].size));
+            type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
+                       (type == c_string_ptr_array) ? c_cean_var_ptr :
+                        type_src;
+        }
+        else {
+            m_vars[new_index + k].into = NULL;
+        }
+
+        if (alloc_elem.val != 0) {
+            m_vars[new_index + k].alloc =
+                static_cast<void*>(make_arr_desc(
+                ptr.val,
+                alloc_start.val,
+                alloc_elem.val,
+                m_vars[i].size));
+        }
+        else {
+            m_vars[new_index + k].alloc = NULL;
+        }
+
+        m_vars[new_index + k].type.src = type_src;
+        m_vars[new_index + k].type.dst = type_dst;
+
+        m_vars_extra[new_index + k].is_arr_ptr_el = 1;
+        m_vars_extra[new_index + k].ptr_arr_offset =
+            src_is_for_mic ? ptr.offset : into.offset;
+    }
+    // count and alloc fields are useless at target. They can be reused
+    // for pointer arrays.
+    m_vars[i].count = pointers_number;
+    m_vars[i].ptr_arr_offset = new_index;
+    return true;
+}
+
+static void __offload_fini_library(void)
+{
+    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
+    if (mic_engines_total > 0) {
+        delete[] mic_engines;
+
+        if (mic_proxy_fs_root != 0) {
+            free(mic_proxy_fs_root);
+            mic_proxy_fs_root = 0;
+        }
+
+        if (mic_library_path != 0) {
+            free(mic_library_path);
+            mic_library_path = 0;
+        }
+
+        // destroy thread key
+        thread_key_delete(mic_thread_key);
+    }
+
+    // unload COI library
+    if (COI::is_available) {
+        COI::fini();
+    }
+
+    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
+}
+
+static void __offload_init_library_once(void)
+{
+    COIRESULT res;
+    uint32_t num_devices;
+    std::bitset<MIC_ENGINES_MAX> devices;
+
+    prefix = report_get_message_str(c_report_host);
+
+    // initialize trace
+    const char *env_var = getenv(htrace_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        int64_t new_val;
+        if (__offload_parse_int_string(env_var, new_val)) {
+            console_enabled = new_val & 0x0f;
+        }
+    }
+
+    env_var = getenv(offload_report_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        int64_t env_val;
+        if (__offload_parse_int_string(env_var, env_val)) {
+            if (env_val == OFFLOAD_REPORT_1 ||
+                env_val == OFFLOAD_REPORT_2 ||
+                env_val == OFFLOAD_REPORT_3) {
+                offload_report_level = env_val;
+            }
+            else {
+                LIBOFFLOAD_ERROR(c_invalid_env_report_value,
+                                 offload_report_envname);
+            }
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
+                             offload_report_envname);
+        }
+    }
+    else if (!offload_report_level) {
+        env_var = getenv(timer_envname);
+        if (env_var != 0 && *env_var != '\0') {
+            timer_enabled = atoi(env_var);
+        }
+    }
+
+    // initialize COI
+    if (!COI::init()) {
+        return;
+    }
+
+    // get number of devices installed in the system
+    res = COI::EngineGetCount(COI_ISA_KNC, &num_devices);
+    if (res != COI_SUCCESS) {
+        return;
+    }
+
+    if (num_devices > MIC_ENGINES_MAX) {
+        num_devices = MIC_ENGINES_MAX;
+    }
+
+    // fill in the list of devices that can be used for offloading
+    env_var = getenv("OFFLOAD_DEVICES");
+    if (env_var != 0) {
+        if (strcasecmp(env_var, "none") != 0) {
+            // value is composed of comma separated physical device indexes
+            char *buf = strdup(env_var);
+            char *str, *ptr;
+            for (str = strtok_r(buf, ",", &ptr); str != 0;
+                 str = strtok_r(0, ",", &ptr)) {
+                // convert string to an int
+                int64_t num;
+                if (!__offload_parse_int_string(str, num)) {
+                    LIBOFFLOAD_ERROR(c_mic_init5);
+
+                    // fallback to using all installed devices
+                    devices.reset();
+                    for (int i = 0; i < num_devices; i++) {
+                        devices.set(i);
+                    }
+                    break;
+                }
+                if (num < 0 || num >= num_devices) {
+                    LIBOFFLOAD_ERROR(c_mic_init6, num);
+                    continue;
+                }
+                devices.set(num);
+            }
+            free(buf);
+        }
+    }
+    else {
+        // use all available devices
+        for (int i = 0; i < num_devices; i++) {
+            COIENGINE engine;
+            res = COI::EngineGetHandle(COI_ISA_KNC, i, &engine);
+            if (res == COI_SUCCESS) {
+                devices.set(i);
+            }
+        }
+    }
+
+    mic_engines_total = devices.count();
+
+    // no need to continue if there are no devices to offload to
+    if (mic_engines_total <= 0) {
+        return;
+    }
+
+    // initialize indexes for available devices
+    mic_engines = new Engine[mic_engines_total];
+    for (int p_idx = 0, l_idx = 0; p_idx < num_devices; p_idx++) {
+        if (devices[p_idx]) {
+            mic_engines[l_idx].set_indexes(l_idx, p_idx);
+            l_idx++;
+        }
+    }
+
+    // library search path for device binaries
+    env_var = getenv("MIC_LD_LIBRARY_PATH");
+    if (env_var != 0) {
+        mic_library_path = strdup(env_var);
+    }
+
+    // memory size reserved for COI buffers
+    env_var = getenv("MIC_BUFFERSIZE");
+    if (env_var != 0) {
+        uint64_t new_size;
+        if (__offload_parse_size_string(env_var, new_size)) {
+            mic_buffer_size = new_size;
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
+        }
+    }
+
+    // determine stacksize for the pipeline on the device
+    env_var = getenv("MIC_STACKSIZE");
+    if (env_var != 0 && *env_var != '\0') {
+        uint64_t new_size;
+        if (__offload_parse_size_string(env_var, new_size) &&
+            (new_size >= 16384) && ((new_size & 4095) == 0)) {
+            mic_stack_size = new_size;
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_mic_init3);
+        }
+    }
+
+    // proxy I/O
+    env_var = getenv("MIC_PROXY_IO");
+    if (env_var != 0 && *env_var != '\0') {
+        int64_t new_val;
+        if (__offload_parse_int_string(env_var, new_val)) {
+            mic_proxy_io = new_val;
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
+        }
+    }
+    env_var = getenv("MIC_PROXY_FS_ROOT");
+    if (env_var != 0 && *env_var != '\0') {
+        mic_proxy_fs_root = strdup(env_var);
+    }
+
+    // Prepare environment for the target process using the following
+    // rules
+    // - If MIC_ENV_PREFIX is set then any environment variable on the
+    //   host which has that prefix are copied to the device without
+    //   the prefix.
+    //   All other host environment variables are ignored.
+    // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
+    //   environment is duplicated.
+    env_var = getenv("MIC_ENV_PREFIX");
+    if (env_var != 0 && *env_var != '\0') {
+        mic_env_vars.set_prefix(env_var);
+
+        int len = strlen(env_var);
+        for (int i = 0; environ[i] != 0; i++) {
+            if (strncmp(environ[i], env_var, len) == 0 &&
+                strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
+                environ[i][len] != '=') {
+                mic_env_vars.analyze_env_var(environ[i]);
+            }
+        }
+    }
+
+    // create key for thread data
+    if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
+        LIBOFFLOAD_ERROR(c_mic_init4, errno);
+        return;
+    }
+
+    // cpu frequency
+    cpu_frequency = COI::PerfGetCycleFrequency();
+
+    env_var = getenv(mic_use_2mb_buffers_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        uint64_t new_size;
+        if (__offload_parse_size_string(env_var, new_size)) {
+            __offload_use_2mb_buffers = new_size;
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_env_var_value,
+                             mic_use_2mb_buffers_envname);
+        }
+    }
+
+    env_var = getenv(mic_use_async_buffer_write_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        uint64_t new_size;
+        if (__offload_parse_size_string(env_var, new_size)) {
+            __offload_use_async_buffer_write = new_size;
+        }
+    }
+
+    env_var = getenv(mic_use_async_buffer_read_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        uint64_t new_size;
+        if (__offload_parse_size_string(env_var, new_size)) {
+            __offload_use_async_buffer_read = new_size;
+        }
+    }
+
+    // mic initialization type
+    env_var = getenv(offload_init_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        if (strcmp(env_var, "on_offload") == 0) {
+            __offload_init_type = c_init_on_offload;
+        }
+        else if (strcmp(env_var, "on_offload_all") == 0) {
+            __offload_init_type = c_init_on_offload_all;
+        }
+#ifndef TARGET_WINNT
+        else if (strcmp(env_var, "on_start") == 0) {
+            __offload_init_type = c_init_on_start;
+        }
+#endif // TARGET_WINNT
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
+        }
+    }
+
+    // active wait
+    env_var = getenv(offload_active_wait_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        int64_t new_val;
+        if (__offload_parse_int_string(env_var, new_val)) {
+            __offload_active_wait = new_val;
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
+                             offload_active_wait_envname);
+        }
+    }
+
+    // omp device num
+    env_var = getenv(omp_device_num_envname);
+    if (env_var != 0 && *env_var != '\0') {
+        int64_t new_val;
+        if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
+            __omp_device_num = new_val;
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
+                             omp_device_num_envname);
+        }
+    }
+
+    // init ORSL
+    ORSL::init();
+}
+
+extern int __offload_init_library(void)
+{
+    // do one time intialization
+    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
+    __offload_run_once(&ctrl, __offload_init_library_once);
+
+    // offload is available if COI is available and the number of devices > 0
+    bool is_available = COI::is_available && (mic_engines_total > 0);
+
+    // register pending libraries if there are any
+    if (is_available && __target_libs) {
+        mutex_locker_t locker(__target_libs_lock);
+
+        for (TargetImageList::iterator it = __target_libs_list.begin();
+             it != __target_libs_list.end(); it++) {
+            // Register library in COI
+            COI::ProcessRegisterLibraries(1, &it->data, &it->size,
+                                          &it->origin, &it->offset);
+
+            // add lib to all engines
+            for (int i = 0; i < mic_engines_total; i++) {
+                mic_engines[i].add_lib(*it);
+            }
+        }
+
+        __target_libs = false;
+        __target_libs_list.clear();
+    }
+
+    return is_available;
+}
+
+extern "C" void __offload_register_image(const void *target_image)
+{
+    const struct Image *image = static_cast<const struct Image*>(target_image);
+
+    // decode image
+    const char *name = image->data;
+    const void *data = image->data + strlen(image->data) + 1;
+    uint64_t    size = image->size;
+    const char *origin = 0;
+    uint64_t    offset = 0;
+
+    // our actions depend on the image type
+    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
+    switch (hdr->e_type) {
+        case ET_EXEC:
+            // Each offload application is supposed to have only one target
+            // image representing target executable.
+            // No thread synchronization is required here as the initialization
+            // code is always executed in a single thread.
+            if (__target_exe != 0) {
+                LIBOFFLOAD_ERROR(c_multiple_target_exes);
+                exit(1);
+            }
+            __target_exe = new TargetImage(name, data, size, origin, offset);
+
+            // Registration code for execs is always called from the context
+            // of main and thus we can safely call any function here,
+            // including LoadLibrary API on windows. This is the place where
+            // we do the offload library initialization.
+            if (__offload_init_library()) {
+                // initialize engine if init_type is on_start
+                if (__offload_init_type == c_init_on_start) {
+                    for (int i = 0; i < mic_engines_total; i++) {
+                        mic_engines[i].init();
+                    }
+                }
+            }
+            break;
+
+        case ET_DYN:
+            // Registration code for libraries is called from the DllMain
+            // context (on windows) and thus we cannot do anything useful
+            // here. So we just add it to the list of pending libraries for
+            // the later use.
+            __target_libs_lock.lock();
+            __target_libs = true;
+            __target_libs_list.push_back(TargetImage(name, data, size,
+                                                     origin, offset));
+            __target_libs_lock.unlock();
+            break;
+
+        default:
+            // something is definitely wrong, issue an error and exit
+            LIBOFFLOAD_ERROR(c_unknown_binary_type);
+            exit(1);
+    }
+}
+
+extern "C" void __offload_unregister_image(const void *target_image)
+{
+    // Target image is packed as follows:
+    //      8 bytes                - size of the target binary
+    //      null-terminated string - binary name
+    //      <size> bytes           - binary contents
+    const struct Image {
+         int64_t size;
+         char data[];
+    } *image = static_cast<const struct Image*>(target_image);
+
+    // decode image
+    const char *name = image->data;
+    const void *data = image->data + strlen(image->data) + 1;
+
+    // our actions depend on the image type
+    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
+    if (hdr->e_type == ET_EXEC) {
+        // We are executing exec's desctructors.
+        // It is time to do a library cleanup.
+        if (timer_enabled) {
+            Offload_Timer_Print();
+        }
+
+#ifdef MYO_SUPPORT
+        __offload_myoFini();
+#endif // MYO_SUPPORT
+
+        __offload_fini_library();
+    }
+}
+
+// Runtime trace interface for user programs
+
+void __offload_console_trace(int level)
+{
+    console_enabled = level;
+}
+
+// User-visible offload API
+
+int _Offload_number_of_devices(void)
+{
+    __offload_init_library();
+    return mic_engines_total;
+}
+
+int _Offload_get_device_number(void)
+{
+    return -1;
+}
+
+int _Offload_get_physical_device_number(void)
+{
+    return -1;
+}
+
+int _Offload_signaled(int index, void *signal)
+{
+    __offload_init_library();
+
+    // check index value
+    if (index < 0 || mic_engines_total <= 0) {
+        LIBOFFLOAD_ERROR(c_offload_signaled1, index);
+        LIBOFFLOAD_ABORT;
+    }
+
+    // find associated async task
+    OffloadDescriptor *task =
+        mic_engines[index % mic_engines_total].find_signal(signal, false);
+    if (task == 0) {
+        LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
+        LIBOFFLOAD_ABORT;
+    }
+
+    return task->is_signaled();
+}
+
+void _Offload_report(int val)
+{
+    if (val == OFFLOAD_REPORT_ON ||
+        val == OFFLOAD_REPORT_OFF) {
+        offload_report_enabled = val;
+    }
+}
+
+// IDB support
+int   __dbg_is_attached = 0;
+int   __dbg_target_id = -1;
+pid_t __dbg_target_so_pid = -1;
+char  __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
+const int __dbg_api_major_version = 1;
+const int __dbg_api_minor_version = 0;
+
+void __dbg_target_so_loaded()
+{
+}
+void __dbg_target_so_unloaded()
+{
+}

diff --git a/final/offload/src/offload_host.h b/final/offload/src/offload_host.h
new file mode 100644
index 0000000..ea23996
--- /dev/null
+++ b/final/offload/src/offload_host.h

@@ -0,0 +1,343 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*! \file
+    \brief The parts of the runtime library used only on the host
+*/
+
+#ifndef OFFLOAD_HOST_H_INCLUDED
+#define OFFLOAD_HOST_H_INCLUDED
+
+#ifndef TARGET_WINNT
+#include <unistd.h>
+#endif // TARGET_WINNT
+#include "offload_common.h"
+#include "offload_util.h"
+#include "offload_engine.h"
+#include "offload_env.h"
+#include "offload_orsl.h"
+#include "coi/coi_client.h"
+
+// MIC engines.
+extern Engine*  mic_engines;
+extern uint32_t mic_engines_total;
+
+//! The target image is packed as follows.
+/*!      1. 8 bytes containing the size of the target binary          */
+/*!      2. a null-terminated string which is the binary name         */
+/*!      3. <size> number of bytes that are the contents of the image */
+/*!      The address of symbol __offload_target_image
+             is the address of this structure.                        */
+struct Image {
+     int64_t size; //!< Size in bytes of the target binary name and contents
+     char data[];  //!< The name and contents of the target image
+};
+
+// The offload descriptor.
+class OffloadDescriptor
+{
+public:
+    OffloadDescriptor(
+        int index,
+        _Offload_status *status,
+        bool is_mandatory,
+        bool is_openmp,
+        OffloadHostTimerData * timer_data
+    ) :
+        m_device(mic_engines[index % mic_engines_total]),
+        m_is_mandatory(is_mandatory),
+        m_is_openmp(is_openmp),
+        m_inout_buf(0),
+        m_func_desc(0),
+        m_func_desc_size(0),
+        m_in_deps(0),
+        m_in_deps_total(0),
+        m_out_deps(0),
+        m_out_deps_total(0),
+        m_vars(0),
+        m_vars_extra(0),
+        m_status(status),
+        m_timer_data(timer_data)
+    {}
+
+    ~OffloadDescriptor()
+    {
+        if (m_in_deps != 0) {
+            free(m_in_deps);
+        }
+        if (m_out_deps != 0) {
+            free(m_out_deps);
+        }
+        if (m_func_desc != 0) {
+            free(m_func_desc);
+        }
+        if (m_vars != 0) {
+            free(m_vars);
+            free(m_vars_extra);
+        }
+    }
+
+    bool offload(const char *name, bool is_empty,
+                 VarDesc *vars, VarDesc2 *vars2, int vars_total,
+                 const void **waits, int num_waits, const void **signal,
+                 int entry_id, const void *stack_addr);
+    bool offload_finish();
+
+    bool is_signaled();
+
+    OffloadHostTimerData* get_timer_data() const {
+        return m_timer_data;
+    }
+
+private:
+    bool wait_dependencies(const void **waits, int num_waits);
+    bool setup_descriptors(VarDesc *vars, VarDesc2 *vars2, int vars_total,
+                           int entry_id, const void *stack_addr);
+    bool setup_misc_data(const char *name);
+    bool send_pointer_data(bool is_async);
+    bool send_noncontiguous_pointer_data(
+        int i,
+        PtrData* src_buf,
+        PtrData* dst_buf,
+        COIEVENT *event);
+    bool receive_noncontiguous_pointer_data(
+        int i,
+        char* src_data,
+        COIBUFFER dst_buf,
+        COIEVENT *event);
+
+    bool gather_copyin_data();
+
+    bool compute();
+
+    bool receive_pointer_data(bool is_async);
+    bool scatter_copyout_data();
+
+    void cleanup();
+
+    bool find_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
+                       int64_t length, bool error_does_not_exist = true);
+    bool alloc_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
+                        int64_t length, int64_t alloc_disp, int align);
+    bool init_static_ptr_data(PtrData *ptr_data);
+    bool init_mic_address(PtrData *ptr_data);
+    bool offload_stack_memory_manager(const void * stack_begin, int routine_id,
+                                      int buf_size, int align, bool *is_new);
+    bool nullify_target_stack(COIBUFFER targ_buf, uint64_t size);
+
+    bool gen_var_descs_for_pointer_array(int i);
+
+    void report_coi_error(error_types msg, COIRESULT res);
+    _Offload_result translate_coi_error(COIRESULT res) const;
+
+private:
+    typedef std::list<COIBUFFER> BufferList;
+
+    // extra data associated with each variable descriptor
+    struct VarExtra {
+        PtrData* src_data;
+        PtrData* dst_data;
+        AutoData* auto_data;
+        int64_t cpu_disp;
+        int64_t cpu_offset;
+        CeanReadRanges *read_rng_src;
+        CeanReadRanges *read_rng_dst;
+        int64_t ptr_arr_offset;
+        bool is_arr_ptr_el;
+    };
+
+    template<typename T> class ReadArrElements {
+    public:
+        ReadArrElements():
+            ranges(NULL),
+            el_size(sizeof(T)),
+            offset(0),
+            count(0),
+            is_empty(true),
+            base(NULL)
+        {}
+
+        bool read_next(bool flag)
+        {
+            if (flag != 0) {
+                if (is_empty) {
+                    if (ranges) {
+                        if (!get_next_range(ranges, &offset)) {
+                            // ranges are over
+                            return false;
+                        }
+                    }
+                    // all contiguous elements are over
+                    else if (count != 0) {
+                        return false;
+                    }
+
+                    length_cur = size;
+                }
+                else {
+                    offset += el_size;
+                }
+                val = (T)get_el_value(base, offset, el_size);
+                length_cur -= el_size;
+                count++;
+                is_empty = length_cur == 0;
+            }
+            return true;
+        }
+    public:
+        CeanReadRanges * ranges;
+        T       val;
+        int     el_size;
+        int64_t size,
+                offset,
+                length_cur;
+        bool    is_empty;
+        int     count;
+        char   *base;
+    };
+
+    // ptr_data for persistent auto objects
+    PtrData*    m_stack_ptr_data;
+    PtrDataList m_destroy_stack;
+
+    // Engine
+    Engine& m_device;
+
+    // if true offload is mandatory
+    bool m_is_mandatory;
+
+    // if true offload has openmp origin
+    const bool m_is_openmp;
+
+    // The Marshaller for the inputs of the offloaded region.
+    Marshaller m_in;
+
+    // The Marshaller for the outputs of the offloaded region.
+    Marshaller m_out;
+
+    // List of buffers that are passed to dispatch call
+    BufferList m_compute_buffers;
+
+    // List of buffers that need to be destroyed at the end of offload
+    BufferList m_destroy_buffers;
+
+    // Variable descriptors
+    VarDesc*  m_vars;
+    VarExtra* m_vars_extra;
+    int       m_vars_total;
+
+    // Pointer to a user-specified status variable
+    _Offload_status *m_status;
+
+    // Function descriptor
+    FunctionDescriptor* m_func_desc;
+    uint32_t            m_func_desc_size;
+
+    // Buffer for transferring copyin/copyout data
+    COIBUFFER m_inout_buf;
+
+    // Dependencies
+    COIEVENT *m_in_deps;
+    uint32_t  m_in_deps_total;
+    COIEVENT *m_out_deps;
+    uint32_t  m_out_deps_total;
+
+    // Timer data
+    OffloadHostTimerData *m_timer_data;
+
+    // copyin/copyout data length
+    uint64_t m_in_datalen;
+    uint64_t m_out_datalen;
+
+    // a boolean value calculated in setup_descriptors. If true we need to do
+    // a run function on the target. Otherwise it may be optimized away.
+    bool m_need_runfunction;
+};
+
+// Initialization types for MIC
+enum OffloadInitType {
+    c_init_on_start,         // all devices before entering main
+    c_init_on_offload,       // single device before starting the first offload
+    c_init_on_offload_all    // all devices before starting the first offload
+};
+
+// Initializes library and registers specified offload image.
+extern "C" void __offload_register_image(const void* image);
+extern "C" void __offload_unregister_image(const void* image);
+
+// Initializes offload runtime library.
+extern int __offload_init_library(void);
+
+// thread data for associating pipelines with threads
+extern pthread_key_t mic_thread_key;
+
+// Environment variables for devices
+extern MicEnvVar mic_env_vars;
+
+// CPU frequency
+extern uint64_t cpu_frequency;
+
+// LD_LIBRARY_PATH for MIC libraries
+extern char* mic_library_path;
+
+// stack size for target
+extern uint32_t mic_stack_size;
+
+// Preallocated memory size for buffers on MIC
+extern uint64_t mic_buffer_size;
+
+// Setting controlling inout proxy
+extern bool  mic_proxy_io;
+extern char* mic_proxy_fs_root;
+
+// Threshold for creating buffers with large pages
+extern uint64_t __offload_use_2mb_buffers;
+
+// offload initialization type
+extern OffloadInitType __offload_init_type;
+
+// Device number to offload to when device is not explicitly specified.
+extern int __omp_device_num;
+
+// target executable
+extern TargetImage* __target_exe;
+
+// IDB support
+
+// Called by the offload runtime after initialization of offload infrastructure
+// has been completed.
+extern "C" void  __dbg_target_so_loaded();
+
+// Called by the offload runtime when the offload infrastructure is about to be
+// shut down, currently at application exit.
+extern "C" void  __dbg_target_so_unloaded();
+
+// Null-terminated string containing path to the process image of the hosting
+// application (offload_main)
+#define MAX_TARGET_NAME 512
+extern "C" char  __dbg_target_exe_name[MAX_TARGET_NAME];
+
+// Integer specifying the process id
+extern "C" pid_t __dbg_target_so_pid;
+
+// Integer specifying the 0-based device number
+extern "C" int   __dbg_target_id;
+
+// Set to non-zero by the host-side debugger to enable offload debugging
+// support
+extern "C" int   __dbg_is_attached;
+
+// Major version of the debugger support API
+extern "C" const int __dbg_api_major_version;
+
+// Minor version of the debugger support API
+extern "C" const int __dbg_api_minor_version;
+
+#endif // OFFLOAD_HOST_H_INCLUDED

diff --git a/final/offload/src/offload_myo_host.cpp b/final/offload/src/offload_myo_host.cpp
new file mode 100644
index 0000000..2e1c186
--- /dev/null
+++ b/final/offload/src/offload_myo_host.cpp

@@ -0,0 +1,805 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_myo_host.h"
+#include <errno.h>
+#include <malloc.h>
+#include "offload_host.h"
+
+#if defined(LINUX) || defined(FREEBSD)
+#include <mm_malloc.h>
+#endif
+
+#define MYO_VERSION1    "MYO_1.0"
+
+extern "C" void __cilkrts_cilk_for_32(void*, void*, uint32_t, int32_t);
+extern "C" void __cilkrts_cilk_for_64(void*, void*, uint64_t, int32_t);
+
+#ifndef TARGET_WINNT
+#pragma weak __cilkrts_cilk_for_32
+#pragma weak __cilkrts_cilk_for_64
+#endif // TARGET_WINNT
+
+#ifdef TARGET_WINNT
+#define MYO_TABLE_END_MARKER() reinterpret_cast<const char*>(-1)
+#else // TARGET_WINNT
+#define MYO_TABLE_END_MARKER() reinterpret_cast<const char*>(0)
+#endif // TARGET_WINNT
+
+class MyoWrapper {
+public:
+    MyoWrapper() : m_lib_handle(0), m_is_available(false)
+    {}
+
+    bool is_available() const {
+        return m_is_available;
+    }
+
+    bool LoadLibrary(void);
+
+    // unloads the library
+    void UnloadLibrary(void) {
+//        if (m_lib_handle != 0) {
+//            DL_close(m_lib_handle);
+//            m_lib_handle = 0;
+//        }
+    }
+
+    // Wrappers for MYO client functions
+    void LibInit(void *arg, void *func) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoinit,
+                                 "%s(%p, %p)\n", __func__, arg, func);
+        CheckResult(__func__, m_lib_init(arg, func));
+    }
+
+    void LibFini(void) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myofini, "%s()\n", __func__);
+        m_lib_fini();
+    }
+
+    void* SharedMalloc(size_t size) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedmalloc,
+                                 "%s(%lld)\n", __func__, size);
+        return m_shared_malloc(size);
+    }
+
+    void SharedFree(void *ptr) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedfree,
+                                 "%s(%p)\n", __func__, ptr);
+        m_shared_free(ptr);
+    }
+
+    void* SharedAlignedMalloc(size_t size, size_t align) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedmalloc,
+                                 "%s(%lld, %lld)\n", __func__, size, align);
+        return m_shared_aligned_malloc(size, align);
+    }
+
+    void SharedAlignedFree(void *ptr) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedfree,
+                              "%s(%p)\n", __func__, ptr);
+        m_shared_aligned_free(ptr);
+    }
+
+    void Acquire(void) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoacquire,
+                              "%s()\n", __func__);
+        CheckResult(__func__, m_acquire());
+    }
+
+    void Release(void) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myorelease,
+                            "%s()\n", __func__);
+        CheckResult(__func__, m_release());
+    }
+
+    void HostVarTablePropagate(void *table, int num_entries) const {
+        OFFLOAD_DEBUG_TRACE(4, "%s(%p, %d)\n", __func__, table, num_entries);
+        CheckResult(__func__, m_host_var_table_propagate(table, num_entries));
+    }
+
+    void HostFptrTableRegister(void *table, int num_entries,
+                               int ordered) const {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoregister,
+                            "%s(%p, %d, %d)\n", __func__, table,
+                            num_entries, ordered);
+        CheckResult(__func__,
+                    m_host_fptr_table_register(table, num_entries, ordered));
+    }
+
+    void RemoteThunkCall(void *thunk, void *args, int device) {
+        OFFLOAD_DEBUG_TRACE(4, "%s(%p, %p, %d)\n", __func__, thunk, args,
+                            device);
+        CheckResult(__func__, m_remote_thunk_call(thunk, args, device));
+    }
+
+    MyoiRFuncCallHandle RemoteCall(char *func, void *args, int device) const {
+        OFFLOAD_DEBUG_TRACE(4, "%s(%s, %p, %d)\n", __func__, func, args,
+                            device);
+        return m_remote_call(func, args, device);
+    }
+
+    void GetResult(MyoiRFuncCallHandle handle) const {
+        OFFLOAD_DEBUG_TRACE(4, "%s(%p)\n", __func__, handle);
+        CheckResult(__func__, m_get_result(handle));
+    }
+
+private:
+    void CheckResult(const char *func, MyoError error) const {
+        if (error != MYO_SUCCESS) {
+             LIBOFFLOAD_ERROR(c_myowrapper_checkresult, func, error);
+            exit(1);
+        }
+    }
+
+private:
+    void* m_lib_handle;
+    bool  m_is_available;
+
+    // pointers to functions from myo library
+    MyoError (*m_lib_init)(void*, void*);
+    void     (*m_lib_fini)(void);
+    void*    (*m_shared_malloc)(size_t);
+    void     (*m_shared_free)(void*);
+    void*    (*m_shared_aligned_malloc)(size_t, size_t);
+    void     (*m_shared_aligned_free)(void*);
+    MyoError (*m_acquire)(void);
+    MyoError (*m_release)(void);
+    MyoError (*m_host_var_table_propagate)(void*, int);
+    MyoError (*m_host_fptr_table_register)(void*, int, int);
+    MyoError (*m_remote_thunk_call)(void*, void*, int);
+    MyoiRFuncCallHandle (*m_remote_call)(char*, void*, int);
+    MyoError (*m_get_result)(MyoiRFuncCallHandle);
+};
+
+bool MyoWrapper::LoadLibrary(void)
+{
+#ifndef TARGET_WINNT
+    const char *lib_name = "libmyo-client.so";
+#else // TARGET_WINNT
+    const char *lib_name = "myo-client.dll";
+#endif // TARGET_WINNT
+
+    OFFLOAD_DEBUG_TRACE(2, "Loading MYO library %s ...\n", lib_name);
+
+    m_lib_handle = DL_open(lib_name);
+    if (m_lib_handle == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to load the library. errno = %d\n",
+                            errno);
+        return false;
+    }
+
+    m_lib_init = (MyoError (*)(void*, void*))
+        DL_sym(m_lib_handle, "myoiLibInit", MYO_VERSION1);
+    if (m_lib_init == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoiLibInit");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_lib_fini = (void (*)(void))
+        DL_sym(m_lib_handle, "myoiLibFini", MYO_VERSION1);
+    if (m_lib_fini == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoiLibFini");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_shared_malloc = (void* (*)(size_t))
+        DL_sym(m_lib_handle, "myoSharedMalloc", MYO_VERSION1);
+    if (m_shared_malloc == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoSharedMalloc");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_shared_free = (void (*)(void*))
+        DL_sym(m_lib_handle, "myoSharedFree", MYO_VERSION1);
+    if (m_shared_free == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoSharedFree");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_shared_aligned_malloc = (void* (*)(size_t, size_t))
+        DL_sym(m_lib_handle, "myoSharedAlignedMalloc", MYO_VERSION1);
+    if (m_shared_aligned_malloc == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoSharedAlignedMalloc");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_shared_aligned_free = (void (*)(void*))
+        DL_sym(m_lib_handle, "myoSharedAlignedFree", MYO_VERSION1);
+    if (m_shared_aligned_free == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoSharedAlignedFree");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_acquire = (MyoError (*)(void))
+        DL_sym(m_lib_handle, "myoAcquire", MYO_VERSION1);
+    if (m_acquire == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoAcquire");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_release = (MyoError (*)(void))
+        DL_sym(m_lib_handle, "myoRelease", MYO_VERSION1);
+    if (m_release == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoRelease");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_host_var_table_propagate = (MyoError (*)(void*, int))
+        DL_sym(m_lib_handle, "myoiHostVarTablePropagate", MYO_VERSION1);
+    if (m_host_var_table_propagate == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoiHostVarTablePropagate");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_host_fptr_table_register = (MyoError (*)(void*, int, int))
+        DL_sym(m_lib_handle, "myoiHostFptrTableRegister", MYO_VERSION1);
+    if (m_host_fptr_table_register == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoiHostFptrTableRegister");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_remote_thunk_call = (MyoError (*)(void*, void*, int))
+        DL_sym(m_lib_handle, "myoiRemoteThunkCall", MYO_VERSION1);
+    if (m_remote_thunk_call == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoiRemoteThunkCall");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_remote_call = (MyoiRFuncCallHandle (*)(char*, void*, int))
+        DL_sym(m_lib_handle, "myoiRemoteCall", MYO_VERSION1);
+    if (m_remote_call == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoiRemoteCall");
+        UnloadLibrary();
+        return false;
+    }
+
+    m_get_result = (MyoError (*)(MyoiRFuncCallHandle))
+        DL_sym(m_lib_handle, "myoiGetResult", MYO_VERSION1);
+    if (m_get_result == 0) {
+        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
+                            "myoiGetResult");
+        UnloadLibrary();
+        return false;
+    }
+
+    OFFLOAD_DEBUG_TRACE(2, "The library was successfully loaded\n");
+
+    m_is_available = true;
+
+    return true;
+}
+
+static bool myo_is_available;
+static MyoWrapper myo_wrapper;
+
+struct MyoTable
+{
+    MyoTable(SharedTableEntry *tab, int len) : var_tab(tab), var_tab_len(len)
+    {}
+
+    SharedTableEntry*   var_tab;
+    int                 var_tab_len;
+};
+
+typedef std::list<MyoTable> MyoTableList;
+static MyoTableList __myo_table_list;
+static mutex_t      __myo_table_lock;
+static bool         __myo_tables = false;
+
+static void __offload_myo_shared_table_register(SharedTableEntry *entry);
+static void __offload_myo_shared_init_table_register(InitTableEntry* entry);
+static void __offload_myo_fptr_table_register(FptrTableEntry *entry);
+
+static void __offload_myoLoadLibrary_once(void)
+{
+    if (__offload_init_library()) {
+        myo_wrapper.LoadLibrary();
+    }
+}
+
+static bool __offload_myoLoadLibrary(void)
+{
+    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
+    __offload_run_once(&ctrl, __offload_myoLoadLibrary_once);
+
+    return myo_wrapper.is_available();
+}
+
+static void __offload_myoInit_once(void)
+{
+    if (!__offload_myoLoadLibrary()) {
+        return;
+    }
+
+    // initialize all devices
+    for (int i = 0; i < mic_engines_total; i++) {
+        mic_engines[i].init();
+    }
+
+    // load and initialize MYO library
+    OFFLOAD_DEBUG_TRACE(2, "Initializing MYO library ...\n");
+
+    COIEVENT events[MIC_ENGINES_MAX];
+    MyoiUserParams params[MIC_ENGINES_MAX+1];
+
+    // load target library to all devices
+    for (int i = 0; i < mic_engines_total; i++) {
+        mic_engines[i].init_myo(&events[i]);
+
+        params[i].type = MYOI_USERPARAMS_DEVID;
+        params[i].nodeid = mic_engines[i].get_physical_index() + 1;
+    }
+
+    params[mic_engines_total].type = MYOI_USERPARAMS_LAST_MSG;
+
+    // initialize myo runtime on host
+    myo_wrapper.LibInit(params, 0);
+
+    // wait for the target init calls to finish
+    COIRESULT res;
+    res = COI::EventWait(mic_engines_total, events, -1, 1, 0, 0);
+    if (res != COI_SUCCESS) {
+        LIBOFFLOAD_ERROR(c_event_wait, res);
+        exit(1);
+    }
+
+    myo_is_available = true;
+
+    OFFLOAD_DEBUG_TRACE(2, "Initializing MYO library ... done\n");
+}
+
+static bool __offload_myoInit(void)
+{
+    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
+    __offload_run_once(&ctrl, __offload_myoInit_once);
+
+    // register pending shared var tables
+    if (myo_is_available && __myo_tables) {
+        mutex_locker_t locker(__myo_table_lock);
+
+        if (__myo_tables) {
+            //  Register tables with MYO so it can propagate to target.
+            for(MyoTableList::const_iterator it = __myo_table_list.begin();
+                it != __myo_table_list.end(); ++it) {
+#ifdef TARGET_WINNT
+                for (SharedTableEntry *entry = it->var_tab;
+                     entry->varName != MYO_TABLE_END_MARKER(); entry++) {
+                    if (entry->varName == 0) {
+                        continue;
+                    }
+                    myo_wrapper.HostVarTablePropagate(entry, 1);
+                }
+#else // TARGET_WINNT
+                myo_wrapper.HostVarTablePropagate(it->var_tab,
+                                                  it->var_tab_len);
+#endif // TARGET_WINNT
+            }
+
+            __myo_table_list.clear();
+            __myo_tables = false;
+        }
+    }
+
+    return myo_is_available;
+}
+
+static bool shared_table_entries(
+    SharedTableEntry *entry
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
+
+    for (; entry->varName != MYO_TABLE_END_MARKER(); entry++) {
+#ifdef TARGET_WINNT
+        if (entry->varName == 0) {
+            continue;
+        }
+#endif // TARGET_WINNT
+
+        return true;
+    }
+
+    return false;
+}
+
+static bool fptr_table_entries(
+    FptrTableEntry *entry
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
+
+    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
+#ifdef TARGET_WINNT
+        if (entry->funcName == 0) {
+            continue;
+        }
+#endif // TARGET_WINNT
+
+        return true;
+    }
+
+    return false;
+}
+
+extern "C" void __offload_myoRegisterTables(
+    InitTableEntry* init_table,
+    SharedTableEntry *shared_table,
+    FptrTableEntry *fptr_table
+)
+{
+    // check whether we need to initialize MYO library. It is
+    // initialized only if at least one myo table is not empty
+    if (shared_table_entries(shared_table) || fptr_table_entries(fptr_table)) {
+        // make sure myo library is loaded
+        __offload_myoLoadLibrary();
+
+        // register tables
+        __offload_myo_shared_table_register(shared_table);
+        __offload_myo_fptr_table_register(fptr_table);
+        __offload_myo_shared_init_table_register(init_table);
+    }
+}
+
+void __offload_myoFini(void)
+{
+    if (myo_is_available) {
+        OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+
+        COIEVENT events[MIC_ENGINES_MAX];
+
+        // kick off myoiLibFini calls on all devices
+        for (int i = 0; i < mic_engines_total; i++) {
+            mic_engines[i].fini_myo(&events[i]);
+        }
+
+        // cleanup myo runtime on host
+        myo_wrapper.LibFini();
+
+        // wait for the target fini calls to finish
+        COIRESULT res;
+        res = COI::EventWait(mic_engines_total, events, -1, 1, 0, 0);
+        if (res != COI_SUCCESS) {
+            LIBOFFLOAD_ERROR(c_event_wait, res);
+            exit(1);
+        }
+    }
+}
+
+static void __offload_myo_shared_table_register(
+    SharedTableEntry *entry
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
+
+    SharedTableEntry *start = entry;
+    int entries = 0;
+
+    // allocate shared memory for vars
+    for (; entry->varName != MYO_TABLE_END_MARKER(); entry++) {
+#ifdef TARGET_WINNT
+        if (entry->varName == 0) {
+            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoSharedTable entry\n");
+            continue;
+        }
+#endif // TARGET_WINNT
+
+        OFFLOAD_DEBUG_TRACE(4, "registering MyoSharedTable entry for %s @%p\n",
+                            entry->varName, entry);
+
+        // Invoke the function to create shared memory
+        reinterpret_cast<void(*)(void)>(entry->sharedAddr)();
+        entries++;
+    }
+
+    // and table to the list if it is not empty
+    if (entries > 0) {
+        mutex_locker_t locker(__myo_table_lock);
+        __myo_table_list.push_back(MyoTable(start, entries));
+        __myo_tables = true;
+    }
+}
+
+static void __offload_myo_shared_init_table_register(InitTableEntry* entry)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
+
+#ifdef TARGET_WINNT
+    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
+        if (entry->funcName == 0) {
+            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoSharedInit entry\n");
+            continue;
+        }
+
+        //  Invoke the function to init the shared memory
+        entry->func();
+    }
+#else // TARGET_WINNT
+    for (; entry->func != 0; entry++) {
+        // Invoke the function to init the shared memory
+        entry->func();
+    }
+#endif // TARGET_WINNT
+}
+
+static void __offload_myo_fptr_table_register(
+    FptrTableEntry *entry
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
+
+    FptrTableEntry *start = entry;
+    int entries = 0;
+
+    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
+#ifdef TARGET_WINNT
+        if (entry->funcName == 0) {
+            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoFptrTable entry\n");
+            continue;
+        }
+#endif // TARGET_WINNT
+
+        if (!myo_wrapper.is_available()) {
+            *(static_cast<void**>(entry->localThunkAddr)) = entry->funcAddr;
+        }
+
+        OFFLOAD_DEBUG_TRACE(4, "registering MyoFptrTable entry for %s @%p\n",
+                            entry->funcName, entry);
+
+#ifdef TARGET_WINNT
+        if (myo_wrapper.is_available()) {
+            myo_wrapper.HostFptrTableRegister(entry, 1, false);
+        }
+#endif // TARGET_WINNT
+
+        entries++;
+    }
+
+#ifndef TARGET_WINNT
+    if (myo_wrapper.is_available() && entries > 0) {
+        myo_wrapper.HostFptrTableRegister(start, entries, false);
+    }
+#endif // TARGET_WINNT
+}
+
+extern "C" int __offload_myoIsAvailable(int target_number)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%d)\n", __func__, target_number);
+
+    if (target_number >= -2) {
+        bool is_default_number = (target_number == -2);
+
+        if (__offload_myoInit()) {
+            if (target_number >= 0) {
+                // User provided the device number
+                int num = target_number % mic_engines_total;
+
+                // reserve device in ORSL
+                target_number = ORSL::reserve(num) ? num : -1;
+            }
+            else {
+                // try to use device 0
+                target_number = ORSL::reserve(0) ? 0 : -1;
+            }
+
+            // make sure device is initialized
+            if (target_number >= 0) {
+                mic_engines[target_number].init();
+            }
+        }
+        else {
+            // fallback to CPU
+            target_number = -1;
+        }
+
+        if (target_number < 0 && !is_default_number) {
+            LIBOFFLOAD_ERROR(c_device_is_not_available);
+            exit(1);
+        }
+    }
+    else {
+        LIBOFFLOAD_ERROR(c_invalid_device_number);
+        exit(1);
+    }
+
+    return target_number;
+}
+
+extern "C" void __offload_myoiRemoteIThunkCall(
+    void *thunk,
+    void *arg,
+    int target_number
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p, %p, %d)\n", __func__, thunk, arg,
+                        target_number);
+
+    myo_wrapper.Release();
+    myo_wrapper.RemoteThunkCall(thunk, arg, target_number);
+    myo_wrapper.Acquire();
+
+    ORSL::release(target_number);
+}
+
+extern "C" void* _Offload_shared_malloc(size_t size)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%lld)\n", __func__, size);
+
+    if (__offload_myoLoadLibrary()) {
+        return myo_wrapper.SharedMalloc(size);
+    }
+    else {
+        return malloc(size);
+    }
+}
+
+extern "C" void _Offload_shared_free(void *ptr)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
+
+    if (__offload_myoLoadLibrary()) {
+        myo_wrapper.SharedFree(ptr);
+    }
+    else {
+        free(ptr);
+    }
+}
+
+extern "C" void* _Offload_shared_aligned_malloc(size_t size, size_t align)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%lld, %lld)\n", __func__, size, align);
+
+    if (__offload_myoLoadLibrary()) {
+        return myo_wrapper.SharedAlignedMalloc(size, align);
+    }
+    else {
+        if (align < sizeof(void*)) {
+            align = sizeof(void*);
+        }
+        return _mm_malloc(size, align);
+    }
+}
+
+extern "C" void _Offload_shared_aligned_free(void *ptr)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
+
+    if (__offload_myoLoadLibrary()) {
+        myo_wrapper.SharedAlignedFree(ptr);
+    }
+    else {
+        _mm_free(ptr);
+    }
+}
+
+extern "C" void __intel_cilk_for_32_offload(
+    int size,
+    void (*copy_constructor)(void*, void*),
+    int target_number,
+    void *raddr,
+    void *closure_object,
+    unsigned int iters,
+    unsigned int grain_size)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+
+    target_number = __offload_myoIsAvailable(target_number);
+    if (target_number >= 0) {
+        struct S {
+            void *M1;
+            unsigned int M2;
+            unsigned int M3;
+            char closure[];
+        } *args;
+
+        args = (struct S*) _Offload_shared_malloc(sizeof(struct S) + size);
+        args->M1 = raddr;
+        args->M2 = iters;
+        args->M3 = grain_size;
+
+        if (copy_constructor == 0) {
+            memcpy(args->closure, closure_object, size);
+        }
+        else {
+            copy_constructor(args->closure, closure_object);
+        }
+
+        myo_wrapper.Release();
+        myo_wrapper.GetResult(
+            myo_wrapper.RemoteCall("__intel_cilk_for_32_offload",
+                                   args, target_number)
+        );
+        myo_wrapper.Acquire();
+
+        _Offload_shared_free(args);
+
+        ORSL::release(target_number);
+    }
+    else {
+        __cilkrts_cilk_for_32(raddr,
+                              closure_object,
+                              iters,
+                              grain_size);
+    }
+}
+
+extern "C" void __intel_cilk_for_64_offload(
+    int size,
+    void (*copy_constructor)(void*, void*),
+    int target_number,
+    void *raddr,
+    void *closure_object,
+    uint64_t iters,
+    uint64_t grain_size)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+
+    target_number = __offload_myoIsAvailable(target_number);
+    if (target_number >= 0) {
+        struct S {
+            void *M1;
+            uint64_t M2;
+            uint64_t M3;
+            char closure[];
+        } *args;
+
+        args = (struct S*) _Offload_shared_malloc(sizeof(struct S) + size);
+        args->M1 = raddr;
+        args->M2 = iters;
+        args->M3 = grain_size;
+
+        if (copy_constructor == 0) {
+            memcpy(args->closure, closure_object, size);
+        }
+        else {
+            copy_constructor(args->closure, closure_object);
+        }
+
+        myo_wrapper.Release();
+        myo_wrapper.GetResult(
+            myo_wrapper.RemoteCall("__intel_cilk_for_64_offload", args,
+                                   target_number)
+        );
+        myo_wrapper.Acquire();
+
+        _Offload_shared_free(args);
+
+        ORSL::release(target_number);
+    }
+    else {
+        __cilkrts_cilk_for_64(raddr,
+                              closure_object,
+                              iters,
+                              grain_size);
+    }
+}

diff --git a/final/offload/src/offload_myo_host.h b/final/offload/src/offload_myo_host.h
new file mode 100644
index 0000000..92a61f4
--- /dev/null
+++ b/final/offload/src/offload_myo_host.h

@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef OFFLOAD_MYO_HOST_H_INCLUDED
+#define OFFLOAD_MYO_HOST_H_INCLUDED
+
+#include <myotypes.h>
+#include <myoimpl.h>
+#include <myo.h>
+#include "offload.h"
+
+typedef MyoiSharedVarEntry      SharedTableEntry;
+//typedef MyoiHostSharedFptrEntry FptrTableEntry;
+typedef struct {
+    //! Function Name
+    const char *funcName;
+    //! Function Address
+    void *funcAddr;
+    //! Local Thunk Address
+    void *localThunkAddr;
+#ifdef TARGET_WINNT
+    // Dummy to pad up to 32 bytes
+    void *dummy;
+#endif // TARGET_WINNT
+} FptrTableEntry;
+
+struct InitTableEntry {
+#ifdef TARGET_WINNT
+    // Dummy to pad up to 16 bytes
+    // Function Name
+    const char *funcName;
+#endif // TARGET_WINNT
+    void (*func)(void);
+};
+
+#ifdef TARGET_WINNT
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable$a"
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable$z"
+
+#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START     ".MyoSharedInitTable$a"
+#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END       ".MyoSharedInitTable$z"
+
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable$a"
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable$z"
+#else  // TARGET_WINNT
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable."
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable."
+
+#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START     ".MyoSharedInitTable."
+#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END       ".MyoSharedInitTable."
+
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable."
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable."
+#endif // TARGET_WINNT
+
+#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_END, read, write)
+
+#pragma section(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END, read, write)
+
+#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_END, read, write)
+
+extern "C" void __offload_myoRegisterTables(
+    InitTableEntry *init_table,
+    SharedTableEntry *shared_table,
+    FptrTableEntry *fptr_table
+);
+
+extern void __offload_myoFini(void);
+
+#endif // OFFLOAD_MYO_HOST_H_INCLUDED

diff --git a/final/offload/src/offload_myo_target.cpp b/final/offload/src/offload_myo_target.cpp
new file mode 100644
index 0000000..eeb1c4f
--- /dev/null
+++ b/final/offload/src/offload_myo_target.cpp

@@ -0,0 +1,184 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_myo_target.h"
+#include "offload_target.h"
+
+extern "C" void __cilkrts_cilk_for_32(void*, void*, uint32_t, int32_t);
+extern "C" void __cilkrts_cilk_for_64(void*, void*, uint64_t, int32_t);
+
+#pragma weak __cilkrts_cilk_for_32
+#pragma weak __cilkrts_cilk_for_64
+
+static void CheckResult(const char *func, MyoError error) {
+    if (error != MYO_SUCCESS) {
+       LIBOFFLOAD_ERROR(c_myotarget_checkresult, func, error);
+        exit(1);
+    }
+}
+
+static void __offload_myo_shared_table_register(SharedTableEntry *entry)
+{
+    int entries = 0;
+    SharedTableEntry *t_start;
+
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
+
+    t_start = entry;
+    while (t_start->varName != 0) {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_mic_myo_shared,
+                              "myo shared entry name = \"%s\" addr = %p\n",
+                              t_start->varName, t_start->sharedAddr);
+        t_start++;
+        entries++;
+    }
+
+    if (entries > 0) {
+        OFFLOAD_DEBUG_TRACE(3, "myoiMicVarTableRegister(%p, %d)\n", entry,
+                            entries);
+        CheckResult("myoiMicVarTableRegister",
+                    myoiMicVarTableRegister(entry, entries));
+    }
+}
+
+static void __offload_myo_fptr_table_register(
+    FptrTableEntry *entry
+)
+{
+    int entries = 0;
+    FptrTableEntry *t_start;
+
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
+
+    t_start = entry;
+    while (t_start->funcName != 0) {
+        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_mic_myo_fptr,
+                              "myo fptr entry name = \"%s\" addr = %p\n",
+                              t_start->funcName, t_start->funcAddr);
+        t_start++;
+        entries++;
+    }
+
+    if (entries > 0) {
+        OFFLOAD_DEBUG_TRACE(3, "myoiTargetFptrTableRegister(%p, %d, 0)\n",
+                            entry, entries);
+        CheckResult("myoiTargetFptrTableRegister",
+                    myoiTargetFptrTableRegister(entry, entries, 0));
+    }
+}
+
+extern "C" void __offload_myoAcquire(void)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+    CheckResult("myoAcquire", myoAcquire());
+}
+
+extern "C" void __offload_myoRelease(void)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+    CheckResult("myoRelease", myoRelease());
+}
+
+extern "C" void __intel_cilk_for_32_offload_wrapper(void *args_)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+
+    struct S {
+        void *M1;
+        unsigned int M2;
+        unsigned int M3;
+        char closure[];
+    } *args = (struct S*) args_;
+
+    __cilkrts_cilk_for_32(args->M1, args->closure, args->M2, args->M3);
+}
+
+extern "C" void __intel_cilk_for_64_offload_wrapper(void *args_)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+
+    struct S {
+        void *M1;
+        uint64_t M2;
+        uint64_t M3;
+        char closure[];
+    } *args = (struct S*) args_;
+
+    __cilkrts_cilk_for_64(args->M1, args->closure, args->M2, args->M3);
+}
+
+static void __offload_myo_once_init(void)
+{
+    CheckResult("myoiRemoteFuncRegister",
+                myoiRemoteFuncRegister(
+                    (MyoiRemoteFuncType) __intel_cilk_for_32_offload_wrapper,
+                    "__intel_cilk_for_32_offload"));
+    CheckResult("myoiRemoteFuncRegister",
+                myoiRemoteFuncRegister(
+                    (MyoiRemoteFuncType) __intel_cilk_for_64_offload_wrapper,
+                    "__intel_cilk_for_64_offload"));
+}
+
+extern "C" void __offload_myoRegisterTables(
+    SharedTableEntry *shared_table,
+    FptrTableEntry *fptr_table
+)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
+
+    // one time registration of Intel(R) Cilk(TM) language entries
+    static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+    pthread_once(&once_control, __offload_myo_once_init);
+
+    // register module's tables
+    if (shared_table->varName == 0 && fptr_table->funcName == 0) {
+        return;
+    }
+
+    __offload_myo_shared_table_register(shared_table);
+    __offload_myo_fptr_table_register(fptr_table);
+}
+
+extern "C" void* _Offload_shared_malloc(size_t size)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%lld)\n", __func__, size);
+    return myoSharedMalloc(size);
+}
+
+extern "C" void _Offload_shared_free(void *ptr)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
+    myoSharedFree(ptr);
+}
+
+extern "C" void* _Offload_shared_aligned_malloc(size_t size, size_t align)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%lld, %lld)\n", __func__, size, align);
+    return myoSharedAlignedMalloc(size, align);
+}
+
+extern "C" void _Offload_shared_aligned_free(void *ptr)
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
+    myoSharedAlignedFree(ptr);
+}
+
+// temporary workaround for blocking behavior of myoiLibInit/Fini calls
+extern "C" void __offload_myoLibInit()
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s()\n", __func__);
+    CheckResult("myoiLibInit", myoiLibInit(0, 0));
+}
+
+extern "C" void __offload_myoLibFini()
+{
+    OFFLOAD_DEBUG_TRACE(3, "%s()\n", __func__);
+    myoiLibFini();
+}

diff --git a/final/offload/src/offload_myo_target.h b/final/offload/src/offload_myo_target.h
new file mode 100644
index 0000000..8b7f789
--- /dev/null
+++ b/final/offload/src/offload_myo_target.h

@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef OFFLOAD_MYO_TARGET_H_INCLUDED
+#define OFFLOAD_MYO_TARGET_H_INCLUDED
+
+#include <myotypes.h>
+#include <myoimpl.h>
+#include <myo.h>
+#include "offload.h"
+
+typedef MyoiSharedVarEntry          SharedTableEntry;
+typedef MyoiTargetSharedFptrEntry   FptrTableEntry;
+
+#ifdef TARGET_WINNT
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable$a"
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable$z"
+
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable$a"
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable$z"
+#else  // TARGET_WINNT
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable."
+#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable."
+
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable."
+#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable."
+#endif // TARGET_WINNT
+
+#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_END, read, write)
+
+#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_END, read, write)
+
+extern "C" void __offload_myoRegisterTables(
+    SharedTableEntry *shared_table,
+    FptrTableEntry *fptr_table
+);
+
+extern "C" void __offload_myoAcquire(void);
+extern "C" void __offload_myoRelease(void);
+
+// temporary workaround for blocking behavior for myoiLibInit/Fini calls
+extern "C" void __offload_myoLibInit();
+extern "C" void __offload_myoLibFini();
+
+#endif // OFFLOAD_MYO_TARGET_H_INCLUDED

diff --git a/final/offload/src/offload_omp_host.cpp b/final/offload/src/offload_omp_host.cpp
new file mode 100644
index 0000000..edd4445
--- /dev/null
+++ b/final/offload/src/offload_omp_host.cpp

@@ -0,0 +1,851 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <omp.h>
+#include "offload.h"
+#include "compiler_if_host.h"
+
+// OpenMP API
+
+void omp_set_default_device(int num)
+{
+    if (num >= 0) {
+        __omp_device_num = num;
+    }
+}
+
+int omp_get_default_device(void)
+{
+    return __omp_device_num;
+}
+
+int omp_get_num_devices()
+{
+    __offload_init_library();
+    return mic_engines_total;
+}
+
+// OpenMP API wrappers
+
+static void omp_set_int_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int setting,
+    const char* f_name
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          f_name, 0);
+    if (ofld) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(int);
+        vars[0].count = 1;
+        vars[0].ptr = &setting;
+
+        OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+static int omp_get_int_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    const char * f_name
+)
+{
+    int setting = 0;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          f_name, 0);
+    if (ofld) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_out;
+        vars[0].size = sizeof(int);
+        vars[0].count = 1;
+        vars[0].ptr = &setting;
+
+        OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
+    }
+    return setting;
+}
+
+void omp_set_num_threads_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+)
+{
+    omp_set_int_target(target_type, target_number, num_threads,
+                       "omp_set_num_threads_target");
+}
+
+int omp_get_max_threads_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "omp_get_max_threads_target");
+}
+
+int omp_get_num_procs_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "omp_get_num_procs_target");
+}
+
+void omp_set_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+)
+{
+    omp_set_int_target(target_type, target_number, num_threads,
+                       "omp_set_dynamic_target");
+}
+
+int omp_get_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "omp_get_dynamic_target");
+}
+
+void omp_set_nested_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int nested
+)
+{
+    omp_set_int_target(target_type, target_number, nested,
+                       "omp_set_nested_target");
+}
+
+int omp_get_nested_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "omp_get_nested_target");
+}
+
+void omp_set_schedule_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_sched_t kind,
+    int modifier
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[2] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(omp_sched_t);
+        vars[0].count = 1;
+        vars[0].ptr = &kind;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_in;
+        vars[1].size = sizeof(int);
+        vars[1].count = 1;
+        vars[1].ptr = &modifier;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_set_schedule_target",
+                        0, 2, vars, NULL, 0, 0, 0);
+    }
+}
+
+void omp_get_schedule_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_sched_t *kind,
+    int *modifier
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[2] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_out;
+        vars[0].size = sizeof(omp_sched_t);
+        vars[0].count = 1;
+        vars[0].ptr = kind;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_out;
+        vars[1].size = sizeof(int);
+        vars[1].count = 1;
+        vars[1].ptr = modifier;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_get_schedule_target",
+                        0, 2, vars, NULL, 0, 0, 0);
+    }
+}
+
+// lock API functions
+
+void omp_init_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_out;
+        vars[0].size = sizeof(omp_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_init_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+void omp_destroy_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(omp_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_destroy_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+void omp_set_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_inout;
+        vars[0].size = sizeof(omp_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_set_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+void omp_unset_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_inout;
+        vars[0].size = sizeof(omp_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_unset_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+int omp_test_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+    int result = 0;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[2] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_inout;
+        vars[0].size = sizeof(omp_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_out;
+        vars[1].size = sizeof(int);
+        vars[1].count = 1;
+        vars[1].ptr = &result;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_test_lock_target",
+                        0, 2, vars, NULL, 0, 0, 0);
+    }
+    return result;
+}
+
+// nested lock API functions
+
+void omp_init_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_out;
+        vars[0].size = sizeof(omp_nest_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_init_nest_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+void omp_destroy_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(omp_nest_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_destroy_nest_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+void omp_set_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_inout;
+        vars[0].size = sizeof(omp_nest_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_set_nest_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+void omp_unset_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_inout;
+        vars[0].size = sizeof(omp_nest_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_unset_nest_lock_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+int omp_test_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+    int result = 0;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[2] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_inout;
+        vars[0].size = sizeof(omp_nest_lock_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = lock;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_out;
+        vars[1].size = sizeof(int);
+        vars[1].count = 1;
+        vars[1].ptr = &result;
+
+        OFFLOAD_OFFLOAD(ofld, "omp_test_nest_lock_target",
+                        0, 2, vars, NULL, 0, 0, 0);
+    }
+    return result;
+}
+
+// kmp API functions
+
+void kmp_set_stacksize_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int size
+)
+{
+    omp_set_int_target(target_type, target_number, size,
+                       "kmp_set_stacksize_target");
+}
+
+int kmp_get_stacksize_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "kmp_get_stacksize_target");
+}
+
+void kmp_set_stacksize_s_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    size_t size
+)
+{
+    omp_set_int_target(target_type, target_number, size,
+                       "kmp_set_stacksize_s_target");
+}
+
+size_t kmp_get_stacksize_s_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "kmp_get_stacksize_s_target");
+}
+
+void kmp_set_blocktime_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int time
+)
+{
+    omp_set_int_target(target_type, target_number, time,
+                       "kmp_set_blocktime_target");
+}
+
+int kmp_get_blocktime_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "kmp_get_blocktime_target");
+}
+
+void kmp_set_library_serial_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_serial_target",
+                        0, 0, 0, 0, 0, 0, 0);
+    }
+}
+
+void kmp_set_library_turnaround_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_turnaround_target",
+                        0, 0, 0, 0, 0, 0, 0);
+    }
+}
+
+void kmp_set_library_throughput_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_throughput_target",
+                        0, 0, 0, 0, 0, 0, 0);
+    }
+}
+
+void kmp_set_library_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int mode
+)
+{
+    omp_set_int_target(target_type, target_number, mode,
+                       "kmp_set_library_target");
+}
+
+int kmp_get_library_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "kmp_get_library_target");
+}
+
+void kmp_set_defaults_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    char const *defaults
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_string_ptr;
+        vars[0].type.dst = c_string_ptr;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].alloc_if = 1;
+        vars[0].free_if = 1;
+        vars[0].ptr = &defaults;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_set_defaults_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+// affinity API functions
+
+void kmp_create_affinity_mask_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_out;
+        vars[0].size = sizeof(kmp_affinity_mask_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = mask;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_create_affinity_mask_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+void kmp_destroy_affinity_mask_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[1] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(kmp_affinity_mask_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = mask;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_destroy_affinity_mask_target",
+                        0, 1, vars, NULL, 0, 0, 0);
+    }
+}
+
+int kmp_set_affinity_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    int result = 1;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[2] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(kmp_affinity_mask_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = mask;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_out;
+        vars[1].size = sizeof(int);
+        vars[1].count = 1;
+        vars[1].ptr = &result;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_set_affinity_target",
+                        0, 2, vars, NULL, 0, 0, 0);
+    }
+    return result;
+}
+
+int kmp_get_affinity_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    int result = 1;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[2] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_inout;
+        vars[0].size = sizeof(kmp_affinity_mask_target_t);
+        vars[0].count = 1;
+        vars[0].ptr = mask;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_out;
+        vars[1].size = sizeof(int);
+        vars[1].count = 1;
+        vars[1].ptr = &result;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_get_affinity_target",
+                        0, 2, vars, NULL, 0, 0, 0);
+    }
+    return result;
+}
+
+int kmp_get_affinity_max_proc_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return omp_get_int_target(target_type, target_number,
+                              "kmp_get_affinity_max_proc_target");
+}
+
+int kmp_set_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    int result = 1;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[3] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(int);
+        vars[0].count = 1;
+        vars[0].ptr = &proc;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_inout;
+        vars[1].size = sizeof(kmp_affinity_mask_target_t);
+        vars[1].count = 1;
+        vars[1].ptr = mask;
+
+        vars[2].type.src = c_data;
+        vars[2].type.dst = c_data;
+        vars[2].direction.bits = c_parameter_out;
+        vars[2].size = sizeof(int);
+        vars[2].count = 1;
+        vars[2].ptr = &result;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_set_affinity_mask_proc_target",
+                        0, 3, vars, NULL, 0, 0, 0);
+    }
+    return result;
+}
+
+int kmp_unset_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    int result = 1;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[3] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(int);
+        vars[0].count = 1;
+        vars[0].ptr = &proc;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_inout;
+        vars[1].size = sizeof(kmp_affinity_mask_target_t);
+        vars[1].count = 1;
+        vars[1].ptr = mask;
+
+        vars[2].type.src = c_data;
+        vars[2].type.dst = c_data;
+        vars[2].direction.bits = c_parameter_out;
+        vars[2].size = sizeof(int);
+        vars[2].count = 1;
+        vars[2].ptr = &result;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_unset_affinity_mask_proc_target",
+                        0, 3, vars, NULL, 0, 0, 0);
+    }
+    return result;
+}
+
+int kmp_get_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    int result = 1;
+
+    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
+                                          __func__, 0);
+    if (ofld != 0) {
+        VarDesc vars[3] = {0};
+
+        vars[0].type.src = c_data;
+        vars[0].type.dst = c_data;
+        vars[0].direction.bits = c_parameter_in;
+        vars[0].size = sizeof(int);
+        vars[0].count = 1;
+        vars[0].ptr = &proc;
+
+        vars[1].type.src = c_data;
+        vars[1].type.dst = c_data;
+        vars[1].direction.bits = c_parameter_in;
+        vars[1].size = sizeof(kmp_affinity_mask_target_t);
+        vars[1].count = 1;
+        vars[1].ptr = mask;
+
+        vars[2].type.src = c_data;
+        vars[2].type.dst = c_data;
+        vars[2].direction.bits = c_parameter_out;
+        vars[2].size = sizeof(int);
+        vars[2].count = 1;
+        vars[2].ptr = &result;
+
+        OFFLOAD_OFFLOAD(ofld, "kmp_get_affinity_mask_proc_target",
+                        0, 3, vars, NULL, 0, 0, 0);
+    }
+    return result;
+}

diff --git a/final/offload/src/offload_omp_target.cpp b/final/offload/src/offload_omp_target.cpp
new file mode 100644
index 0000000..1f2052a
--- /dev/null
+++ b/final/offload/src/offload_omp_target.cpp

@@ -0,0 +1,1021 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <omp.h>
+#include "offload.h"
+#include "compiler_if_target.h"
+
+// OpenMP API
+
+void omp_set_default_device(int num)
+{
+}
+
+int omp_get_default_device(void)
+{
+    return mic_index;
+}
+
+int omp_get_num_devices()
+{
+    return mic_engines_total;
+}
+
+// OpenMP API wrappers
+
+static void omp_send_int_to_host(
+    void *ofld_,
+    int setting
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_out;
+    vars[0].ptr = &setting;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+static int omp_get_int_from_host(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    int setting;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &setting;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    OFFLOAD_TARGET_LEAVE(ofld);
+
+    return setting;
+}
+
+void omp_set_num_threads_lrb(
+    void *ofld
+)
+{
+    int num_threads;
+
+    num_threads = omp_get_int_from_host(ofld);
+    omp_set_num_threads(num_threads);
+}
+
+void omp_get_max_threads_lrb(
+    void *ofld
+)
+{
+    int num_threads;
+
+    num_threads = omp_get_max_threads();
+    omp_send_int_to_host(ofld, num_threads);
+}
+
+void omp_get_num_procs_lrb(
+    void *ofld
+)
+{
+    int num_procs;
+
+    num_procs = omp_get_num_procs();
+    omp_send_int_to_host(ofld, num_procs);
+}
+
+void omp_set_dynamic_lrb(
+    void *ofld
+)
+{
+    int dynamic;
+
+    dynamic = omp_get_int_from_host(ofld);
+    omp_set_dynamic(dynamic);
+}
+
+void omp_get_dynamic_lrb(
+    void *ofld
+)
+{
+    int dynamic;
+
+    dynamic = omp_get_dynamic();
+    omp_send_int_to_host(ofld, dynamic);
+}
+
+void omp_set_nested_lrb(
+    void *ofld
+)
+{
+    int nested;
+
+    nested = omp_get_int_from_host(ofld);
+    omp_set_nested(nested);
+}
+
+void omp_get_nested_lrb(
+    void *ofld
+)
+{
+    int nested;
+
+    nested = omp_get_nested();
+    omp_send_int_to_host(ofld, nested);
+}
+
+void omp_set_schedule_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[2] = {0};
+    omp_sched_t kind;
+    int modifier;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &kind;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_in;
+    vars[1].ptr = &modifier;
+
+    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
+    omp_set_schedule(kind, modifier);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_get_schedule_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[2] = {0};
+    omp_sched_t kind;
+    int modifier;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_out;
+    vars[0].ptr = &kind;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_out;
+    vars[1].ptr = &modifier;
+
+    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
+    omp_get_schedule(&kind, &modifier);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+// lock API functions
+
+void omp_init_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_out;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_init_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_destroy_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_destroy_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_set_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_inout;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_set_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_unset_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_inout;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_unset_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_test_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[2] = {0};
+    omp_lock_target_t lock;
+    int result;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_inout;
+    vars[0].ptr = &lock;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_out;
+    vars[1].ptr = &result;
+
+    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
+    result = omp_test_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+// nested lock API functions
+
+void omp_init_nest_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_nest_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_out;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_init_nest_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_destroy_nest_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_nest_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_destroy_nest_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_set_nest_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_nest_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_inout;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_set_nest_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_unset_nest_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    omp_nest_lock_target_t lock;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_inout;
+    vars[0].ptr = &lock;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    omp_unset_nest_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void omp_test_nest_lock_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[2] = {0};
+    omp_nest_lock_target_t lock;
+    int result;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_inout;
+    vars[0].ptr = &lock;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_out;
+    vars[1].ptr = &result;
+
+    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
+    result = omp_test_nest_lock(&lock.lock);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+// kmp API functions
+
+void kmp_set_stacksize_lrb(
+    void *ofld
+)
+{
+    int size;
+
+    size = omp_get_int_from_host(ofld);
+    kmp_set_stacksize(size);
+}
+
+void kmp_get_stacksize_lrb(
+    void *ofld
+)
+{
+    int size;
+
+    size = kmp_get_stacksize();
+    omp_send_int_to_host(ofld, size);
+}
+
+void kmp_set_stacksize_s_lrb(
+    void *ofld
+)
+{
+    int size;
+
+    size = omp_get_int_from_host(ofld);
+    kmp_set_stacksize_s(size);
+}
+
+void kmp_get_stacksize_s_lrb(
+    void *ofld
+)
+{
+    int size;
+
+    size = kmp_get_stacksize_s();
+    omp_send_int_to_host(ofld, size);
+}
+
+void kmp_set_blocktime_lrb(
+    void *ofld
+)
+{
+    int time;
+
+    time = omp_get_int_from_host(ofld);
+    kmp_set_blocktime(time);
+}
+
+void kmp_get_blocktime_lrb(
+    void *ofld
+)
+{
+    int time;
+
+    time = kmp_get_blocktime();
+    omp_send_int_to_host(ofld, time);
+}
+
+void kmp_set_library_serial_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+
+    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
+    kmp_set_library_serial();
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_set_library_turnaround_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+
+    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
+    kmp_set_library_turnaround();
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_set_library_throughput_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+
+    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
+    kmp_set_library_throughput();
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_set_library_lrb(
+    void *ofld
+)
+{
+    int mode;
+
+    mode = omp_get_int_from_host(ofld);
+    kmp_set_library(mode);
+}
+
+void kmp_get_library_lrb(
+    void *ofld
+)
+{
+    int mode;
+
+    mode = kmp_get_library();
+    omp_send_int_to_host(ofld, mode);
+}
+
+void kmp_set_defaults_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    char *defaults = 0;
+
+    vars[0].type.src = c_string_ptr;
+    vars[0].type.dst = c_string_ptr;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &defaults;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    kmp_set_defaults(defaults);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+// affinity API functions
+
+void kmp_create_affinity_mask_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    kmp_affinity_mask_target_t mask;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_out;
+    vars[0].ptr = &mask;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    kmp_create_affinity_mask(&mask.mask);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_destroy_affinity_mask_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[1] = {0};
+    kmp_affinity_mask_target_t mask;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &mask;
+
+    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
+    kmp_destroy_affinity_mask(&mask.mask);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_set_affinity_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[2] = {0};
+    kmp_affinity_mask_target_t mask;
+    int result;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &mask;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_out;
+    vars[1].ptr = &result;
+
+    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
+    result = kmp_set_affinity(&mask.mask);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_get_affinity_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[2] = {0};
+    kmp_affinity_mask_target_t mask;
+    int result;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_inout;
+    vars[0].ptr = &mask;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_out;
+    vars[1].ptr = &result;
+
+    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
+    result = kmp_get_affinity(&mask.mask);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_get_affinity_max_proc_lrb(
+    void *ofld
+)
+{
+    int max_proc;
+
+    max_proc = kmp_get_affinity_max_proc();
+    omp_send_int_to_host(ofld, max_proc);
+}
+
+void kmp_set_affinity_mask_proc_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[3] = {0};
+    kmp_affinity_mask_target_t mask;
+    int proc, result;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &proc;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_inout;
+    vars[1].ptr = &mask;
+
+    vars[2].type.src = c_data;
+    vars[2].type.dst = c_data;
+    vars[2].direction.bits = c_parameter_out;
+    vars[2].ptr = &result;
+
+    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
+    result = kmp_set_affinity_mask_proc(proc, &mask.mask);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_unset_affinity_mask_proc_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[3] = {0};
+    kmp_affinity_mask_target_t mask;
+    int proc, result;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &proc;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_inout;
+    vars[1].ptr = &mask;
+
+    vars[2].type.src = c_data;
+    vars[2].type.dst = c_data;
+    vars[2].direction.bits = c_parameter_out;
+    vars[2].ptr = &result;
+
+    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
+    result = kmp_unset_affinity_mask_proc(proc, &mask.mask);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+void kmp_get_affinity_mask_proc_lrb(
+    void *ofld_
+)
+{
+    OFFLOAD ofld = (OFFLOAD) ofld_;
+    VarDesc vars[3] = {0};
+    kmp_affinity_mask_target_t mask;
+    int proc, result;
+
+    vars[0].type.src = c_data;
+    vars[0].type.dst = c_data;
+    vars[0].direction.bits = c_parameter_in;
+    vars[0].ptr = &proc;
+
+    vars[1].type.src = c_data;
+    vars[1].type.dst = c_data;
+    vars[1].direction.bits = c_parameter_in;
+    vars[1].ptr = &mask;
+
+    vars[2].type.src = c_data;
+    vars[2].type.dst = c_data;
+    vars[2].direction.bits = c_parameter_out;
+    vars[2].ptr = &result;
+
+    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
+    result = kmp_get_affinity_mask_proc(proc, &mask.mask);
+    OFFLOAD_TARGET_LEAVE(ofld);
+}
+
+// Target-side stubs for the host functions (to avoid unresolveds)
+// These are needed for the offloadm table
+
+void omp_set_num_threads_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+)
+{
+}
+
+int omp_get_max_threads_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+int omp_get_num_procs_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+void omp_set_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+)
+{
+}
+
+int omp_get_dynamic_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+void omp_set_nested_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int num_threads
+)
+{
+}
+
+int omp_get_nested_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+void omp_set_schedule_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_sched_t kind,
+    int modifier
+)
+{
+}
+
+void omp_get_schedule_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_sched_t *kind,
+    int *modifier
+)
+{
+}
+
+void omp_init_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+}
+
+void omp_destroy_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+}
+
+void omp_set_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+}
+
+void omp_unset_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+}
+
+int omp_test_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_lock_target_t *lock
+)
+{
+    return 0;
+}
+
+void omp_init_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+}
+
+void omp_destroy_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+}
+
+void omp_set_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+}
+
+void omp_unset_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+}
+
+int omp_test_nest_lock_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    omp_nest_lock_target_t *lock
+)
+{
+    return 0;
+}
+
+void kmp_set_stacksize_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int size
+)
+{
+}
+
+int kmp_get_stacksize_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+void kmp_set_stacksize_s_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    size_t size
+)
+{
+}
+
+size_t kmp_get_stacksize_s_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+void kmp_set_blocktime_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int time
+)
+{
+}
+
+int kmp_get_blocktime_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+void kmp_set_library_serial_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+}
+
+void kmp_set_library_turnaround_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+}
+
+void kmp_set_library_throughput_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+}
+
+void kmp_set_library_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int mode
+)
+{
+}
+
+int kmp_get_library_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+void kmp_set_defaults_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    char const *defaults
+)
+{
+}
+
+void kmp_create_affinity_mask_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+}
+
+void kmp_destroy_affinity_mask_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+}
+
+int kmp_set_affinity_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    return 0;
+}
+
+int kmp_get_affinity_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    return 0;
+}
+
+int kmp_get_affinity_max_proc_target(
+    TARGET_TYPE target_type,
+    int target_number
+)
+{
+    return 0;
+}
+
+int kmp_set_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    return 0;
+}
+
+int kmp_unset_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    return 0;
+}
+
+int kmp_get_affinity_mask_proc_target(
+    TARGET_TYPE target_type,
+    int target_number,
+    int proc,
+    kmp_affinity_mask_target_t *mask
+)
+{
+    return 0;
+}

diff --git a/final/offload/src/offload_orsl.cpp b/final/offload/src/offload_orsl.cpp
new file mode 100644
index 0000000..6162f8a
--- /dev/null
+++ b/final/offload/src/offload_orsl.cpp

@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_orsl.h"
+#include <stdlib.h>
+#include "offload_host.h"
+#include "orsl-lite/include/orsl-lite.h"
+
+namespace ORSL {
+
+static bool            is_enabled = false;
+static const ORSLTag   my_tag = "Offload";
+
+void init()
+{
+    const char *env_var = getenv("OFFLOAD_ENABLE_ORSL");
+    if (env_var != 0 && *env_var != '\0') {
+        int64_t new_val;
+        if (__offload_parse_int_string(env_var, new_val)) {
+            is_enabled = new_val;
+        }
+        else {
+            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
+                             "OFFLOAD_ENABLE_ORSL");
+        }
+    }
+
+    if (is_enabled) {
+        OFFLOAD_DEBUG_TRACE(2, "ORSL is enabled\n");
+    }
+    else {
+        OFFLOAD_DEBUG_TRACE(2, "ORSL is disabled\n");
+    }
+}
+
+bool reserve(int device)
+{
+    if (is_enabled) {
+        int pnum = mic_engines[device].get_physical_index();
+        ORSLBusySet bset;
+
+        bset.type = BUSY_SET_FULL;
+        if (ORSLReserve(1, &pnum, &bset, my_tag) != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool try_reserve(int device)
+{
+    if (is_enabled) {
+        int pnum = mic_engines[device].get_physical_index();
+        ORSLBusySet bset;
+
+        bset.type = BUSY_SET_FULL;
+        if (ORSLTryReserve(1, &pnum, &bset, my_tag) != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void release(int device)
+{
+    if (is_enabled) {
+        int pnum = mic_engines[device].get_physical_index();
+        ORSLBusySet bset;
+
+        bset.type = BUSY_SET_FULL;
+        if (ORSLRelease(1, &pnum, &bset, my_tag) != 0) {
+            // should never get here
+        }
+    }
+}
+
+} // namespace ORSL

diff --git a/final/offload/src/offload_orsl.h b/final/offload/src/offload_orsl.h
new file mode 100644
index 0000000..cdb86f9
--- /dev/null
+++ b/final/offload/src/offload_orsl.h

@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef OFFLOAD_ORSL_H_INCLUDED
+#define OFFLOAD_ORSL_H_INCLUDED
+
+// ORSL interface
+namespace ORSL {
+
+extern void init();
+
+extern bool reserve(int device);
+extern bool try_reserve(int device);
+extern void release(int device);
+
+} // namespace ORSL
+
+#endif // OFFLOAD_ORSL_H_INCLUDED

diff --git a/final/offload/src/offload_table.cpp b/final/offload/src/offload_table.cpp
new file mode 100644
index 0000000..cf165df
--- /dev/null
+++ b/final/offload/src/offload_table.cpp

@@ -0,0 +1,375 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_table.h"
+#include "offload_common.h"
+
+#if !HOST_LIBRARY
+// Predefined offload entries
+extern void omp_set_num_threads_lrb(void*);
+extern void omp_get_max_threads_lrb(void*);
+extern void omp_get_num_procs_lrb(void*);
+extern void omp_set_dynamic_lrb(void*);
+extern void omp_get_dynamic_lrb(void*);
+extern void omp_set_nested_lrb(void*);
+extern void omp_get_nested_lrb(void*);
+extern void omp_set_schedule_lrb(void*);
+extern void omp_get_schedule_lrb(void*);
+
+extern void omp_init_lock_lrb(void*);
+extern void omp_destroy_lock_lrb(void*);
+extern void omp_set_lock_lrb(void*);
+extern void omp_unset_lock_lrb(void*);
+extern void omp_test_lock_lrb(void*);
+
+extern void omp_init_nest_lock_lrb(void*);
+extern void omp_destroy_nest_lock_lrb(void*);
+extern void omp_set_nest_lock_lrb(void*);
+extern void omp_unset_nest_lock_lrb(void*);
+extern void omp_test_nest_lock_lrb(void*);
+
+extern void kmp_set_stacksize_lrb(void*);
+extern void kmp_get_stacksize_lrb(void*);
+extern void kmp_set_stacksize_s_lrb(void*);
+extern void kmp_get_stacksize_s_lrb(void*);
+extern void kmp_set_blocktime_lrb(void*);
+extern void kmp_get_blocktime_lrb(void*);
+extern void kmp_set_library_serial_lrb(void*);
+extern void kmp_set_library_turnaround_lrb(void*);
+extern void kmp_set_library_throughput_lrb(void*);
+extern void kmp_set_library_lrb(void*);
+extern void kmp_get_library_lrb(void*);
+extern void kmp_set_defaults_lrb(void*);
+
+extern void kmp_create_affinity_mask_lrb(void*);
+extern void kmp_destroy_affinity_mask_lrb(void*);
+extern void kmp_set_affinity_lrb(void*);
+extern void kmp_get_affinity_lrb(void*);
+extern void kmp_get_affinity_max_proc_lrb(void*);
+extern void kmp_set_affinity_mask_proc_lrb(void*);
+extern void kmp_unset_affinity_mask_proc_lrb(void*);
+extern void kmp_get_affinity_mask_proc_lrb(void*);
+
+// Predefined entries on the target side
+static FuncTable::Entry predefined_entries[] = {
+    "omp_set_num_threads_target",
+    (void*) &omp_set_num_threads_lrb,
+    "omp_get_max_threads_target",
+    (void*) &omp_get_max_threads_lrb,
+    "omp_get_num_procs_target",
+    (void*) &omp_get_num_procs_lrb,
+    "omp_set_dynamic_target",
+    (void*) &omp_set_dynamic_lrb,
+    "omp_get_dynamic_target",
+    (void*) &omp_get_dynamic_lrb,
+    "omp_set_nested_target",
+    (void*) &omp_set_nested_lrb,
+    "omp_get_nested_target",
+    (void*) &omp_get_nested_lrb,
+    "omp_set_schedule_target",
+    (void*) &omp_set_schedule_lrb,
+    "omp_get_schedule_target",
+    (void*) &omp_get_schedule_lrb,
+
+    "omp_init_lock_target",
+    (void*) &omp_init_lock_lrb,
+    "omp_destroy_lock_target",
+    (void*) &omp_destroy_lock_lrb,
+    "omp_set_lock_target",
+    (void*) &omp_set_lock_lrb,
+    "omp_unset_lock_target",
+    (void*) &omp_unset_lock_lrb,
+    "omp_test_lock_target",
+    (void*) &omp_test_lock_lrb,
+
+    "omp_init_nest_lock_target",
+    (void*) &omp_init_nest_lock_lrb,
+    "omp_destroy_nest_lock_target",
+    (void*) &omp_destroy_nest_lock_lrb,
+    "omp_set_nest_lock_target",
+    (void*) &omp_set_nest_lock_lrb,
+    "omp_unset_nest_lock_target",
+    (void*) &omp_unset_nest_lock_lrb,
+    "omp_test_nest_lock_target",
+    (void*) &omp_test_nest_lock_lrb,
+
+    "kmp_set_stacksize_target",
+    (void*) &kmp_set_stacksize_lrb,
+    "kmp_get_stacksize_target",
+    (void*) &kmp_get_stacksize_lrb,
+    "kmp_set_stacksize_s_target",
+    (void*) &kmp_set_stacksize_s_lrb,
+    "kmp_get_stacksize_s_target",
+    (void*) &kmp_get_stacksize_s_lrb,
+    "kmp_set_blocktime_target",
+    (void*) &kmp_set_blocktime_lrb,
+    "kmp_get_blocktime_target",
+    (void*) &kmp_get_blocktime_lrb,
+    "kmp_set_library_serial_target",
+    (void*) &kmp_set_library_serial_lrb,
+    "kmp_set_library_turnaround_target",
+    (void*) &kmp_set_library_turnaround_lrb,
+    "kmp_set_library_throughput_target",
+    (void*) &kmp_set_library_throughput_lrb,
+    "kmp_set_library_target",
+    (void*) &kmp_set_library_lrb,
+    "kmp_get_library_target",
+    (void*) &kmp_get_library_lrb,
+    "kmp_set_defaults_target",
+    (void*) &kmp_set_defaults_lrb,
+
+    "kmp_create_affinity_mask_target",
+    (void*) &kmp_create_affinity_mask_lrb,
+    "kmp_destroy_affinity_mask_target",
+    (void*) &kmp_destroy_affinity_mask_lrb,
+    "kmp_set_affinity_target",
+    (void*) &kmp_set_affinity_lrb,
+    "kmp_get_affinity_target",
+    (void*) &kmp_get_affinity_lrb,
+    "kmp_get_affinity_max_proc_target",
+    (void*) &kmp_get_affinity_max_proc_lrb,
+    "kmp_set_affinity_mask_proc_target",
+    (void*) &kmp_set_affinity_mask_proc_lrb,
+    "kmp_unset_affinity_mask_proc_target",
+    (void*) &kmp_unset_affinity_mask_proc_lrb,
+    "kmp_get_affinity_mask_proc_target",
+    (void*) &kmp_get_affinity_mask_proc_lrb,
+
+    (const char*) -1,
+    (void*) -1
+};
+
+static FuncList::Node predefined_table = {
+    { predefined_entries, -1 },
+    0, 0
+};
+
+// Entry table
+FuncList __offload_entries(&predefined_table);
+#else
+FuncList __offload_entries;
+#endif // !HOST_LIBRARY
+
+// Function table. No predefined entries.
+FuncList __offload_funcs;
+
+// Var table
+VarList  __offload_vars;
+
+// Given the function name returns the associtated function pointer
+const void* FuncList::find_addr(const char *name)
+{
+    const void* func = 0;
+
+    m_lock.lock();
+
+    for (Node *n = m_head; n != 0; n = n->next) {
+        for (const Table::Entry *e = n->table.entries;
+             e->name != (const char*) -1; e++) {
+            if (e->name != 0 && strcmp(e->name, name) == 0) {
+                func = e->func;
+                break;
+            }
+        }
+    }
+
+    m_lock.unlock();
+
+    return func;
+}
+
+// Given the function pointer returns the associtated function name
+const char* FuncList::find_name(const void *func)
+{
+    const char* name = 0;
+
+    m_lock.lock();
+
+    for (Node *n = m_head; n != 0; n = n->next) {
+        for (const Table::Entry *e = n->table.entries;
+             e->name != (const char*) -1; e++) {
+            if (e->func == func) {
+                name = e->name;
+                break;
+            }
+        }
+    }
+
+    m_lock.unlock();
+
+    return name;
+}
+
+// Returns max name length from all tables
+int64_t FuncList::max_name_length(void)
+{
+    if (m_max_name_len < 0) {
+        m_lock.lock();
+
+        m_max_name_len = 0;
+        for (Node *n = m_head; n != 0; n = n->next) {
+            if (n->table.max_name_len < 0) {
+                n->table.max_name_len = 0;
+
+                // calculate max name length in a single table
+                for (const Table::Entry *e = n->table.entries;
+                     e->name != (const char*) -1; e++) {
+                    if (e->name != 0) {
+                        size_t len = strlen(e->name) + 1;
+                        if (n->table.max_name_len < len) {
+                            n->table.max_name_len = len;
+                        }
+                    }
+                }
+            }
+
+            // select max from all tables
+            if (m_max_name_len < n->table.max_name_len) {
+                m_max_name_len = n->table.max_name_len;
+            }
+        }
+
+        m_lock.unlock();
+    }
+    return m_max_name_len;
+}
+
+// Debugging dump
+void FuncList::dump(void)
+{
+    OFFLOAD_DEBUG_TRACE(2, "Function table:\n");
+
+    m_lock.lock();
+
+    for (Node *n = m_head; n != 0; n = n->next) {
+        for (const Table::Entry *e = n->table.entries;
+             e->name != (const char*) -1; e++) {
+            if (e->name != 0) {
+                OFFLOAD_DEBUG_TRACE(2, "%p %s\n", e->func, e->name);
+            }
+        }
+    }
+
+    m_lock.unlock();
+}
+
+// Debugging dump
+void VarList::dump(void)
+{
+    OFFLOAD_DEBUG_TRACE(2, "Var table:\n");
+
+    m_lock.lock();
+
+    for (Node *n = m_head; n != 0; n = n->next) {
+        for (const Table::Entry *e = n->table.entries;
+             e->name != (const char*) -1; e++) {
+            if (e->name != 0) {
+#if HOST_LIBRARY
+                OFFLOAD_DEBUG_TRACE(2, "%s %p %ld\n", e->name, e->addr,
+                                    e->size);
+#else  // HOST_LIBRARY
+                OFFLOAD_DEBUG_TRACE(2, "%s %p\n", e->name, e->addr);
+#endif // HOST_LIBRARY
+            }
+        }
+    }
+
+    m_lock.unlock();
+}
+
+//
+int64_t VarList::table_size(int64_t &nelems)
+{
+    int64_t length = 0;
+
+    nelems = 0;
+
+    // calculate string table size and number of elements
+    for (Node *n = m_head; n != 0; n = n->next) {
+        for (const Table::Entry *e = n->table.entries;
+             e->name != (const char*) -1; e++) {
+            if (e->name != 0) {
+                length += strlen(e->name) + 1;
+                nelems++;
+            }
+        }
+    }
+
+    return nelems * sizeof(BufEntry) + length;
+}
+
+// copy table to the gven buffer
+void VarList::table_copy(void *buf, int64_t nelems)
+{
+    BufEntry* elems = static_cast<BufEntry*>(buf);
+    char*     names = reinterpret_cast<char*>(elems + nelems);
+
+    // copy entries to buffer
+    for (Node *n = m_head; n != 0; n = n->next) {
+        for (const Table::Entry *e = n->table.entries;
+             e->name != (const char*) -1; e++) {
+            if (e->name != 0) {
+                // name field contains offset to the name from the beginning
+                // of the buffer
+                elems->name = names - static_cast<char*>(buf);
+                elems->addr = reinterpret_cast<intptr_t>(e->addr);
+
+                // copy name to string table
+                const char *name = e->name;
+                while ((*names++ = *name++) != '\0');
+
+                elems++;
+            }
+        }
+    }
+}
+
+// patch name offsets in a buffer
+void VarList::table_patch_names(void *buf, int64_t nelems)
+{
+    BufEntry* elems = static_cast<BufEntry*>(buf);
+    for (int i = 0; i < nelems; i++) {
+        elems[i].name += reinterpret_cast<intptr_t>(buf);
+    }
+}
+
+// Adds given list element to the global lookup table list
+extern "C" void __offload_register_tables(
+    FuncList::Node *entry_table,
+    FuncList::Node *func_table,
+    VarList::Node *var_table
+)
+{
+    OFFLOAD_DEBUG_TRACE(2, "Registering offload function entry table %p\n",
+                           entry_table);
+    __offload_entries.add_table(entry_table);
+
+    OFFLOAD_DEBUG_TRACE(2, "Registering function table %p\n", func_table);
+    __offload_funcs.add_table(func_table);
+
+    OFFLOAD_DEBUG_TRACE(2, "Registering var table %p\n", var_table);
+    __offload_vars.add_table(var_table);
+}
+
+// Removes given list element from the global lookup table list
+extern "C" void __offload_unregister_tables(
+    FuncList::Node *entry_table,
+    FuncList::Node *func_table,
+    VarList::Node *var_table
+)
+{
+    __offload_entries.remove_table(entry_table);
+
+    OFFLOAD_DEBUG_TRACE(2, "Unregistering function table %p\n", func_table);
+    __offload_funcs.remove_table(func_table);
+
+    OFFLOAD_DEBUG_TRACE(2, "Unregistering var table %p\n", var_table);
+    __offload_vars.remove_table(var_table);
+}

diff --git a/final/offload/src/offload_table.h b/final/offload/src/offload_table.h
new file mode 100644
index 0000000..cfced3e
--- /dev/null
+++ b/final/offload/src/offload_table.h

@@ -0,0 +1,301 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*! \file
+    \brief Function and Variable tables used by the runtime library
+*/
+
+#ifndef OFFLOAD_TABLE_H_INCLUDED
+#define OFFLOAD_TABLE_H_INCLUDED
+
+#include <iterator>
+#include "offload_util.h"
+
+// Template representing double linked list of tables
+template <typename T> class TableList {
+public:
+    // table type
+    typedef T Table;
+
+    // List node
+    struct Node {
+        Table   table;
+        Node*   prev;
+        Node*   next;
+    };
+
+public:
+    explicit TableList(Node *node = 0) : m_head(node) {}
+
+    void add_table(Node *node) {
+        m_lock.lock();
+
+        if (m_head != 0) {
+            node->next = m_head;
+            m_head->prev = node;
+        }
+        m_head = node;
+
+        m_lock.unlock();
+    }
+
+    void remove_table(Node *node) {
+        m_lock.lock();
+
+        if (node->next != 0) {
+            node->next->prev = node->prev;
+        }
+        if (node->prev != 0) {
+            node->prev->next = node->next;
+        }
+        if (m_head == node) {
+            m_head = node->next;
+        }
+
+        m_lock.unlock();
+    }
+
+protected:
+    Node*           m_head;
+    mutex_t         m_lock;
+};
+
+// Function lookup table.
+struct FuncTable {
+    //! Function table entry
+    /*! This table contains functions created from offload regions.   */
+    /*! Each entry consists of a pointer to the function's "key"
+        and the function address.                                     */
+    /*! Each shared library or executable may contain one such table. */
+    /*! The end of the table is marked with an entry whose name field
+        has value -1.                                                 */
+    struct Entry {
+        const char* name; //!< Name of the function
+        void*       func; //!< Address of the function
+    };
+
+    // entries
+    const Entry *entries;
+
+    // max name length
+    int64_t max_name_len;
+};
+
+// Function table
+class FuncList : public TableList<FuncTable> {
+public:
+    explicit FuncList(Node *node = 0) : TableList<Table>(node),
+                                        m_max_name_len(-1)
+    {}
+
+    // add table to the list
+    void add_table(Node *node) {
+        // recalculate max function name length
+        m_max_name_len = -1;
+
+        // add table
+        TableList<Table>::add_table(node);
+    }
+
+    // find function address for the given name
+    const void* find_addr(const char *name);
+
+    // find function name for the given address
+    const char* find_name(const void *addr);
+
+    // max name length from all tables in the list
+    int64_t max_name_length(void);
+
+    // debug dump
+    void dump(void);
+
+private:
+    // max name length within from all tables
+    int64_t m_max_name_len;
+};
+
+// Table entry for static variables
+struct VarTable {
+    //! Variable table entry
+    /*! This table contains statically allocated variables marked with
+        __declspec(target(mic) or #pragma omp declare target.           */
+    /*! Each entry consists of a pointer to the variable's "key",
+        the variable address and its size in bytes.                     */
+    /*! Because memory allocation is done from the host,
+        the MIC table does not need the size of the variable.           */
+    /*! Padding to make the table entry size a power of 2 is necessary
+        to avoid "holes" between table contributions from different object
+        files on Windows when debug information is specified with /Zi.  */
+    struct Entry {
+        const char* name; //!< Name of the variable
+        void*       addr; //!< Address of the variable
+
+#if HOST_LIBRARY
+        uint64_t    size;
+
+#ifdef TARGET_WINNT
+		// padding to make entry size a power of 2
+        uint64_t    padding;
+#endif // TARGET_WINNT
+#endif
+    };
+
+    // Table terminated by an entry with name == -1
+    const Entry *entries;
+};
+
+// List of var tables
+class VarList : public TableList<VarTable> {
+public:
+    VarList() : TableList<Table>()
+    {}
+
+    // debug dump
+    void dump();
+
+public:
+    // var table list iterator
+    class Iterator : public std::iterator<std::input_iterator_tag,
+                                          Table::Entry> {
+    public:
+        Iterator() : m_node(0), m_entry(0) {}
+
+        explicit Iterator(Node *node) {
+            new_node(node);
+        }
+
+        Iterator& operator++() {
+            if (m_entry != 0) {
+                m_entry++;
+                while (m_entry->name == 0) {
+                    m_entry++;
+                }
+                if (m_entry->name == reinterpret_cast<const char*>(-1)) {
+                    new_node(m_node->next);
+                }
+            }
+            return *this;
+        }
+
+        bool operator==(const Iterator &other) const {
+            return m_entry == other.m_entry;
+        }
+
+        bool operator!=(const Iterator &other) const {
+            return m_entry != other.m_entry;
+        }
+
+        const Table::Entry* operator*() const {
+            return m_entry;
+        }
+
+    private:
+        void new_node(Node *node) {
+            m_node = node;
+            m_entry = 0;
+            while (m_node != 0) {
+                m_entry = m_node->table.entries;
+                while (m_entry->name == 0) {
+                    m_entry++;
+                }
+                if (m_entry->name != reinterpret_cast<const char*>(-1)) {
+                    break;
+                }
+                m_node = m_node->next;
+                m_entry = 0;
+            }
+        }
+
+    private:
+        Node                *m_node;
+        const Table::Entry  *m_entry;
+    };
+
+    Iterator begin() const {
+        return Iterator(m_head);
+    }
+
+    Iterator end() const {
+        return Iterator();
+    }
+
+public:
+    // Entry representation in a copy buffer
+    struct BufEntry {
+        intptr_t name;
+        intptr_t addr;
+    };
+
+    // Calculate the number of elements in the table and
+    // returns the size of buffer for the table
+    int64_t table_size(int64_t &nelems);
+
+    // Copy table contents to given buffer. It is supposed to be large
+    // enough to hold all elements as string table.
+    void table_copy(void *buf, int64_t nelems);
+
+    // Patch name offsets in a table after it's been copied to other side
+    static void table_patch_names(void *buf, int64_t nelems);
+};
+
+extern FuncList __offload_entries;
+extern FuncList __offload_funcs;
+extern VarList  __offload_vars;
+
+// Section names where the lookup tables are stored
+#ifdef TARGET_WINNT
+#define OFFLOAD_ENTRY_TABLE_SECTION_START   ".OffloadEntryTable$a"
+#define OFFLOAD_ENTRY_TABLE_SECTION_END     ".OffloadEntryTable$z"
+
+#define OFFLOAD_FUNC_TABLE_SECTION_START    ".OffloadFuncTable$a"
+#define OFFLOAD_FUNC_TABLE_SECTION_END      ".OffloadFuncTable$z"
+
+#define OFFLOAD_VAR_TABLE_SECTION_START     ".OffloadVarTable$a"
+#define OFFLOAD_VAR_TABLE_SECTION_END       ".OffloadVarTable$z"
+
+#define OFFLOAD_CRTINIT_SECTION_START       ".CRT$XCT"
+
+#pragma section(OFFLOAD_CRTINIT_SECTION_START, read)
+
+#else  // TARGET_WINNT
+
+#define OFFLOAD_ENTRY_TABLE_SECTION_START   ".OffloadEntryTable."
+#define OFFLOAD_ENTRY_TABLE_SECTION_END     ".OffloadEntryTable."
+
+#define OFFLOAD_FUNC_TABLE_SECTION_START    ".OffloadFuncTable."
+#define OFFLOAD_FUNC_TABLE_SECTION_END      ".OffloadFuncTable."
+
+#define OFFLOAD_VAR_TABLE_SECTION_START     ".OffloadVarTable."
+#define OFFLOAD_VAR_TABLE_SECTION_END       ".OffloadVarTable."
+#endif // TARGET_WINNT
+
+#pragma section(OFFLOAD_ENTRY_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_ENTRY_TABLE_SECTION_END, read, write)
+
+#pragma section(OFFLOAD_FUNC_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_FUNC_TABLE_SECTION_END, read, write)
+
+#pragma section(OFFLOAD_VAR_TABLE_SECTION_START, read, write)
+#pragma section(OFFLOAD_VAR_TABLE_SECTION_END, read, write)
+
+
+// register/unregister given tables
+extern "C" void __offload_register_tables(
+    FuncList::Node *entry_table,
+    FuncList::Node *func_table,
+    VarList::Node *var_table
+);
+
+extern "C" void __offload_unregister_tables(
+    FuncList::Node *entry_table,
+    FuncList::Node *func_table,
+    VarList::Node *var_table
+);
+#endif  // OFFLOAD_TABLE_H_INCLUDED

diff --git a/final/offload/src/offload_target.cpp b/final/offload/src/offload_target.cpp
new file mode 100644
index 0000000..cfc1b04
--- /dev/null
+++ b/final/offload/src/offload_target.cpp

@@ -0,0 +1,754 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_target.h"
+#include <stdlib.h>
+#include <unistd.h>
+#ifdef SEP_SUPPORT
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#endif // SEP_SUPPORT
+#include <omp.h>
+#include <map>
+
+// typedef offload_func_with_parms.
+// Pointer to function that represents an offloaded entry point.
+// The parameters are a temporary fix for parameters on the stack.
+typedef void (*offload_func_with_parms)(void *);
+
+// Target console and file logging
+const char *prefix;
+int console_enabled = 0;
+int offload_report_level = 0;
+
+// Trace information
+static const char* vardesc_direction_as_string[] = {
+    "NOCOPY",
+    "IN",
+    "OUT",
+    "INOUT"
+};
+static const char* vardesc_type_as_string[] = {
+    "unknown",
+    "data",
+    "data_ptr",
+    "func_ptr",
+    "void_ptr",
+    "string_ptr",
+    "dv",
+    "dv_data",
+    "dv_data_slice",
+    "dv_ptr",
+    "dv_ptr_data",
+    "dv_ptr_data_slice",
+    "cean_var",
+    "cean_var_ptr",
+    "c_data_ptr_array"
+};
+
+int mic_index = -1;
+int mic_engines_total = -1;
+uint64_t mic_frequency = 0;
+int offload_number = 0;
+static std::map<void*, RefInfo*> ref_data;
+static mutex_t add_ref_lock;
+
+#ifdef SEP_SUPPORT
+static const char*  sep_monitor_env = "SEP_MONITOR";
+static bool         sep_monitor = false;
+static const char*  sep_device_env = "SEP_DEVICE";
+static const char*  sep_device =  "/dev/sep3.8/c";
+static int          sep_counter = 0;
+
+#define SEP_API_IOC_MAGIC   99
+#define SEP_IOCTL_PAUSE     _IO (SEP_API_IOC_MAGIC, 31)
+#define SEP_IOCTL_RESUME    _IO (SEP_API_IOC_MAGIC, 32)
+
+static void add_ref_count(void * buf, bool created)
+{
+    mutex_locker_t locker(add_ref_lock);
+    RefInfo * info = ref_data[buf];
+
+    if (info) {
+        info->count++;
+    }
+    else {
+        info = new RefInfo((int)created,(long)1);
+    }
+    info->is_added |= created;
+    ref_data[buf] = info;
+}
+
+static void BufReleaseRef(void * buf)
+{
+    mutex_locker_t locker(add_ref_lock);
+    RefInfo * info = ref_data[buf];
+
+    if (info) {
+        --info->count;
+        if (info->count == 0 && info->is_added) {
+            BufferReleaseRef(buf);
+            info->is_added = 0;
+        }
+    }
+}
+
+static int VTPauseSampling(void)
+{
+    int ret = -1;
+    int handle = open(sep_device, O_RDWR);
+    if (handle > 0) {
+        ret = ioctl(handle, SEP_IOCTL_PAUSE);
+        close(handle);
+    }
+    return ret;
+}
+
+static int VTResumeSampling(void)
+{
+    int ret = -1;
+    int handle = open(sep_device, O_RDWR);
+    if (handle > 0) {
+        ret = ioctl(handle, SEP_IOCTL_RESUME);
+        close(handle);
+    }
+    return ret;
+}
+#endif // SEP_SUPPORT
+
+void OffloadDescriptor::offload(
+    uint32_t  buffer_count,
+    void**    buffers,
+    void*     misc_data,
+    uint16_t  misc_data_len,
+    void*     return_data,
+    uint16_t  return_data_len
+)
+{
+    FunctionDescriptor *func = (FunctionDescriptor*) misc_data;
+    const char *name = func->data;
+    OffloadDescriptor ofld;
+    char *in_data = 0;
+    char *out_data = 0;
+    char *timer_data = 0;
+
+    console_enabled = func->console_enabled;
+    timer_enabled = func->timer_enabled;
+    offload_report_level = func->offload_report_level;
+    offload_number = func->offload_number;
+    ofld.set_offload_number(func->offload_number);
+
+#ifdef SEP_SUPPORT
+    if (sep_monitor) {
+        if (__sync_fetch_and_add(&sep_counter, 1) == 0) {
+            OFFLOAD_DEBUG_TRACE(2, "VTResumeSampling\n");
+            VTResumeSampling();
+        }
+    }
+#endif // SEP_SUPPORT
+
+    OFFLOAD_DEBUG_TRACE_1(2, ofld.get_offload_number(),
+                          c_offload_start_target_func,
+                          "Offload \"%s\" started\n", name);
+
+    // initialize timer data
+    OFFLOAD_TIMER_INIT();
+
+    OFFLOAD_TIMER_START(c_offload_target_total_time);
+
+    OFFLOAD_TIMER_START(c_offload_target_descriptor_setup);
+
+    // get input/output buffer addresses
+    if (func->in_datalen > 0 || func->out_datalen > 0) {
+        if (func->data_offset != 0) {
+            in_data = (char*) misc_data + func->data_offset;
+            out_data = (char*) return_data;
+        }
+        else {
+            char *inout_buf = (char*) buffers[--buffer_count];
+            in_data = inout_buf;
+            out_data = inout_buf;
+        }
+    }
+
+    // assign variable descriptors
+    ofld.m_vars_total = func->vars_num;
+    if (ofld.m_vars_total > 0) {
+        uint64_t var_data_len = ofld.m_vars_total * sizeof(VarDesc);
+
+        ofld.m_vars = (VarDesc*) malloc(var_data_len);
+        memcpy(ofld.m_vars, in_data, var_data_len);
+
+        in_data += var_data_len;
+        func->in_datalen -= var_data_len;
+    }
+
+    // timer data
+    if (func->timer_enabled) {
+        uint64_t timer_data_len = OFFLOAD_TIMER_DATALEN();
+
+        timer_data = out_data;
+        out_data += timer_data_len;
+        func->out_datalen -= timer_data_len;
+    }
+
+    // init Marshallers
+    ofld.m_in.init_buffer(in_data, func->in_datalen);
+    ofld.m_out.init_buffer(out_data, func->out_datalen);
+
+    // copy buffers to offload descriptor
+    std::copy(buffers, buffers + buffer_count,
+              std::back_inserter(ofld.m_buffers));
+
+    OFFLOAD_TIMER_STOP(c_offload_target_descriptor_setup);
+
+    // find offload entry address
+    OFFLOAD_TIMER_START(c_offload_target_func_lookup);
+
+    offload_func_with_parms entry = (offload_func_with_parms)
+        __offload_entries.find_addr(name);
+
+    if (entry == NULL) {
+#if OFFLOAD_DEBUG > 0
+        if (console_enabled > 2) {
+            __offload_entries.dump();
+        }
+#endif
+        LIBOFFLOAD_ERROR(c_offload_descriptor_offload, name);
+        exit(1);
+    }
+
+    OFFLOAD_TIMER_STOP(c_offload_target_func_lookup);
+
+    OFFLOAD_TIMER_START(c_offload_target_func_time);
+
+    // execute offload entry
+    entry(&ofld);
+
+    OFFLOAD_TIMER_STOP(c_offload_target_func_time);
+
+    OFFLOAD_TIMER_STOP(c_offload_target_total_time);
+
+    // copy timer data to the buffer
+    OFFLOAD_TIMER_TARGET_DATA(timer_data);
+
+    OFFLOAD_DEBUG_TRACE(2, "Offload \"%s\" finished\n", name);
+
+#ifdef SEP_SUPPORT
+    if (sep_monitor) {
+        if (__sync_sub_and_fetch(&sep_counter, 1) == 0) {
+            OFFLOAD_DEBUG_TRACE(2, "VTPauseSampling\n");
+            VTPauseSampling();
+        }
+    }
+#endif // SEP_SUPPORT
+}
+
+void OffloadDescriptor::merge_var_descs(
+    VarDesc *vars,
+    VarDesc2 *vars2,
+    int vars_total
+)
+{
+    // number of variable descriptors received from host and generated
+    // locally should match
+    if (m_vars_total < vars_total) {
+        LIBOFFLOAD_ERROR(c_merge_var_descs1);
+        exit(1);
+    }
+
+    for (int i = 0; i < m_vars_total; i++) {
+        if (i < vars_total) {
+            // variable type must match
+            if (m_vars[i].type.bits != vars[i].type.bits) {
+                LIBOFFLOAD_ERROR(c_merge_var_descs2);
+                exit(1);
+            }
+
+            m_vars[i].ptr = vars[i].ptr;
+            m_vars[i].into = vars[i].into;
+
+            const char *var_sname = "";
+            if (vars2 != NULL) {
+                if (vars2[i].sname != NULL) {
+                    var_sname = vars2[i].sname;
+                }
+            }
+            OFFLOAD_DEBUG_TRACE_1(2, get_offload_number(), c_offload_var,
+                "   VarDesc %d, var=%s, %s, %s\n",
+                i, var_sname,
+                vardesc_direction_as_string[m_vars[i].direction.bits],
+                vardesc_type_as_string[m_vars[i].type.src]);
+            if (vars2 != NULL && vars2[i].dname != NULL) {
+                OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
+                    vardesc_type_as_string[m_vars[i].type.dst]);
+            }
+        }
+        OFFLOAD_TRACE(2,
+            "              type_src=%d, type_dstn=%d, direction=%d, "
+            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
+            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p into=%p\n",
+            m_vars[i].type.src,
+            m_vars[i].type.dst,
+            m_vars[i].direction.bits,
+            m_vars[i].alloc_if,
+            m_vars[i].free_if,
+            m_vars[i].align,
+            m_vars[i].mic_offset,
+            m_vars[i].flags.bits,
+            m_vars[i].offset,
+            m_vars[i].size,
+            m_vars[i].count,
+            m_vars[i].ptr,
+            m_vars[i].into);
+    }
+}
+
+void OffloadDescriptor::scatter_copyin_data()
+{
+    OFFLOAD_TIMER_START(c_offload_target_scatter_inputs);
+
+    OFFLOAD_DEBUG_TRACE(2, "IN  buffer @ %p size %lld\n",
+                        m_in.get_buffer_start(),
+                        m_in.get_buffer_size());
+    OFFLOAD_DEBUG_DUMP_BYTES(2, m_in.get_buffer_start(),
+                             m_in.get_buffer_size());
+
+    // receive data
+    for (int i = 0; i < m_vars_total; i++) {
+        bool src_is_for_mic = (m_vars[i].direction.out ||
+                               m_vars[i].into == NULL);
+        void** ptr_addr = src_is_for_mic ?
+                          static_cast<void**>(m_vars[i].ptr) :
+                          static_cast<void**>(m_vars[i].into);
+        int type = src_is_for_mic ? m_vars[i].type.src :
+                                    m_vars[i].type.dst;
+        bool is_static = src_is_for_mic ?
+                         m_vars[i].flags.is_static :
+                         m_vars[i].flags.is_static_dstn;
+        void *ptr = NULL;
+
+        if (m_vars[i].flags.alloc_disp) {
+            int64_t offset = 0;
+            m_in.receive_data(&offset, sizeof(offset));
+            m_vars[i].offset = -offset;
+        }
+        if (VAR_TYPE_IS_DV_DATA_SLICE(type) ||
+            VAR_TYPE_IS_DV_DATA(type)) {
+            ArrDesc *dvp = (type == c_dv_data_slice || type == c_dv_data)?
+                  reinterpret_cast<ArrDesc*>(ptr_addr) :
+                  *reinterpret_cast<ArrDesc**>(ptr_addr);
+            ptr_addr = reinterpret_cast<void**>(&dvp->Base);
+        }
+
+        // Set pointer values
+        switch (type) {
+            case c_data_ptr_array:
+                {
+                    int j = m_vars[i].ptr_arr_offset;
+                    int max_el = j + m_vars[i].count;
+                    char *dst_arr_ptr = (src_is_for_mic)?
+                        *(reinterpret_cast<char**>(m_vars[i].ptr)) :
+                        reinterpret_cast<char*>(m_vars[i].into);
+
+                    for (; j < max_el; j++) {
+                        if (src_is_for_mic) {
+                            m_vars[j].ptr =
+                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
+                        }
+                        else {
+                            m_vars[j].into =
+                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
+                        }
+                    }
+                }
+                break;
+            case c_data:
+            case c_void_ptr:
+            case c_cean_var:
+            case c_dv:
+                break;
+
+            case c_string_ptr:
+            case c_data_ptr:
+            case c_cean_var_ptr:
+            case c_dv_ptr:
+                if (m_vars[i].alloc_if) {
+                    void *buf;
+                    if (m_vars[i].flags.sink_addr) {
+                        m_in.receive_data(&buf, sizeof(buf));
+                    }
+                    else {
+                        buf = m_buffers.front();
+                        m_buffers.pop_front();
+                    }
+                    if (buf) {
+                        if (!is_static) {
+                            if (!m_vars[i].flags.sink_addr) {
+                                // increment buffer reference
+                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
+                                BufferAddRef(buf);
+                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
+                            }
+                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
+                        }
+                        ptr = static_cast<char*>(buf) +
+                                  m_vars[i].mic_offset +
+                                  (m_vars[i].flags.is_stack_buf ?
+                                   0 : m_vars[i].offset);
+                    }
+                    *ptr_addr = ptr;
+                }
+                else if (m_vars[i].flags.sink_addr) {
+                    void *buf;
+                    m_in.receive_data(&buf, sizeof(buf));
+                    void *ptr = static_cast<char*>(buf) +
+                                    m_vars[i].mic_offset +
+                                    (m_vars[i].flags.is_stack_buf ?
+                                     0 : m_vars[i].offset);
+                    *ptr_addr = ptr;
+                }
+                break;
+
+            case c_func_ptr:
+                break;
+
+            case c_dv_data:
+            case c_dv_ptr_data:
+            case c_dv_data_slice:
+            case c_dv_ptr_data_slice:
+                if (m_vars[i].alloc_if) {
+                    void *buf;
+                    if (m_vars[i].flags.sink_addr) {
+                        m_in.receive_data(&buf, sizeof(buf));
+                    }
+                    else {
+                        buf = m_buffers.front();
+                        m_buffers.pop_front();
+                    }
+                    if (buf) {
+                        if (!is_static) {
+                            if (!m_vars[i].flags.sink_addr) {
+                                // increment buffer reference
+                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
+                                BufferAddRef(buf);
+                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
+                            }
+                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
+                        }
+                        ptr = static_cast<char*>(buf) +
+                            m_vars[i].mic_offset + m_vars[i].offset;
+                    }
+                    *ptr_addr = ptr;
+                }
+                else if (m_vars[i].flags.sink_addr) {
+                    void *buf;
+                    m_in.receive_data(&buf, sizeof(buf));
+                    ptr = static_cast<char*>(buf) +
+                          m_vars[i].mic_offset + m_vars[i].offset;
+                    *ptr_addr = ptr;
+                }
+                break;
+
+            default:
+                LIBOFFLOAD_ERROR(c_unknown_var_type, type);
+                abort();
+        }
+        // Release obsolete buffers for stack of persistent objects
+        if (type = c_data_ptr &&
+            m_vars[i].flags.is_stack_buf &&
+            !m_vars[i].direction.bits &&
+            m_vars[i].alloc_if &&
+            m_vars[i].size != 0) {
+                for (int j=0; j < m_vars[i].size; j++) {
+                    void *buf;
+                    m_in.receive_data(&buf, sizeof(buf));
+                    BufferReleaseRef(buf);
+                    ref_data.erase(buf);
+                }
+        }
+        // Do copyin
+        switch (m_vars[i].type.dst) {
+            case c_data_ptr_array:
+                break;
+            case c_data:
+            case c_void_ptr:
+            case c_cean_var:
+                if (m_vars[i].direction.in &&
+                    !m_vars[i].flags.is_static_dstn) {
+                    int64_t size;
+                    int64_t disp;
+                    char* ptr = m_vars[i].into ?
+                                 static_cast<char*>(m_vars[i].into) :
+                                 static_cast<char*>(m_vars[i].ptr);
+                    if (m_vars[i].type.dst == c_cean_var) {
+                        m_in.receive_data((&size), sizeof(int64_t));
+                        m_in.receive_data((&disp), sizeof(int64_t));
+                    }
+                    else {
+                        size = m_vars[i].size;
+                        disp = 0;
+                    }
+                    m_in.receive_data(ptr + disp, size);
+                }
+                break;
+
+            case c_dv:
+                if (m_vars[i].direction.bits ||
+                    m_vars[i].alloc_if ||
+                    m_vars[i].free_if) {
+                    char* ptr = m_vars[i].into ?
+                                 static_cast<char*>(m_vars[i].into) :
+                                 static_cast<char*>(m_vars[i].ptr);
+                    m_in.receive_data(ptr + sizeof(uint64_t),
+                                      m_vars[i].size - sizeof(uint64_t));
+                }
+                break;
+
+            case c_string_ptr:
+            case c_data_ptr:
+            case c_cean_var_ptr:
+            case c_dv_ptr:
+            case c_dv_data:
+            case c_dv_ptr_data:
+            case c_dv_data_slice:
+            case c_dv_ptr_data_slice:
+                break;
+
+            case c_func_ptr:
+                if (m_vars[i].direction.in) {
+                    m_in.receive_func_ptr((const void**) m_vars[i].ptr);
+                }
+                break;
+
+            default:
+                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
+                abort();
+        }
+    }
+
+    OFFLOAD_TRACE(1, "Total copyin data received from host: [%lld] bytes\n",
+                  m_in.get_tfr_size());
+
+    OFFLOAD_TIMER_STOP(c_offload_target_scatter_inputs);
+
+    OFFLOAD_TIMER_START(c_offload_target_compute);
+}
+
+void OffloadDescriptor::gather_copyout_data()
+{
+    OFFLOAD_TIMER_STOP(c_offload_target_compute);
+
+    OFFLOAD_TIMER_START(c_offload_target_gather_outputs);
+
+    for (int i = 0; i < m_vars_total; i++) {
+        bool src_is_for_mic = (m_vars[i].direction.out ||
+                               m_vars[i].into == NULL);
+
+        switch (m_vars[i].type.src) {
+            case c_data_ptr_array:
+                break;
+            case c_data:
+            case c_void_ptr:
+            case c_cean_var:
+                if (m_vars[i].direction.out &&
+                    !m_vars[i].flags.is_static) {
+                    m_out.send_data(
+                        static_cast<char*>(m_vars[i].ptr) + m_vars[i].disp,
+                        m_vars[i].size);
+                }
+                break;
+
+            case c_dv:
+                break;
+
+            case c_string_ptr:
+            case c_data_ptr:
+            case c_cean_var_ptr:
+            case c_dv_ptr:
+                if (m_vars[i].free_if &&
+                    src_is_for_mic &&
+                    !m_vars[i].flags.is_static) {
+                    void *buf = *static_cast<char**>(m_vars[i].ptr) -
+                                    m_vars[i].mic_offset -
+                                    (m_vars[i].flags.is_stack_buf?
+                                     0 : m_vars[i].offset);
+                    if (buf == NULL) {
+                        break;
+                    }
+                    // decrement buffer reference count
+                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
+                    BufReleaseRef(buf);
+                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
+                }
+                break;
+
+            case c_func_ptr:
+                if (m_vars[i].direction.out) {
+                    m_out.send_func_ptr(*((void**) m_vars[i].ptr));
+                }
+                break;
+
+            case c_dv_data:
+            case c_dv_ptr_data:
+            case c_dv_data_slice:
+            case c_dv_ptr_data_slice:
+                if (src_is_for_mic &&
+                    m_vars[i].free_if &&
+                    !m_vars[i].flags.is_static) {
+                    ArrDesc *dvp = (m_vars[i].type.src == c_dv_data ||
+                                    m_vars[i].type.src == c_dv_data_slice) ?
+                        static_cast<ArrDesc*>(m_vars[i].ptr) :
+                        *static_cast<ArrDesc**>(m_vars[i].ptr);
+
+                    void *buf = reinterpret_cast<char*>(dvp->Base) -
+                                m_vars[i].mic_offset -
+                                m_vars[i].offset;
+
+                    if (buf == NULL) {
+                        break;
+                    }
+
+                    // decrement buffer reference count
+                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
+                    BufReleaseRef(buf);
+                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
+                }
+                break;
+
+            default:
+                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
+                abort();
+        }
+
+        if (m_vars[i].into) {
+            switch (m_vars[i].type.dst) {
+                case c_data_ptr_array:
+                    break;
+                case c_data:
+                case c_void_ptr:
+                case c_cean_var:
+                case c_dv:
+                    break;
+
+                case c_string_ptr:
+                case c_data_ptr:
+                case c_cean_var_ptr:
+                case c_dv_ptr:
+                    if (m_vars[i].direction.in &&
+                        m_vars[i].free_if &&
+                        !m_vars[i].flags.is_static_dstn) {
+                        void *buf = *static_cast<char**>(m_vars[i].into) -
+                                    m_vars[i].mic_offset -
+                                    (m_vars[i].flags.is_stack_buf?
+                                     0 : m_vars[i].offset);
+
+                        if (buf == NULL) {
+                            break;
+                        }
+                        // decrement buffer reference count
+                        OFFLOAD_TIMER_START(
+                            c_offload_target_release_buffer_refs);
+                        BufReleaseRef(buf);
+                        OFFLOAD_TIMER_STOP(
+                            c_offload_target_release_buffer_refs);
+                    }
+                    break;
+
+                case c_func_ptr:
+                    break;
+
+                case c_dv_data:
+                case c_dv_ptr_data:
+                case c_dv_data_slice:
+                case c_dv_ptr_data_slice:
+                    if (m_vars[i].free_if &&
+                        m_vars[i].direction.in &&
+                        !m_vars[i].flags.is_static_dstn) {
+                        ArrDesc *dvp =
+                            (m_vars[i].type.dst == c_dv_data_slice ||
+                             m_vars[i].type.dst == c_dv_data) ?
+                            static_cast<ArrDesc*>(m_vars[i].into) :
+                            *static_cast<ArrDesc**>(m_vars[i].into);
+                        void *buf = reinterpret_cast<char*>(dvp->Base) -
+                              m_vars[i].mic_offset -
+                              m_vars[i].offset;
+
+                        if (buf == NULL) {
+                            break;
+                        }
+                        // decrement buffer reference count
+                        OFFLOAD_TIMER_START(
+                            c_offload_target_release_buffer_refs);
+                        BufReleaseRef(buf);
+                        OFFLOAD_TIMER_STOP(
+                            c_offload_target_release_buffer_refs);
+                    }
+                    break;
+
+                default:
+                    LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
+                    abort();
+            }
+        }
+    }
+
+    OFFLOAD_DEBUG_TRACE(2, "OUT buffer @ p %p size %lld\n",
+                        m_out.get_buffer_start(),
+                        m_out.get_buffer_size());
+
+    OFFLOAD_DEBUG_DUMP_BYTES(2,
+                             m_out.get_buffer_start(),
+                             m_out.get_buffer_size());
+
+    OFFLOAD_DEBUG_TRACE_1(1, get_offload_number(), c_offload_copyout_data,
+                  "Total copyout data sent to host: [%lld] bytes\n",
+                  m_out.get_tfr_size());
+
+    OFFLOAD_TIMER_STOP(c_offload_target_gather_outputs);
+}
+
+void __offload_target_init(void)
+{
+#ifdef SEP_SUPPORT
+    const char* env_var = getenv(sep_monitor_env);
+    if (env_var != 0 && *env_var != '\0') {
+        sep_monitor = atoi(env_var);
+    }
+    env_var = getenv(sep_device_env);
+    if (env_var != 0 && *env_var != '\0') {
+        sep_device = env_var;
+    }
+#endif // SEP_SUPPORT
+
+    prefix = report_get_message_str(c_report_mic);
+
+    // init frequency
+    mic_frequency = COIPerfGetCycleFrequency();
+}
+
+// User-visible offload API
+
+int _Offload_number_of_devices(void)
+{
+    return mic_engines_total;
+}
+
+int _Offload_get_device_number(void)
+{
+    return mic_index;
+}
+
+int _Offload_get_physical_device_number(void)
+{
+    uint32_t index;
+    EngineGetIndex(&index);
+    return index;
+}

diff --git a/final/offload/src/offload_target.h b/final/offload/src/offload_target.h
new file mode 100644
index 0000000..7db3147
--- /dev/null
+++ b/final/offload/src/offload_target.h

@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// The parts of the offload library used only on the target
+
+#ifndef OFFLOAD_TARGET_H_INCLUDED
+#define OFFLOAD_TARGET_H_INCLUDED
+
+#include "offload_common.h"
+#include "coi/coi_server.h"
+
+// The offload descriptor.
+class OffloadDescriptor
+{
+public:
+    ~OffloadDescriptor() {
+        if (m_vars != 0) {
+            free(m_vars);
+        }
+    }
+
+    // Entry point for COI. Synchronously execute offloaded region given
+    // the provided buffers, misc and return data.
+    static void offload(
+        uint32_t  buffer_count,
+        void**    buffers,
+        void*     misc_data,
+        uint16_t  misc_data_len,
+        void*     return_data,
+        uint16_t  return_data_len
+    );
+
+    // scatters input data from in buffer to target variables
+    void scatter_copyin_data();
+
+    // gathers output data to the buffer
+    void gather_copyout_data();
+
+    // merges local variable descriptors with the descriptors received from
+    // host
+    void merge_var_descs(VarDesc *vars, VarDesc2 *vars2, int vars_total);
+
+    int get_offload_number() const {
+        return m_offload_number;
+    }
+
+    void set_offload_number(int number) {
+        m_offload_number = number;
+    }
+
+private:
+    // Constructor
+    OffloadDescriptor() : m_vars(0)
+    {}
+
+private:
+    typedef std::list<void*> BufferList;
+
+    // The Marshaller for the inputs of the offloaded region.
+    Marshaller m_in;
+
+    // The Marshaller for the outputs of the offloaded region.
+    Marshaller m_out;
+
+    // List of buffers that are passed to dispatch call
+    BufferList m_buffers;
+
+    // Variable descriptors received from host
+    VarDesc* m_vars;
+    int      m_vars_total;
+    int      m_offload_number;
+};
+
+// one time target initialization in main
+extern void __offload_target_init(void);
+
+// logical device index
+extern int mic_index;
+
+// total number of available logical devices
+extern int mic_engines_total;
+
+// device frequency (from COI)
+extern uint64_t mic_frequency;
+
+struct RefInfo {
+    RefInfo(bool is_add, long amount):is_added(is_add),count(amount)
+    {}
+    bool is_added;
+    long count;
+};
+
+#endif // OFFLOAD_TARGET_H_INCLUDED

diff --git a/final/offload/src/offload_target_main.cpp b/final/offload/src/offload_target_main.cpp
new file mode 100644
index 0000000..a4921d2
--- /dev/null
+++ b/final/offload/src/offload_target_main.cpp

@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+extern "C" void __offload_target_main(void);
+
+int main(int argc, char ** argv)
+{
+    __offload_target_main();
+    return 0;
+}

diff --git a/final/offload/src/offload_timer.h b/final/offload/src/offload_timer.h
new file mode 100644
index 0000000..1401a9d
--- /dev/null
+++ b/final/offload/src/offload_timer.h

@@ -0,0 +1,172 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef OFFLOAD_TIMER_H_INCLUDED
+#define OFFLOAD_TIMER_H_INCLUDED
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include "liboffload_error_codes.h"
+
+extern int timer_enabled;
+
+#ifdef TIMING_SUPPORT
+
+struct OffloadTargetTimerData {
+    uint64_t frequency;
+    struct {
+        uint64_t start;
+        uint64_t total;
+    } phases[c_offload_target_max_phase];
+};
+
+struct OffloadHostTimerData {
+    // source file name and line number
+    const char* file;
+    int         line;
+
+    // host timer data
+    struct {
+        uint64_t start;
+        uint64_t total;
+    } phases[c_offload_host_max_phase];
+
+    uint64_t sent_bytes;
+    uint64_t received_bytes;
+    int card_number;
+    int offload_number;
+
+    // target timer data
+    OffloadTargetTimerData target;
+
+    // next element
+    OffloadHostTimerData *next;
+};
+
+#if HOST_LIBRARY
+
+extern int offload_report_level;
+extern int offload_report_enabled;
+#define OFFLOAD_REPORT_1 1
+#define OFFLOAD_REPORT_2 2
+#define OFFLOAD_REPORT_3 3
+#define OFFLOAD_REPORT_ON 1
+#define OFFLOAD_REPORT_OFF 0
+
+#define OFFLOAD_TIMER_DATALEN() \
+    ((timer_enabled || (offload_report_level && offload_report_enabled)) ? \
+     ((1 + c_offload_target_max_phase) * sizeof(uint64_t)) : 0)
+
+#define OFFLOAD_TIMER_START(timer_data, pnode) \
+    if (timer_enabled || \
+        (offload_report_level && offload_report_enabled)) { \
+        offload_timer_start(timer_data, pnode); \
+    }
+
+#define OFFLOAD_TIMER_STOP(timer_data, pnode) \
+    if (timer_enabled || \
+        (offload_report_level && offload_report_enabled)) { \
+        offload_timer_stop(timer_data, pnode); \
+    }
+
+#define OFFLOAD_TIMER_INIT(file, line) \
+    offload_timer_init(file, line);
+
+#define OFFLOAD_TIMER_TARGET_DATA(timer_data, data) \
+    if (timer_enabled || \
+        (offload_report_level && offload_report_enabled)) { \
+        offload_timer_fill_target_data(timer_data, data); \
+    }
+
+#define OFFLOAD_TIMER_HOST_SDATA(timer_data, data) \
+    if (offload_report_level && offload_report_enabled) { \
+        offload_timer_fill_host_sdata(timer_data, data); \
+    }
+
+#define OFFLOAD_TIMER_HOST_RDATA(timer_data, data) \
+    if (offload_report_level && offload_report_enabled) { \
+        offload_timer_fill_host_rdata(timer_data, data); \
+    }
+
+#define OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, data) \
+    if (offload_report_level && offload_report_enabled) { \
+        offload_timer_fill_host_mic_num(timer_data, data); \
+    }
+
+extern void offload_timer_start(OffloadHostTimerData *,
+                                OffloadHostPhase t_node);
+extern void offload_timer_stop(OffloadHostTimerData *,
+                               OffloadHostPhase t_node);
+extern OffloadHostTimerData * offload_timer_init(const char *file, int line);
+extern void offload_timer_fill_target_data(OffloadHostTimerData *,
+                                           void *data);
+extern void offload_timer_fill_host_sdata(OffloadHostTimerData *,
+                                          uint64_t sent_bytes);
+extern void offload_timer_fill_host_rdata(OffloadHostTimerData *,
+                                          uint64_t sent_bytes);
+extern void offload_timer_fill_host_mic_num(OffloadHostTimerData *,
+                                            int card_number);
+
+// Utility structure for starting/stopping timer
+struct OffloadTimer {
+    OffloadTimer(OffloadHostTimerData *data, OffloadHostPhase phase) :
+        m_data(data),
+        m_phase(phase)
+    {
+        OFFLOAD_TIMER_START(m_data, m_phase);
+    }
+
+    ~OffloadTimer()
+    {
+        OFFLOAD_TIMER_STOP(m_data, m_phase);
+    }
+
+private:
+    OffloadHostTimerData*   m_data;
+    OffloadHostPhase        m_phase;
+};
+
+#else
+
+#define OFFLOAD_TIMER_DATALEN() \
+    ((timer_enabled) ? \
+     ((1 + c_offload_target_max_phase) * sizeof(uint64_t)) : 0)
+
+#define OFFLOAD_TIMER_START(pnode) \
+    if (timer_enabled) offload_timer_start(pnode);
+
+#define OFFLOAD_TIMER_STOP(pnode) \
+    if (timer_enabled) offload_timer_stop(pnode);
+
+#define OFFLOAD_TIMER_INIT() \
+    if (timer_enabled) offload_timer_init();
+
+#define OFFLOAD_TIMER_TARGET_DATA(data) \
+    if (timer_enabled) offload_timer_fill_target_data(data);
+
+extern void offload_timer_start(OffloadTargetPhase t_node);
+extern void offload_timer_stop(OffloadTargetPhase t_node);
+extern void offload_timer_init(void);
+extern void offload_timer_fill_target_data(void *data);
+
+#endif // HOST_LIBRARY
+
+#else // TIMING_SUPPORT
+
+#define OFFLOAD_TIMER_START(...)
+#define OFFLOAD_TIMER_STOP(...)
+#define OFFLOAD_TIMER_INIT(...)
+#define OFFLOAD_TIMER_TARGET_DATA(...)
+#define OFFLOAD_TIMER_DATALEN(...)      (0)
+
+#endif // TIMING_SUPPORT
+
+#endif // OFFLOAD_TIMER_H_INCLUDED

diff --git a/final/offload/src/offload_timer_host.cpp b/final/offload/src/offload_timer_host.cpp
new file mode 100644
index 0000000..fb27db0
--- /dev/null
+++ b/final/offload/src/offload_timer_host.cpp

@@ -0,0 +1,359 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_timer.h"
+
+#ifdef __INTEL_COMPILER
+#include <ia32intrin.h>
+#else // __INTEL_COMPILER
+#include <x86intrin.h>
+#endif // __INTEL_COMPILER
+
+#include "offload_host.h"
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+int timer_enabled = 0;
+
+#ifdef TIMING_SUPPORT
+
+int offload_report_level = 0;
+int offload_report_enabled = 1;
+
+static const int host_timer_prefix_spaces[] = {
+    /*c_offload_host_setup_buffers*/         0,
+    /*c_offload_host_initialize*/            2,
+    /*c_offload_host_target_acquire*/        2,
+    /*c_offload_host_wait_deps*/             2,
+    /*c_offload_host_setup_buffers*/         2,
+    /*c_offload_host_alloc_buffers*/         4,
+    /*c_offload_host_setup_misc_data*/       2,
+    /*c_offload_host_alloc_data_buffer*/     4,
+    /*c_offload_host_send_pointers*/         2,
+    /*c_offload_host_gather_inputs*/         2,
+    /*c_offload_host_map_in_data_buffer*/    4,
+    /*c_offload_host_unmap_in_data_buffer*/  4,
+    /*c_offload_host_start_compute*/         2,
+    /*c_offload_host_wait_compute*/          2,
+    /*c_offload_host_start_buffers_reads*/   2,
+    /*c_offload_host_scatter_outputs*/       2,
+    /*c_offload_host_map_out_data_buffer*/   4,
+    /*c_offload_host_unmap_out_data_buffer*/ 4,
+    /*c_offload_host_wait_buffers_reads*/    2,
+    /*c_offload_host_destroy_buffers*/       2
+};
+
+const static int target_timer_prefix_spaces[] = {
+/*c_offload_target_total_time*/          0,
+/*c_offload_target_descriptor_setup*/    2,
+/*c_offload_target_func_lookup*/         2,
+/*c_offload_target_func_time*/           2,
+/*c_offload_target_scatter_inputs*/      4,
+/*c_offload_target_add_buffer_refs*/     6,
+/*c_offload_target_compute*/             4,
+/*c_offload_target_gather_outputs*/      4,
+/*c_offload_target_release_buffer_refs*/ 6
+};
+
+static OffloadHostTimerData* timer_data_head;
+static OffloadHostTimerData* timer_data_tail;
+static mutex_t               timer_data_mutex;
+
+static void offload_host_phase_name(std::stringstream &ss, int p_node);
+static void offload_target_phase_name(std::stringstream &ss, int p_node);
+
+extern void Offload_Timer_Print(void)
+{
+    std::string       buf;
+    std::stringstream ss;
+    const char *stars =
+        "**************************************************************";
+
+    ss << "\n\n" << stars << "\n";
+    ss << "                             ";
+    ss << report_get_message_str(c_report_title) << "\n";
+    ss << stars << "\n";
+    double frequency = cpu_frequency;
+
+    for (OffloadHostTimerData *pnode = timer_data_head;
+         pnode != 0; pnode = pnode->next) {
+        ss << "      ";
+        ss << report_get_message_str(c_report_from_file) << " "<< pnode->file;
+        ss << report_get_message_str(c_report_line) << " " << pnode->line;
+        ss << "\n";
+        for (int i = 0; i < c_offload_host_max_phase ; i++) {
+            ss << "          ";
+            offload_host_phase_name(ss, i);
+            ss << "   " << std::fixed << std::setprecision(5);
+            ss << (double)pnode->phases[i].total / frequency << "\n";
+        }
+
+        for (int i = 0; i < c_offload_target_max_phase ; i++) {
+            double time = 0;
+            if (pnode->target.frequency != 0) {
+                time = (double) pnode->target.phases[i].total /
+                       (double) pnode->target.frequency;
+            }
+            ss << "          ";
+            offload_target_phase_name(ss, i);
+            ss << "   " << std::fixed << std::setprecision(5);
+            ss << time << "\n";
+        }
+    }
+
+    buf = ss.str();
+    fprintf(stdout, buf.data());
+    fflush(stdout);
+}
+
+extern void Offload_Report_Prolog(OffloadHostTimerData *pnode)
+{
+    double frequency = cpu_frequency;
+    std::string       buf;
+    std::stringstream ss;
+
+    if (pnode) {
+        // [Offload] [Mic 0] [File]          file.c
+        ss << "[" << report_get_message_str(c_report_offload) << "] [";
+        ss << report_get_message_str(c_report_mic) << " ";
+        ss << pnode->card_number << "] [";
+        ss << report_get_message_str(c_report_file);
+        ss << "]                    " << pnode->file << "\n";
+
+        // [Offload] [Mic 0] [Line]          1234
+        ss << "[" << report_get_message_str(c_report_offload) << "] [";
+        ss << report_get_message_str(c_report_mic) << " ";
+        ss << pnode->card_number << "] [";
+        ss << report_get_message_str(c_report_line);
+        ss << "]                    " << pnode->line << "\n";
+
+        // [Offload] [Mic 0] [Tag]          Tag 1
+        ss << "[" << report_get_message_str(c_report_offload) << "] [";
+        ss << report_get_message_str(c_report_mic) << " ";
+        ss << pnode->card_number << "] [";
+        ss << report_get_message_str(c_report_tag);
+        ss << "]                     " << report_get_message_str(c_report_tag);
+        ss << " " << pnode->offload_number << "\n";
+
+        buf = ss.str();
+        fprintf(stdout, buf.data());
+        fflush(stdout);
+    }
+}
+
+extern void Offload_Report_Epilog(OffloadHostTimerData * timer_data)
+{
+    double frequency = cpu_frequency;
+    std::string       buf;
+    std::stringstream ss;
+
+    OffloadHostTimerData *pnode = timer_data;
+
+    if (!pnode) {
+        return;
+    }
+    ss << "[" << report_get_message_str(c_report_offload) << "] [";
+    ss << report_get_message_str(c_report_host) << "]  [";
+    ss << report_get_message_str(c_report_tag) <<  " ";
+    ss << pnode->offload_number << "] [";
+    ss << report_get_message_str(c_report_cpu_time) << "]        ";
+    ss << std::fixed << std::setprecision(6);
+    ss << (double) pnode->phases[0].total / frequency;
+    ss << report_get_message_str(c_report_seconds) << "\n";
+
+    if (offload_report_level >= OFFLOAD_REPORT_2) {
+        ss << "[" << report_get_message_str(c_report_offload) << "] [";
+        ss << report_get_message_str(c_report_mic);
+        ss << " " << pnode->card_number;
+        ss << "] [" << report_get_message_str(c_report_tag) << " ";
+        ss <<  pnode->offload_number << "] [";
+        ss << report_get_message_str(c_report_cpu_to_mic_data) << "]   ";
+        ss << pnode->sent_bytes << " ";
+        ss << report_get_message_str(c_report_bytes) << "\n";
+    }
+
+    double time = 0;
+    if (pnode->target.frequency != 0) {
+        time = (double) pnode->target.phases[0].total /
+            (double) pnode->target.frequency;
+    }
+    ss << "[" << report_get_message_str(c_report_offload) << "] [";
+    ss << report_get_message_str(c_report_mic) << " ";
+    ss << pnode->card_number<< "] [";
+    ss << report_get_message_str(c_report_tag) <<  " ";
+    ss << pnode->offload_number << "] [";
+    ss << report_get_message_str(c_report_mic_time) << "]        ";
+    ss << std::fixed << std::setprecision(6) << time;
+    ss << report_get_message_str(c_report_seconds) << "\n";
+
+    if (offload_report_level >= OFFLOAD_REPORT_2) {
+        ss << "[" << report_get_message_str(c_report_offload) << "] [";
+        ss << report_get_message_str(c_report_mic);
+        ss << " " << pnode->card_number;
+        ss << "] [" << report_get_message_str(c_report_tag) << " ";
+        ss <<  pnode->offload_number << "] [";
+        ss << report_get_message_str(c_report_mic_to_cpu_data) << "]   ";
+        ss << pnode->received_bytes << " ";
+        ss << report_get_message_str(c_report_bytes) << "\n";
+    }
+    ss << "\n";
+
+    buf = ss.str();
+    fprintf(stdout, buf.data());
+    fflush(stdout);
+
+    offload_report_free_data(timer_data);
+}
+
+extern void offload_report_free_data(OffloadHostTimerData * timer_data)
+{
+    OffloadHostTimerData *pnode_last = NULL;
+
+    for (OffloadHostTimerData *pnode = timer_data_head;
+         pnode != 0; pnode = pnode->next) {
+        if (timer_data == pnode) {
+            if (pnode_last) {
+                pnode_last->next = pnode->next;
+            }
+            else {
+                timer_data_head = pnode->next;
+            }
+            OFFLOAD_FREE(pnode);
+            break;
+        }
+        pnode_last = pnode;
+    }
+}
+
+static void fill_buf_with_spaces(std::stringstream &ss, int num)
+{
+    for (; num > 0; num--) {
+        ss << " ";
+    }
+}
+
+static void offload_host_phase_name(std::stringstream &ss, int p_node)
+{
+    int prefix_spaces;
+    int str_length;
+    int tail_length;
+    const int message_length = 40;
+    char const *str;
+
+    str = report_get_host_stage_str(p_node);
+    prefix_spaces = host_timer_prefix_spaces[p_node];
+    fill_buf_with_spaces(ss, prefix_spaces);
+    str_length = strlen(str);
+    ss << str;
+    tail_length = message_length - prefix_spaces - str_length;
+    tail_length = tail_length > 0? tail_length : 1;
+    fill_buf_with_spaces(ss, tail_length);
+}
+
+static void offload_target_phase_name(std::stringstream &ss, int p_node)
+{
+    int prefix_spaces;
+    int str_length;
+    const int message_length = 40;
+    int tail_length;
+    char const *str;
+
+    str = report_get_target_stage_str(p_node);
+    prefix_spaces = target_timer_prefix_spaces[p_node];
+    fill_buf_with_spaces(ss, prefix_spaces);
+    str_length = strlen(str);
+    ss << str;
+    tail_length = message_length - prefix_spaces - str_length;
+    tail_length = (tail_length > 0)? tail_length : 1;
+    fill_buf_with_spaces(ss, tail_length);
+}
+
+void offload_timer_start(OffloadHostTimerData * timer_data,
+                         OffloadHostPhase p_type)
+{
+    timer_data->phases[p_type].start = _rdtsc();
+}
+
+void offload_timer_stop(OffloadHostTimerData * timer_data,
+                        OffloadHostPhase p_type)
+{
+    timer_data->phases[p_type].total += _rdtsc() -
+                                        timer_data->phases[p_type].start;
+}
+
+void offload_timer_fill_target_data(OffloadHostTimerData * timer_data,
+                                    void *buf)
+{
+    uint64_t *data = (uint64_t*) buf;
+
+    timer_data->target.frequency = *data++;
+    for (int i = 0; i < c_offload_target_max_phase; i++) {
+        timer_data->target.phases[i].total = *data++;
+    }
+}
+
+void offload_timer_fill_host_sdata(OffloadHostTimerData * timer_data,
+                                   uint64_t sent_bytes)
+{
+    if (timer_data) {
+        timer_data->sent_bytes += sent_bytes;
+    }
+}
+
+void offload_timer_fill_host_rdata(OffloadHostTimerData * timer_data,
+                                   uint64_t received_bytes)
+{
+    if (timer_data) {
+        timer_data->received_bytes += received_bytes;
+    }
+}
+
+void offload_timer_fill_host_mic_num(OffloadHostTimerData * timer_data,
+                                     int card_number)
+{
+    if (timer_data) {
+        timer_data->card_number = card_number;
+    }
+}
+
+OffloadHostTimerData* offload_timer_init(const char *file, int line)
+{
+    static bool first_time = true;
+    OffloadHostTimerData* timer_data = NULL;
+
+    timer_data_mutex.lock();
+    {
+        if (timer_enabled ||
+            (offload_report_level && offload_report_enabled)) {
+            timer_data = (OffloadHostTimerData*)
+                OFFLOAD_MALLOC(sizeof(OffloadHostTimerData), 0);
+            memset(timer_data, 0, sizeof(OffloadHostTimerData));
+
+            timer_data->offload_number = OFFLOAD_DEBUG_INCR_OFLD_NUM() - 1;
+
+            if (timer_data_head == 0) {
+                timer_data_head = timer_data;
+                timer_data_tail = timer_data;
+            }
+            else {
+                timer_data_tail->next = timer_data;
+                timer_data_tail = timer_data;
+            }
+
+            timer_data->file = file;
+            timer_data->line = line;
+        }
+    }
+    timer_data_mutex.unlock();
+    return timer_data;
+}
+
+#endif // TIMING_SUPPORT

diff --git a/final/offload/src/offload_timer_target.cpp b/final/offload/src/offload_timer_target.cpp
new file mode 100644
index 0000000..30a4c91
--- /dev/null
+++ b/final/offload/src/offload_timer_target.cpp

@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_timer.h"
+#include "offload_target.h"
+
+#ifdef __INTEL_COMPILER
+#include <ia32intrin.h>
+#else // __INTEL_COMPILER
+#include <x86intrin.h>
+#endif // __INTEL_COMPILER
+
+
+
+int timer_enabled = 0;
+
+#ifdef TIMING_SUPPORT
+
+#if defined(LINUX) || defined(FREEBSD)
+static __thread OffloadTargetTimerData timer_data;
+#else // WINNT
+static __declspec(thread) OffloadTargetTimerData timer_data;
+#endif // defined(LINUX) || defined(FREEBSD)
+
+
+void offload_timer_start(
+    OffloadTargetPhase p_type
+)
+{
+    timer_data.phases[p_type].start = _rdtsc();
+}
+
+void offload_timer_stop(
+    OffloadTargetPhase p_type
+)
+{
+    timer_data.phases[p_type].total += _rdtsc() -
+                                       timer_data.phases[p_type].start;
+}
+
+void offload_timer_init()
+{
+    memset(&timer_data, 0, sizeof(OffloadTargetTimerData));
+}
+
+void offload_timer_fill_target_data(
+    void *buf
+)
+{
+    uint64_t *data = (uint64_t*) buf;
+
+    timer_data.frequency = mic_frequency;
+    memcpy(data++, &(timer_data.frequency), sizeof(uint64_t));
+
+    for (int i = 0; i < c_offload_target_max_phase; i++) {
+        memcpy(data++, &(timer_data.phases[i].total), sizeof(uint64_t));
+    }
+}
+
+#endif // TIMING_SUPPORT

diff --git a/final/offload/src/offload_trace.cpp b/final/offload/src/offload_trace.cpp
new file mode 100644
index 0000000..0a06204
--- /dev/null
+++ b/final/offload/src/offload_trace.cpp

@@ -0,0 +1,309 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_trace.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sstream>
+#include "liboffload_error_codes.h"
+
+extern const char *prefix;
+
+#if !HOST_LIBRARY
+extern int mic_index;
+#endif
+
+// The debug routines
+
+static const char * offload_stage(std::stringstream &ss,
+                                  int offload_number,
+                                  const char *tag,
+                                  const char *text,
+                                  bool print_tag)
+{
+    ss << "[" << report_get_message_str(c_report_offload) << "]";
+#if HOST_LIBRARY
+    ss << " [" << prefix << "]";
+    if (print_tag) {
+        ss << "  [" << report_get_message_str(c_report_tag);
+        ss << " " << offload_number << "]";
+    }
+    else {
+        ss << "         ";
+    }
+    ss << " [" << tag << "]";
+    ss << "           " << text;
+#else
+    ss << " [" << prefix << " " << mic_index << "]";
+    if (print_tag) {
+        ss << " [" << report_get_message_str(c_report_tag);
+        ss << " " << offload_number << "]";
+    }
+    ss << " [" << tag << "]";
+    ss << "           " << text;
+#endif
+    return 0;
+}
+
+static const char * offload_signal(std::stringstream &ss,
+                                  int offload_number,
+                                  const char *tag,
+                                  const char *text)
+{
+    ss << "[" << report_get_message_str(c_report_offload) << "]";
+    ss << " [" << prefix << "]";
+    ss << "  [" << report_get_message_str(c_report_tag);
+    ss << " " << offload_number << "]";
+    ss << " [" << tag << "]";
+    ss << "          " << text;
+    return 0;
+}
+
+void offload_stage_print(int stage, int offload_number, ...)
+{
+    std::string buf;
+    std::stringstream ss;
+    char const *str1;
+    char const *str2;
+    va_list va_args;
+    va_start(va_args, offload_number);
+    va_arg(va_args, char*);
+
+    switch (stage) {
+        case c_offload_start:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_start);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_init:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_init);
+            offload_stage(ss, offload_number, str1, str2, false);
+            ss << " " << report_get_message_str(c_report_logical_card);
+            ss << " " << va_arg(va_args, int);
+            ss << " = " << report_get_message_str(c_report_physical_card);
+            ss << " " << va_arg(va_args, int);
+            break;
+        case c_offload_register:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_register);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_init_func:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_init_func);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << ": " << va_arg(va_args, char*);
+            break;
+        case c_offload_create_buf_host:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_create_buf_host);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << ": base=0x" << std::hex << va_arg(va_args, uint64_t);
+            ss << " length=" << std::dec << va_arg(va_args, uint64_t);
+            break;
+        case c_offload_create_buf_mic:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_create_buf_mic);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << ": size=" << va_arg(va_args, uint64_t);
+            ss << " offset=" << va_arg(va_args, int);
+            if (va_arg(va_args,int))
+               ss << " (2M page)";
+            break;
+        case c_offload_send_pointer_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_send_pointer_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_sent_pointer_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_sent_pointer_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << " " << va_arg(va_args, uint64_t);
+            break;
+        case c_offload_gather_copyin_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_gather_copyin_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_copyin_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_copyin_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << " " << va_arg(va_args, uint64_t) << " ";
+            break;
+        case c_offload_compute:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_compute);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_receive_pointer_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_receive_pointer_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_received_pointer_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_received_pointer_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << " " << va_arg(va_args, uint64_t);
+            break;
+        case c_offload_start_target_func:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_start_target_func);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << ": " << va_arg(va_args, char*);
+            break;
+        case c_offload_var:
+            str1 = report_get_message_str(c_report_var);
+            offload_stage(ss, offload_number, str1, "  ", true);
+            va_arg(va_args, int);
+            ss << va_arg(va_args, char*);
+            ss << " " << " " << va_arg(va_args, char*);
+            break;
+        case c_offload_scatter_copyin_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_scatter_copyin_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_gather_copyout_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_gather_copyout_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_scatter_copyout_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_scatter_copyout_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_copyout_data:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_copyout_data);
+            offload_stage(ss, offload_number, str1, str2, true);
+            ss << "   " << va_arg(va_args, uint64_t);
+            break;
+        case c_offload_signal:
+            {
+                uint64_t  *signal;
+                str1 = report_get_message_str(c_report_state_signal);
+                str2 = report_get_message_str(c_report_signal);
+                offload_signal(ss, offload_number, str1, str2);
+	        signal = va_arg(va_args, uint64_t*);
+	        if (signal)
+                   ss << " 0x" << std::hex << *signal;
+                else
+                   ss << " none";
+            }
+            break;
+        case c_offload_wait:
+            {
+                int count;
+                uint64_t  **signal;
+                str1 = report_get_message_str(c_report_state_signal);
+                str2 = report_get_message_str(c_report_wait);
+                offload_signal(ss, offload_number, str1, str2);
+                count = va_arg(va_args, int);
+                signal = va_arg(va_args, uint64_t**);
+                if (count) {
+                    while (count) {
+                        ss << " " << std::hex << signal[count-1];
+                        count--;
+                    }
+                }
+                else
+                    ss << " none";
+            }
+            break;
+        case c_offload_unregister:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_unregister);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        case c_offload_destroy:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_destroy);
+            offload_stage(ss, offload_number, str1, str2, true);
+            break;
+        case c_offload_myoinit:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myoinit);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        case c_offload_myoregister:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myoregister);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        case c_offload_myofini:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myofini);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        case c_offload_mic_myo_shared:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_mic_myo_shared);
+            offload_stage(ss, offload_number, str1, str2, false);
+            ss << " " << va_arg(va_args, char*);
+            break;
+        case c_offload_mic_myo_fptr:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_mic_myo_fptr);
+            offload_stage(ss, offload_number, str1, str2, false);
+            ss << " " << va_arg(va_args, char*);
+            break;
+        case c_offload_myosharedmalloc:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myosharedmalloc);
+            offload_stage(ss, offload_number, str1, str2, false);
+            va_arg(va_args, char*);
+            ss << " " << va_arg(va_args, size_t);
+            break;
+        case c_offload_myosharedfree:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myosharedfree);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        case c_offload_myosharedalignedmalloc:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myosharedalignedmalloc);
+            offload_stage(ss, offload_number, str1, str2, false);
+            va_arg(va_args, char*);
+            ss << " " << va_arg(va_args, size_t);
+            ss << " " << va_arg(va_args, size_t);
+            break;
+        case c_offload_myosharedalignedfree:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myosharedalignedfree);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        case c_offload_myoacquire:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myoacquire);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        case c_offload_myorelease:
+            str1 = report_get_message_str(c_report_state);
+            str2 = report_get_message_str(c_report_myorelease);
+            offload_stage(ss, offload_number, str1, str2, false);
+            break;
+        default:
+            LIBOFFLOAD_ERROR(c_report_unknown_trace_node);
+            abort();
+    }
+    ss << "\n";
+    buf = ss.str();
+    fprintf(stdout, buf.data());
+    fflush(stdout);
+
+    va_end(va_args);
+    return;
+}

diff --git a/final/offload/src/offload_trace.h b/final/offload/src/offload_trace.h
new file mode 100644
index 0000000..65c28a4
--- /dev/null
+++ b/final/offload/src/offload_trace.h

@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// The parts of the offload library common to host and target
+
+void offload_stage_print(int stage, int offload_number, ...);
+
+enum OffloadTraceStage {
+    // Total time spent on the target
+    c_offload_start = 0,
+    c_offload_init,
+    c_offload_register,
+    c_offload_init_func,
+    c_offload_create_buf_host,
+    c_offload_create_buf_mic,
+    c_offload_send_pointer_data,
+    c_offload_sent_pointer_data,
+    c_offload_gather_copyin_data,
+    c_offload_copyin_data,
+    c_offload_compute,
+    c_offload_receive_pointer_data,
+    c_offload_received_pointer_data,
+    c_offload_start_target_func,
+    c_offload_var,
+    c_offload_scatter_copyin_data,
+    c_offload_gather_copyout_data,
+    c_offload_scatter_copyout_data,
+    c_offload_copyout_data,
+    c_offload_signal,
+    c_offload_wait,
+    c_offload_unregister,
+    c_offload_destroy,
+    c_offload_finish,
+    c_offload_myoinit,
+    c_offload_myoregister,
+    c_offload_mic_myo_shared,
+    c_offload_mic_myo_fptr,
+    c_offload_myosharedmalloc,
+    c_offload_myosharedfree,
+    c_offload_myosharedalignedmalloc,
+    c_offload_myosharedalignedfree,
+    c_offload_myoacquire,
+    c_offload_myorelease,
+    c_offload_myofini
+};

diff --git a/final/offload/src/offload_util.cpp b/final/offload/src/offload_util.cpp
new file mode 100644
index 0000000..68462c8
--- /dev/null
+++ b/final/offload/src/offload_util.cpp

@@ -0,0 +1,206 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "offload_util.h"
+#include <errno.h>
+#include "liboffload_error_codes.h"
+
+#ifdef TARGET_WINNT
+void *thread_getspecific(pthread_key_t key)
+{
+    if (key == 0) {
+        return NULL;
+    }
+    else {
+        return TlsGetValue(key);
+    }
+}
+
+int thread_setspecific(pthread_key_t key, const void *value)
+{
+    return (TlsSetValue(key, (LPVOID)value)) ? 0 : GetLastError();
+}
+#endif // TARGET_WINNT
+
+bool __offload_parse_size_string(const char *str, uint64_t &new_size)
+{
+    uint64_t val;
+    char *suffix;
+
+    errno = 0;
+#ifdef TARGET_WINNT
+    val = strtoul(str, &suffix, 10);
+#else // TARGET_WINNT
+    val = strtoull(str, &suffix, 10);
+#endif // TARGET_WINNT
+    if (errno != 0 || suffix == str) {
+        return false;
+    }
+
+    if (suffix[0] == '\0') {
+        // default is Kilobytes
+        new_size = val * 1024;
+        return true;
+    }
+    else if (suffix[1] == '\0') {
+        // Optional suffixes: B (bytes), K (Kilobytes), M (Megabytes),
+        // G (Gigabytes), or T (Terabytes) specify the units.
+        switch (suffix[0]) {
+            case 'b':
+            case 'B':
+                new_size = val;
+                break;
+
+            case 'k':
+            case 'K':
+                new_size = val * 1024;
+                break;
+
+            case 'm':
+            case 'M':
+                new_size = val * 1024 * 1024;
+                break;
+
+            case 'g':
+            case 'G':
+                new_size = val * 1024 * 1024 * 1024;
+                break;
+
+            case 't':
+            case 'T':
+                new_size = val * 1024 * 1024 * 1024 * 1024;
+                break;
+
+            default:
+                return false;
+        }
+        return true;
+    }
+
+    return false;
+}
+
+bool __offload_parse_int_string(const char *str, int64_t &value)
+{
+    int64_t val;
+    char *suffix;
+
+    errno = 0;
+#ifdef TARGET_WINNT
+    val = strtol(str, &suffix, 0);
+#else
+    val = strtoll(str, &suffix, 0);
+#endif
+    if (errno == 0 && suffix != str && *suffix == '\0') {
+        value = val;
+        return true;
+    }
+    return false;
+}
+
+#ifdef TARGET_WINNT
+extern void* DL_open(const char *path)
+{
+    void *handle;
+    int error_mode;
+
+    /*
+     * do not display message box with error if it the call below fails to
+     * load dynamic library.
+     */
+    error_mode = SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOOPENFILEERRORBOX);
+
+    /* load dynamic library */
+    handle = (void*) LoadLibrary(path);
+
+    /* restore error mode */
+    SetErrorMode(error_mode);
+
+    return handle;
+}
+
+extern int DL_addr(const void *addr, Dl_info *dl_info)
+{
+    MEMORY_BASIC_INFORMATION mem_info;
+    char mod_name[MAX_PATH];
+    HMODULE mod_handle;
+
+    /* Fill MEMORY_BASIC_INFORMATION struct */
+    if (!VirtualQuery(addr, &mem_info, sizeof(mem_info))) {
+        return 0;
+    }
+    mod_handle = (HMODULE)mem_info.AllocationBase;
+
+    /* ANSI file name for module */
+    if (!GetModuleFileNameA(mod_handle, (char*) mod_name, sizeof(mod_name))) {
+        return 0;
+    }
+    strcpy(dl_info->dli_fname, mod_name);
+    dl_info->dli_fbase = mem_info.BaseAddress;
+    dl_info->dli_saddr = addr;
+    strcpy(dl_info->dli_sname, mod_name);
+    return 1;
+}
+
+// Run once
+static BOOL CALLBACK __offload_run_once_wrapper(
+    PINIT_ONCE initOnce,
+    PVOID parameter,
+    PVOID *context
+)
+{
+    void (*init_routine)(void) = (void(*)(void)) parameter;
+    init_routine();
+    return true;
+}
+
+void __offload_run_once(OffloadOnceControl *ctrl, void (*func)(void))
+{
+    InitOnceExecuteOnce(ctrl, __offload_run_once_wrapper, (void*) func, 0);
+}
+#endif // TARGET_WINNT
+
+/* ARGSUSED */ // version is not used on windows
+void* DL_sym(void *handle, const char *name, const char *version)
+{
+#ifdef TARGET_WINNT
+    return GetProcAddress((HMODULE) handle, name);
+#else // TARGET_WINNT
+    if (version == 0) {
+        return dlsym(handle, name);
+    }
+    else {
+        return dlvsym(handle, name, version);
+    }
+#endif // TARGET_WINNT
+}
+
+int64_t get_el_value(
+                     char *base,
+                     int64_t offset,
+                     int64_t size)
+{
+    int64_t val = 0;
+    switch (size) {
+        case 1:
+            val = static_cast<int64_t>(*((char *)(base + offset)));
+            break;
+        case 2:
+            val = static_cast<int64_t>(*((short *)(base + offset)));
+            break;
+        case 4:
+            val = static_cast<int64_t>(*((int *)(base + offset)));
+            break;
+        default:
+            val = *((int64_t *)(base + offset));
+            break;
+    }
+    return val;
+}

diff --git a/final/offload/src/offload_util.h b/final/offload/src/offload_util.h
new file mode 100644
index 0000000..e50d77d
--- /dev/null
+++ b/final/offload/src/offload_util.h

@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef OFFLOAD_UTIL_H_INCLUDED
+#define OFFLOAD_UTIL_H_INCLUDED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#ifdef TARGET_WINNT
+#include <windows.h>
+#include <process.h>
+#else // TARGET_WINNT
+#include <dlfcn.h>
+#include <pthread.h>
+#endif // TARGET_WINNT
+
+#ifdef TARGET_WINNT
+typedef unsigned pthread_key_t;
+typedef int pid_t;
+
+#define __func__ __FUNCTION__
+#define strtok_r(s,d,p) strtok_s(s,d,p)
+#define strcasecmp(a,b) stricmp(a,b)
+
+#define thread_key_create(key, destructor) \
+    (((*key = TlsAlloc()) > 0) ? 0 : GetLastError())
+#define thread_key_delete(key) TlsFree(key)
+
+#ifndef S_ISREG
+#define S_ISREG(mode)  (((mode) & S_IFMT) == S_IFREG)
+#endif
+
+void*   thread_getspecific(pthread_key_t key);
+int     thread_setspecific(pthread_key_t key, const void *value);
+#else
+#define thread_key_create(key, destructor) \
+            pthread_key_create((key), (destructor))
+#define thread_key_delete(key)  pthread_key_delete(key)
+#define thread_getspecific(key) pthread_getspecific(key)
+#define thread_setspecific(key, value) pthread_setspecific(key, value)
+#endif // TARGET_WINNT
+
+// Mutex implementation
+struct mutex_t {
+    mutex_t() {
+#ifdef TARGET_WINNT
+        InitializeCriticalSection(&m_lock);
+#else // TARGET_WINNT
+        pthread_mutex_init(&m_lock, 0);
+#endif // TARGET_WINNT
+    }
+
+    ~mutex_t() {
+#ifdef TARGET_WINNT
+        DeleteCriticalSection(&m_lock);
+#else // TARGET_WINNT
+        pthread_mutex_destroy(&m_lock);
+#endif // TARGET_WINNT
+    }
+
+    void lock() {
+#ifdef TARGET_WINNT
+        EnterCriticalSection(&m_lock);
+#else // TARGET_WINNT
+        pthread_mutex_lock(&m_lock);
+#endif // TARGET_WINNT
+    }
+
+    void unlock() {
+#ifdef TARGET_WINNT
+        LeaveCriticalSection(&m_lock);
+#else // TARGET_WINNT
+        pthread_mutex_unlock(&m_lock);
+#endif // TARGET_WINNT
+    }
+
+private:
+#ifdef TARGET_WINNT
+    CRITICAL_SECTION    m_lock;
+#else
+    pthread_mutex_t     m_lock;
+#endif
+};
+
+struct mutex_locker_t {
+    mutex_locker_t(mutex_t &mutex) : m_mutex(mutex) {
+        m_mutex.lock();
+    }
+
+    ~mutex_locker_t() {
+        m_mutex.unlock();
+    }
+
+private:
+    mutex_t &m_mutex;
+};
+
+// Dynamic loader interface
+#ifdef TARGET_WINNT
+struct Dl_info
+{
+    char        dli_fname[MAX_PATH];
+    void       *dli_fbase;
+    char        dli_sname[MAX_PATH];
+    const void *dli_saddr;
+};
+
+void*   DL_open(const char *path);
+#define DL_close(handle)        FreeLibrary((HMODULE) (handle))
+int     DL_addr(const void *addr, Dl_info *info);
+#else
+#define DL_open(path)           dlopen((path), RTLD_NOW)
+#define DL_close(handle)        dlclose(handle)
+#define DL_addr(addr, info)     dladdr((addr), (info))
+#endif // TARGET_WINNT
+
+extern void* DL_sym(void *handle, const char *name, const char *version);
+
+// One-time initialization API
+#ifdef TARGET_WINNT
+typedef INIT_ONCE                   OffloadOnceControl;
+#define OFFLOAD_ONCE_CONTROL_INIT   INIT_ONCE_STATIC_INIT
+
+extern void __offload_run_once(OffloadOnceControl *ctrl, void (*func)(void));
+#else
+typedef pthread_once_t              OffloadOnceControl;
+#define OFFLOAD_ONCE_CONTROL_INIT   PTHREAD_ONCE_INIT
+
+#define __offload_run_once(ctrl, func) pthread_once(ctrl, func)
+#endif // TARGET_WINNT
+
+// Parses size specification string.
+extern bool __offload_parse_size_string(const char *str, uint64_t &new_size);
+
+// Parses string with integer value
+extern bool __offload_parse_int_string(const char *str, int64_t &value);
+
+// get value by its base, offset and size
+int64_t get_el_value(
+    char   *base,
+    int64_t offset,
+    int64_t size
+);
+#endif // OFFLOAD_UTIL_H_INCLUDED

diff --git a/final/offload/src/ofldbegin.cpp b/final/offload/src/ofldbegin.cpp
new file mode 100644
index 0000000..945f982
--- /dev/null
+++ b/final/offload/src/ofldbegin.cpp

@@ -0,0 +1,164 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if HOST_LIBRARY
+#include "offload_host.h"
+#include "offload_myo_host.h"
+#else
+#include "compiler_if_target.h"
+#include "offload_target.h"
+#include "offload_myo_target.h"
+#endif
+
+#ifdef TARGET_WINNT
+#define ALLOCATE(name) __declspec(allocate(name))
+#define DLL_LOCAL
+#else // TARGET_WINNT
+#define ALLOCATE(name) __attribute__((section(name)))
+#define DLL_LOCAL  __attribute__((visibility("hidden")))
+#endif // TARGET_WINNT
+
+#if HOST_LIBRARY
+// the host program/shared library should always have __offload_target_image
+// symbol defined. This symbol specifies the beginning of the target program
+// image.
+extern "C" DLL_LOCAL const void* __offload_target_image;
+#else // HOST_LIBRARY
+// Define a weak main which would be used on target side in case usere's
+// source file containing main does not have offload code.
+#pragma weak main
+int main(void)
+{
+    OFFLOAD_TARGET_MAIN();
+    return 0;
+}
+
+#pragma weak MAIN__
+extern "C" int MAIN__(void)
+{
+    OFFLOAD_TARGET_MAIN();
+    return 0;
+}
+#endif // HOST_LIBRARY
+
+// offload section prolog
+ALLOCATE(OFFLOAD_ENTRY_TABLE_SECTION_START)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(FuncTable::Entry)))
+#endif // TARGET_WINNT
+static FuncTable::Entry __offload_entry_table_start = { 0 };
+
+// list element for the current module
+static FuncList::Node __offload_entry_node = {
+    { &__offload_entry_table_start + 1, -1 },
+    0, 0
+};
+
+// offload fp section prolog
+ALLOCATE(OFFLOAD_FUNC_TABLE_SECTION_START)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(FuncTable::Entry)))
+#endif // TARGET_WINNT
+static FuncTable::Entry __offload_func_table_start = { 0 };
+
+// list element for the current module
+static FuncList::Node __offload_func_node = {
+    { &__offload_func_table_start + 1, -1 },
+    0, 0
+};
+
+// offload fp section prolog
+ALLOCATE(OFFLOAD_VAR_TABLE_SECTION_START)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(VarTable::Entry)))
+#endif // TARGET_WINNT
+static VarTable::Entry __offload_var_table_start = { 0 };
+
+// list element for the current module
+static VarList::Node __offload_var_node = {
+    { &__offload_var_table_start + 1 },
+    0, 0
+};
+
+#ifdef MYO_SUPPORT
+
+// offload myo shared var section prolog
+ALLOCATE(OFFLOAD_MYO_SHARED_TABLE_SECTION_START)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(SharedTableEntry)))
+#endif // TARGET_WINNT
+static SharedTableEntry __offload_myo_shared_table_start = { 0 };
+
+#if HOST_LIBRARY
+// offload myo shared var init section prolog
+ALLOCATE(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(InitTableEntry)))
+#endif // TARGET_WINNT
+static InitTableEntry __offload_myo_shared_init_table_start = { 0 };
+#endif
+
+// offload myo fptr section prolog
+ALLOCATE(OFFLOAD_MYO_FPTR_TABLE_SECTION_START)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(FptrTableEntry)))
+#endif // TARGET_WINNT
+static FptrTableEntry __offload_myo_fptr_table_start = { 0 };
+
+#endif // MYO_SUPPORT
+
+// init/fini code which adds/removes local lookup data to/from the global list
+
+static void offload_fini();
+
+#ifndef TARGET_WINNT
+static void offload_init() __attribute__((constructor(101)));
+#else // TARGET_WINNT
+static void offload_init();
+
+// Place offload initialization before user constructors
+ALLOCATE(OFFLOAD_CRTINIT_SECTION_START)
+static void (*addressof_offload_init)() = offload_init;
+#endif // TARGET_WINNT
+
+static void offload_init()
+{
+    // register offload tables
+    __offload_register_tables(&__offload_entry_node,
+                              &__offload_func_node,
+                              &__offload_var_node);
+
+#if HOST_LIBRARY
+    __offload_register_image(&__offload_target_image);
+    atexit(offload_fini);
+#endif // HOST_LIBRARY
+
+#ifdef MYO_SUPPORT
+    __offload_myoRegisterTables(
+#if HOST_LIBRARY
+        &__offload_myo_shared_init_table_start + 1,
+#endif // HOST_LIBRARY
+        &__offload_myo_shared_table_start + 1,
+        &__offload_myo_fptr_table_start + 1
+    );
+#endif // MYO_SUPPORT
+}
+
+static void offload_fini()
+{
+#if HOST_LIBRARY
+    __offload_unregister_image(&__offload_target_image);
+#endif // HOST_LIBRARY
+
+    // unregister offload tables
+    __offload_unregister_tables(&__offload_entry_node,
+                                &__offload_func_node,
+                                &__offload_var_node);
+}

diff --git a/final/offload/src/ofldend.cpp b/final/offload/src/ofldend.cpp
new file mode 100644
index 0000000..f61fe59
--- /dev/null
+++ b/final/offload/src/ofldend.cpp

@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if HOST_LIBRARY
+#include "offload_host.h"
+#include "offload_myo_host.h"
+#else
+#include "offload_target.h"
+#include "offload_myo_target.h"
+#endif
+
+#ifdef TARGET_WINNT
+#define ALLOCATE(name) __declspec(allocate(name))
+#else // TARGET_WINNT
+#define ALLOCATE(name) __attribute__((section(name)))
+#endif // TARGET_WINNT
+
+// offload entry table
+ALLOCATE(OFFLOAD_ENTRY_TABLE_SECTION_END)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(FuncTable::Entry)))
+#endif // TARGET_WINNT
+static FuncTable::Entry __offload_entry_table_end = { (const char*)-1 };
+
+// offload function table
+ALLOCATE(OFFLOAD_FUNC_TABLE_SECTION_END)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(FuncTable::Entry)))
+#endif // TARGET_WINNT
+static FuncTable::Entry __offload_func_table_end = { (const char*)-1 };
+
+// data table
+ALLOCATE(OFFLOAD_VAR_TABLE_SECTION_END)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(VarTable::Entry)))
+#endif // TARGET_WINNT
+static VarTable::Entry __offload_var_table_end = { (const char*)-1 };
+
+#ifdef MYO_SUPPORT
+
+// offload myo shared var section epilog
+ALLOCATE(OFFLOAD_MYO_SHARED_TABLE_SECTION_END)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(SharedTableEntry)))
+static SharedTableEntry __offload_myo_shared_table_end = { (const char*)-1, 0 };
+#else // TARGET_WINNT
+static SharedTableEntry __offload_myo_shared_table_end = { 0 };
+#endif // TARGET_WINNT
+
+#if HOST_LIBRARY
+// offload myo shared var init section epilog
+ALLOCATE(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(InitTableEntry)))
+static InitTableEntry __offload_myo_shared_init_table_end = { (const char*)-1, 0 };
+#else // TARGET_WINNT
+static InitTableEntry __offload_myo_shared_init_table_end = { 0 };
+#endif // TARGET_WINNT
+#endif // HOST_LIBRARY
+
+// offload myo fptr section epilog
+ALLOCATE(OFFLOAD_MYO_FPTR_TABLE_SECTION_END)
+#ifdef TARGET_WINNT
+__declspec(align(sizeof(FptrTableEntry)))
+static FptrTableEntry __offload_myo_fptr_table_end = { (const char*)-1, 0, 0 };
+#else // TARGET_WINNT
+static FptrTableEntry __offload_myo_fptr_table_end = { 0 };
+#endif // TARGET_WINNT
+
+#endif // MYO_SUPPORT

diff --git a/final/offload/src/orsl-lite/include/orsl-lite.h b/final/offload/src/orsl-lite/include/orsl-lite.h
new file mode 100644
index 0000000..f26a335
--- /dev/null
+++ b/final/offload/src/orsl-lite/include/orsl-lite.h

@@ -0,0 +1,221 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef _ORSL_LITE_H_
+#define _ORSL_LITE_H_
+
+#ifndef TARGET_WINNT
+#include <sched.h>
+#else
+#define cpu_set_t int
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Type of a ORSLBusySet */
+typedef enum ORSLBusySetType {
+    BUSY_SET_EMPTY = 0,     /**< Empty set */
+    BUSY_SET_PARTIAL = 1,   /**< Non-empty set that omits some threads */
+    BUSY_SET_FULL = 2       /**< A set that includes all threads on the card */
+} BusySetType;
+
+/** ORSLBusySet encapsulation */
+typedef struct ORSLBusySet {
+    BusySetType type;   /**< Set type */
+#ifdef __linux__
+    cpu_set_t cpu_set;  /**< CPU mask (unused for BUSY_SET_EMPTY and
+                           BUSY_SET_PARTIAL sets) represented by the standard
+                           Linux CPU set type -- cpu_set_t. Threads are numbered
+                           starting from 0. The maximal possible thread number
+                           is system-specific. See CPU_SET(3) family of macros
+                           for more details. Unused in ORSL Lite. */
+#endif
+} ORSLBusySet;
+
+/** Client tag */
+typedef char* ORSLTag;
+
+/** Maximal length of tag in characters */
+#define ORSL_MAX_TAG_LEN 128
+
+/** Maximal number of cards that can be managed by ORSL */
+#define ORSL_MAX_CARDS 32
+
+/** Reserves computational resources on a set of cards. Blocks.
+ *
+ * If any of the resources cannot be reserved, this function will block until
+ * they become available. Reservation can be recursive if performed by the
+ * same tag. A recursively reserved resource must be released the same number
+ * of times it was reserved.
+ *
+ * @see ORSLTryReserve
+ *
+ * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
+ *                    or > ORSL_MAX_CARDS.
+ *
+ * @param[in]  inds   Indices of the cards: an integer array with n elements.
+ *                    Cannot be NULL if n > 0. Valid card indices are from 0
+ *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
+ *
+ * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
+ *                    if n > 0.
+ *
+ * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
+ *                    must not exeed ORSL_MAX_TAG_LEN.
+ *
+ * @returns    0      if the resources were successfully reserved
+ *
+ * @returns    EINVAL if any of the arguments is invalid
+ *
+ * @returns    EAGAIN limit of recursive reservations reached
+ *                    (not in ORSL Lite)
+ *
+ * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
+ *                    equal to BUSY_SET_PARTIAL
+ */
+int ORSLReserve(const int n, const int *__restrict inds,
+                const ORSLBusySet *__restrict bsets,
+                const ORSLTag __restrict tag);
+
+/** Reserves computational resources on a set of cards. Does not block.
+ *
+ * If any of the resources cannot be reserved, this function will return
+ * immediately. Reservation can be recursive if performed by the same tag.
+ * A recursively reserved resource must be released the same number of times
+ * it was reserved.
+ *
+ * @see ORSLReserve
+ *
+ * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
+ *                    or > ORSL_MAX_CARDS.
+ *
+ * @param[in]  inds     Indices of the cards: an integer array with n elements.
+ *                      Cannot be NULL if n > 0. Valid card indices are from 0
+ *                      to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
+ *
+ * @param[inout] bsets  Requested resources on each of the card. Cannot be
+ *                      NULL if n > 0.
+ *
+ * @param[in]    tag    ORSLTag of the calling client. Cannot be NULL. Length
+ *                      must not exceed ORSL_MAX_TAG_LEN.
+ *
+ * @returns      0      if the resources were successfully reserved
+ *
+ * @returns      EBUSY  if some of the requested resources are busy
+ *
+ * @returns      EINVAL if any of the arguments is invalid
+ *
+ * @returns      EAGAIN limit of recursive reservations reached
+ *                      (not in ORSL Lite)
+ *
+ * @returns      ENOSYS (in ORSL Lite) if type of any of the busy sets is
+ *                      equal to BUSY_SET_PARTIAL
+ */
+int ORSLTryReserve(const int n, const int *__restrict inds,
+                   const ORSLBusySet *__restrict bsets,
+                   const ORSLTag __restrict tag);
+
+/** Granularify of partial reservation */
+typedef enum ORSLPartialGranularity {
+    GRAN_CARD = 0, /**< Card granularity */
+    GRAN_THREAD = 1 /**< Thread granularity */
+} ORSLPartialGranularity;
+
+/** Requests reservation of some of computational resources on a set of cards.
+ * Does not block. Updates user-provided bsets to indicate which resources
+ * were reserved.
+ *
+ * If any of the resources cannot be reserved, this function will update busy
+ * sets provided by the caller to reflect what resources were actually
+ * reserved. This function supports two granularity modes: 'card' and
+ * 'thread'.  When granularity is set to 'card', a failure to reserve a thread
+ * on the card will imply that reservation has failed for the whole card. When
+ * granularity is set to 'thread', reservation on a card will be considered
+ * successful as long as at least one thread on the card was successfully
+ * reserved. Reservation can be recursive if performed by the same tag. A
+ * recursively reserved resource must be released the same number of times it
+ * was reserved.
+ *
+ * @param[in]  gran   Reservation granularity
+ *
+ * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
+ *                    or > ORSL_MAX_CARDS.
+ *
+ * @param[in]  inds   Indices of the cards: an integer array with n elements.
+ *                    Cannot be NULL if n > 0. Valid card indices are from 0
+ *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
+ *
+ * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
+ *                    if n > 0.
+ *
+ * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
+ *                    must not exceed ORSL_MAX_TAG_LEN.
+ *
+ * @returns    0      if at least some of the resources were successfully
+ *                    reserved
+ *
+ * @returns    EBUSY  if all of the requested resources are busy
+ *
+ * @returns    EINVAL if any of the arguments is invalid
+ *
+ * @returns    EAGAIN limit of recursive reservations reached
+ *                    (not in ORSL Lite)
+ *
+ * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
+ *                    equal to BUSY_SET_PARTIAL
+ */
+int ORSLReservePartial(const ORSLPartialGranularity gran, const int n,
+                       const int *__restrict inds,
+                       ORSLBusySet *__restrict bsets,
+                       const ORSLTag __restrict tag);
+
+/** Releases previously reserved computational resources on a set of cards.
+ *
+ * This function will fail if any of the resources to be released were not
+ * reserved by the calling client.
+ *
+ * @see ORSLReserve
+ * @see ORSLTryReserve
+ * @see ORSLReservePartial
+ *
+ * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
+ *                    or > ORSL_MAX_CARDS.
+ *
+ * @param[in]  inds   Indices of the cards: an integer array with n elements.
+ *                    Cannot be NULL if n > 0. Valid card indices are from 0
+ *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
+ *
+ * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
+ *                    if n > 0.
+ *
+ * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
+ *                    must not exceed ORSL_MAX_TAG_LEN.
+ *
+ * @returns    0      if the resources were successfully released
+ *
+ * @returns    EINVAL if any of the arguments is invalid
+ *
+ * @returns    EPERM  the calling client did not reserve some of the
+ *                    resources it is trying to release.
+ *
+ * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
+ *                    equal to BUSY_SET_PARTIAL
+ */
+int ORSLRelease(const int n, const int *__restrict inds,
+                const ORSLBusySet *__restrict bsets,
+                const ORSLTag __restrict tag);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

diff --git a/final/offload/src/orsl-lite/lib/orsl-lite.c b/final/offload/src/orsl-lite/lib/orsl-lite.c
new file mode 100644
index 0000000..221cda7
--- /dev/null
+++ b/final/offload/src/orsl-lite/lib/orsl-lite.c

@@ -0,0 +1,337 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <errno.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "orsl-lite/include/orsl-lite.h"
+
+#define DISABLE_SYMBOL_VERSIONING
+
+#if defined(__linux__) && !defined(DISABLE_SYMBOL_VERSIONING)
+#define symver(src, tgt, verstr) __asm__(".symver " #src "," #tgt verstr)
+symver(ORSLReserve0, ORSLReserve, "@@ORSL_0.0");
+symver(ORSLTryReserve0, ORSLTryReserve, "@@ORSL_0.0");
+symver(ORSLReservePartial0, ORSLReservePartial, "@@ORSL_0.0");
+symver(ORSLRelease0, ORSLRelease, "@@ORSL_0.0");
+#else
+#define ORSLReserve0 ORSLReserve
+#define ORSLTryReserve0 ORSLTryReserve
+#define ORSLReservePartial0 ORSLReservePartial
+#define ORSLRelease0 ORSLRelease
+#endif
+
+#ifdef __linux__
+#include <pthread.h>
+static pthread_mutex_t global_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t release_cond = PTHREAD_COND_INITIALIZER;
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#pragma intrinsic(_ReadWriteBarrier)
+static SRWLOCK global_mutex = SRWLOCK_INIT;
+static volatile int release_cond_initialized = 0;
+static CONDITION_VARIABLE release_cond;
+
+static void state_lazy_init_sync()
+{
+    if (!release_cond_initialized) {
+        AcquireSRWLockExclusive(&global_mutex);
+        _ReadWriteBarrier();
+        if (!release_cond_initialized) {
+            InitializeConditionVariable(&release_cond);
+            release_cond_initialized = 1;
+        }
+        ReleaseSRWLockExclusive(&global_mutex);
+    }
+}
+#endif
+
+static int state_lock()
+{
+#ifdef __linux__
+    return pthread_mutex_lock(&global_mutex);
+#endif
+
+#ifdef _WIN32
+    AcquireSRWLockExclusive(&global_mutex);
+    return 0;
+#endif
+}
+
+static int state_unlock()
+{
+#ifdef __linux__
+    return pthread_mutex_unlock(&global_mutex);
+#endif
+
+#ifdef _WIN32
+    ReleaseSRWLockExclusive(&global_mutex);
+    return 0;
+#endif
+}
+
+static int state_wait_for_release()
+{
+#ifdef __linux__
+    return pthread_cond_wait(&release_cond, &global_mutex);
+#endif
+
+#ifdef _WIN32
+    return SleepConditionVariableSRW(&release_cond,
+            &global_mutex, INFINITE, 0) == 0 ? 1 : 0;
+#endif
+}
+
+static int state_signal_release()
+{
+#ifdef __linux__
+    return pthread_cond_signal(&release_cond);
+#endif
+
+#ifdef _WIN32
+    WakeConditionVariable(&release_cond);
+    return 0;
+#endif
+}
+
+static struct {
+    char owner[ORSL_MAX_TAG_LEN + 1];
+    unsigned long rsrv_cnt;
+} rsrv_data[ORSL_MAX_CARDS];
+
+static int check_args(const int n, const int *__restrict inds,
+                      const ORSLBusySet *__restrict bsets,
+                      const ORSLTag __restrict tag)
+{
+    int i;
+    int card_specified[ORSL_MAX_CARDS];
+    if (tag == NULL) return -1;
+    if (strlen((char *)tag) > ORSL_MAX_TAG_LEN) return -1;
+    if (n < 0 || n >= ORSL_MAX_CARDS) return -1;
+    if (n != 0 && (inds == NULL || bsets == NULL)) return -1;
+    for (i = 0; i < ORSL_MAX_CARDS; i++)
+        card_specified[i] = 0;
+    for (i = 0; i < n; i++) {
+        int ind = inds[i];
+        if (ind < 0 || ind >= ORSL_MAX_CARDS) return -1;
+        if (card_specified[ind]) return -1;
+        card_specified[ind] = 1;
+    }
+    return 0;
+}
+
+static int check_bsets(const int n, const ORSLBusySet *bsets)
+{
+    int i;
+    for (i = 0; i < n; i++)
+        if (bsets[i].type == BUSY_SET_PARTIAL) return -1;
+    return 0;
+}
+
+static int can_reserve_card(int card, const ORSLBusySet *__restrict bset,
+                            const ORSLTag __restrict tag)
+{
+    assert(tag != NULL);
+    assert(bset != NULL);
+    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
+    assert(bset->type != BUSY_SET_PARTIAL);
+
+    return (bset->type == BUSY_SET_EMPTY ||
+            ((rsrv_data[card].rsrv_cnt == 0 ||
+            strncmp((char *)tag,
+                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0) &&
+            rsrv_data[card].rsrv_cnt < ULONG_MAX)) ? 0 : - 1;
+}
+
+static void reserve_card(int card, const ORSLBusySet *__restrict bset,
+                         const ORSLTag __restrict tag)
+{
+    assert(tag != NULL);
+    assert(bset != NULL);
+    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
+    assert(bset->type != BUSY_SET_PARTIAL);
+
+    if (bset->type == BUSY_SET_EMPTY)
+        return;
+
+    assert(rsrv_data[card].rsrv_cnt == 0 ||
+            strncmp((char *)tag,
+                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0);
+    assert(rsrv_data[card].rsrv_cnt < ULONG_MAX);
+
+    if (rsrv_data[card].rsrv_cnt == 0)
+        strncpy(rsrv_data[card].owner, (char *)tag, ORSL_MAX_TAG_LEN);
+    rsrv_data[card].owner[ORSL_MAX_TAG_LEN] = '\0';
+    rsrv_data[card].rsrv_cnt++;
+}
+
+static int can_release_card(int card, const ORSLBusySet *__restrict bset,
+                            const ORSLTag __restrict tag)
+{
+    assert(tag != NULL);
+    assert(bset != NULL);
+    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
+    assert(bset->type != BUSY_SET_PARTIAL);
+
+    return (bset->type == BUSY_SET_EMPTY || (rsrv_data[card].rsrv_cnt > 0 &&
+                strncmp((char *)tag,
+                    rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0)) ? 0 : 1;
+}
+
+static void release_card(int card, const ORSLBusySet *__restrict bset,
+                         const ORSLTag __restrict tag)
+{
+    assert(tag != NULL);
+    assert(bset != NULL);
+    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
+    assert(bset->type != BUSY_SET_PARTIAL);
+
+    if (bset->type == BUSY_SET_EMPTY)
+        return;
+
+    assert(strncmp((char *)tag,
+                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0);
+    assert(rsrv_data[card].rsrv_cnt > 0);
+
+    rsrv_data[card].rsrv_cnt--;
+}
+
+int ORSLReserve0(const int n, const int *__restrict inds,
+                const ORSLBusySet *__restrict bsets,
+                const ORSLTag __restrict tag)
+{
+    int i, ok;
+
+    if (n == 0) return 0;
+    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
+    if (check_bsets(n, bsets) != 0) return ENOSYS;
+
+    state_lock();
+
+    /* Loop until we find that all the resources we want are available */
+    do {
+        ok = 1;
+        for (i = 0; i < n; i++)
+            if (can_reserve_card(inds[i], &bsets[i], tag) != 0) {
+                ok = 0;
+                /* Wait for someone to release some resources */
+                state_wait_for_release();
+                break;
+            }
+    } while (!ok);
+
+    /* At this point we are good to reserve_card the resources we want */
+    for (i = 0; i < n; i++)
+        reserve_card(inds[i], &bsets[i], tag);
+
+    state_unlock();
+    return 0;
+}
+
+int ORSLTryReserve0(const int n, const int *__restrict inds,
+                   const ORSLBusySet *__restrict bsets,
+                   const ORSLTag __restrict tag)
+{
+    int i, rc = EBUSY;
+
+    if (n == 0) return 0;
+    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
+    if (check_bsets(n, bsets) != 0) return ENOSYS;
+
+    state_lock();
+
+    /* Check resource availability once */
+    for (i = 0; i < n; i++)
+        if (can_reserve_card(inds[i], &bsets[i], tag) != 0)
+            goto bail_out;
+
+    /* At this point we are good to reserve the resources we want */
+    for (i = 0; i < n; i++)
+        reserve_card(inds[i], &bsets[i], tag);
+
+    rc = 0;
+
+bail_out:
+    state_unlock();
+    return rc;
+}
+
+int ORSLReservePartial0(const ORSLPartialGranularity gran, const int n,
+                       const int *__restrict inds, ORSLBusySet *__restrict bsets,
+                       const ORSLTag __restrict tag)
+{
+    int rc = EBUSY;
+    int i, num_avail = n;
+
+    if (n == 0) return 0;
+    if (gran != GRAN_CARD && gran != GRAN_THREAD) return EINVAL;
+    if (gran != GRAN_CARD) return EINVAL;
+    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
+    if (check_bsets(n, bsets) != 0) return ENOSYS;
+
+    state_lock();
+
+    /* Check resource availability once; remove unavailable resources from the
+     * user-provided list */
+    for (i = 0; i < n; i++)
+        if (can_reserve_card(inds[i], &bsets[i], tag) != 0) {
+            num_avail--;
+            bsets[i].type = BUSY_SET_EMPTY;
+        }
+
+    if (num_avail == 0)
+        goto bail_out;
+
+    /* At this point we are good to reserve the resources we want */
+    for (i = 0; i < n; i++)
+        reserve_card(inds[i], &bsets[i], tag);
+
+    rc = 0;
+
+bail_out:
+    state_unlock();
+    return rc;
+}
+
+int ORSLRelease0(const int n, const int *__restrict inds,
+                const ORSLBusySet *__restrict bsets,
+                const ORSLTag __restrict tag)
+{
+    int i, rc = EPERM;
+
+    if (n == 0) return 0;
+    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
+    if (check_bsets(n, bsets) != 0) return ENOSYS;
+
+    state_lock();
+
+    /* Check that we can release all the resources */
+    for (i = 0; i < n; i++)
+        if (can_release_card(inds[i], &bsets[i], tag) != 0)
+            goto bail_out;
+
+    /* At this point we are good to release the resources we want */
+    for (i = 0; i < n; i++)
+        release_card(inds[i], &bsets[i], tag);
+
+    state_signal_release();
+
+    rc = 0;
+
+bail_out:
+    state_unlock();
+    return rc;
+}
+
+/* vim:set et: */

diff --git a/final/offload/src/orsl-lite/version.txt b/final/offload/src/orsl-lite/version.txt
new file mode 100644
index 0000000..ab5f599
--- /dev/null
+++ b/final/offload/src/orsl-lite/version.txt

@@ -0,0 +1 @@
+ORSL-lite 0.7

diff --git a/final/offload/src/rdtsc.h b/final/offload/src/rdtsc.h
new file mode 100644
index 0000000..da91d71
--- /dev/null
+++ b/final/offload/src/rdtsc.h

@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+uint64_t _rdtsc()
+{
+  uint32_t eax, edx;
+  asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
+  return ((uint64_t)edx << 32) | eax;
+}

diff --git a/final/offload/src/use_mpss2.txt b/final/offload/src/use_mpss2.txt
new file mode 100644
index 0000000..948f483
--- /dev/null
+++ b/final/offload/src/use_mpss2.txt

@@ -0,0 +1 @@
+2.1.6720-13

diff --git a/final/offload/src/use_mpss_win.txt b/final/offload/src/use_mpss_win.txt
new file mode 100644
index 0000000..948f483
--- /dev/null
+++ b/final/offload/src/use_mpss_win.txt

@@ -0,0 +1 @@
+2.1.6720-13

diff --git a/final/runtime/Build_With_CMake.txt b/final/runtime/Build_With_CMake.txt
new file mode 100644
index 0000000..1eb8a35
--- /dev/null
+++ b/final/runtime/Build_With_CMake.txt

@@ -0,0 +1,207 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+==========================================================
+How to Build the LLVM* OpenMP* Runtime Library using CMake
+==========================================================
+
+==== Version of CMake required: v2.8.0 or above ====
+ 
+============================================
+How to call cmake initially, then repeatedly
+============================================
+- When calling cmake for the first time, all needed compiler options
+  must be specified on the command line.  After this initial call to
+  cmake, the compiler definitions must not be included for further calls
+  to cmake.  Other options can be specified on the command line multiple
+  times including all definitions in the Build options section below.
+- Example of configuring, building, reconfiguring, rebuilding:
+  $ mkdir build
+  $ cd build
+  $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_ARCH=i386 ..  # Initial configuration
+  $ make
+  ...
+  $ make clean
+  $ cmake -DLIBOMP_ARCH=x86_64 -DCMAKE_BUILD_TYPE=Debug ..                       # Second configuration
+  $ make
+  ...
+  $ rm -rf *
+  $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_ARCH=x86_64 .. # Third configuration
+  $ make
+- Notice in the example how the compiler definitions are only specified
+  for an empty build directory, but other Build options are used at any time.
+- The file CMakeCache.txt which is created after the first call to cmake
+  is a configuration file which holds all the values for the Build options.
+  These configuration values can be changed using a text editor to modify 
+  CMakeCache.txt as opposed to using definitions on the command line.
+- To have cmake create a particular type of build generator file simply 
+  inlude the -G <Generator name> option:
+  $ cmake -G "Unix Makefiles" ...
+  You can see a list of generators cmake supports by executing cmake with
+  no arguments and a list will be printed.
+
+=====================
+Instructions to Build
+=====================
+ $ cd libomp_top_level/ [ directory with src/ , exports/ , tools/ , etc. ]
+ $ mkdir build
+ $ cd build
+
+ [ Unix* Libraries ]
+ $ cmake -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> ..
+
+ [ Intel(R) Many Integrated Core Library (Intel(R) MIC Library) ]
+ $ cmake -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> -DLIBOMP_ARCH=mic ..
+
+ [ Windows Libraries ]
+ $ cmake -G <Generator Type> -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> -DCMAKE_ASM_MASM_COMPILER=[ml | ml64] -DCMAKE_BUILD_TYPE=Release ..
+
+ $ make
+ $ make install
+
+==================
+Mac* Fat Libraries
+==================
+On OS X* machines, it is possible to build universal (or fat) libraries which
+include both i386 and x86_64 architecture objects in a
+single archive.
+ $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_OSX_ARCHITECTURES='i386;x86_64' ..
+ $ make
+There is also an option -DLIBOMP_OSX_ARCHITECTURES which can be set in case
+this is an LLVM source tree build which will only set the libomp library
+to a universal fat library and prevent having the entire llvm/clang build
+produce universal binaries.
+
+===========
+Micro tests
+===========
+After the library has been built, there are five optional microtests that
+can be performed.  Some will be skipped based upon the platform.
+To run the tests,
+$ make libomp-micro-tests
+
+=============
+CMake options
+=============
+-DCMAKE_C_COMPILER=<C compiler name>
+Specify the C compiler
+
+-DCMAKE_CXX_COMPILER=<C++ compiler name>
+Specify the C++ compiler
+
+-DCMAKE_Fortran_COMPILER=<Fortran compiler name>
+This option is only needed when -DLIBOMP_FORTRAN_MODULES is on.
+So typically, a Fortran compiler is not needed during the build.
+Specify the Fortran compiler
+
+-DCMAKE_ASM_MASM_COMPILER=[ml | ml64 ]
+This option is Windows* Only
+
+-DLIBOMP_ARCH=i386|x86_64|arm|ppc64|ppc64le|aarch64|mic
+The default for the option is chosen based on the probing the compiler for 
+architecture macros (e.g., is __x86_64__ predefined by compiler?).
+
+==== First values listed are the default value ====
+-DLIBOMP_LIB_TYPE=normal|profile|stubs
+Library type can be normal, profile, or stubs.
+
+-DCMAKE_BUILD_TYPE=Release|Debug|RelWithDebInfo
+Build type can be Release, Debug, or RelWithDebInfo.
+
+-DLIBOMP_VERSION=5|4
+libomp major version can be 5 or 4.
+
+-DLIBOMP_OMP_VERSION=41|40|30
+OpenMP version can be either 41, 40 or 30.
+
+-DLIBOMP_MIC_ARCH=knc|knf
+This value is ignored if LIBOMP_ARCH != mic
+Intel(R) MIC Architecture, can be knf or knc.
+  
+-DLIBOMP_FORTRAN_MODULES=off|on
+Should the Fortran modules be created (requires Fortran compiler)
+
+-DLIBOMP_USE_ADAPTIVE_LOCKS=on|off       
+Should adaptive (TSX-based) locks be included?  
+These are x86 specific.  This feature is turned on by default 
+for i386 and x86_64.  Otherwise, it is turned off.
+
+-DLIBOMP_USE_INTERNODE_ALIGNMENT=off|on
+Should 4096-byte alignment be used for certain data structures?
+This option is useful on multinode systems where a small CACHE_LINE
+setting leads to false sharing.  This option is off by default.
+
+-DLIBOMP_USE_VERSION_SYMBOLS=on|off
+Should versioned symbols be used for building the library?
+This option only makes sense for ELF based libraries where version
+symbols are supported (Linux, some BSD* variants).  It is off
+by default for Windows and Mac, but on for other Unix based operating
+systems.
+
+-DLIBOMP_OMPT_SUPPORT=off|on
+Should OMPT support be included in the build? (Not supported on Windows)
+If LIBOMP_OMPT_SUPPORT is off, then both ompt_blame and ompt_trace are ignored.
+
+-DLIBOMP_OMPT_BLAME=on|off
+Should OMPT blame functionality be included in the build?
+
+-DLIBOMP_OMPT_TRACE=on|off
+Should OMPT trace functionality be included in the build?
+
+-DLIBOMP_STATS=off|on
+Should include stats-gathering code be included in the build?
+
+-DLIBOMP_USE_DEBUGGER=off|on
+Should the friendly debugger interface be included in the build?
+
+================================
+How to append flags to the build
+================================
+- These flags are *appended*.  They do not 
+  overwrite any of the preset flags.
+-DLIBOMP_CPPFLAGS=<space-separated flags> -- Additional C preprocessor flags
+-DLIBOMP_CFLAGS=<space-separated flags>   -- Additional C compiler flags
+-DLIBOMP_CXXFLAGS=<space-separated flags> -- Additional C++ compiler flags
+-DLIBOMP_ASMFLAGS=<space-separated flags> -- Additional assembly flags
+-DLIBOMP_LDFLAGS=<space-separated flags>  -- Additional linker flags
+-DLIBOMP_LIBFLAGS=<space-separated flags> -- Additional libraries to link
+-DLIBOMP_FFLAGS=<space-separated flags>   -- Additional Fortran compiler flags
+
+=======================
+Example usages of CMake
+=======================
+---- Typical usage ----
+cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc ..
+cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ ..
+cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+
+---- With Various Options ----
+- Build the i386 Linux library using GCC*
+cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_ARCH=i386 ..
+
+- Build the x86_64 debug Mac library using Clang*
+cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBOMP_ARCH=x86_64 -DCMAKE_BUILD_TYPE=Debug ..
+
+- Build the library (architecture determined by probing compiler) using the
+  Intel(R) C Compiler and the Intel(R) C++ Compiler. Also, create the fortran modules using
+  the Intel(R) Fortran Compiler.
+cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_Fortran_COMPILER=ifort -DLIBOMP_FORTRAN_MODULES=on ..
+
+- Have CMake Find the C/C++ compiler, and specify additional flags for the C compiler, preprocessor, and C++ compiler.
+cmake -DLIBOMP_CFLAGS='-specific-flag' -DLIBOMP_CPPFLAGS='-DNEW_FEATURE=1 -DOLD_FEATURE=0' -DLIBOMP_CXXFLAGS='--one-specific-flag --two-specific-flag' ..
+
+---- Build the stubs library ----
+cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_LIB_TYPE=stubs ..
+
+=========
+Footnotes
+=========
+[*] Other names and brands may be claimed as the property of others.

diff --git a/final/runtime/CMakeLists.txt b/final/runtime/CMakeLists.txt
new file mode 100644
index 0000000..3371248
--- /dev/null
+++ b/final/runtime/CMakeLists.txt

@@ -0,0 +1,311 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# CMAKE libomp
+cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+
+# Add cmake directory to search for custom cmake functions
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+# Standalone build or part of LLVM?
+set(LIBOMP_STANDALONE_BUILD FALSE)
+if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}" OR
+   "${CMAKE_SOURCE_DIR}/runtime" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+  project(libomp C CXX)
+  set(LIBOMP_STANDALONE_BUILD TRUE)
+endif()
+
+# These include files are in the cmake/ subdirectory
+include(LibompUtils)
+include(LibompGetArchitecture)
+include(LibompHandleFlags)
+include(LibompDefinitions)
+
+# Determine the target architecture
+if(${LIBOMP_STANDALONE_BUILD})
+  # If adding a new architecture, take a look at cmake/LibompGetArchitecture.cmake
+  libomp_get_architecture(LIBOMP_DETECTED_ARCH)
+  set(LIBOMP_ARCH ${LIBOMP_DETECTED_ARCH} CACHE STRING
+    "The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic).")
+  # Allow user to choose a suffix for the installation directory.
+  set(LIBOMP_LIBDIR_SUFFIX "" CACHE STRING
+    "suffix of lib installation directory e.g., 64 => lib64")
+  # Should assertions be enabled?  They are on by default.
+  set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
+    "enable assertions?")
+  set(LIBOMP_ENABLE_WERROR FALSE CACHE BOOL
+    "Enable -Werror flags to turn warnings into errors for supporting compilers.")
+  # CMAKE_BUILD_TYPE was not defined, set default to Release
+  if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+  endif()
+else() # Part of LLVM build
+  # Determine the native architecture from LLVM.
+  string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH)
+  if( LIBOMP_NATIVE_ARCH STREQUAL "host" )
+    string(REGEX MATCH "^[^-]*" LIBOMP_NATIVE_ARCH ${LLVM_HOST_TRIPLE})
+  endif ()
+  if(LIBOMP_NATIVE_ARCH MATCHES "i[2-6]86")
+    set(LIBOMP_ARCH i386)
+  elseif(LIBOMP_NATIVE_ARCH STREQUAL "x86")
+    set(LIBOMP_ARCH i386)
+  elseif(LIBOMP_NATIVE_ARCH STREQUAL "amd64")
+    set(LIBOMP_ARCH x86_64)
+  elseif(LIBOMP_NATIVE_ARCH STREQUAL "x86_64")
+    set(LIBOMP_ARCH x86_64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "powerpc")
+    set(LIBOMP_ARCH ppc64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "aarch64")
+    set(LIBOMP_ARCH aarch64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "arm64")
+    set(LIBOMP_ARCH aarch64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "arm")
+    set(LIBOMP_ARCH arm)
+  else()
+    # last ditch effort
+    libomp_get_architecture(LIBOMP_ARCH)
+  endif ()
+  set(LIBOMP_LIBDIR_SUFFIX ${LLVM_LIBDIR_SUFFIX})
+  set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS})
+  set(LIBOMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR})
+endif()
+libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 mic)
+
+set(LIBOMP_LIB_TYPE normal CACHE STRING
+  "Performance,Profiling,Stubs library (normal/profile/stubs)")
+libomp_check_variable(LIBOMP_LIB_TYPE normal profile stubs)
+set(LIBOMP_VERSION 5 CACHE STRING
+  "Produce libguide (version 4) or libomp (version 5)")
+set(LIBOMP_OMP_VERSION 41 CACHE STRING
+  "The OpenMP version (41/40/30)")
+libomp_check_variable(LIBOMP_OMP_VERSION 41 40 30)
+set(LIBOMP_MIC_ARCH knc CACHE STRING
+  "Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) (knf/knc).  Ignored if not Intel(R) MIC Architecture build.")
+if("${LIBOMP_ARCH}" STREQUAL "mic")
+  libomp_check_variable(LIBOMP_MIC_ARCH knf knc)
+endif()
+set(LIBOMP_FORTRAN_MODULES FALSE CACHE BOOL
+  "Create Fortran module files? (requires fortran compiler)")
+
+# - Support for universal fat binary builds on Mac
+# - Having this extra variable allows people to build this library as a universal library
+#   without forcing a universal build of the llvm/clang compiler.
+set(LIBOMP_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}" CACHE STRING
+  "For Mac builds, semicolon separated list of architectures to build for universal fat binary.")
+set(CMAKE_OSX_ARCHITECTURES ${LIBOMP_OSX_ARCHITECTURES})
+
+# User specified flags.  These are appended to the configured flags.
+set(LIBOMP_CFLAGS "" CACHE STRING
+  "Appended user specified C compiler flags.")
+set(LIBOMP_CXXFLAGS "" CACHE STRING
+  "Appended user specified C++ compiler flags.")
+set(LIBOMP_CPPFLAGS "" CACHE STRING
+  "Appended user specified C preprocessor flags.")
+set(LIBOMP_ASMFLAGS "" CACHE STRING
+  "Appended user specified assembler flags.")
+set(LIBOMP_LDFLAGS "" CACHE STRING
+  "Appended user specified linker flags.")
+set(LIBOMP_LIBFLAGS "" CACHE STRING
+  "Appended user specified linked libs flags. (e.g., -lm)")
+set(LIBOMP_FFLAGS "" CACHE STRING
+  "Appended user specified Fortran compiler flags.  These are only used if LIBOMP_FORTRAN_MODULES==TRUE.")
+
+# Should the libomp library and generated headers be copied into the original source exports/ directory
+# Turning this to FALSE aids parallel builds to not interfere with each other.
+# Currently, the testsuite module expects the just built OpenMP library to be located inside the exports/
+# directory.  TODO: have testsuite run under llvm-lit directly.  We can then get rid of copying to exports/
+set(LIBOMP_COPY_EXPORTS TRUE CACHE STRING
+  "Should exports be copied into source exports/ directory?")
+
+# Get the build number from kmp_version.c
+libomp_get_build_number("${CMAKE_CURRENT_SOURCE_DIR}" LIBOMP_BUILD_NUMBER)
+
+# Currently don't record any timestamps
+set(LIBOMP_DATE "No_Timestamp")
+
+# Architecture
+set(IA32 FALSE)
+set(INTEL64 FALSE)
+set(ARM FALSE)
+set(AARCH64 FALSE)
+set(PPC64BE FALSE)
+set(PPC64LE FALSE)
+set(PPC64 FALSE)
+set(MIC FALSE)
+if("${LIBOMP_ARCH}" STREQUAL "i386" OR "${LIBOMP_ARCH}" STREQUAL "32")    # IA-32 architecture
+  set(IA32 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "x86_64" OR "${LIBOMP_ARCH}" STREQUAL "32e") # Intel(R) 64 architecture
+  set(INTEL64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "arm") # ARM architecture
+  set(ARM TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "ppc64") # PPC64BE architecture
+  set(PPC64BE TRUE)
+  set(PPC64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "ppc64le") # PPC64LE architecture
+  set(PPC64LE TRUE)
+  set(PPC64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "aarch64") # AARCH64 architecture
+  set(AARCH64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "mic") # Intel(R) Many Integrated Core Architecture
+  set(MIC TRUE)
+endif()
+
+# Set some flags based on build_type
+set(RELEASE_BUILD FALSE)
+set(DEBUG_BUILD FALSE)
+set(RELWITHDEBINFO_BUILD FALSE)
+set(MINSIZEREL_BUILD FALSE)
+string(TOLOWER "${CMAKE_BUILD_TYPE}" libomp_build_type_lowercase)
+if("${libomp_build_type_lowercase}" STREQUAL "release")
+  set(RELEASE_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "debug")
+  set(DEBUG_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "relwithdebinfo")
+  set(RELWITHDEBINFO_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "minsizerel")
+  set(MINSIZEREL_BUILD TRUE)
+endif()
+
+# Include itt notify interface? Right now, always.
+set(LIBOMP_USE_ITT_NOTIFY TRUE)
+
+# normal, profile, stubs library.
+set(NORMAL_LIBRARY FALSE)
+set(STUBS_LIBRARY FALSE)
+set(PROFILE_LIBRARY FALSE)
+if("${LIBOMP_LIB_TYPE}" STREQUAL "normal")
+  set(NORMAL_LIBRARY TRUE)
+elseif("${LIBOMP_LIB_TYPE}" STREQUAL "profile")
+  set(PROFILE_LIBRARY TRUE)
+elseif("${LIBOMP_LIB_TYPE}" STREQUAL "stubs")
+  set(STUBS_LIBRARY TRUE)
+endif()
+
+# Setting directory names
+set(LIBOMP_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(LIBOMP_SRC_DIR ${LIBOMP_BASE_DIR}/src)
+set(LIBOMP_TOOLS_DIR ${LIBOMP_BASE_DIR}/tools)
+set(LIBOMP_INC_DIR ${LIBOMP_SRC_DIR}/include/${LIBOMP_OMP_VERSION})
+
+# Enabling Fortran if it is needed
+if(${LIBOMP_FORTRAN_MODULES})
+  enable_language(Fortran)
+endif()
+# Enable MASM Compiler if it is needed (Windows only)
+if(WIN32)
+  enable_language(ASM_MASM)
+endif()
+
+# Getting legal type/arch
+libomp_get_legal_type(LIBOMP_LEGAL_TYPE)
+libomp_get_legal_arch(LIBOMP_LEGAL_ARCH)
+
+# Compiler flag checks, library checks, threading check, etc.
+include(config-ix)
+
+# Is there a quad precision data type available?
+# TODO: Make this a real feature check
+set(LIBOMP_USE_QUAD_PRECISION "${LIBOMP_HAVE_QUAD_PRECISION}" CACHE BOOL
+  "Should 128-bit precision entry points be built?")
+if(LIBOMP_USE_QUAD_PRECISION AND (NOT LIBOMP_HAVE_QUAD_PRECISION))
+  libomp_error_say("128-bit quad precision functionality requested but not available")
+endif()
+
+# libgomp drop-in compatibility requires versioned symbols
+set(LIBOMP_USE_VERSION_SYMBOLS "${LIBOMP_HAVE_VERSION_SYMBOLS}" CACHE BOOL
+  "Should version symbols be used? These provide binary compatibility with libgomp.")
+if(LIBOMP_USE_VERSION_SYMBOLS AND (NOT LIBOMP_HAVE_VERSION_SYMBOLS))
+  libomp_error_say("Version symbols functionality requested but not available")
+endif()
+
+# On multinode systems, larger alignment is desired to avoid false sharing
+set(LIBOMP_USE_INTERNODE_ALIGNMENT FALSE CACHE BOOL
+  "Should larger alignment (4096 bytes) be used for some locks and data structures?")
+
+# Build code that allows the OpenMP library to conveniently interface with debuggers
+set(LIBOMP_USE_DEBUGGER FALSE CACHE BOOL
+  "Enable debugger interface code?")
+
+# Should we link to C++ library?
+set(LIBOMP_USE_STDCPPLIB FALSE CACHE BOOL
+  "Should we link to C++ library?")
+
+# TSX (x86) based locks have __asm code which can be troublesome for some compilers.
+# TODO: Make this a real feature check
+set(LIBOMP_USE_ADAPTIVE_LOCKS "${LIBOMP_HAVE_ADAPTIVE_LOCKS}" CACHE BOOL
+  "Should TSX-based lock be compiled (adaptive lock in kmp_lock.cpp).  These are x86 specific.")
+if(LIBOMP_USE_ADAPTIVE_LOCKS AND (NOT LIBOMP_HAVE_ADAPTIVE_LOCKS))
+  libomp_error_say("Adaptive locks (TSX) functionality requested but not available")
+endif()
+
+# - stats-gathering enables OpenMP stats where things like the number of
+# parallel regions, clock ticks spent in particular openmp regions are recorded.
+# TODO: Make this a real feature check
+set(LIBOMP_STATS FALSE CACHE BOOL
+  "Stats-Gathering functionality?")
+if(LIBOMP_STATS AND (NOT LIBOMP_HAVE_STATS))
+  libomp_error_say("Stats-gathering functionality requested but not available")
+endif()
+
+# OMPT-support
+# TODO: Make this a real feature check
+set(LIBOMP_OMPT_SUPPORT FALSE CACHE BOOL
+  "OMPT-support?")
+set(LIBOMP_OMPT_BLAME TRUE CACHE BOOL
+  "OMPT-blame?")
+set(LIBOMP_OMPT_TRACE TRUE CACHE BOOL
+  "OMPT-trace?")
+if(LIBOMP_OMPT_SUPPORT AND (NOT LIBOMP_HAVE_OMPT_SUPPORT))
+  libomp_error_say("OpenMP Tools Interface requested but not available")
+endif()
+
+# Setting final library name
+set(LIBOMP_DEFAULT_LIB_NAME libomp)
+if(${PROFILE_LIBRARY})
+  set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}prof)
+endif()
+if(${STUBS_LIBRARY})
+  set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}stubs)
+endif()
+set(LIBOMP_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME} CACHE STRING "Base OMP library name")
+set(LIBOMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+
+# Print configuration after all variables are set.
+if(${LIBOMP_STANDALONE_BUILD})
+  libomp_say("Operating System     -- ${CMAKE_SYSTEM_NAME}")
+  libomp_say("Target Architecture  -- ${LIBOMP_ARCH}")
+  if(${MIC})
+    libomp_say("Intel(R) MIC Architecture    -- ${LIBOMP_MIC_ARCH}")
+  endif()
+  libomp_say("Build Type           -- ${CMAKE_BUILD_TYPE}")
+  libomp_say("OpenMP Version       -- ${LIBOMP_OMP_VERSION}")
+  libomp_say("Lib Type             -- ${LIBOMP_LIB_TYPE}")
+  libomp_say("Fortran Modules      -- ${LIBOMP_FORTRAN_MODULES}")
+  # will say development if all zeros
+  if(${LIBOMP_BUILD_NUMBER} STREQUAL 00000000)
+    set(LIBOMP_BUILD Development)
+  else()
+    set(LIBOMP_BUILD ${LIBOMP_BUILD_NUMBER})
+  endif()
+  libomp_say("Build                -- ${LIBOMP_BUILD}")
+  libomp_say("Use Stats-gathering  -- ${LIBOMP_STATS}")
+  libomp_say("Use Debugger-support -- ${LIBOMP_USE_DEBUGGER}")
+  libomp_say("Use OMPT-support     -- ${LIBOMP_OMPT_SUPPORT}")
+  if(${LIBOMP_OMPT_SUPPORT})
+    libomp_say("Use OMPT-blame       -- ${LIBOMP_OMPT_BLAME}")
+    libomp_say("Use OMPT-trace       -- ${LIBOMP_OMPT_TRACE}")
+  endif()
+  libomp_say("Use Adaptive locks   -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
+  libomp_say("Use quad precision   -- ${LIBOMP_USE_QUAD_PRECISION}")
+endif()
+
+add_subdirectory(src)
+

diff --git a/final/runtime/Makefile b/final/runtime/Makefile
new file mode 100644
index 0000000..5f025b4
--- /dev/null
+++ b/final/runtime/Makefile

@@ -0,0 +1,78 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+omp_root?=.
+include $(omp_root)/tools/common.inc
+.PHONY: default all omp
+
+default: omp
+
+all: omp stubs
+
+omp: info mkdir
+	@echo $(omp_root)/tools/build.pl $(build_args) --arch=$(arch) --mode=$(mode) lib inc common -- -j$(jobs)
+	$(omp_root)/tools/build.pl $(build_args) --arch=$(arch) --mode=$(mode) lib inc common -- -j$(jobs)
+
+stubs: mkdir
+	@echo $(omp_root)/tools/build.pl $(build_args) --arch=$(arch) --mode=$(mode) --stubs lib inc common -- -j$(jobs)
+	$(omp_root)/tools/build.pl $(build_args) --arch=$(arch) --mode=$(mode) --stubs lib inc common -- -j$(jobs)
+
+.PHONY: clean info
+
+clean:
+	$(shell $(RM) -rf $(omp_root)$(SLASH)tmp)
+	@echo clean done
+
+mkdir:
+	$(shell $(MD) $(omp_root)$(SLASH)tmp >$(NUL) 2>$(NUL))
+	@echo Created $(omp_root)$(SLASH)tmp directory
+
+info:
+	@echo omp_root=$(omp_root)
+	@echo omp_os=$(omp_os)
+	@echo arch=$(arch)
+ifeq "$(arch)" "mic"
+	@echo mic_arch=$(mic_arch)	
+endif
+	@echo compiler=$(compiler)	
+	@echo mic=$(mic)
+	@echo mode=$(mode)
+	@echo jobs=$(jobs)
+
+
+
+libomp_path=$(shell $(omp_root)/tools/check-openmp.pl)
+test_path=$(shell $(omp_root)/tools/check-openmp-test.pl)
+
+# Please do not change this rule.
+# -------------------------------
+# Intentionally changing directory into "testsuite" in order to generate output results and errors
+# over there and keep this directory clean.
+test: omp	
+	@$(Verb) if which llvm-lit &> /dev/null; then \
+        if [ -d "$(omp_root)$(SLASH)..$(SLASH)testsuite$(SLASH)LLVM-IR" ] ; then \
+			export TESTSUITE_TEMP=$(realpath $(omp_root))$(SLASH)tmp ; \
+			export LIBRARY_PATH=$(libomp_path):$(LIBRARY_PATH) ; \
+			export C_INCLUDE_PATH=$(libomp_path)$(SLASH)..$(SLASH)..$(SLASH)common$(SLASH)include:$(C_INCLUDE_PATH) ; \
+			export LD_LIBRARY_PATH=$(libomp_path):$(LD_LIBRARY_PATH) ; \
+			export DYLD_LIBRARY_PATH=$(libomp_path):$(DYLD_LIBRARY_PATH) ; \
+			cd $(omp_root)$(SLASH)..$(SLASH)testsuite ; \
+			make ctest ; \
+			python adding_xfails.py ; \
+			llvm-lit -j 1 $(realpath $(omp_root))$(SLASH)..$(SLASH)testsuite$(SLASH)LLVM-IR$(SLASH)$(test_path) -v ; \
+        else \
+        	echo "No test directory" ; exit 1; \
+        fi; else echo "No llvm-lit in $(PATH)"; exit 1; fi
+
+test-clean:
+	make -C ..$(SLASH)testsuite cleanall
+
+

diff --git a/final/runtime/README.txt b/final/runtime/README.txt
new file mode 100644
index 0000000..3b594bd
--- /dev/null
+++ b/final/runtime/README.txt

@@ -0,0 +1,143 @@
+
+               README for the LLVM* OpenMP* Runtime Library
+               ============================================
+
+How to Build Documentation
+==========================
+
+The main documentation is in Doxygen* format, and this distribution
+should come with pre-built PDF documentation in doc/Reference.pdf.  
+However, an HTML version can be built by executing:
+
+% doxygen doc/doxygen/config 
+
+in the runtime directory.
+
+That will produce HTML documentation in the doc/doxygen/generated
+directory, which can be accessed by pointing a web browser at the
+index.html file there.
+
+If you don't have Doxygen installed, you can download it from
+www.doxygen.org.
+
+
+How to Build the LLVM* OpenMP* Runtime Library
+==============================================
+
+The library can be built either using Cmake, or using a makefile that
+in turn invokes various Perl scripts. For porting, non X86
+architectures, and for those already familiar with Cmake that may be
+an easier route to take than the one described here.
+
+Building with CMake
+===================
+The runtime/Build_With_CMake.txt file has a description of how to
+build with Cmake.
+
+Building with the Makefile
+==========================
+The Makefile at the top-level will attempt to detect what it needs to
+build the LLVM* OpenMP* Runtime Library.  To see the default settings, 
+type:
+
+make info
+
+You can change the Makefile's behavior with the following options:
+
+omp_root:    The path to the top-level directory containing the top-level
+	     Makefile.  By default, this will take on the value of the 
+	     current working directory.
+
+omp_os:      Operating system.  By default, the build will attempt to 
+	     detect this. Currently supports "linux", "freebsd", "macos", and
+	     "windows".
+
+arch:        Architecture. By default, the build will attempt to 
+	     detect this if not specified by the user. Currently 
+	     supported values are
+                 "32" for IA-32 architecture 
+                 "32e" for Intel(R) 64 architecture
+                 "mic" for Intel(R) Many Integrated Core Architecture
+                 "arm" for ARM* architecture
+                 "aarch64" for Aarch64 (64-bit ARM) architecture
+                 "ppc64" for IBM(R) Power architecture (big endian)
+                 "ppc64le" for IBM(R) Power architecture (little endian)
+
+             If "mic" is specified then "icc" will be used as the
+	     compiler, and appropriate k1om binutils will be used. The
+	     necessary packages must be installed on the build machine
+	     for this to be possible (but an Intel(R) Xeon Phi(TM)
+	     coprocessor card is not required to build the library).
+
+compiler:    Which compiler to use for the build.  Defaults to "icc" 
+	     or "icl" depending on the value of omp_os. Also supports 
+	     some versions of "gcc"* when omp_os is "linux". The selected 
+	     compiler should be installed and in the user's path. The 
+	     corresponding Fortran compiler should also be in the path. 
+	     See "Supported RTL Build Configurations" below for more 
+	     information on compiler versions.
+
+mode:        Library mode: default is "release".  Also supports "debug".
+
+jobs:        The number of parallel jobs for the underlying call to make.
+         This value is sent as the parameter to the -j flag for make.
+         This value defaults to "1", but can be set to any positive integer.
+
+To use any of the options above, simple add <option_name>=<value>.  For 
+example, if you want to build with gcc instead of icc, type:
+
+make compiler=gcc
+
+On OS X* machines, it is possible to build universal (or fat) libraries which
+include both IA-32 architecture and Intel(R) 64 architecture objects in a
+single archive; just build the 32 and 32e libraries separately, then invoke 
+make again with a special argument as follows:
+
+make compiler=clang build_args=fat
+
+Supported RTL Build Configurations
+==================================
+
+Supported Architectures: IA-32 architecture, Intel(R) 64, and 
+Intel(R) Many Integrated Core Architecture
+
+              ----------------------------------------------
+              |   icc/icl     |    gcc      |   clang      |
+--------------|---------------|----------------------------|
+| Linux* OS   |   Yes(1,5)    |  Yes(2,4)   | Yes(4,6,7)   |
+| FreeBSD*    |   No          |  No         | Yes(4,6,7,8) |
+| OS X*       |   Yes(1,3,4)  |  No         | Yes(4,6,7)   |
+| Windows* OS |   Yes(1,4)    |  No         | No           |
+------------------------------------------------------------
+
+(1) On IA-32 architecture and Intel(R) 64, icc/icl versions 12.x are 
+    supported (12.1 is recommended).
+(2) GCC* version 4.6.2 is supported.
+(3) For icc on OS X*, OS X* version 10.5.8 is supported.
+(4) Intel(R) Many Integrated Core Architecture not supported.
+(5) On Intel(R) Many Integrated Core Architecture, icc/icl versions 13.0 
+    or later are required.
+(6) Clang* version 3.3 is supported.
+(7) Clang* currently does not offer a software-implemented 128 bit extended 
+    precision type.  Thus, all entry points reliant on this type are removed
+    from the library and cannot be called in the user program.  The following
+    functions are not available:
+    __kmpc_atomic_cmplx16_*
+    __kmpc_atomic_float16_*
+    __kmpc_atomic_*_fp
+(8) Community contribution provided AS IS, not tested by Intel.
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL: clang (from the OpenMP development branch at
+http://clang-omp.github.io/ ), Intel compilers, GCC.  See the documentation
+for more details.
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+
+*Other names and brands may be claimed as the property of others.

diff --git a/final/runtime/cmake/LibompCheckFortranFlag.cmake b/final/runtime/cmake/LibompCheckFortranFlag.cmake
new file mode 100644
index 0000000..c37b3ad
--- /dev/null
+++ b/final/runtime/cmake/LibompCheckFortranFlag.cmake

@@ -0,0 +1,73 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Checking a fortran compiler flag
+# There is no real trivial way to do this in CMake, so we implement it here
+# this will have ${boolean} = TRUE if the flag succeeds, otherwise false.
+function(libomp_check_fortran_flag flag boolean)
+  if(NOT DEFINED "${boolean}")
+    set(retval TRUE)
+    set(fortran_source
+"      program hello
+           print *, \"Hello World!\"
+      end program hello")
+
+  set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping")
+  if(CMAKE_VERSION VERSION_GREATER 3.1 OR CMAKE_VERSION VERSION_EQUAL 3.1)
+    include(CheckFortranSourceCompiles)
+    check_fortran_source_compiles("${fortran_source}" ${boolean} FAIL_REGEX "${failed_regexes}")
+    set(${boolean} ${${boolean}} PARENT_SCOPE)
+    return()
+  else()
+    # Our manual check for cmake versions that don't have CheckFortranSourceCompiles
+    set(base_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/fortran_flag_check)
+    file(MAKE_DIRECTORY ${base_dir})
+    file(WRITE ${base_dir}/fortran_source.f "${fortran_source}")
+
+    message(STATUS "Performing Test ${boolean}")
+    execute_process(
+      COMMAND ${CMAKE_Fortran_COMPILER} "${flag}" ${base_dir}/fortran_source.f
+      WORKING_DIRECTORY ${base_dir}
+      RESULT_VARIABLE exit_code
+      OUTPUT_VARIABLE OUTPUT
+      ERROR_VARIABLE OUTPUT
+    )
+
+    if(${exit_code} EQUAL 0)
+      foreach(regex IN LISTS failed_regexes)
+        if("${OUTPUT}" MATCHES ${regex})
+          set(retval FALSE)
+        endif()
+      endforeach()
+    else()
+      set(retval FALSE)
+    endif()
+
+    if(${retval})
+      set(${boolean} 1 CACHE INTERNAL "Test ${boolean}")
+      message(STATUS "Performing Test ${boolean} - Success")
+      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+        "Performing Fortran Compiler Flag test ${boolean} succeeded with the following output:\n"
+        "${OUTPUT}\n"
+        "Source file was:\n${fortran_source}\n")
+    else()
+      set(${boolean} "" CACHE INTERNAL "Test ${boolean}")
+      message(STATUS "Performing Test ${boolean} - Failed")
+      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+        "Performing Fortran Compiler Flag test ${boolean} failed with the following output:\n"
+        "${OUTPUT}\n"
+        "Source file was:\n${fortran_source}\n")
+    endif()
+  endif()
+
+  set(${boolean} ${retval} PARENT_SCOPE)
+  endif()
+endfunction()

diff --git a/final/runtime/cmake/LibompCheckLinkerFlag.cmake b/final/runtime/cmake/LibompCheckLinkerFlag.cmake
new file mode 100644
index 0000000..75a38e3
--- /dev/null
+++ b/final/runtime/cmake/LibompCheckLinkerFlag.cmake

@@ -0,0 +1,68 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Checking a linker flag to build a shared library
+# There is no real trivial way to do this in CMake, so we implement it here
+# this will have ${boolean} = TRUE if the flag succeeds, otherwise FALSE.
+function(libomp_check_linker_flag flag boolean)
+  if(NOT DEFINED "${boolean}")
+  set(retval TRUE)
+  set(library_source
+    "int foo(int a) { return a*a; }")
+  set(cmake_source
+    "cmake_minimum_required(VERSION 2.8)
+     project(foo C)
+     set(CMAKE_SHARED_LINKER_FLAGS \"${flag}\")
+     add_library(foo SHARED src_to_link.c)")
+  set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping;LINK : warning")
+  set(base_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/link_flag_check_${boolean})
+  file(MAKE_DIRECTORY ${base_dir})
+  file(MAKE_DIRECTORY ${base_dir}/build)
+  file(WRITE ${base_dir}/src_to_link.c "${library_source}")
+  file(WRITE ${base_dir}/CMakeLists.txt "${cmake_source}")
+
+  message(STATUS "Performing Test ${boolean}")
+  try_compile(
+    try_compile_result
+    ${base_dir}/build
+    ${base_dir}
+    foo
+    OUTPUT_VARIABLE OUTPUT)
+
+  if(try_compile_result)
+    foreach(regex IN LISTS failed_regexes)
+      if("${OUTPUT}" MATCHES ${regex})
+        set(retval FALSE)
+      endif()
+    endforeach()
+  else()
+    set(retval FALSE)
+  endif()
+
+  if(${retval})
+    set(${boolean} 1 CACHE INTERNAL "Test ${boolean}")
+    message(STATUS "Performing Test ${boolean} - Success")
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+      "Performing C Linker Flag test ${boolean} succeeded with the following output:\n"
+      "${OUTPUT}\n"
+      "Source file was:\n${library_source}\n")
+  else()
+    set(${boolean} "" CACHE INTERNAL "Test ${boolean}")
+    message(STATUS "Performing Test ${boolean} - Failed")
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+      "Performing C Linker Flag test ${boolean} failed with the following output:\n"
+      "${OUTPUT}\n"
+      "Source file was:\n${library_source}\n")
+  endif()
+
+  set(${boolean} ${retval} PARENT_SCOPE)
+  endif()
+endfunction()

diff --git a/final/runtime/cmake/LibompDefinitions.cmake b/final/runtime/cmake/LibompDefinitions.cmake
new file mode 100644
index 0000000..667110e
--- /dev/null
+++ b/final/runtime/cmake/LibompDefinitions.cmake

@@ -0,0 +1,100 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+function(libomp_get_definitions_flags cppflags)
+  set(cppflags_local)
+  libomp_append(cppflags_local "-D USE_ITT_BUILD")
+  # yes... you need 5 backslashes...
+  libomp_append(cppflags_local "-D KMP_ARCH_STR=\"\\\\\"${LIBOMP_LEGAL_ARCH}\\\\\"\"")
+  libomp_append(cppflags_local "-D BUILD_I8")
+  libomp_append(cppflags_local "-D KMP_LIBRARY_FILE=\\\\\"${LIBOMP_LIB_FILE}\\\\\"")
+  libomp_append(cppflags_local "-D KMP_VERSION_MAJOR=${LIBOMP_VERSION}")
+  libomp_append(cppflags_local "-D KMP_NESTED_HOT_TEAMS")
+
+  # customize to 128 bytes for ppc64
+  if(${PPC64})
+    libomp_append(cppflags_local "-D CACHE_LINE=128")
+  else()
+    libomp_append(cppflags_local "-D CACHE_LINE=64")
+  endif()
+
+  libomp_append(cppflags_local "-D KMP_ADJUST_BLOCKTIME=1")
+  libomp_append(cppflags_local "-D BUILD_PARALLEL_ORDERED")
+  libomp_append(cppflags_local "-D KMP_ASM_INTRINS")
+  libomp_append(cppflags_local "-D USE_ITT_NOTIFY" IF_TRUE_1_0 LIBOMP_USE_ITT_NOTIFY)
+  libomp_append(cppflags_local "-D INTEL_NO_ITTNOTIFY_API" IF_FALSE LIBOMP_USE_ITT_NOTIFY)
+  libomp_append(cppflags_local "-D INTEL_ITTNOTIFY_PREFIX=__kmp_itt_")
+  libomp_append(cppflags_local "-D KMP_USE_VERSION_SYMBOLS" IF_TRUE LIBOMP_USE_VERSION_SYMBOLS)
+
+  if(WIN32)
+    libomp_append(cppflags_local "-D _CRT_SECURE_NO_WARNINGS")
+    libomp_append(cppflags_local "-D _CRT_SECURE_NO_DEPRECATE")
+    libomp_append(cppflags_local "-D _WINDOWS")
+    libomp_append(cppflags_local "-D _WINNT")
+    libomp_append(cppflags_local "-D _WIN32_WINNT=0x0501")
+    libomp_append(cppflags_local "-D KMP_WIN_CDECL")
+    libomp_append(cppflags_local "-D _USRDLL")
+    libomp_append(cppflags_local "-D _ITERATOR_DEBUG_LEVEL=0" IF_TRUE DEBUG_BUILD)
+  else()
+    libomp_append(cppflags_local "-D _GNU_SOURCE")
+    libomp_append(cppflags_local "-D _REENTRANT")
+    libomp_append(cppflags_local "-D BUILD_TV")
+    libomp_append(cppflags_local "-D USE_CBLKDATA")
+    libomp_append(cppflags_local "-D KMP_GOMP_COMPAT")
+  endif()
+
+  libomp_append(cppflags_local "-D USE_LOAD_BALANCE" IF_FALSE MIC)
+  if(NOT WIN32 AND NOT APPLE)
+    libomp_append(cppflags_local "-D KMP_TDATA_GTID")
+  endif()
+  libomp_append(cppflags_local "-D KMP_USE_ASSERT" IF_TRUE LIBOMP_ENABLE_ASSERTIONS)
+  libomp_append(cppflags_local "-D KMP_DYNAMIC_LIB")
+  libomp_append(cppflags_local "-D KMP_STUB" IF_TRUE STUBS_LIBRARY)
+
+  if(${DEBUG_BUILD} OR ${RELWITHDEBINFO_BUILD})
+    libomp_append(cppflags_local "-D KMP_DEBUG")
+  endif()
+  libomp_append(cppflags_local "-D _DEBUG" IF_TRUE DEBUG_BUILD)
+  libomp_append(cppflags_local "-D BUILD_DEBUG" IF_TRUE DEBUG_BUILD)
+  libomp_append(cppflags_local "-D KMP_STATS_ENABLED" IF_TRUE_1_0 LIBOMP_STATS)
+  libomp_append(cppflags_local "-D USE_DEBUGGER" IF_TRUE_1_0 LIBOMP_USE_DEBUGGER)
+  libomp_append(cppflags_local "-D OMPT_SUPPORT" IF_TRUE_1_0 LIBOMP_OMPT_SUPPORT)
+  libomp_append(cppflags_local "-D OMPT_BLAME" IF_TRUE_1_0 LIBOMP_OMPT_BLAME)
+  libomp_append(cppflags_local "-D OMPT_TRACE" IF_TRUE_1_0 LIBOMP_OMPT_TRACE)
+
+  # OpenMP version flags
+  set(libomp_have_omp_50 0)
+  set(libomp_have_omp_41 0)
+  set(libomp_have_omp_40 0)
+  set(libomp_have_omp_30 0)
+  if(${LIBOMP_OMP_VERSION} EQUAL 50 OR ${LIBOMP_OMP_VERSION} GREATER 50)
+    set(libomp_have_omp_50 1)
+  endif()
+  if(${LIBOMP_OMP_VERSION} EQUAL 41 OR ${LIBOMP_OMP_VERSION} GREATER 41)
+    set(libomp_have_omp_41 1)
+  endif()
+  if(${LIBOMP_OMP_VERSION} EQUAL 40 OR ${LIBOMP_OMP_VERSION} GREATER 40)
+    set(libomp_have_omp_40 1)
+  endif()
+  if(${LIBOMP_OMP_VERSION} EQUAL 30 OR ${LIBOMP_OMP_VERSION} GREATER 30)
+    set(libomp_have_omp_30 1)
+  endif()
+  libomp_append(cppflags_local "-D OMP_50_ENABLED=${libomp_have_omp_50}")
+  libomp_append(cppflags_local "-D OMP_41_ENABLED=${libomp_have_omp_41}")
+  libomp_append(cppflags_local "-D OMP_40_ENABLED=${libomp_have_omp_40}")
+  libomp_append(cppflags_local "-D OMP_30_ENABLED=${libomp_have_omp_30}")
+  libomp_append(cppflags_local "-D KMP_USE_ADAPTIVE_LOCKS" IF_TRUE_1_0 LIBOMP_USE_ADAPTIVE_LOCKS)
+  libomp_append(cppflags_local "-D KMP_DEBUG_ADAPTIVE_LOCKS=0")
+  libomp_append(cppflags_local "-D KMP_USE_INTERNODE_ALIGNMENT" IF_TRUE_1_0 LIBOMP_USE_INTERNODE_ALIGNMENT)
+  # CMake doesn't include CPPFLAGS from environment, but we will.
+  set(${cppflags} ${cppflags_local} ${LIBOMP_CPPFLAGS} $ENV{CPPFLAGS} PARENT_SCOPE)
+endfunction()
+

diff --git a/final/runtime/cmake/LibompExports.cmake b/final/runtime/cmake/LibompExports.cmake
new file mode 100644
index 0000000..16fcb20
--- /dev/null
+++ b/final/runtime/cmake/LibompExports.cmake

@@ -0,0 +1,98 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# LibompExports.cmake
+#   Copy library and header files into the exports/ subdirectory after library build
+
+# Create the suffix for the export directory
+# - Only add to suffix when not a default value
+# - Example suffix: .deb.30.s1
+#   final export directory: exports/lin_32e.deb.30.s1/lib
+# - These suffixes imply the build is a Debug, OpenMP 3.0, Stats-Gathering version of the library
+set(libomp_suffix)
+libomp_append(libomp_suffix .deb DEBUG_BUILD)
+libomp_append(libomp_suffix .dia RELWITHDEBINFO_BUILD)
+libomp_append(libomp_suffix .min MINSIZEREL_BUILD)
+if(NOT "${LIBOMP_OMP_VERSION}" STREQUAL "41")
+  libomp_append(libomp_suffix .${LIBOMP_OMP_VERSION})
+endif()
+libomp_append(libomp_suffix .s1 LIBOMP_STATS)
+libomp_append(libomp_suffix .ompt LIBOMP_OMPT_SUPPORT)
+if(${LIBOMP_OMPT_SUPPORT})
+  libomp_append(libomp_suffix .no-ompt-blame IF_FALSE LIBOMP_OMPT_BLAME)
+  libomp_append(libomp_suffix .no-ompt-trace IF_FALSE LIBOMP_OMPT_TRACE)
+endif()
+string(REPLACE ";" "" libomp_suffix "${libomp_suffix}")
+
+# Set exports locations
+if(${MIC})
+  set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_${LIBOMP_MIC_ARCH}") # e.g., lin_knf, lin_knc
+else()
+  if(${IA32})
+    set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_32")
+  elseif(${INTEL64})
+    set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_32e")
+  else()
+    set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_${LIBOMP_ARCH}") # e.g., lin_arm, lin_ppc64
+  endif()
+endif()
+set(LIBOMP_EXPORTS_DIR "${LIBOMP_BASE_DIR}/exports")
+set(LIBOMP_EXPORTS_PLATFORM_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suffix}")
+set(LIBOMP_EXPORTS_CMN_DIR "${LIBOMP_EXPORTS_DIR}/common${libomp_suffix}/include")
+set(LIBOMP_EXPORTS_INC_DIR "${LIBOMP_EXPORTS_PLATFORM_DIR}/include")
+set(LIBOMP_EXPORTS_MOD_DIR "${LIBOMP_EXPORTS_PLATFORM_DIR}/include_compat")
+set(LIBOMP_EXPORTS_LIB_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suffix}/lib")
+
+# Put headers in exports/ directory post build
+add_custom_command(TARGET omp POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_CMN_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy omp.h ${LIBOMP_EXPORTS_CMN_DIR}
+)
+if(${LIBOMP_OMPT_SUPPORT})
+  add_custom_command(TARGET omp POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ompt.h ${LIBOMP_EXPORTS_CMN_DIR}
+  )
+endif()
+if(${LIBOMP_FORTRAN_MODULES})
+  add_custom_command(TARGET libomp-mod POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_MOD_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib.h ${LIBOMP_EXPORTS_CMN_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib.mod ${LIBOMP_EXPORTS_MOD_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib_kinds.mod ${LIBOMP_EXPORTS_MOD_DIR}
+  )
+endif()
+
+# Copy OpenMP library into exports/ directory post build
+if(WIN32)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
+else()
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+endif()
+if(NOT LIBOMP_OUTPUT_DIRECTORY)
+  set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+add_custom_command(TARGET omp POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
+)
+
+# Copy Windows import library into exports/ directory post build
+if(WIN32)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
+    set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+  add_custom_command(TARGET ompimp POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
+  )
+endif()
+

diff --git a/final/runtime/cmake/LibompGetArchitecture.cmake b/final/runtime/cmake/LibompGetArchitecture.cmake
new file mode 100644
index 0000000..8f3c4c6
--- /dev/null
+++ b/final/runtime/cmake/LibompGetArchitecture.cmake

@@ -0,0 +1,66 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Determine the architecture from predefined compiler macros
+# The architecture name can only contain alphanumeric characters and underscores (i.e., C identifier)
+
+# void get_architecture(string* return_arch)
+# - Returns the architecture in return_arch
+function(libomp_get_architecture return_arch)
+  set(detect_arch_src_txt "
+    #if defined(__KNC__)
+      #error ARCHITECTURE=mic
+    #elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+      #error ARCHITECTURE=x86_64
+    #elif defined(__i386) || defined(__i386__) || defined(__IA32__) || defined(_M_I86) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
+      #error ARCHITECTURE=i386
+    #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) ||  defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7S__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6ZK__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5E__)  || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_2__)
+      #error ARCHITECTURE=arm
+    #elif defined(__arm__) || defined(_M_ARM) || defined(_ARM)
+      #error ARCHITECTURE=arm
+    #elif defined(__aarch64__)
+      #error ARCHITECTURE=aarch64
+    #elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
+      #error ARCHITECTURE=ppc64le
+    #elif defined(__powerpc64__)
+      #error ARCHITECTURE=ppc64
+    #else
+      #error ARCHITECTURE=UnknownArchitecture
+    #endif
+  ")
+  # Write out ${detect_arch_src_txt} to a file within the cmake/ subdirectory
+  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c" ${detect_arch_src_txt})
+
+  # Try to compile using the C Compiler.  It will always error out with an #error directive, so store error output to ${local_architecture}
+  try_run(run_dummy compile_dummy "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c" COMPILE_OUTPUT_VARIABLE local_architecture)
+
+  # Match the important architecture line and store only that matching string in ${local_architecture}
+  string(REGEX MATCH "ARCHITECTURE=([a-zA-Z0-9_]+)" local_architecture "${local_architecture}")
+
+  # Get rid of the ARCHITECTURE= part of the string
+  string(REPLACE "ARCHITECTURE=" "" local_architecture "${local_architecture}")
+
+  # set the return value to the architecture detected (e.g., 32e, 32, arm, ppc64, etc.)
+  set(${return_arch} "${local_architecture}" PARENT_SCOPE)
+
+  # Remove ${detect_arch_src_txt} from cmake/ subdirectory
+  file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c")
+endfunction()

diff --git a/final/runtime/cmake/LibompHandleFlags.cmake b/final/runtime/cmake/LibompHandleFlags.cmake
new file mode 100644
index 0000000..570ce5c
--- /dev/null
+++ b/final/runtime/cmake/LibompHandleFlags.cmake

@@ -0,0 +1,228 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Setup the flags correctly for cmake (covert to string)
+# Pretty them up (STRIP any beginning and trailing whitespace,
+# remove duplicates, remove empty entries)
+macro(libomp_setup_flags flags)
+  if(NOT "${${flags}}" STREQUAL "") # if flags are empty, don't do anything
+    set(flags_local)
+    list(REMOVE_DUPLICATES ${flags}) # remove duplicates
+    list(REMOVE_ITEM ${flags} "") # remove empty items
+    libomp_list_to_string("${${flags}}" flags_local)
+    string(STRIP "${flags_local}" flags_local)
+    set(${flags} "${flags_local}")
+  endif()
+endmacro()
+
+# Gets flags common to both the C and C++ compiler
+function(libomp_get_c_and_cxxflags_common flags)
+  set(flags_local)
+  libomp_append(flags_local -std=c++11 LIBOMP_HAVE_STD_CPP11_FLAG)
+  libomp_append(flags_local -fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
+  if(${LIBOMP_ENABLE_WERROR})
+    libomp_append(flags_local -Werror LIBOMP_HAVE_WERROR_FLAG)
+  endif()
+  libomp_append(flags_local -Wno-sign-compare LIBOMP_HAVE_WNO_SIGN_COMPARE_FLAG)
+  libomp_append(flags_local -Wno-unused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
+  libomp_append(flags_local -Wno-unused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
+  libomp_append(flags_local -Wno-unused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
+  libomp_append(flags_local -Wno-unused-variable LIBOMP_HAVE_WNO_UNUSED_VARIABLE_FLAG)
+  libomp_append(flags_local -Wno-switch LIBOMP_HAVE_WNO_SWITCH_FLAG)
+  libomp_append(flags_local -Wno-covered-switch-default LIBOMP_HAVE_WNO_COVERED_SWITCH_DEFAULT_FLAG)
+  libomp_append(flags_local -Wno-deprecated-register LIBOMP_HAVE_WNO_DEPRECATED_REGISTER_FLAG)
+  libomp_append(flags_local -Wno-gnu-anonymous-struct LIBOMP_HAVE_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+  libomp_append(flags_local -Wno-unknown-pragmas LIBOMP_HAVE_WNO_UNKNOWN_PRAGMAS_FLAG)
+  libomp_append(flags_local -Wno-missing-field-initializers LIBOMP_HAVE_WNO_MISSING_FIELD_INITIALIZERS_FLAG)
+  libomp_append(flags_local -Wno-missing-braces LIBOMP_HAVE_WNO_MISSING_BRACES_FLAG)
+  libomp_append(flags_local -Wno-comment LIBOMP_HAVE_WNO_COMMENT_FLAG)
+  libomp_append(flags_local -Wno-self-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
+  libomp_append(flags_local -Wno-vla-extension LIBOMP_HAVE_WNO_VLA_EXTENSION_FLAG)
+  libomp_append(flags_local -Wno-format-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+  libomp_append(flags_local /GS LIBOMP_HAVE_GS_FLAG)
+  libomp_append(flags_local /EHsc LIBOMP_HAVE_EHSC_FLAG)
+  libomp_append(flags_local /Oy- LIBOMP_HAVE_OY__FLAG)
+  # Intel(R) C Compiler flags
+  libomp_append(flags_local /Qsafeseh LIBOMP_HAVE_QSAFESEH_FLAG)
+  libomp_append(flags_local -Qoption,cpp,--extended_float_types LIBOMP_HAVE_EXTENDED_FLOAT_TYPES_FLAG)
+  libomp_append(flags_local -Qlong_double LIBOMP_HAVE_LONG_DOUBLE_FLAG)
+  libomp_append(flags_local -Qdiag-disable:177 LIBOMP_HAVE_DIAG_DISABLE_177_FLAG)
+  if(${RELEASE_BUILD} OR ${RELWITHDEBINFO_BUILD})
+    libomp_append(flags_local -Qinline-min-size=1 LIBOMP_HAVE_INLINE_MIN_SIZE_FLAG)
+  endif()
+  # Architectural C and C++ flags
+  if(${IA32})
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      libomp_append(flags_local -m32 LIBOMP_HAVE_M32_FLAG)
+    endif()
+    libomp_append(flags_local /arch:SSE2 LIBOMP_HAVE_ARCH_SSE2_FLAG)
+    libomp_append(flags_local -msse2 LIBOMP_HAVE_MSSE2_FLAG)
+    libomp_append(flags_local -falign-stack=maintain-16-byte LIBOMP_HAVE_FALIGN_STACK_FLAG)
+  elseif(${MIC})
+    libomp_append(flags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+    libomp_append(flags_local -ftls-model=initial-exec LIBOMP_HAVE_FTLS_MODEL_FLAG)
+    libomp_append(flags_local "-opt-streaming-stores never" LIBOMP_HAVE_OPT_STREAMING_STORES_FLAG)
+  endif()
+  set(${flags} ${flags_local} PARENT_SCOPE)
+endfunction()
+
+# C compiler flags
+function(libomp_get_cflags cflags)
+  set(cflags_local)
+  libomp_get_c_and_cxxflags_common(cflags_local)
+  # flags only for the C Compiler
+  libomp_append(cflags_local /TP LIBOMP_HAVE_TP_FLAG)
+  libomp_append(cflags_local "-x c++" LIBOMP_HAVE_X_CPP_FLAG)
+  set(cflags_local ${cflags_local} ${LIBOMP_CFLAGS})
+  libomp_setup_flags(cflags_local)
+  set(${cflags} ${cflags_local} PARENT_SCOPE)
+endfunction()
+
+# C++ compiler flags
+function(libomp_get_cxxflags cxxflags)
+  set(cxxflags_local)
+  libomp_get_c_and_cxxflags_common(cxxflags_local)
+  set(cxxflags_local ${cxxflags_local} ${LIBOMP_CXXFLAGS})
+  libomp_setup_flags(cxxflags_local)
+  set(${cxxflags} ${cxxflags_local} PARENT_SCOPE)
+endfunction()
+
+# Assembler flags
+function(libomp_get_asmflags asmflags)
+  set(asmflags_local)
+  libomp_append(asmflags_local "-x assembler-with-cpp" LIBOMP_HAVE_X_ASSEMBLER_WITH_CPP_FLAG)
+  # Architectural assembler flags
+  if(${IA32})
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      libomp_append(asmflags_local -m32 LIBOMP_HAVE_M32_FLAG)
+    endif()
+    libomp_append(asmflags_local /safeseh LIBOMP_HAVE_SAFESEH_MASM_FLAG)
+    libomp_append(asmflags_local /coff LIBOMP_HAVE_COFF_MASM_FLAG)
+  elseif(${MIC})
+    libomp_append(asmflags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+  endif()
+  set(asmflags_local ${asmflags_local} ${LIBOMP_ASMFLAGS})
+  libomp_setup_flags(asmflags_local)
+  set(${asmflags} ${asmflags_local} PARENT_SCOPE)
+endfunction()
+
+# Linker flags
+function(libomp_get_ldflags ldflags)
+  set(ldflags_local)
+  libomp_append(ldflags_local "${CMAKE_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_LIB_NAME}.def"
+    IF_DEFINED CMAKE_LINK_DEF_FILE_FLAG)
+  libomp_append(ldflags_local "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}${LIBOMP_VERSION}.0"
+    IF_DEFINED CMAKE_C_OSX_CURRENT_VERSION_FLAG)
+  libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION}.0"
+    IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
+  libomp_append(ldflags_local -Wl,--warn-shared-textrel LIBOMP_HAVE_WARN_SHARED_TEXTREL_FLAG)
+  libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
+  libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
+  libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
+  libomp_append(ldflags_local -Wl,-fini=__kmp_internal_end_fini LIBOMP_HAVE_FINI_FLAG)
+  libomp_append(ldflags_local -no-intel-extensions LIBOMP_HAVE_NO_INTEL_EXTENSIONS_FLAG)
+  libomp_append(ldflags_local -static-intel LIBOMP_HAVE_STATIC_INTEL_FLAG)
+  libomp_append(ldflags_local /SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
+  # Architectural linker flags
+  if(${IA32})
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      libomp_append(ldflags_local -m32 LIBOMP_HAVE_M32_FLAG)
+    endif()
+    libomp_append(ldflags_local -msse2 LIBOMP_HAVE_MSSE2_FLAG)
+  elseif(${MIC})
+    libomp_append(ldflags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+    libomp_append(ldflags_local -Wl,-x LIBOMP_HAVE_X_FLAG)
+  endif()
+  set(ldflags_local ${ldflags_local} ${LIBOMP_LDFLAGS})
+  libomp_setup_flags(ldflags_local)
+  set(${ldflags} ${ldflags_local} PARENT_SCOPE)
+endfunction()
+
+# Library flags
+function(libomp_get_libflags libflags)
+  set(libflags_local)
+  libomp_append(libflags_local "${CMAKE_THREAD_LIBS_INIT}")
+  if(${IA32})
+    libomp_append(libflags_local -lirc_pic LIBOMP_HAVE_IRC_PIC_LIBRARY)
+  endif()
+  set(libflags_local ${libflags_local} ${LIBOMP_LIBFLAGS})
+  libomp_setup_flags(libflags_local)
+  set(${libflags} ${libflags_local} PARENT_SCOPE)
+endfunction()
+
+# Fortran flags
+function(libomp_get_fflags fflags)
+  set(fflags_local)
+  if(${IA32})
+    libomp_append(fflags_local -m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
+  endif()
+  set(fflags_local ${fflags_local} ${LIBOMP_FFLAGS})
+  libomp_setup_flags(fflags_local)
+  set(${fflags} ${fflags_local} PARENT_SCOPE)
+endfunction()
+
+# Perl expand-vars.pl flags
+function(libomp_get_evflags evflags)
+  set(evflags_local)
+  libomp_append(evflags_local "-D KMP_TYPE=\"${LIBOMP_LEGAL_TYPE}\"")
+  libomp_append(evflags_local "-D KMP_ARCH=\"${LIBOMP_LEGAL_ARCH}\"")
+  libomp_append(evflags_local "-D KMP_VERSION_MAJOR=${LIBOMP_VERSION}")
+  libomp_append(evflags_local "-D KMP_VERSION_MINOR=0")
+  libomp_append(evflags_local "-D KMP_VERSION_BUILD=${LIBOMP_BUILD_NUMBER}")
+  libomp_append(evflags_local "-D KMP_BUILD_DATE=\"${LIBOMP_DATE}\"")
+  if(${DEBUG_BUILD} OR ${RELWITHDEBINFO_BUILD})
+    libomp_append(evflags_local "-D KMP_DIAG=1")
+    libomp_append(evflags_local "-D KMP_DEBUG_INFO=1")
+  else()
+    libomp_append(evflags_local "-D KMP_DIAG=0")
+    libomp_append(evflags_local "-D KMP_DEBUG_INFO=0")
+  endif()
+  if(${LIBOMP_OMP_VERSION} EQUAL 40 OR ${LIBOMP_OMP_VERSION} GREATER 40)
+    libomp_append(evflags_local "-D OMP_VERSION=201307")
+  elseif(${LIBOMP_OMP_VERSION} EQUAL 30 OR ${LIBOMP_OMP_VERSION} GREATER 30)
+    libomp_append(evflags_local "-D OMP_VERSION=201107")
+  else()
+    libomp_append(evflags_local "-D OMP_VERSION=200505")
+  endif()
+  set(${evflags} ${evflags_local} PARENT_SCOPE)
+endfunction()
+
+# Perl generate-defs.pl flags (For Windows only)
+function(libomp_get_gdflags gdflags)
+  set(gdflags_local)
+  if(${IA32})
+    set(libomp_gdflag_arch arch_32)
+  elseif(${INTEL64})
+    set(libomp_gdflag_arch arch_32e)
+  else()
+    set(libomp_gdflag_arch arch_${LIBOMP_ARCH})
+  endif()
+  libomp_append(gdflags_local "-D ${libomp_gdflag_arch}")
+  libomp_append(gdflags_local "-D msvc_compat")
+  libomp_append(gdflags_local "-D norm" NORMAL_LIBRARY)
+  libomp_append(gdflags_local "-D prof" PROFILE_LIBRARY)
+  libomp_append(gdflags_local "-D stub" STUBS_LIBRARY)
+  libomp_append(gdflags_local "-D HAVE_QUAD" LIBOMP_USE_QUAD_PRECISION)
+  if(${LIBOMP_OMP_VERSION} GREATER 41 OR ${LIBOMP_OMP_VERSION} EQUAL 41)
+    libomp_append(gdflags_local "-D OMP_41")
+  endif()
+  if(${LIBOMP_OMP_VERSION} GREATER 40 OR ${LIBOMP_OMP_VERSION} EQUAL 40)
+    libomp_append(gdflags_local "-D OMP_40")
+  endif()
+  if(${LIBOMP_OMP_VERSION} GREATER 30 OR ${LIBOMP_OMP_VERSION} EQUAL 30)
+    libomp_append(gdflags_local "-D OMP_30")
+  endif()
+  if(${DEBUG_BUILD} OR ${RELWITHDEBINFO_BUILD})
+    libomp_append(gdflags_local "-D KMP_DEBUG")
+  endif()
+  set(${gdflags} ${gdflags_local} PARENT_SCOPE)
+endfunction()

diff --git a/final/runtime/cmake/LibompMicroTests.cmake b/final/runtime/cmake/LibompMicroTests.cmake
new file mode 100644
index 0000000..bf5afab
--- /dev/null
+++ b/final/runtime/cmake/LibompMicroTests.cmake

@@ -0,0 +1,215 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# The following micro-tests are small tests to perform on the library just created.
+# There are currently five micro-tests:
+# (1) test-touch
+#  - Compile and run a small program using newly created libomp library
+#  - Fails if test-touch.c does not compile or if test-touch.c does not run after compilation
+#  - Program dependencies: gcc or g++, grep, bourne shell
+#  - Available for all Unix,Mac,Windows builds.  Not available on Intel(R) MIC Architecture builds.
+# (2) test-relo
+#  - Tests dynamic libraries for position-dependent code (can not have any position dependent code)
+#  - Fails if TEXTREL is in output of readelf -d libomp.so command
+#  - Program dependencies: readelf, grep, bourne shell
+#  - Available for Unix, Intel(R) MIC Architecture dynamic library builds. Not available otherwise.
+# (3) test-execstack
+#  - Tests if stack is executable
+#  - Fails if stack is executable. Should only be readable and writable. Not exectuable.
+#  - Program dependencies: perl, readelf
+#  - Available for Unix dynamic library builds. Not available otherwise.
+# (4) test-instr (Intel(R) MIC Architecutre only)
+#  - Tests Intel(R) MIC Architecture libraries for valid instruction set
+#  - Fails if finds invalid instruction for Intel(R) MIC Architecture (wasn't compiled with correct flags)
+#  - Program dependencies: perl, objdump
+#  - Available for Intel(R) MIC Architecture and i386 builds. Not available otherwise.
+# (5) test-deps
+#  - Tests newly created libomp for library dependencies
+#  - Fails if sees a dependence not listed in td_exp variable below
+#  - Program dependencies: perl, (unix)readelf, (mac)otool[64], (windows)link.exe
+#  - Available for Unix,Mac,Windows, Intel(R) MIC Architecture dynamic builds and Windows
+#    static builds. Not available otherwise.
+
+# get library location
+if(WIN32)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
+    set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+else()
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+endif()
+if(NOT LIBOMP_OUTPUT_DIRECTORY)
+  set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# test-touch
+find_program(LIBOMP_SHELL sh)
+if(WIN32)
+  if(LIBOMP_SHELL)
+    set(libomp_test_touch_targets test-touch-md/.success test-touch-mt/.success)
+  endif()
+  # pick test-touch compiler
+  set(libomp_test_touch_compiler ${CMAKE_C_COMPILER})
+  # test-touch compilation flags
+  libomp_append(libomp_test_touch_cflags /nologo)
+  libomp_append(libomp_test_touch_libs ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE})
+  if(${IA32})
+    libomp_append(libomp_test_touch_ldflags /safeseh)
+  endif()
+else() # (Unix based systems, Intel(R) MIC Architecture, and Mac)
+  if(LIBOMP_SHELL)
+    set(libomp_test_touch_targets test-touch-rt/.success)
+  endif()
+  # pick test-touch compiler
+  if(${LIBOMP_USE_STDCPPLIB})
+    set(libomp_test_touch_compiler ${CMAKE_CXX_COMPILER})
+  else()
+    set(libomp_test_touch_compiler ${CMAKE_C_COMPILER})
+  endif()
+  # test-touch compilation flags
+  libomp_append(libomp_test_touch_libs "${CMAKE_THREAD_LIBS_INIT}")
+  if(${IA32})
+    libomp_append(libomp_test_touch_cflags -m32 LIBOMP_HAVE_M32_FLAG)
+  endif()
+  libomp_append(libomp_test_touch_libs ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE})
+  if(APPLE)
+    set(libomp_test_touch_env "DYLD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{DYLD_LIBRARY_PATH}")
+  else()
+    set(libomp_test_touch_env "LD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{LD_LIBRARY_PATH}")
+  endif()
+endif()
+macro(libomp_test_touch_recipe test_touch_dir)
+  set(libomp_test_touch_dependencies ${LIBOMP_SRC_DIR}/test-touch.c omp)
+  set(libomp_test_touch_exe ${test_touch_dir}/test-touch${CMAKE_EXECUTABLE_SUFFIX})
+  set(libomp_test_touch_obj ${test_touch_dir}/test-touch${CMAKE_C_OUTPUT_EXTENSION})
+  if(WIN32)
+    if(${RELEASE_BUILD} OR ${RELWITHDEBINFO_BUILD})
+      if(${test_touch_dir} MATCHES "test-touch-mt")
+        libomp_append(libomp_test_touch_cflags /MT)
+      else()
+        libomp_append(libomp_test_touch_cflags /MD)
+      endif()
+    else()
+      if(${test_touch_dir} MATCHES "test-touch-mt")
+        libomp_append(libomp_test_touch_cflags /MTd)
+      else()
+        libomp_append(libomp_test_touch_cflags /MDd)
+      endif()
+    endif()
+    set(libomp_test_touch_out_flags -Fe${libomp_test_touch_exe} -Fo${libomp_test_touch_obj})
+    list(APPEND libomp_test_touch_dependencies ompimp)
+  else()
+    set(libomp_test_touch_out_flags -o ${libomp_test_touch_exe})
+  endif()
+  add_custom_command(
+    OUTPUT  ${test_touch_dir}/.success ${libomp_test_touch_exe} ${libomp_test_touch_obj}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${test_touch_dir}
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${test_touch_dir}/*
+    COMMAND ${libomp_test_touch_compiler} ${libomp_test_touch_out_flags} ${libomp_test_touch_cflags}
+      ${LIBOMP_SRC_DIR}/test-touch.c ${libomp_test_touch_ldflags} ${libomp_test_touch_libs}
+    COMMAND ${LIBOMP_SHELL} -c \"${libomp_test_touch_env} ${libomp_test_touch_exe}\"
+    COMMAND ${CMAKE_COMMAND} -E touch ${test_touch_dir}/.success
+    DEPENDS ${libomp_test_touch_dependencies}
+  )
+endmacro()
+libomp_append(libomp_test_touch_env "KMP_VERSION=1")
+add_custom_target(libomp-test-touch DEPENDS ${libomp_test_touch_targets})
+if(WIN32)
+  libomp_test_touch_recipe(test-touch-mt)
+  libomp_test_touch_recipe(test-touch-md)
+else()
+  libomp_test_touch_recipe(test-touch-rt)
+endif()
+
+# test-relo
+add_custom_target(libomp-test-relo DEPENDS test-relo/.success)
+add_custom_command(
+  OUTPUT  test-relo/.success test-relo/readelf.log
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-relo
+  COMMAND readelf -d ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} > test-relo/readelf.log
+  COMMAND grep -e TEXTREL test-relo/readelf.log \; test $$? -eq 1
+  COMMAND ${CMAKE_COMMAND} -E touch test-relo/.success
+  DEPENDS omp
+)
+
+# test-execstack
+add_custom_target(libomp-test-execstack DEPENDS test-execstack/.success)
+add_custom_command(
+  OUTPUT  test-execstack/.success
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-execstack
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-execstack.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --arch=${LIBOMP_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+  COMMAND ${CMAKE_COMMAND} -E touch test-execstack/.success
+  DEPENDS omp
+)
+
+# test-instr
+add_custom_target(libomp-test-instr DEPENDS test-instr/.success)
+add_custom_command(
+  OUTPUT  test-instr/.success
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-instr
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --arch=${LIBOMP_ARCH} --show --mic-arch=${LIBOMP_MIC_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+  COMMAND ${CMAKE_COMMAND} -E touch test-instr/.success
+  DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl
+)
+
+# test-deps
+add_custom_target(libomp-test-deps DEPENDS test-deps/.success)
+set(libomp_expected_library_deps)
+if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+  set(libomp_expected_library_deps libc.so.7 libthr.so.3)
+elseif(APPLE)
+  set(libomp_expected_library_deps /usr/lib/libSystem.B.dylib)
+elseif(WIN32)
+  set(libomp_expected_library_deps kernel32.dll)
+else()
+  if(${MIC})
+    set(libomp_expected_library_deps libc.so.6 libpthread.so.0 libdl.so.2)
+    if("${LIBOMP_MIC_ARCH}" STREQUAL "knf")
+      libomp_append(libomp_expected_library_deps ld-linux-l1om.so.2)
+      libomp_append(libomp_expected_library_deps libgcc_s.so.1)
+    elseif("${LIBOMP_MIC_ARCH}" STREQUAL "knc")
+      libomp_append(libomp_expected_library_deps ld-linux-k1om.so.2)
+    endif()
+  else()
+    set(libomp_expected_library_deps libdl.so.2 libgcc_s.so.1)
+    if(${IA32})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld-linux.so.2)
+    elseif(${INTEL64})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld-linux-x86-64.so.2)
+    elseif(${ARM})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps libffi.so.6)
+      libomp_append(libomp_expected_library_deps libffi.so.5)
+      libomp_append(libomp_expected_library_deps ld-linux-armhf.so.3)
+    elseif(${PPC64})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld64.so.1)
+    endif()
+    libomp_append(libomp_expected_library_deps libpthread.so.0 IF_FALSE STUBS_LIBRARY)
+  endif()
+  libomp_append(libomp_expected_library_deps libstdc++.so.6 LIBOMP_USE_STDCPPLIB)
+endif()
+# Perl script expects comma separated list
+string(REPLACE ";" "," libomp_expected_library_deps "${libomp_expected_library_deps}")
+add_custom_command(
+  OUTPUT  test-deps/.success
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-deps
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-depends.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --arch=${LIBOMP_ARCH} --expected="${libomp_expected_library_deps}" ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+  COMMAND ${CMAKE_COMMAND} -E touch test-deps/.success
+  DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-depends.pl
+)

diff --git a/final/runtime/cmake/LibompUtils.cmake b/final/runtime/cmake/LibompUtils.cmake
new file mode 100644
index 0000000..ba81a6c
--- /dev/null
+++ b/final/runtime/cmake/LibompUtils.cmake

@@ -0,0 +1,191 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# void libomp_say(string message_to_user);
+# - prints out message_to_user
+macro(libomp_say message_to_user)
+  message(STATUS "LIBOMP: ${message_to_user}")
+endmacro()
+
+# void libomp_warning_say(string message_to_user);
+# - prints out message_to_user with a warning
+macro(libomp_warning_say message_to_user)
+  message(WARNING "LIBOMP: ${message_to_user}")
+endmacro()
+
+# void libomp_error_say(string message_to_user);
+# - prints out message_to_user with an error and exits cmake
+macro(libomp_error_say message_to_user)
+  message(FATAL_ERROR "LIBOMP: ${message_to_user}")
+endmacro()
+
+# libomp_append(<flag> <flags_list> [(IF_TRUE | IF_FALSE | IF_TRUE_1_0 ) BOOLEAN])
+#
+# libomp_append(<flag> <flags_list>)
+#   - unconditionally appends <flag> to the list of definitions
+#
+# libomp_append(<flag> <flags_list> <BOOLEAN>)
+#   - appends <flag> to the list of definitions if BOOLEAN is true
+#
+# libomp_append(<flag> <flags_list> IF_TRUE <BOOLEAN>)
+#   - appends <flag> to the list of definitions if BOOLEAN is true
+#
+# libomp_append(<flag> <flags_list> IF_FALSE <BOOLEAN>)
+#   - appends <flag> to the list of definitions if BOOLEAN is false
+#
+# libomp_append(<flag> <flags_list> IF_DEFINED <VARIABLE>)
+#   - appends <flag> to the list of definitions if VARIABLE is defined
+#
+# libomp_append(<flag> <flags_list> IF_TRUE_1_0 <BOOLEAN>)
+#   - appends <flag>=1 to the list of definitions if <BOOLEAN> is true, <flag>=0 otherwise
+# e.g., libomp_append("-D USE_FEATURE" IF_TRUE_1_0 HAVE_FEATURE)
+#     appends "-D USE_FEATURE=1" if HAVE_FEATURE is true
+#     or "-D USE_FEATURE=0" if HAVE_FEATURE is false
+macro(libomp_append flags flag)
+  if(NOT (${ARGC} EQUAL 2 OR ${ARGC} EQUAL 3 OR ${ARGC} EQUAL 4))
+    libomp_error_say("libomp_append: takes 2, 3, or 4 arguments")
+  endif()
+  if(${ARGC} EQUAL 2)
+    list(APPEND ${flags} "${flag}")
+  elseif(${ARGC} EQUAL 3)
+    if(${ARGV2})
+      list(APPEND ${flags} "${flag}")
+    endif()
+  else()
+    if(${ARGV2} STREQUAL "IF_TRUE")
+      if(${ARGV3})
+        list(APPEND ${flags} "${flag}")
+      endif()
+    elseif(${ARGV2} STREQUAL "IF_FALSE")
+      if(NOT ${ARGV3})
+        list(APPEND ${flags} "${flag}")
+      endif()
+    elseif(${ARGV2} STREQUAL "IF_DEFINED")
+      if(DEFINED ${ARGV3})
+        list(APPEND ${flags} "${flag}")
+      endif()
+    elseif(${ARGV2} STREQUAL "IF_TRUE_1_0")
+      if(${ARGV3})
+        list(APPEND ${flags} "${flag}=1")
+      else()
+        list(APPEND ${flags} "${flag}=0")
+      endif()
+    else()
+      libomp_error_say("libomp_append: third argument must be one of IF_TRUE, IF_FALSE, IF_DEFINED, IF_TRUE_1_0")
+    endif()
+  endif()
+endmacro()
+
+# void libomp_get_legal_arch(string* return_arch_string);
+# - returns (through return_arch_string) the formal architecture
+#   string or warns user of unknown architecture
+function(libomp_get_legal_arch return_arch_string)
+  if(${IA32})
+    set(${return_arch_string} "IA-32" PARENT_SCOPE)
+  elseif(${INTEL64})
+    set(${return_arch_string} "Intel(R) 64" PARENT_SCOPE)
+  elseif(${MIC})
+    set(${return_arch_string} "Intel(R) Many Integrated Core Architecture" PARENT_SCOPE)
+  elseif(${ARM})
+    set(${return_arch_string} "ARM" PARENT_SCOPE)
+  elseif(${PPC64BE})
+    set(${return_arch_string} "PPC64BE" PARENT_SCOPE)
+  elseif(${PPC64LE})
+    set(${return_arch_string} "PPC64LE" PARENT_SCOPE)
+  elseif(${AARCH64})
+    set(${return_arch_string} "AARCH64" PARENT_SCOPE)
+  else()
+    set(${return_arch_string} "${LIBOMP_ARCH}" PARENT_SCOPE)
+    libomp_warning_say("libomp_get_legal_arch(): Warning: Unknown architecture: Using ${LIBOMP_ARCH}")
+  endif()
+endfunction()
+
+# void libomp_check_variable(string var, ...);
+# - runs through all values checking if ${var} == value
+# - uppercase and lowercase do not matter
+# - if the var is found, then just print it out
+# - if the var is not found, then error out
+function(libomp_check_variable var)
+  set(valid_flag 0)
+  string(TOLOWER "${${var}}" var_lower)
+  foreach(value IN LISTS ARGN)
+    string(TOLOWER "${value}" value_lower)
+    if("${var_lower}" STREQUAL "${value_lower}")
+      set(valid_flag 1)
+      set(the_value "${value}")
+    endif()
+  endforeach()
+  if(${valid_flag} EQUAL 0)
+    libomp_error_say("libomp_check_variable(): ${var} = ${${var}} is unknown")
+  endif()
+endfunction()
+
+# void libomp_get_build_number(string src_dir, string* return_build_number);
+# - grab the eight digit build number (or 00000000) from kmp_version.c
+function(libomp_get_build_number src_dir return_build_number)
+  # sets file_lines_list to a list of all lines in kmp_version.c
+  file(STRINGS "${src_dir}/src/kmp_version.c" file_lines_list)
+
+  # runs through each line in kmp_version.c
+  foreach(line IN LISTS file_lines_list)
+    # if the line begins with "#define KMP_VERSION_BUILD" then we take not of the build number
+    string(REGEX MATCH "^[ \t]*#define[ \t]+KMP_VERSION_BUILD" valid "${line}")
+    if(NOT "${valid}" STREQUAL "") # if we matched "#define KMP_VERSION_BUILD", then grab the build number
+      string(REGEX REPLACE "^[ \t]*#define[ \t]+KMP_VERSION_BUILD[ \t]+([0-9]+)" "\\1"
+           build_number "${line}"
+      )
+    endif()
+  endforeach()
+  set(${return_build_number} "${build_number}" PARENT_SCOPE) # return build number
+endfunction()
+
+# void libomp_get_legal_type(string* return_legal_type);
+# - set the legal type name Performance/Profiling/Stub
+function(libomp_get_legal_type return_legal_type)
+  if(${NORMAL_LIBRARY})
+    set(${return_legal_type} "Performance" PARENT_SCOPE)
+  elseif(${PROFILE_LIBRARY})
+    set(${return_legal_type} "Profiling" PARENT_SCOPE)
+  elseif(${STUBS_LIBRARY})
+    set(${return_legal_type} "Stub" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# void libomp_add_suffix(string suffix, list<string>* list_of_items);
+# - returns list_of_items with suffix appended to all items
+# - original list is modified
+function(libomp_add_suffix suffix list_of_items)
+  set(local_list "")
+  foreach(item IN LISTS "${list_of_items}")
+    if(NOT "${item}" STREQUAL "")
+      list(APPEND local_list "${item}${suffix}")
+    endif()
+  endforeach()
+  set(${list_of_items} "${local_list}" PARENT_SCOPE)
+endfunction()
+
+# void libomp_list_to_string(list<string> list_of_things, string* return_string);
+# - converts a list to a space separated string
+function(libomp_list_to_string list_of_things return_string)
+  string(REPLACE ";" " " output_variable "${list_of_things}")
+  set(${return_string} "${output_variable}" PARENT_SCOPE)
+endfunction()
+
+# void libomp_string_to_list(string str, list<string>* return_list);
+# - converts a string to a semicolon separated list
+# - what it really does is just string_replace all running whitespace to a semicolon
+# - in cmake, a list is strings separated by semicolons: i.e., list of four items, list = "item1;item2;item3;item4"
+function(libomp_string_to_list str return_list)
+  set(outstr)
+  string(REGEX REPLACE "[ \t]+" ";" outstr "${str}")
+  set(${return_list} "${outstr}" PARENT_SCOPE)
+endfunction()
+

diff --git a/final/runtime/cmake/config-ix.cmake b/final/runtime/cmake/config-ix.cmake
new file mode 100644
index 0000000..875b551
--- /dev/null
+++ b/final/runtime/cmake/config-ix.cmake

@@ -0,0 +1,194 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+include(CheckCCompilerFlag)
+include(CheckCSourceCompiles)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
+include(LibompCheckLinkerFlag)
+include(LibompCheckFortranFlag)
+
+# Check for versioned symbols
+function(libomp_check_version_symbols retval)
+  set(source_code
+    "#include <stdio.h>
+    void func1() { printf(\"Hello\"); }
+    void func2() { printf(\"World\"); }
+    __asm__(\".symver func1, func@VER1\");
+    __asm__(\".symver func2, func@VER2\");
+    int main() {
+      func1();
+      func2();
+      return 0;
+    }")
+  set(version_script_source "VER1 { }; VER2 { } VER1;")
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt "${version_script_source}")
+  set(CMAKE_REQUIRED_FLAGS -Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt)
+  check_c_source_compiles("${source_code}" ${retval})
+  set(${retval} ${${retval}} PARENT_SCOPE)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt)
+endfunction()
+
+# Includes the architecture flag in both compile and link phase
+function(libomp_check_architecture_flag flag retval)
+  set(CMAKE_REQUIRED_FLAGS "${flag}")
+  check_c_compiler_flag("${flag}" ${retval})
+  set(${retval} ${${retval}} PARENT_SCOPE)
+endfunction()
+
+# Checking C, CXX, Linker Flags
+check_cxx_compiler_flag(-std=c++11 LIBOMP_HAVE_STD_CPP11_FLAG)
+check_cxx_compiler_flag(-fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
+check_c_compiler_flag("-x c++" LIBOMP_HAVE_X_CPP_FLAG)
+check_c_compiler_flag(-Werror LIBOMP_HAVE_WERROR_FLAG)
+check_c_compiler_flag(-Wunused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
+check_c_compiler_flag(-Wunused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
+check_c_compiler_flag(-Wunused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
+check_c_compiler_flag(-Wunused-variable LIBOMP_HAVE_WNO_UNUSED_VARIABLE_FLAG)
+check_c_compiler_flag(-Wswitch LIBOMP_HAVE_WNO_SWITCH_FLAG)
+check_c_compiler_flag(-Wcovered-switch-default LIBOMP_HAVE_WNO_COVERED_SWITCH_DEFAULT_FLAG)
+check_c_compiler_flag(-Wdeprecated-register LIBOMP_HAVE_WNO_DEPRECATED_REGISTER_FLAG)
+check_c_compiler_flag(-Wsign-compare LIBOMP_HAVE_WNO_SIGN_COMPARE_FLAG)
+check_c_compiler_flag(-Wgnu-anonymous-struct LIBOMP_HAVE_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+check_c_compiler_flag(-Wunknown-pragmas LIBOMP_HAVE_WNO_UNKNOWN_PRAGMAS_FLAG)
+check_c_compiler_flag(-Wmissing-field-initializers LIBOMP_HAVE_WNO_MISSING_FIELD_INITIALIZERS_FLAG)
+check_c_compiler_flag(-Wmissing-braces LIBOMP_HAVE_WNO_MISSING_BRACES_FLAG)
+check_c_compiler_flag(-Wcomment LIBOMP_HAVE_WNO_COMMENT_FLAG)
+check_c_compiler_flag(-Wself-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
+check_c_compiler_flag(-Wvla-extension LIBOMP_HAVE_WNO_VLA_EXTENSION_FLAG)
+check_c_compiler_flag(-Wformat-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+check_c_compiler_flag(-msse2 LIBOMP_HAVE_MSSE2_FLAG)
+check_c_compiler_flag(-ftls-model=initial-exec LIBOMP_HAVE_FTLS_MODEL_FLAG)
+libomp_check_architecture_flag(-mmic LIBOMP_HAVE_MMIC_FLAG)
+libomp_check_architecture_flag(-m32 LIBOMP_HAVE_M32_FLAG)
+if(WIN32)
+  # Check Windows MSVC style flags.
+  check_c_compiler_flag(/TP LIBOMP_HAVE_TP_FLAG)
+  check_cxx_compiler_flag(/EHsc LIBOMP_HAVE_EHSC_FLAG)
+  check_cxx_compiler_flag(/GS LIBOMP_HAVE_GS_FLAG)
+  check_cxx_compiler_flag(/Oy- LIBOMP_HAVE_Oy__FLAG)
+  check_cxx_compiler_flag(/arch:SSE2 LIBOMP_HAVE_ARCH_SSE2_FLAG)
+  check_cxx_compiler_flag(/Qsafeseh LIBOMP_HAVE_QSAFESEH_FLAG)
+  # It is difficult to create a dummy masm assembly file
+  # and then check the MASM assembler to see if these flags exist and work,
+  # so we assume they do for Windows.
+  set(LIBOMP_HAVE_SAFESEH_MASM_FLAG TRUE)
+  set(LIBOMP_HAVE_COFF_MASM_FLAG TRUE)
+  # Change Windows flags /MDx to /MTx
+  foreach(libomp_lang IN ITEMS C CXX)
+    foreach(libomp_btype IN ITEMS DEBUG RELWITHDEBINFO RELEASE MINSIZEREL)
+      string(REPLACE "/MD" "/MT"
+        CMAKE_${libomp_lang}_FLAGS_${libomp_btype}
+        "${CMAKE_${libomp_lang}_FLAGS_${libomp_btype}}"
+      )
+    endforeach()
+  endforeach()
+else()
+  # It is difficult to create a dummy assembly file that compiles into an
+  # exectuable for every architecture and then check the C compiler to
+  # see if -x assembler-with-cpp exists and works, so we assume it does for non-Windows.
+  set(LIBOMP_HAVE_X_ASSEMBLER_WITH_CPP_FLAG TRUE)
+endif()
+if(${LIBOMP_FORTRAN_MODULES})
+  libomp_check_fortran_flag(-m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
+endif()
+
+# Check linker flags
+if(WIN32)
+  libomp_check_linker_flag(/SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
+elseif(NOT APPLE)
+  libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG)
+  libomp_check_linker_flag(-Wl,--warn-shared-textrel LIBOMP_HAVE_WARN_SHARED_TEXTREL_FLAG)
+  libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
+  libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
+  libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
+  libomp_check_linker_flag(-Wl,-fini=__kmp_internal_end_fini LIBOMP_HAVE_FINI_FLAG)
+endif()
+
+# Check Intel(R) C Compiler specific flags
+if(CMAKE_C_COMPILER_ID STREQUAL "Intel")
+  check_cxx_compiler_flag(/Qlong_double LIBOMP_HAVE_LONG_DOUBLE_FLAG)
+  check_cxx_compiler_flag(/Qdiag-disable:177 LIBOMP_HAVE_DIAG_DISABLE_177_FLAG)
+  check_cxx_compiler_flag(/Qinline-min-size=1 LIBOMP_HAVE_INLINE_MIN_SIZE_FLAG)
+  check_cxx_compiler_flag(-Qoption,cpp,--extended_float_types LIBOMP_HAVE_EXTENDED_FLOAT_TYPES_FLAG)
+  check_cxx_compiler_flag(-falign-stack=maintain-16-byte LIBOMP_HAVE_FALIGN_STACK_FLAG)
+  check_cxx_compiler_flag("-opt-streaming-stores never" LIBOMP_HAVE_OPT_STREAMING_STORES_FLAG)
+  libomp_check_linker_flag(-static-intel LIBOMP_HAVE_STATIC_INTEL_FLAG)
+  libomp_check_linker_flag(-no-intel-extensions LIBOMP_HAVE_NO_INTEL_EXTENSIONS_FLAG)
+  check_library_exists(irc_pic _intel_fast_memcpy "" LIBOMP_HAVE_IRC_PIC_LIBRARY)
+endif()
+
+# Checking Threading requirements
+find_package(Threads REQUIRED)
+if(WIN32)
+  if(NOT CMAKE_USE_WIN32_THREADS_INIT)
+    libomp_error_say("Need Win32 thread interface on Windows.")
+  endif()
+else()
+  if(NOT CMAKE_USE_PTHREADS_INIT)
+    libomp_error_say("Need pthread interface on Unix-like systems.")
+  endif()
+endif()
+
+# Find perl executable
+# Perl is used to create omp.h (and other headers) along with kmp_i18n_id.inc and kmp_i18n_default.inc
+find_package(Perl REQUIRED)
+# The perl scripts take the --os= flag which expects a certain format for operating systems.  Until the
+# perl scripts are removed, the most portable way to handle this is to have all operating systems that
+# are neither Windows nor Mac (Most Unix flavors) be considered lin to the perl scripts.  This is rooted
+# in that all the Perl scripts check the operating system and will fail if it isn't "valid".  This
+# temporary solution lets us avoid trying to enumerate all the possible OS values inside the Perl modules.
+if(WIN32)
+  set(LIBOMP_PERL_SCRIPT_OS win)
+elseif(APPLE)
+  set(LIBOMP_PERL_SCRIPT_OS mac)
+else()
+  set(LIBOMP_PERL_SCRIPT_OS lin)
+endif()
+
+# Checking features
+# Check if version symbol assembler directives are supported
+libomp_check_version_symbols(LIBOMP_HAVE_VERSION_SYMBOLS)
+
+# Check if quad precision types are available
+if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  set(LIBOMP_HAVE_QUAD_PRECISION TRUE)
+elseif(CMAKE_C_COMPILER_ID STREQUAL "Intel")
+  if(LIBOMP_HAVE_EXTENDED_FLOAT_TYPES_FLAG)
+    set(LIBOMP_HAVE_QUAD_PRECISION TRUE)
+  else()
+    set(LIBOMP_HAVE_QUAD_PRECISION TRUE)
+  endif()
+else()
+  set(LIBOMP_HAVE_QUAD_PRECISION FALSE)
+endif()
+
+# Check if adaptive locks are available
+if((${IA32} OR ${INTEL64}) AND NOT MSVC)
+  set(LIBOMP_HAVE_ADAPTIVE_LOCKS TRUE)
+else()
+  set(LIBOMP_HAVE_ADAPTIVE_LOCKS FALSE)
+endif()
+
+# Check if stats-gathering is available
+if(NOT (WIN32 OR APPLE) AND (${IA32} OR ${INTEL64} OR ${MIC}))
+  set(LIBOMP_HAVE_STATS TRUE)
+else()
+  set(LIBOMP_HAVE_STATS FALSE)
+endif()
+
+# Check if OMPT support is available
+if(NOT WIN32)
+  set(LIBOMP_HAVE_OMPT_SUPPORT TRUE)
+else()
+  set(LIBOMP_HAVE_OMPT_SUPPORT FALSE)
+endif()

diff --git a/final/runtime/doc/Reference.pdf b/final/runtime/doc/Reference.pdf
new file mode 100644
index 0000000..d6faf7e
--- /dev/null
+++ b/final/runtime/doc/Reference.pdf
Binary files differ

diff --git a/final/runtime/doc/doxygen/config b/final/runtime/doc/doxygen/config
new file mode 100644
index 0000000..3c2c0d5
--- /dev/null
+++ b/final/runtime/doc/doxygen/config

@@ -0,0 +1,1822 @@
+# Doxyfile 1.o8.2

+

+# This file describes the settings to be used by the documentation system

+# doxygen (www.doxygen.org) for a project.

+#

+# All text after a hash (#) is considered a comment and will be ignored.

+# The format is:

+#       TAG = value [value, ...]

+# For lists items can also be appended using:

+#       TAG += value [value, ...]

+# Values that contain spaces should be placed between quotes (" ").

+

+#---------------------------------------------------------------------------

+# Project related configuration options

+#---------------------------------------------------------------------------

+

+# This tag specifies the encoding used for all characters in the config file

+# that follow. The default is UTF-8 which is also the encoding used for all

+# text before the first occurrence of this tag. Doxygen uses libiconv (or the

+# iconv built into libc) for the transcoding. See

+# http://www.gnu.org/software/libiconv for the list of possible encodings.

+

+DOXYFILE_ENCODING      = UTF-8

+

+# The PROJECT_NAME tag is a single word (or sequence of words) that should

+# identify the project. Note that if you do not use Doxywizard you need

+# to put quotes around the project name if it contains spaces.

+

+PROJECT_NAME           = "LLVM OpenMP* Runtime Library"

+

+# The PROJECT_NUMBER tag can be used to enter a project or revision number.

+# This could be handy for archiving the generated documentation or

+# if some version control system is used.

+

+PROJECT_NUMBER         =

+

+# Using the PROJECT_BRIEF tag one can provide an optional one line description

+# for a project that appears at the top of each page and should give viewer

+# a quick idea about the purpose of the project. Keep the description short.

+

+PROJECT_BRIEF          =

+

+# With the PROJECT_LOGO tag one can specify an logo or icon that is

+# included in the documentation. The maximum height of the logo should not

+# exceed 55 pixels and the maximum width should not exceed 200 pixels.

+# Doxygen will copy the logo to the output directory.

+

+PROJECT_LOGO           =

+

+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)

+# base path where the generated documentation will be put.

+# If a relative path is entered, it will be relative to the location

+# where doxygen was started. If left blank the current directory will be used.

+

+OUTPUT_DIRECTORY       = doc/doxygen/generated

+

+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create

+# 4096 sub-directories (in 2 levels) under the output directory of each output

+# format and will distribute the generated files over these directories.

+# Enabling this option can be useful when feeding doxygen a huge amount of

+# source files, where putting all generated files in the same directory would

+# otherwise cause performance problems for the file system.

+

+CREATE_SUBDIRS         = NO

+

+# The OUTPUT_LANGUAGE tag is used to specify the language in which all

+# documentation generated by doxygen is written. Doxygen will use this

+# information to generate all constant output in the proper language.

+# The default language is English, other supported languages are:

+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,

+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,

+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English

+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,

+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,

+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.

+

+OUTPUT_LANGUAGE        = English

+

+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will

+# include brief member descriptions after the members that are listed in

+# the file and class documentation (similar to JavaDoc).

+# Set to NO to disable this.

+

+BRIEF_MEMBER_DESC      = YES

+

+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend

+# the brief description of a member or function before the detailed description.

+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the

+# brief descriptions will be completely suppressed.

+

+REPEAT_BRIEF           = YES

+

+# This tag implements a quasi-intelligent brief description abbreviator

+# that is used to form the text in various listings. Each string

+# in this list, if found as the leading text of the brief description, will be

+# stripped from the text and the result after processing the whole list, is

+# used as the annotated text. Otherwise, the brief description is used as-is.

+# If left blank, the following values are used ("$name" is automatically

+# replaced with the name of the entity): "The $name class" "The $name widget"

+# "The $name file" "is" "provides" "specifies" "contains"

+# "represents" "a" "an" "the"

+

+ABBREVIATE_BRIEF       =

+

+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then

+# Doxygen will generate a detailed section even if there is only a brief

+# description.

+

+ALWAYS_DETAILED_SEC    = NO

+

+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all

+# inherited members of a class in the documentation of that class as if those

+# members were ordinary class members. Constructors, destructors and assignment

+# operators of the base classes will not be shown.

+

+INLINE_INHERITED_MEMB  = NO

+

+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full

+# path before files name in the file list and in the header files. If set

+# to NO the shortest path that makes the file name unique will be used.

+

+FULL_PATH_NAMES        = NO

+

+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag

+# can be used to strip a user-defined part of the path. Stripping is

+# only done if one of the specified strings matches the left-hand part of

+# the path. The tag can be used to show relative paths in the file list.

+# If left blank the directory from which doxygen is run is used as the

+# path to strip. Note that you specify absolute paths here, but also

+# relative paths, which will be relative from the directory where doxygen is

+# started.

+

+STRIP_FROM_PATH        =

+

+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of

+# the path mentioned in the documentation of a class, which tells

+# the reader which header file to include in order to use a class.

+# If left blank only the name of the header file containing the class

+# definition is used. Otherwise one should specify the include paths that

+# are normally passed to the compiler using the -I flag.

+

+STRIP_FROM_INC_PATH    =

+

+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter

+# (but less readable) file names. This can be useful if your file system

+# doesn't support long names like on DOS, Mac, or CD-ROM.

+

+SHORT_NAMES            = NO

+

+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen

+# will interpret the first line (until the first dot) of a JavaDoc-style

+# comment as the brief description. If set to NO, the JavaDoc

+# comments will behave just like regular Qt-style comments

+# (thus requiring an explicit @brief command for a brief description.)

+

+JAVADOC_AUTOBRIEF      = NO

+

+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will

+# interpret the first line (until the first dot) of a Qt-style

+# comment as the brief description. If set to NO, the comments

+# will behave just like regular Qt-style comments (thus requiring

+# an explicit \brief command for a brief description.)

+

+QT_AUTOBRIEF           = NO

+

+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen

+# treat a multi-line C++ special comment block (i.e. a block of //! or ///

+# comments) as a brief description. This used to be the default behaviour.

+# The new default is to treat a multi-line C++ comment block as a detailed

+# description. Set this tag to YES if you prefer the old behaviour instead.

+

+MULTILINE_CPP_IS_BRIEF = NO

+

+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented

+# member inherits the documentation from any documented member that it

+# re-implements.

+

+INHERIT_DOCS           = YES

+

+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce

+# a new page for each member. If set to NO, the documentation of a member will

+# be part of the file/class/namespace that contains it.

+

+SEPARATE_MEMBER_PAGES  = NO

+

+# The TAB_SIZE tag can be used to set the number of spaces in a tab.

+# Doxygen uses this value to replace tabs by spaces in code fragments.

+

+TAB_SIZE               = 8

+

+# This tag can be used to specify a number of aliases that acts

+# as commands in the documentation. An alias has the form "name=value".

+# For example adding "sideeffect=\par Side Effects:\n" will allow you to

+# put the command \sideeffect (or @sideeffect) in the documentation, which

+# will result in a user-defined paragraph with heading "Side Effects:".

+# You can put \n's in the value part of an alias to insert newlines.

+

+ALIASES                = "other=<sup>*</sup>"

+

+# This tag can be used to specify a number of word-keyword mappings (TCL only).

+# A mapping has the form "name=value". For example adding

+# "class=itcl::class" will allow you to use the command class in the

+# itcl::class meaning.

+

+TCL_SUBST              =

+

+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C

+# sources only. Doxygen will then generate output that is more tailored for C.

+# For instance, some of the names that are used will be different. The list

+# of all members will be omitted, etc.

+

+OPTIMIZE_OUTPUT_FOR_C  = NO

+

+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java

+# sources only. Doxygen will then generate output that is more tailored for

+# Java. For instance, namespaces will be presented as packages, qualified

+# scopes will look different, etc.

+

+OPTIMIZE_OUTPUT_JAVA   = NO

+

+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran

+# sources only. Doxygen will then generate output that is more tailored for

+# Fortran.

+

+OPTIMIZE_FOR_FORTRAN   = NO

+

+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL

+# sources. Doxygen will then generate output that is tailored for

+# VHDL.

+

+OPTIMIZE_OUTPUT_VHDL   = NO

+

+# Doxygen selects the parser to use depending on the extension of the files it

+# parses. With this tag you can assign which parser to use for a given

+# extension. Doxygen has a built-in mapping, but you can override or extend it

+# using this tag. The format is ext=language, where ext is a file extension,

+# and language is one of the parsers supported by doxygen: IDL, Java,

+# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C,

+# C++. For instance to make doxygen treat .inc files as Fortran files (default

+# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note

+# that for custom extensions you also need to set FILE_PATTERNS otherwise the

+# files are not read by doxygen.

+

+EXTENSION_MAPPING      =

+

+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all

+# comments according to the Markdown format, which allows for more readable

+# documentation. See http://daringfireball.net/projects/markdown/ for details.

+# The output of markdown processing is further processed by doxygen, so you

+# can mix doxygen, HTML, and XML commands with Markdown formatting.

+# Disable only in case of backward compatibilities issues.

+

+MARKDOWN_SUPPORT       = YES

+

+# When enabled doxygen tries to link words that correspond to documented classes,

+# or namespaces to their corresponding documentation. Such a link can be

+# prevented in individual cases by by putting a % sign in front of the word or

+# globally by setting AUTOLINK_SUPPORT to NO.

+

+AUTOLINK_SUPPORT       = YES

+

+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want

+# to include (a tag file for) the STL sources as input, then you should

+# set this tag to YES in order to let doxygen match functions declarations and

+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.

+# func(std::string) {}). This also makes the inheritance and collaboration

+# diagrams that involve STL classes more complete and accurate.

+

+BUILTIN_STL_SUPPORT    = NO

+

+# If you use Microsoft's C++/CLI language, you should set this option to YES to

+# enable parsing support.

+

+CPP_CLI_SUPPORT        = NO

+

+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.

+# Doxygen will parse them like normal C++ but will assume all classes use public

+# instead of private inheritance when no explicit protection keyword is present.

+

+SIP_SUPPORT            = NO

+

+# For Microsoft's IDL there are propget and propput attributes to

+# indicate getter and setter methods for a property. Setting this

+# option to YES (the default) will make doxygen replace the get and

+# set methods by a property in the documentation. This will only work

+# if the methods are indeed getting or setting a simple type. If this

+# is not the case, or you want to show the methods anyway, you should

+# set this option to NO.

+

+IDL_PROPERTY_SUPPORT   = YES

+

+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC

+# tag is set to YES, then doxygen will reuse the documentation of the first

+# member in the group (if any) for the other members of the group. By default

+# all members of a group must be documented explicitly.

+

+DISTRIBUTE_GROUP_DOC   = NO

+

+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of

+# the same type (for instance a group of public functions) to be put as a

+# subgroup of that type (e.g. under the Public Functions section). Set it to

+# NO to prevent subgrouping. Alternatively, this can be done per class using

+# the \nosubgrouping command.

+

+SUBGROUPING            = YES

+

+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and

+# unions are shown inside the group in which they are included (e.g. using

+# @ingroup) instead of on a separate page (for HTML and Man pages) or

+# section (for LaTeX and RTF).

+

+INLINE_GROUPED_CLASSES = NO

+

+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and

+# unions with only public data fields will be shown inline in the documentation

+# of the scope in which they are defined (i.e. file, namespace, or group

+# documentation), provided this scope is documented. If set to NO (the default),

+# structs, classes, and unions are shown on a separate page (for HTML and Man

+# pages) or section (for LaTeX and RTF).

+

+INLINE_SIMPLE_STRUCTS  = NO

+

+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum

+# is documented as struct, union, or enum with the name of the typedef. So

+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct

+# with name TypeT. When disabled the typedef will appear as a member of a file,

+# namespace, or class. And the struct will be named TypeS. This can typically

+# be useful for C code in case the coding convention dictates that all compound

+# types are typedef'ed and only the typedef is referenced, never the tag name.

+

+TYPEDEF_HIDES_STRUCT   = NO

+

+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to

+# determine which symbols to keep in memory and which to flush to disk.

+# When the cache is full, less often used symbols will be written to disk.

+# For small to medium size projects (<1000 input files) the default value is

+# probably good enough. For larger projects a too small cache size can cause

+# doxygen to be busy swapping symbols to and from disk most of the time

+# causing a significant performance penalty.

+# If the system has enough physical memory increasing the cache will improve the

+# performance by keeping more symbols in memory. Note that the value works on

+# a logarithmic scale so increasing the size by one will roughly double the

+# memory usage. The cache size is given by this formula:

+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,

+# corresponding to a cache size of 2^16 = 65536 symbols.

+

+SYMBOL_CACHE_SIZE      = 0

+

+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be

+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given

+# their name and scope. Since this can be an expensive process and often the

+# same symbol appear multiple times in the code, doxygen keeps a cache of

+# pre-resolved symbols. If the cache is too small doxygen will become slower.

+# If the cache is too large, memory is wasted. The cache size is given by this

+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,

+# corresponding to a cache size of 2^16 = 65536 symbols.

+

+LOOKUP_CACHE_SIZE      = 0

+

+#---------------------------------------------------------------------------

+# Build related configuration options

+#---------------------------------------------------------------------------

+

+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in

+# documentation are documented, even if no documentation was available.

+# Private class members and static file members will be hidden unless

+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES

+

+EXTRACT_ALL            = NO

+

+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class

+# will be included in the documentation.

+

+EXTRACT_PRIVATE        = YES

+

+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal

+# scope will be included in the documentation.

+

+EXTRACT_PACKAGE        = NO

+

+# If the EXTRACT_STATIC tag is set to YES all static members of a file

+# will be included in the documentation.

+

+EXTRACT_STATIC         = YES

+

+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)

+# defined locally in source files will be included in the documentation.

+# If set to NO only classes defined in header files are included.

+

+EXTRACT_LOCAL_CLASSES  = YES

+

+# This flag is only useful for Objective-C code. When set to YES local

+# methods, which are defined in the implementation section but not in

+# the interface are included in the documentation.

+# If set to NO (the default) only methods in the interface are included.

+

+EXTRACT_LOCAL_METHODS  = NO

+

+# If this flag is set to YES, the members of anonymous namespaces will be

+# extracted and appear in the documentation as a namespace called

+# 'anonymous_namespace{file}', where file will be replaced with the base

+# name of the file that contains the anonymous namespace. By default

+# anonymous namespaces are hidden.

+

+EXTRACT_ANON_NSPACES   = NO

+

+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all

+# undocumented members of documented classes, files or namespaces.

+# If set to NO (the default) these members will be included in the

+# various overviews, but no documentation section is generated.

+# This option has no effect if EXTRACT_ALL is enabled.

+

+HIDE_UNDOC_MEMBERS     = YES

+

+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all

+# undocumented classes that are normally visible in the class hierarchy.

+# If set to NO (the default) these classes will be included in the various

+# overviews. This option has no effect if EXTRACT_ALL is enabled.

+

+HIDE_UNDOC_CLASSES     = YES

+

+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all

+# friend (class|struct|union) declarations.

+# If set to NO (the default) these declarations will be included in the

+# documentation.

+

+HIDE_FRIEND_COMPOUNDS  = NO

+

+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any

+# documentation blocks found inside the body of a function.

+# If set to NO (the default) these blocks will be appended to the

+# function's detailed documentation block.

+

+HIDE_IN_BODY_DOCS      = NO

+

+# The INTERNAL_DOCS tag determines if documentation

+# that is typed after a \internal command is included. If the tag is set

+# to NO (the default) then the documentation will be excluded.

+# Set it to YES to include the internal documentation.

+

+INTERNAL_DOCS          = NO

+

+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate

+# file names in lower-case letters. If set to YES upper-case letters are also

+# allowed. This is useful if you have classes or files whose names only differ

+# in case and if your file system supports case sensitive file names. Windows

+# and Mac users are advised to set this option to NO.

+

+CASE_SENSE_NAMES       = YES

+

+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen

+# will show members with their full class and namespace scopes in the

+# documentation. If set to YES the scope will be hidden.

+

+HIDE_SCOPE_NAMES       = NO

+

+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen

+# will put a list of the files that are included by a file in the documentation

+# of that file.

+

+SHOW_INCLUDE_FILES     = YES

+

+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen

+# will list include files with double quotes in the documentation

+# rather than with sharp brackets.

+

+FORCE_LOCAL_INCLUDES   = NO

+

+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]

+# is inserted in the documentation for inline members.

+

+INLINE_INFO            = YES

+

+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen

+# will sort the (detailed) documentation of file and class members

+# alphabetically by member name. If set to NO the members will appear in

+# declaration order.

+

+SORT_MEMBER_DOCS       = YES

+

+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the

+# brief documentation of file, namespace and class members alphabetically

+# by member name. If set to NO (the default) the members will appear in

+# declaration order.

+

+SORT_BRIEF_DOCS        = NO

+

+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen

+# will sort the (brief and detailed) documentation of class members so that

+# constructors and destructors are listed first. If set to NO (the default)

+# the constructors will appear in the respective orders defined by

+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.

+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO

+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.

+

+SORT_MEMBERS_CTORS_1ST = NO

+

+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the

+# hierarchy of group names into alphabetical order. If set to NO (the default)

+# the group names will appear in their defined order.

+

+SORT_GROUP_NAMES       = NO

+

+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be

+# sorted by fully-qualified names, including namespaces. If set to

+# NO (the default), the class list will be sorted only by class name,

+# not including the namespace part.

+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.

+# Note: This option applies only to the class list, not to the

+# alphabetical list.

+

+SORT_BY_SCOPE_NAME     = NO

+

+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to

+# do proper type resolution of all parameters of a function it will reject a

+# match between the prototype and the implementation of a member function even

+# if there is only one candidate or it is obvious which candidate to choose

+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen

+# will still accept a match between prototype and implementation in such cases.

+

+STRICT_PROTO_MATCHING  = NO

+

+# The GENERATE_TODOLIST tag can be used to enable (YES) or

+# disable (NO) the todo list. This list is created by putting \todo

+# commands in the documentation.

+

+GENERATE_TODOLIST      = YES

+

+# The GENERATE_TESTLIST tag can be used to enable (YES) or

+# disable (NO) the test list. This list is created by putting \test

+# commands in the documentation.

+

+GENERATE_TESTLIST      = YES

+

+# The GENERATE_BUGLIST tag can be used to enable (YES) or

+# disable (NO) the bug list. This list is created by putting \bug

+# commands in the documentation.

+

+GENERATE_BUGLIST       = YES

+

+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or

+# disable (NO) the deprecated list. This list is created by putting

+# \deprecated commands in the documentation.

+

+GENERATE_DEPRECATEDLIST= YES

+

+# The ENABLED_SECTIONS tag can be used to enable conditional

+# documentation sections, marked by \if sectionname ... \endif.

+

+ENABLED_SECTIONS       =

+

+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines

+# the initial value of a variable or macro consists of for it to appear in

+# the documentation. If the initializer consists of more lines than specified

+# here it will be hidden. Use a value of 0 to hide initializers completely.

+# The appearance of the initializer of individual variables and macros in the

+# documentation can be controlled using \showinitializer or \hideinitializer

+# command in the documentation regardless of this setting.

+

+MAX_INITIALIZER_LINES  = 30

+

+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated

+# at the bottom of the documentation of classes and structs. If set to YES the

+# list will mention the files that were used to generate the documentation.

+

+SHOW_USED_FILES        = YES

+

+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.

+# This will remove the Files entry from the Quick Index and from the

+# Folder Tree View (if specified). The default is YES.

+

+# We probably will want this, but we have no file documentation yet so it's simpler to remove

+# it for now.

+SHOW_FILES             = NO

+

+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the

+# Namespaces page.

+# This will remove the Namespaces entry from the Quick Index

+# and from the Folder Tree View (if specified). The default is YES.

+

+SHOW_NAMESPACES        = YES

+

+# The FILE_VERSION_FILTER tag can be used to specify a program or script that

+# doxygen should invoke to get the current version for each file (typically from

+# the version control system). Doxygen will invoke the program by executing (via

+# popen()) the command <command> <input-file>, where <command> is the value of

+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file

+# provided by doxygen. Whatever the program writes to standard output

+# is used as the file version. See the manual for examples.

+

+FILE_VERSION_FILTER    =

+

+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed

+# by doxygen. The layout file controls the global structure of the generated

+# output files in an output format independent way. To create the layout file

+# that represents doxygen's defaults, run doxygen with the -l option.

+# You can optionally specify a file name after the option, if omitted

+# DoxygenLayout.xml will be used as the name of the layout file.

+

+LAYOUT_FILE            =

+

+# The CITE_BIB_FILES tag can be used to specify one or more bib files

+# containing the references data. This must be a list of .bib files. The

+# .bib extension is automatically appended if omitted. Using this command

+# requires the bibtex tool to be installed. See also

+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style

+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this

+# feature you need bibtex and perl available in the search path.

+

+CITE_BIB_FILES         =

+

+#---------------------------------------------------------------------------

+# configuration options related to warning and progress messages

+#---------------------------------------------------------------------------

+

+# The QUIET tag can be used to turn on/off the messages that are generated

+# by doxygen. Possible values are YES and NO. If left blank NO is used.

+

+QUIET                  = NO

+

+# The WARNINGS tag can be used to turn on/off the warning messages that are

+# generated by doxygen. Possible values are YES and NO. If left blank

+# NO is used.

+

+WARNINGS               = YES

+

+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings

+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will

+# automatically be disabled.

+

+WARN_IF_UNDOCUMENTED   = YES

+

+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for

+# potential errors in the documentation, such as not documenting some

+# parameters in a documented function, or documenting parameters that

+# don't exist or using markup commands wrongly.

+

+WARN_IF_DOC_ERROR      = YES

+

+# The WARN_NO_PARAMDOC option can be enabled to get warnings for

+# functions that are documented, but have no documentation for their parameters

+# or return value. If set to NO (the default) doxygen will only warn about

+# wrong or incomplete parameter documentation, but not about the absence of

+# documentation.

+

+WARN_NO_PARAMDOC       = NO

+

+# The WARN_FORMAT tag determines the format of the warning messages that

+# doxygen can produce. The string should contain the $file, $line, and $text

+# tags, which will be replaced by the file and line number from which the

+# warning originated and the warning text. Optionally the format may contain

+# $version, which will be replaced by the version of the file (if it could

+# be obtained via FILE_VERSION_FILTER)

+

+WARN_FORMAT            =

+

+# The WARN_LOGFILE tag can be used to specify a file to which warning

+# and error messages should be written. If left blank the output is written

+# to stderr.

+

+WARN_LOGFILE           =

+

+#---------------------------------------------------------------------------

+# configuration options related to the input files

+#---------------------------------------------------------------------------

+

+# The INPUT tag can be used to specify the files and/or directories that contain

+# documented source files. You may enter file names like "myfile.cpp" or

+# directories like "/usr/src/myproject". Separate the files or directories

+# with spaces.

+

+INPUT                  = src  doc/doxygen/libomp_interface.h 

+# The ittnotify code also has doxygen documentation, but if we include it here

+# it takes over from us!

+# src/thirdparty/ittnotify                         

+

+# This tag can be used to specify the character encoding of the source files

+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is

+# also the default input encoding. Doxygen uses libiconv (or the iconv built

+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for

+# the list of possible encodings.

+

+INPUT_ENCODING         = UTF-8

+

+# If the value of the INPUT tag contains directories, you can use the

+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp

+# and *.h) to filter out the source-files in the directories. If left

+# blank the following patterns are tested:

+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh

+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py

+# *.f90 *.f *.for *.vhd *.vhdl

+

+FILE_PATTERNS          = *.c *.h *.cpp 

+# We may also want to include the asm files with appropriate ifdef to ensure

+# doxygen doesn't see the content, just the documentation...

+

+# The RECURSIVE tag can be used to turn specify whether or not subdirectories

+# should be searched for input files as well. Possible values are YES and NO.

+# If left blank NO is used.

+

+# Only look in the one directory.

+RECURSIVE              = NO

+

+# The EXCLUDE tag can be used to specify files and/or directories that should be

+# excluded from the INPUT source files. This way you can easily exclude a

+# subdirectory from a directory tree whose root is specified with the INPUT tag.

+# Note that relative paths are relative to the directory from which doxygen is

+# run.

+

+EXCLUDE                = src/test-touch.c

+

+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or

+# directories that are symbolic links (a Unix file system feature) are excluded

+# from the input.

+

+EXCLUDE_SYMLINKS       = NO

+

+# If the value of the INPUT tag contains directories, you can use the

+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude

+# certain files from those directories. Note that the wildcards are matched

+# against the file with absolute path, so to exclude all test directories

+# for example use the pattern */test/*

+

+EXCLUDE_PATTERNS       =

+

+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names

+# (namespaces, classes, functions, etc.) that should be excluded from the

+# output. The symbol name can be a fully qualified name, a word, or if the

+# wildcard * is used, a substring. Examples: ANamespace, AClass,

+# AClass::ANamespace, ANamespace::*Test

+

+EXCLUDE_SYMBOLS        =

+

+# The EXAMPLE_PATH tag can be used to specify one or more files or

+# directories that contain example code fragments that are included (see

+# the \include command).

+

+EXAMPLE_PATH           =

+

+# If the value of the EXAMPLE_PATH tag contains directories, you can use the

+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp

+# and *.h) to filter out the source-files in the directories. If left

+# blank all files are included.

+

+EXAMPLE_PATTERNS       =

+

+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be

+# searched for input files to be used with the \include or \dontinclude

+# commands irrespective of the value of the RECURSIVE tag.

+# Possible values are YES and NO. If left blank NO is used.

+

+EXAMPLE_RECURSIVE      = NO

+

+# The IMAGE_PATH tag can be used to specify one or more files or

+# directories that contain image that are included in the documentation (see

+# the \image command).

+

+IMAGE_PATH             =

+

+# The INPUT_FILTER tag can be used to specify a program that doxygen should

+# invoke to filter for each input file. Doxygen will invoke the filter program

+# by executing (via popen()) the command <filter> <input-file>, where <filter>

+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an

+# input file. Doxygen will then use the output that the filter program writes

+# to standard output.

+# If FILTER_PATTERNS is specified, this tag will be

+# ignored.

+

+INPUT_FILTER           =

+

+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern

+# basis.

+# Doxygen will compare the file name with each pattern and apply the

+# filter if there is a match.

+# The filters are a list of the form:

+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further

+# info on how filters are used. If FILTER_PATTERNS is empty or if

+# non of the patterns match the file name, INPUT_FILTER is applied.

+

+FILTER_PATTERNS        =

+

+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using

+# INPUT_FILTER) will be used to filter the input files when producing source

+# files to browse (i.e. when SOURCE_BROWSER is set to YES).

+

+FILTER_SOURCE_FILES    = NO

+

+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file

+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)

+# and it is also possible to disable source filtering for a specific pattern

+# using *.ext= (so without naming a filter). This option only has effect when

+# FILTER_SOURCE_FILES is enabled.

+

+FILTER_SOURCE_PATTERNS =

+

+#---------------------------------------------------------------------------

+# configuration options related to source browsing

+#---------------------------------------------------------------------------

+

+# If the SOURCE_BROWSER tag is set to YES then a list of source files will

+# be generated. Documented entities will be cross-referenced with these sources.

+# Note: To get rid of all source code in the generated output, make sure also

+# VERBATIM_HEADERS is set to NO.

+

+SOURCE_BROWSER         = YES

+

+# Setting the INLINE_SOURCES tag to YES will include the body

+# of functions and classes directly in the documentation.

+

+INLINE_SOURCES         = NO

+

+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct

+# doxygen to hide any special comment blocks from generated source code

+# fragments. Normal C, C++ and Fortran comments will always remain visible.

+

+STRIP_CODE_COMMENTS    = YES

+

+# If the REFERENCED_BY_RELATION tag is set to YES

+# then for each documented function all documented

+# functions referencing it will be listed.

+

+REFERENCED_BY_RELATION = YES

+

+# If the REFERENCES_RELATION tag is set to YES

+# then for each documented function all documented entities

+# called/used by that function will be listed.

+

+REFERENCES_RELATION    = NO

+

+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)

+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from

+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will

+# link to the source code.

+# Otherwise they will link to the documentation.

+

+REFERENCES_LINK_SOURCE = YES

+

+# If the USE_HTAGS tag is set to YES then the references to source code

+# will point to the HTML generated by the htags(1) tool instead of doxygen

+# built-in source browser. The htags tool is part of GNU's global source

+# tagging system (see http://www.gnu.org/software/global/global.html). You

+# will need version 4.8.6 or higher.

+

+USE_HTAGS              = NO

+

+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen

+# will generate a verbatim copy of the header file for each class for

+# which an include is specified. Set to NO to disable this.

+

+VERBATIM_HEADERS       = YES

+

+#---------------------------------------------------------------------------

+# configuration options related to the alphabetical class index

+#---------------------------------------------------------------------------

+

+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index

+# of all compounds will be generated. Enable this if the project

+# contains a lot of classes, structs, unions or interfaces.

+

+ALPHABETICAL_INDEX     = YES

+

+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then

+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns

+# in which this list will be split (can be a number in the range [1..20])

+

+COLS_IN_ALPHA_INDEX    = 5

+

+# In case all classes in a project start with a common prefix, all

+# classes will be put under the same header in the alphabetical index.

+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that

+# should be ignored while generating the index headers.

+

+IGNORE_PREFIX          =

+

+#---------------------------------------------------------------------------

+# configuration options related to the HTML output

+#---------------------------------------------------------------------------

+

+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will

+# generate HTML output.

+

+GENERATE_HTML          = YES

+

+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.

+# If a relative path is entered the value of OUTPUT_DIRECTORY will be

+# put in front of it. If left blank `html' will be used as the default path.

+

+HTML_OUTPUT            =

+

+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for

+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank

+# doxygen will generate files with .html extension.

+

+HTML_FILE_EXTENSION    = .html

+

+# The HTML_HEADER tag can be used to specify a personal HTML header for

+# each generated HTML page. If it is left blank doxygen will generate a

+# standard header. Note that when using a custom header you are responsible

+#  for the proper inclusion of any scripts and style sheets that doxygen

+# needs, which is dependent on the configuration options used.

+# It is advised to generate a default header using "doxygen -w html

+# header.html footer.html stylesheet.css YourConfigFile" and then modify

+# that header. Note that the header is subject to change so you typically

+# have to redo this when upgrading to a newer version of doxygen or when

+# changing the value of configuration settings such as GENERATE_TREEVIEW!

+

+HTML_HEADER            =

+

+# The HTML_FOOTER tag can be used to specify a personal HTML footer for

+# each generated HTML page. If it is left blank doxygen will generate a

+# standard footer.

+

+HTML_FOOTER            =

+

+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading

+# style sheet that is used by each HTML page. It can be used to

+# fine-tune the look of the HTML output. If left blank doxygen will

+# generate a default style sheet. Note that it is recommended to use

+# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this

+# tag will in the future become obsolete.

+

+HTML_STYLESHEET        =

+

+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional

+# user-defined cascading style sheet that is included after the standard

+# style sheets created by doxygen. Using this option one can overrule

+# certain style aspects. This is preferred over using HTML_STYLESHEET

+# since it does not replace the standard style sheet and is therefor more

+# robust against future updates. Doxygen will copy the style sheet file to

+# the output directory.

+

+HTML_EXTRA_STYLESHEET  =

+

+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or

+# other source files which should be copied to the HTML output directory. Note

+# that these files will be copied to the base HTML output directory. Use the

+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these

+# files. In the HTML_STYLESHEET file, use the file name only. Also note that

+# the files will be copied as-is; there are no commands or markers available.

+

+HTML_EXTRA_FILES       =

+

+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.

+# Doxygen will adjust the colors in the style sheet and background images

+# according to this color. Hue is specified as an angle on a colorwheel,

+# see http://en.wikipedia.org/wiki/Hue for more information.

+# For instance the value 0 represents red, 60 is yellow, 120 is green,

+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.

+# The allowed range is 0 to 359.

+

+HTML_COLORSTYLE_HUE    = 220

+

+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of

+# the colors in the HTML output. For a value of 0 the output will use

+# grayscales only. A value of 255 will produce the most vivid colors.

+

+HTML_COLORSTYLE_SAT    = 100

+

+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to

+# the luminance component of the colors in the HTML output. Values below

+# 100 gradually make the output lighter, whereas values above 100 make

+# the output darker. The value divided by 100 is the actual gamma applied,

+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,

+# and 100 does not change the gamma.

+

+HTML_COLORSTYLE_GAMMA  = 80

+

+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML

+# page will contain the date and time when the page was generated. Setting

+# this to NO can help when comparing the output of multiple runs.

+

+HTML_TIMESTAMP         = YES

+

+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML

+# documentation will contain sections that can be hidden and shown after the

+# page has loaded.

+

+HTML_DYNAMIC_SECTIONS  = NO

+

+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of

+# entries shown in the various tree structured indices initially; the user

+# can expand and collapse entries dynamically later on. Doxygen will expand

+# the tree to such a level that at most the specified number of entries are

+# visible (unless a fully collapsed tree already exceeds this amount).

+# So setting the number of entries 1 will produce a full collapsed tree by

+# default. 0 is a special value representing an infinite number of entries

+# and will result in a full expanded tree by default.

+

+HTML_INDEX_NUM_ENTRIES = 100

+

+# If the GENERATE_DOCSET tag is set to YES, additional index files

+# will be generated that can be used as input for Apple's Xcode 3

+# integrated development environment, introduced with OSX 10.5 (Leopard).

+# To create a documentation set, doxygen will generate a Makefile in the

+# HTML output directory. Running make will produce the docset in that

+# directory and running "make install" will install the docset in

+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find

+# it at startup.

+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html

+# for more information.

+

+GENERATE_DOCSET        = NO

+

+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the

+# feed. A documentation feed provides an umbrella under which multiple

+# documentation sets from a single provider (such as a company or product suite)

+# can be grouped.

+

+DOCSET_FEEDNAME        = "Doxygen generated docs"

+

+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that

+# should uniquely identify the documentation set bundle. This should be a

+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen

+# will append .docset to the name.

+

+DOCSET_BUNDLE_ID       = org.doxygen.Project

+

+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely

+# identify the documentation publisher. This should be a reverse domain-name

+# style string, e.g. com.mycompany.MyDocSet.documentation.

+

+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher

+

+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.

+

+DOCSET_PUBLISHER_NAME  = Publisher

+

+# If the GENERATE_HTMLHELP tag is set to YES, additional index files

+# will be generated that can be used as input for tools like the

+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)

+# of the generated HTML documentation.

+

+GENERATE_HTMLHELP      = NO

+

+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can

+# be used to specify the file name of the resulting .chm file. You

+# can add a path in front of the file if the result should not be

+# written to the html output directory.

+

+CHM_FILE               =

+

+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can

+# be used to specify the location (absolute path including file name) of

+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run

+# the HTML help compiler on the generated index.hhp.

+

+HHC_LOCATION           =

+

+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag

+# controls if a separate .chi index file is generated (YES) or that

+# it should be included in the master .chm file (NO).

+

+GENERATE_CHI           = NO

+

+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING

+# is used to encode HtmlHelp index (hhk), content (hhc) and project file

+# content.

+

+CHM_INDEX_ENCODING     =

+

+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag

+# controls whether a binary table of contents is generated (YES) or a

+# normal table of contents (NO) in the .chm file.

+

+BINARY_TOC             = NO

+

+# The TOC_EXPAND flag can be set to YES to add extra items for group members

+# to the contents of the HTML help documentation and to the tree view.

+

+TOC_EXPAND             = NO

+

+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and

+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated

+# that can be used as input for Qt's qhelpgenerator to generate a

+# Qt Compressed Help (.qch) of the generated HTML documentation.

+

+GENERATE_QHP           = NO

+

+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can

+# be used to specify the file name of the resulting .qch file.

+# The path specified is relative to the HTML output folder.

+

+QCH_FILE               =

+

+# The QHP_NAMESPACE tag specifies the namespace to use when generating

+# Qt Help Project output. For more information please see

+# http://doc.trolltech.com/qthelpproject.html#namespace

+

+QHP_NAMESPACE          = org.doxygen.Project

+

+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating

+# Qt Help Project output. For more information please see

+# http://doc.trolltech.com/qthelpproject.html#virtual-folders

+

+QHP_VIRTUAL_FOLDER     = doc

+

+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to

+# add. For more information please see

+# http://doc.trolltech.com/qthelpproject.html#custom-filters

+

+QHP_CUST_FILTER_NAME   =

+

+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the

+# custom filter to add. For more information please see

+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">

+# Qt Help Project / Custom Filters</a>.

+

+QHP_CUST_FILTER_ATTRS  =

+

+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this

+# project's

+# filter section matches.

+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">

+# Qt Help Project / Filter Attributes</a>.

+

+QHP_SECT_FILTER_ATTRS  =

+

+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can

+# be used to specify the location of Qt's qhelpgenerator.

+# If non-empty doxygen will try to run qhelpgenerator on the generated

+# .qhp file.

+

+QHG_LOCATION           =

+

+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files

+#  will be generated, which together with the HTML files, form an Eclipse help

+# plugin. To install this plugin and make it available under the help contents

+# menu in Eclipse, the contents of the directory containing the HTML and XML

+# files needs to be copied into the plugins directory of eclipse. The name of

+# the directory within the plugins directory should be the same as

+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before

+# the help appears.

+

+GENERATE_ECLIPSEHELP   = NO

+

+# A unique identifier for the eclipse help plugin. When installing the plugin

+# the directory name containing the HTML and XML files should also have

+# this name.

+

+ECLIPSE_DOC_ID         = org.doxygen.Project

+

+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)

+# at top of each HTML page. The value NO (the default) enables the index and

+# the value YES disables it. Since the tabs have the same information as the

+# navigation tree you can set this option to NO if you already set

+# GENERATE_TREEVIEW to YES.

+

+DISABLE_INDEX          = NO

+

+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index

+# structure should be generated to display hierarchical information.

+# If the tag value is set to YES, a side panel will be generated

+# containing a tree-like index structure (just like the one that

+# is generated for HTML Help). For this to work a browser that supports

+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).

+# Windows users are probably better off using the HTML help feature.

+# Since the tree basically has the same information as the tab index you

+# could consider to set DISABLE_INDEX to NO when enabling this option.

+

+GENERATE_TREEVIEW      = NO

+

+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values

+# (range [0,1..20]) that doxygen will group on one line in the generated HTML

+# documentation. Note that a value of 0 will completely suppress the enum

+# values from appearing in the overview section.

+

+ENUM_VALUES_PER_LINE   = 4

+

+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be

+# used to set the initial width (in pixels) of the frame in which the tree

+# is shown.

+

+TREEVIEW_WIDTH         = 250

+

+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open

+# links to external symbols imported via tag files in a separate window.

+

+EXT_LINKS_IN_WINDOW    = NO

+

+# Use this tag to change the font size of Latex formulas included

+# as images in the HTML documentation. The default is 10. Note that

+# when you change the font size after a successful doxygen run you need

+# to manually remove any form_*.png images from the HTML output directory

+# to force them to be regenerated.

+

+FORMULA_FONTSIZE       = 10

+

+# Use the FORMULA_TRANPARENT tag to determine whether or not the images

+# generated for formulas are transparent PNGs. Transparent PNGs are

+# not supported properly for IE 6.0, but are supported on all modern browsers.

+# Note that when changing this option you need to delete any form_*.png files

+# in the HTML output before the changes have effect.

+

+FORMULA_TRANSPARENT    = YES

+

+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax

+# (see http://www.mathjax.org) which uses client side Javascript for the

+# rendering instead of using prerendered bitmaps. Use this if you do not

+# have LaTeX installed or if you want to formulas look prettier in the HTML

+# output. When enabled you may also need to install MathJax separately and

+# configure the path to it using the MATHJAX_RELPATH option.

+

+USE_MATHJAX            = NO

+

+# When MathJax is enabled you need to specify the location relative to the

+# HTML output directory using the MATHJAX_RELPATH option. The destination

+# directory should contain the MathJax.js script. For instance, if the mathjax

+# directory is located at the same level as the HTML output directory, then

+# MATHJAX_RELPATH should be ../mathjax. The default value points to

+# the MathJax Content Delivery Network so you can quickly see the result without

+# installing MathJax.

+# However, it is strongly recommended to install a local

+# copy of MathJax from http://www.mathjax.org before deployment.

+

+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest

+

+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension

+# names that should be enabled during MathJax rendering.

+

+MATHJAX_EXTENSIONS     =

+

+# When the SEARCHENGINE tag is enabled doxygen will generate a search box

+# for the HTML output. The underlying search engine uses javascript

+# and DHTML and should work on any modern browser. Note that when using

+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets

+# (GENERATE_DOCSET) there is already a search function so this one should

+# typically be disabled. For large projects the javascript based search engine

+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.

+

+SEARCHENGINE           = YES

+

+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be

+# implemented using a PHP enabled web server instead of at the web client

+# using Javascript. Doxygen will generate the search PHP script and index

+# file to put on the web server. The advantage of the server

+# based approach is that it scales better to large projects and allows

+# full text search. The disadvantages are that it is more difficult to setup

+# and does not have live searching capabilities.

+

+SERVER_BASED_SEARCH    = NO

+

+#---------------------------------------------------------------------------

+# configuration options related to the LaTeX output

+#---------------------------------------------------------------------------

+

+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will

+# generate Latex output.

+

+GENERATE_LATEX         = YES

+

+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.

+# If a relative path is entered the value of OUTPUT_DIRECTORY will be

+# put in front of it. If left blank `latex' will be used as the default path.

+

+LATEX_OUTPUT           =

+

+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be

+# invoked. If left blank `latex' will be used as the default command name.

+# Note that when enabling USE_PDFLATEX this option is only used for

+# generating bitmaps for formulas in the HTML output, but not in the

+# Makefile that is written to the output directory.

+

+LATEX_CMD_NAME         = latex

+

+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to

+# generate index for LaTeX. If left blank `makeindex' will be used as the

+# default command name.

+

+MAKEINDEX_CMD_NAME     = makeindex

+

+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact

+# LaTeX documents. This may be useful for small projects and may help to

+# save some trees in general.

+

+COMPACT_LATEX          = NO

+

+# The PAPER_TYPE tag can be used to set the paper type that is used

+# by the printer. Possible values are: a4, letter, legal and

+# executive. If left blank a4wide will be used.

+

+PAPER_TYPE             = a4wide

+

+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX

+# packages that should be included in the LaTeX output.

+

+EXTRA_PACKAGES         =

+

+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for

+# the generated latex document. The header should contain everything until

+# the first chapter. If it is left blank doxygen will generate a

+# standard header. Notice: only use this tag if you know what you are doing!

+

+LATEX_HEADER           = doc/doxygen/header.tex

+

+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for

+# the generated latex document. The footer should contain everything after

+# the last chapter. If it is left blank doxygen will generate a

+# standard footer. Notice: only use this tag if you know what you are doing!

+

+LATEX_FOOTER           =

+

+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated

+# is prepared for conversion to pdf (using ps2pdf). The pdf file will

+# contain links (just like the HTML output) instead of page references

+# This makes the output suitable for online browsing using a pdf viewer.

+

+PDF_HYPERLINKS         = YES

+

+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of

+# plain latex in the generated Makefile. Set this option to YES to get a

+# higher quality PDF documentation.

+

+USE_PDFLATEX           = YES

+

+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.

+# command to the generated LaTeX files. This will instruct LaTeX to keep

+# running if errors occur, instead of asking the user for help.

+# This option is also used when generating formulas in HTML.

+

+LATEX_BATCHMODE        = NO

+

+# If LATEX_HIDE_INDICES is set to YES then doxygen will not

+# include the index chapters (such as File Index, Compound Index, etc.)

+# in the output.

+

+LATEX_HIDE_INDICES     = NO

+

+# If LATEX_SOURCE_CODE is set to YES then doxygen will include

+# source code with syntax highlighting in the LaTeX output.

+# Note that which sources are shown also depends on other settings

+# such as SOURCE_BROWSER.

+

+LATEX_SOURCE_CODE      = NO

+

+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the

+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See

+# http://en.wikipedia.org/wiki/BibTeX for more info.

+

+LATEX_BIB_STYLE        = plain

+

+#---------------------------------------------------------------------------

+# configuration options related to the RTF output

+#---------------------------------------------------------------------------

+

+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output

+# The RTF output is optimized for Word 97 and may not look very pretty with

+# other RTF readers or editors.

+

+GENERATE_RTF           = NO

+

+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.

+# If a relative path is entered the value of OUTPUT_DIRECTORY will be

+# put in front of it. If left blank `rtf' will be used as the default path.

+

+RTF_OUTPUT             =

+

+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact

+# RTF documents. This may be useful for small projects and may help to

+# save some trees in general.

+

+COMPACT_RTF            = NO

+

+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated

+# will contain hyperlink fields. The RTF file will

+# contain links (just like the HTML output) instead of page references.

+# This makes the output suitable for online browsing using WORD or other

+# programs which support those fields.

+# Note: wordpad (write) and others do not support links.

+

+RTF_HYPERLINKS         = NO

+

+# Load style sheet definitions from file. Syntax is similar to doxygen's

+# config file, i.e. a series of assignments. You only have to provide

+# replacements, missing definitions are set to their default value.

+

+RTF_STYLESHEET_FILE    =

+

+# Set optional variables used in the generation of an rtf document.

+# Syntax is similar to doxygen's config file.

+

+RTF_EXTENSIONS_FILE    =

+

+#---------------------------------------------------------------------------

+# configuration options related to the man page output

+#---------------------------------------------------------------------------

+

+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will

+# generate man pages

+

+GENERATE_MAN           = NO

+

+# The MAN_OUTPUT tag is used to specify where the man pages will be put.

+# If a relative path is entered the value of OUTPUT_DIRECTORY will be

+# put in front of it. If left blank `man' will be used as the default path.

+

+MAN_OUTPUT             =

+

+# The MAN_EXTENSION tag determines the extension that is added to

+# the generated man pages (default is the subroutine's section .3)

+

+MAN_EXTENSION          =

+

+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,

+# then it will generate one additional man file for each entity

+# documented in the real man page(s). These additional files

+# only source the real man page, but without them the man command

+# would be unable to find the correct page. The default is NO.

+

+MAN_LINKS              = NO

+

+#---------------------------------------------------------------------------

+# configuration options related to the XML output

+#---------------------------------------------------------------------------

+

+# If the GENERATE_XML tag is set to YES Doxygen will

+# generate an XML file that captures the structure of

+# the code including all documentation.

+

+GENERATE_XML           = NO

+

+# The XML_OUTPUT tag is used to specify where the XML pages will be put.

+# If a relative path is entered the value of OUTPUT_DIRECTORY will be

+# put in front of it. If left blank `xml' will be used as the default path.

+

+XML_OUTPUT             = xml

+

+# The XML_SCHEMA tag can be used to specify an XML schema,

+# which can be used by a validating XML parser to check the

+# syntax of the XML files.

+

+XML_SCHEMA             =

+

+# The XML_DTD tag can be used to specify an XML DTD,

+# which can be used by a validating XML parser to check the

+# syntax of the XML files.

+

+XML_DTD                =

+

+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will

+# dump the program listings (including syntax highlighting

+# and cross-referencing information) to the XML output. Note that

+# enabling this will significantly increase the size of the XML output.

+

+XML_PROGRAMLISTING     = YES

+

+#---------------------------------------------------------------------------

+# configuration options for the AutoGen Definitions output

+#---------------------------------------------------------------------------

+

+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will

+# generate an AutoGen Definitions (see autogen.sf.net) file

+# that captures the structure of the code including all

+# documentation. Note that this feature is still experimental

+# and incomplete at the moment.

+

+GENERATE_AUTOGEN_DEF   = NO

+

+#---------------------------------------------------------------------------

+# configuration options related to the Perl module output

+#---------------------------------------------------------------------------

+

+# If the GENERATE_PERLMOD tag is set to YES Doxygen will

+# generate a Perl module file that captures the structure of

+# the code including all documentation. Note that this

+# feature is still experimental and incomplete at the

+# moment.

+

+GENERATE_PERLMOD       = NO

+

+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate

+# the necessary Makefile rules, Perl scripts and LaTeX code to be able

+# to generate PDF and DVI output from the Perl module output.

+

+PERLMOD_LATEX          = NO

+

+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be

+# nicely formatted so it can be parsed by a human reader.

+# This is useful

+# if you want to understand what is going on.

+# On the other hand, if this

+# tag is set to NO the size of the Perl module output will be much smaller

+# and Perl will parse it just the same.

+

+PERLMOD_PRETTY         = YES

+

+# The names of the make variables in the generated doxyrules.make file

+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.

+# This is useful so different doxyrules.make files included by the same

+# Makefile don't overwrite each other's variables.

+

+PERLMOD_MAKEVAR_PREFIX =

+

+#---------------------------------------------------------------------------

+# Configuration options related to the preprocessor

+#---------------------------------------------------------------------------

+

+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will

+# evaluate all C-preprocessor directives found in the sources and include

+# files.

+

+ENABLE_PREPROCESSING   = YES

+

+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro

+# names in the source code. If set to NO (the default) only conditional

+# compilation will be performed. Macro expansion can be done in a controlled

+# way by setting EXPAND_ONLY_PREDEF to YES.

+

+MACRO_EXPANSION        = YES

+

+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES

+# then the macro expansion is limited to the macros specified with the

+# PREDEFINED and EXPAND_AS_DEFINED tags.

+

+EXPAND_ONLY_PREDEF     = YES

+

+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files

+# pointed to by INCLUDE_PATH will be searched when a #include is found.

+

+SEARCH_INCLUDES        = YES

+

+# The INCLUDE_PATH tag can be used to specify one or more directories that

+# contain include files that are not input files but should be processed by

+# the preprocessor.

+

+INCLUDE_PATH           =

+

+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard

+# patterns (like *.h and *.hpp) to filter out the header-files in the

+# directories. If left blank, the patterns specified with FILE_PATTERNS will

+# be used.

+

+INCLUDE_FILE_PATTERNS  =

+

+# The PREDEFINED tag can be used to specify one or more macro names that

+# are defined before the preprocessor is started (similar to the -D option of

+# gcc). The argument of the tag is a list of macros of the form: name

+# or name=definition (no spaces). If the definition and the = are

+# omitted =1 is assumed. To prevent a macro definition from being

+# undefined via #undef or recursively expanded use the := operator

+# instead of the = operator.

+

+PREDEFINED             = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1

+

+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then

+# this tag can be used to specify a list of macro names that should be expanded.

+# The macro definition that is found in the sources will be used.

+# Use the PREDEFINED tag if you want to use a different macro definition that

+# overrules the definition found in the source code.

+

+EXPAND_AS_DEFINED      =

+

+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then

+# doxygen's preprocessor will remove all references to function-like macros

+# that are alone on a line, have an all uppercase name, and do not end with a

+# semicolon, because these will confuse the parser if not removed.

+

+SKIP_FUNCTION_MACROS   = YES

+

+#---------------------------------------------------------------------------

+# Configuration::additions related to external references

+#---------------------------------------------------------------------------

+

+# The TAGFILES option can be used to specify one or more tagfiles. For each

+# tag file the location of the external documentation should be added. The

+# format of a tag file without this location is as follows:

+#

+# TAGFILES = file1 file2 ...

+# Adding location for the tag files is done as follows:

+#

+# TAGFILES = file1=loc1 "file2 = loc2" ...

+# where "loc1" and "loc2" can be relative or absolute paths

+# or URLs. Note that each tag file must have a unique name (where the name does

+# NOT include the path). If a tag file is not located in the directory in which

+# doxygen is run, you must also specify the path to the tagfile here.

+

+TAGFILES               =

+

+# When a file name is specified after GENERATE_TAGFILE, doxygen will create

+# a tag file that is based on the input files it reads.

+

+GENERATE_TAGFILE       =

+

+# If the ALLEXTERNALS tag is set to YES all external classes will be listed

+# in the class index. If set to NO only the inherited external classes

+# will be listed.

+

+ALLEXTERNALS           = NO

+

+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed

+# in the modules index. If set to NO, only the current project's groups will

+# be listed.

+

+EXTERNAL_GROUPS        = YES

+

+# The PERL_PATH should be the absolute path and name of the perl script

+# interpreter (i.e. the result of `which perl').

+

+PERL_PATH              =

+

+#---------------------------------------------------------------------------

+# Configuration options related to the dot tool

+#---------------------------------------------------------------------------

+

+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will

+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base

+# or super classes. Setting the tag to NO turns the diagrams off. Note that

+# this option also works with HAVE_DOT disabled, but it is recommended to

+# install and use dot, since it yields more powerful graphs.

+

+CLASS_DIAGRAMS         = YES

+

+# You can define message sequence charts within doxygen comments using the \msc

+# command. Doxygen will then run the mscgen tool (see

+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the

+# documentation. The MSCGEN_PATH tag allows you to specify the directory where

+# the mscgen tool resides. If left empty the tool is assumed to be found in the

+# default search path.

+

+MSCGEN_PATH            =

+

+# If set to YES, the inheritance and collaboration graphs will hide

+# inheritance and usage relations if the target is undocumented

+# or is not a class.

+

+HIDE_UNDOC_RELATIONS   = YES

+

+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is

+# available from the path. This tool is part of Graphviz, a graph visualization

+# toolkit from AT&T and Lucent Bell Labs. The other options in this section

+# have no effect if this option is set to NO (the default)

+

+HAVE_DOT               = NO

+

+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is

+# allowed to run in parallel. When set to 0 (the default) doxygen will

+# base this on the number of processors available in the system. You can set it

+# explicitly to a value larger than 0 to get control over the balance

+# between CPU load and processing speed.

+

+DOT_NUM_THREADS        = 0

+

+# By default doxygen will use the Helvetica font for all dot files that

+# doxygen generates. When you want a differently looking font you can specify

+# the font name using DOT_FONTNAME. You need to make sure dot is able to find

+# the font, which can be done by putting it in a standard location or by setting

+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the

+# directory containing the font.

+

+DOT_FONTNAME           = Helvetica

+

+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.

+# The default size is 10pt.

+

+DOT_FONTSIZE           = 10

+

+# By default doxygen will tell dot to use the Helvetica font.

+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to

+# set the path where dot can find it.

+

+DOT_FONTPATH           =

+

+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen

+# will generate a graph for each documented class showing the direct and

+# indirect inheritance relations. Setting this tag to YES will force the

+# CLASS_DIAGRAMS tag to NO.

+

+CLASS_GRAPH            = YES

+

+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen

+# will generate a graph for each documented class showing the direct and

+# indirect implementation dependencies (inheritance, containment, and

+# class references variables) of the class with other documented classes.

+

+COLLABORATION_GRAPH    = YES

+

+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen

+# will generate a graph for groups, showing the direct groups dependencies

+

+GROUP_GRAPHS           = YES

+

+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and

+# collaboration diagrams in a style similar to the OMG's Unified Modeling

+# Language.

+

+UML_LOOK               = NO

+

+# If the UML_LOOK tag is enabled, the fields and methods are shown inside

+# the class node. If there are many fields or methods and many nodes the

+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS

+# threshold limits the number of items for each type to make the size more

+# managable. Set this to 0 for no limit. Note that the threshold may be

+# exceeded by 50% before the limit is enforced.

+

+UML_LIMIT_NUM_FIELDS   = 10

+

+# If set to YES, the inheritance and collaboration graphs will show the

+# relations between templates and their instances.

+

+TEMPLATE_RELATIONS     = YES

+

+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT

+# tags are set to YES then doxygen will generate a graph for each documented

+# file showing the direct and indirect include dependencies of the file with

+# other documented files.

+

+INCLUDE_GRAPH          = YES

+

+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and

+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each

+# documented header file showing the documented files that directly or

+# indirectly include this file.

+

+INCLUDED_BY_GRAPH      = YES

+

+# If the CALL_GRAPH and HAVE_DOT options are set to YES then

+# doxygen will generate a call dependency graph for every global function

+# or class method. Note that enabling this option will significantly increase

+# the time of a run. So in most cases it will be better to enable call graphs

+# for selected functions only using the \callgraph command.

+

+CALL_GRAPH             = NO

+

+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then

+# doxygen will generate a caller dependency graph for every global function

+# or class method. Note that enabling this option will significantly increase

+# the time of a run. So in most cases it will be better to enable caller

+# graphs for selected functions only using the \callergraph command.

+

+CALLER_GRAPH           = NO

+

+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen

+# will generate a graphical hierarchy of all classes instead of a textual one.

+

+GRAPHICAL_HIERARCHY    = YES

+

+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES

+# then doxygen will show the dependencies a directory has on other directories

+# in a graphical way. The dependency relations are determined by the #include

+# relations between the files in the directories.

+

+DIRECTORY_GRAPH        = YES

+

+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images

+# generated by dot. Possible values are svg, png, jpg, or gif.

+# If left blank png will be used. If you choose svg you need to set

+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files

+# visible in IE 9+ (other browsers do not have this requirement).

+

+DOT_IMAGE_FORMAT       = png

+

+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to

+# enable generation of interactive SVG images that allow zooming and panning.

+# Note that this requires a modern browser other than Internet Explorer.

+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you

+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files

+# visible. Older versions of IE do not have SVG support.

+

+INTERACTIVE_SVG        = NO

+

+# The tag DOT_PATH can be used to specify the path where the dot tool can be

+# found. If left blank, it is assumed the dot tool can be found in the path.

+

+DOT_PATH               =

+

+# The DOTFILE_DIRS tag can be used to specify one or more directories that

+# contain dot files that are included in the documentation (see the

+# \dotfile command).

+

+DOTFILE_DIRS           =

+

+# The MSCFILE_DIRS tag can be used to specify one or more directories that

+# contain msc files that are included in the documentation (see the

+# \mscfile command).

+

+MSCFILE_DIRS           =

+

+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of

+# nodes that will be shown in the graph. If the number of nodes in a graph

+# becomes larger than this value, doxygen will truncate the graph, which is

+# visualized by representing a node as a red box. Note that doxygen if the

+# number of direct children of the root node in a graph is already larger than

+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note

+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.

+

+DOT_GRAPH_MAX_NODES    = 50

+

+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the

+# graphs generated by dot. A depth value of 3 means that only nodes reachable

+# from the root by following a path via at most 3 edges will be shown. Nodes

+# that lay further from the root node will be omitted. Note that setting this

+# option to 1 or 2 may greatly reduce the computation time needed for large

+# code bases. Also note that the size of a graph can be further restricted by

+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.

+

+MAX_DOT_GRAPH_DEPTH    = 0

+

+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent

+# background. This is disabled by default, because dot on Windows does not

+# seem to support this out of the box. Warning: Depending on the platform used,

+# enabling this option may lead to badly anti-aliased labels on the edges of

+# a graph (i.e. they become hard to read).

+

+DOT_TRANSPARENT        = NO

+

+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output

+# files in one run (i.e. multiple -o and -T options on the command line). This

+# makes dot run faster, but since only newer versions of dot (>1.8.10)

+# support this, this feature is disabled by default.

+

+DOT_MULTI_TARGETS      = NO

+

+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will

+# generate a legend page explaining the meaning of the various boxes and

+# arrows in the dot generated graphs.

+

+GENERATE_LEGEND        = YES

+

+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will

+# remove the intermediate dot files that are used to generate

+# the various graphs.

+

+DOT_CLEANUP            = YES


diff --git a/final/runtime/doc/doxygen/header.tex b/final/runtime/doc/doxygen/header.tex
new file mode 100644
index 0000000..8874077
--- /dev/null
+++ b/final/runtime/doc/doxygen/header.tex

@@ -0,0 +1,77 @@
+% Latex header for doxygen 1.8.3.1
+\documentclass{book}
+\usepackage[a4paper,top=2.5cm,bottom=2.5cm,left=2.5cm,right=2.5cm]{geometry}
+\usepackage{makeidx}
+\usepackage{natbib}
+\usepackage{graphicx}
+\usepackage{multicol}
+\usepackage{float}
+\usepackage{listings}
+\usepackage{color}
+\usepackage{ifthen}
+\usepackage[table]{xcolor}
+\usepackage{textcomp}
+\usepackage{alltt}
+\usepackage{ifpdf}
+\ifpdf
+\usepackage[pdftex,
+            pagebackref=true,
+            colorlinks=true,
+            linkcolor=blue,
+            unicode
+           ]{hyperref}
+\else
+\usepackage[ps2pdf,
+            pagebackref=true,
+            colorlinks=true,
+            linkcolor=blue,
+            unicode
+           ]{hyperref}
+\usepackage{pspicture}
+\fi
+\usepackage[utf8]{inputenc}
+\usepackage{mathptmx}
+\usepackage[scaled=.90]{helvet}
+\usepackage{courier}
+\usepackage{sectsty}
+\usepackage{amssymb}
+\usepackage[titles]{tocloft}
+\usepackage{doxygen}
+\lstset{language=C++,inputencoding=utf8,basicstyle=\footnotesize,breaklines=true,breakatwhitespace=true,tabsize=4,numbers=left }
+\makeindex
+\setcounter{tocdepth}{3}
+\renewcommand{\footrulewidth}{0.4pt}
+\renewcommand{\familydefault}{\sfdefault}
+\hfuzz=15pt
+\setlength{\emergencystretch}{15pt}
+\hbadness=750
+\tolerance=750
+\begin{document}
+\hypersetup{pageanchor=false,citecolor=blue}
+\begin{titlepage}
+\vspace*{7cm}
+\begin{center}
+{\Large LLVM OpenMP\textsuperscript{*} Runtime Library }\\
+\vspace*{1cm}
+{\large Generated by Doxygen $doxygenversion }\\
+\vspace*{0.5cm}
+{\small $datetime }\\
+\end{center}
+\end{titlepage}
+
+{\bf Trademarks}
+The OpenMP name and the OpenMP logo are registered trademarks of the OpenMP Architecture Review Board.
+
+Intel, Xeon, and Intel Xeon Phi are trademarks of Intel Corporation in the U.S. and/or other countries.
+
+This document is Copyright \textcopyright~\the\year the LLVM Project. It is
+subject to the same license terms as the LLVM OpenMP runtime.
+ 
+\textsuperscript{*} Other names and brands may be claimed as the property of others.
+
+\clearemptydoublepage
+\pagenumbering{roman}
+\tableofcontents
+\clearemptydoublepage
+\pagenumbering{arabic}
+\hypersetup{pageanchor=true,citecolor=blue}

diff --git a/final/runtime/doc/doxygen/libomp_interface.h b/final/runtime/doc/doxygen/libomp_interface.h
new file mode 100644
index 0000000..8223db7
--- /dev/null
+++ b/final/runtime/doc/doxygen/libomp_interface.h

@@ -0,0 +1,348 @@
+// This file does not contain any code; it just contains additional text and formatting

+// for doxygen.

+

+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+

+/*! @mainpage LLVM&nbsp; OpenMP* Runtime Library Interface

+@section sec_intro Introduction

+

+This document describes the interface provided by the

+LLVM &nbsp;OpenMP\other runtime library to the compiler.

+Routines that are directly called as simple functions by user code are

+not currently described here, since their definition is in the OpenMP

+specification available from http://openmp.org

+

+The aim here is to explain the interface from the compiler to the runtime.

+

+The overall design is described, and each function in the interface

+has its own description. (At least, that's the ambition, we may not be there yet).

+

+@section sec_building Building the Runtime

+For the impatient, we cover building the runtime as the first topic here.

+

+A top-level Makefile is provided that attempts to derive a suitable

+configuration for the most commonly used environments.  To see the

+default settings, type:

+@code

+% make info

+@endcode

+

+You can change the Makefile's behavior with the following options:

+

+ - <b>omp_root</b>:    The path to the top-level directory containing the top-level

+             Makefile.  By default, this will take on the value of the

+             current working directory.

+

+ - <b>omp_os</b>:      Operating system.  By default, the build will attempt to

+             detect this. Currently supports "linux", "macos", and

+             "windows".

+

+ - <b>arch</b>:        Architecture. By default, the build will attempt to

+	     detect this if not specified by the user. Currently 

+	     supported values are

+             - "32" for IA-32 architecture 

+             - "32e" for Intel&reg;&nbsp;64 architecture

+             - "mic" for Intel&reg;&nbsp;Many Integrated Core Architecture (

+             If "mic" is specified then "icc" will be used as the

+             compiler, and appropriate k1om binutils will be used. The

+             necessary packages must be installed on the build machine

+             for this to be possible, but an

+	     Intel&reg;&nbsp;Xeon Phi&trade;&nbsp;

+             coprocessor is not required to build the library).

+

+ - <b>compiler</b>:    Which compiler to use for the build.  Defaults to "icc"

+             or "icl" depending on the value of omp_os. Also supports

+             "gcc" when omp_os is "linux" for gcc\other versions

+             4.6.2 and higher. For icc on OS X\other, OS X\other versions 

+	     greater than 10.6 are not supported currently. Also, icc

+	     version 13.0 is not supported. The selected compiler should be

+             installed and in the user's path. The corresponding

+             Fortran compiler should also be in the path.

+

+ - <b>mode</b>:        Library mode: default is "release".  Also supports "debug".

+

+To use any of the options above, simple add &lt;option_name&gt;=&lt;value&gt;.  For

+example, if you want to build with gcc instead of icc, type:

+@code

+% make compiler=gcc

+@endcode

+

+Underneath the hood of the top-level Makefile, the runtime is built by

+a perl script that in turn drives a detailed runtime system make.  The

+script can be found at <tt>tools/build.pl</tt>, and will print

+information about all its flags and controls if invoked as 

+@code 

+% tools/build.pl --help 

+@endcode

+

+If invoked with no arguments, it will try to build a set of libraries

+that are appropriate for the machine on which the build is happening. 

+There are many options for building out of tree, and configuring library

+features that can also be used. Consult the <tt>--help</tt> output for details.

+

+@section sec_supported Supported RTL Build Configurations

+

+The architectures supported are IA-32 architecture, Intel&reg;&nbsp; 64, and

+Intel&reg;&nbsp; Many Integrated Core Architecture.  The build configurations

+supported are shown in the table below.

+

+<table border=1>

+<tr><th> <th>icc/icl<th>gcc

+<tr><td>Linux\other OS<td>Yes(1,5)<td>Yes(2,4)

+<tr><td>OS X\other<td>Yes(1,3,4)<td>No

+<tr><td>Windows\other OS<td>Yes(1,4)<td>No

+</table>

+(1) On IA-32 architecture and Intel&reg;&nbsp; 64, icc/icl versions 12.x 

+    are supported (12.1 is recommended).<br>

+(2) gcc version 4.6.2 is supported.<br>

+(3) For icc on OS X\other, OS X\other version 10.5.8 is supported.<br>

+(4) Intel&reg;&nbsp; Many Integrated Core Architecture not supported.<br>

+(5) On Intel&reg;&nbsp; Many Integrated Core Architecture, icc/icl versions 13.0 or later are required.

+

+@section sec_frontend Front-end Compilers that work with this RTL

+

+The following compilers are known to do compatible code generation for

+this RTL: icc/icl, gcc.  Code generation is discussed in more detail

+later in this document.

+

+@section sec_outlining Outlining

+

+The runtime interface is based on the idea that the compiler

+"outlines" sections of code that are to run in parallel into separate

+functions that can then be invoked in multiple threads.  For instance,

+simple code like this

+

+@code

+void foo()

+{

+#pragma omp parallel

+    {

+        ... do something ...

+    }

+}

+@endcode

+is converted into something that looks conceptually like this (where

+the names used are merely illustrative; the real library function

+names will be used later after we've discussed some more issues...)

+

+@code

+static void outlinedFooBody()

+{

+    ... do something ...

+}

+

+void foo()

+{

+    __OMP_runtime_fork(outlinedFooBody, (void*)0);   // Not the real function name!

+}

+@endcode

+

+@subsection SEC_SHAREDVARS Addressing shared variables

+

+In real uses of the OpenMP\other API there are normally references 

+from the outlined code  to shared variables that are in scope in the containing function. 

+Therefore the containing function must be able to address 

+these variables. The runtime supports two alternate ways of doing

+this.

+

+@subsubsection SEC_SEC_OT Current Technique

+The technique currently supported by the runtime library is to receive

+a separate pointer to each shared variable that can be accessed from

+the outlined function.  This is what is shown in the example below.

+

+We hope soon to provide an alternative interface to support the

+alternate implementation described in the next section. The

+alternative implementation has performance advantages for small

+parallel regions that have many shared variables.

+

+@subsubsection SEC_SEC_PT Future Technique

+The idea is to treat the outlined function as though it

+were a lexically nested function, and pass it a single argument which

+is the pointer to the parent's stack frame. Provided that the compiler

+knows the layout of the parent frame when it is generating the outlined

+function it can then access the up-level variables at appropriate

+offsets from the parent frame.  This is a classical compiler technique

+from the 1960s to support languages like Algol (and its descendants)

+that support lexically nested functions.

+

+The main benefit of this technique is that there is no code required

+at the fork point to marshal the arguments to the outlined function.

+Since the runtime knows statically how many arguments must be passed to the

+outlined function, it can easily copy them to the thread's stack

+frame.  Therefore the performance of the fork code is independent of

+the number of shared variables that are accessed by the outlined

+function.

+

+If it is hard to determine the stack layout of the parent while generating the

+outlined code, it is still possible to use this approach by collecting all of

+the variables in the parent that are accessed from outlined functions into

+a single `struct` which is placed on the stack, and whose address is passed

+to the outlined functions. In this way the offsets of the shared variables

+are known (since they are inside the struct) without needing to know

+the complete layout of the parent stack-frame. From the point of view

+of the runtime either of these techniques is equivalent, since in either

+case it only has to pass a single argument to the outlined function to allow 

+it to access shared variables.

+

+A scheme like this is how gcc\other generates outlined functions.

+

+@section SEC_INTERFACES Library Interfaces

+The library functions used for specific parts of the OpenMP\other language implementation

+are documented in different modules.

+

+ - @ref BASIC_TYPES fundamental types used by the runtime in many places

+ - @ref DEPRECATED  functions that are in the library but are no longer required

+ - @ref STARTUP_SHUTDOWN functions for initializing and finalizing the runtime

+ - @ref PARALLEL functions for implementing `omp parallel`

+ - @ref THREAD_STATES functions for supporting thread state inquiries

+ - @ref WORK_SHARING functions for work sharing constructs such as `omp for`, `omp sections`

+ - @ref THREADPRIVATE functions to support thread private data, copyin etc

+ - @ref SYNCHRONIZATION functions to support `omp critical`, `omp barrier`, `omp master`, reductions etc

+ - @ref ATOMIC_OPS functions to support atomic operations

+ - @ref STATS_GATHERING macros to support developer profiling of libomp

+ - Documentation on tasking has still to be written...

+

+@section SEC_EXAMPLES Examples

+@subsection SEC_WORKSHARING_EXAMPLE Work Sharing Example

+This example shows the code generated for a parallel for with reduction and dynamic scheduling.

+

+@code

+extern float foo( void );

+

+int main () {

+    int i; 

+    float r = 0.0; 

+    #pragma omp parallel for schedule(dynamic) reduction(+:r) 

+    for ( i = 0; i < 10; i ++ ) {

+        r += foo(); 

+    }

+}

+@endcode

+

+The transformed code looks like this.

+@code

+extern float foo( void ); 

+

+int main () {

+    static int zero = 0; 

+    auto int gtid; 

+    auto float r = 0.0; 

+    __kmpc_begin( & loc3, 0 ); 

+    // The gtid is not actually required in this example so could be omitted;

+    // We show its initialization here because it is often required for calls into

+    // the runtime and should be locally cached like this.

+    gtid = __kmpc_global thread num( & loc3 ); 

+    __kmpc_fork call( & loc7, 1, main_7_parallel_3, & r ); 

+    __kmpc_end( & loc0 ); 

+    return 0; 

+}

+

+struct main_10_reduction_t_5 { float r_10_rpr; }; 

+

+static kmp_critical_name lck = { 0 };

+static ident_t loc10; // loc10.flags should contain KMP_IDENT_ATOMIC_REDUCE bit set 

+                      // if compiler has generated an atomic reduction.

+

+void main_7_parallel_3( int *gtid, int *btid, float *r_7_shp ) {

+    auto int i_7_pr; 

+    auto int lower, upper, liter, incr; 

+    auto struct main_10_reduction_t_5 reduce; 

+    reduce.r_10_rpr = 0.F; 

+    liter = 0; 

+    __kmpc_dispatch_init_4( & loc7,*gtid, 35, 0, 9, 1, 1 ); 

+    while ( __kmpc_dispatch_next_4( & loc7, *gtid, & liter, & lower, & upper, & incr ) ) {

+        for( i_7_pr = lower; upper >= i_7_pr; i_7_pr ++ ) 

+          reduce.r_10_rpr += foo(); 

+    }

+    switch( __kmpc_reduce_nowait( & loc10, *gtid, 1, 4, & reduce, main_10_reduce_5, & lck ) ) {

+        case 1:

+           *r_7_shp += reduce.r_10_rpr;

+           __kmpc_end_reduce_nowait( & loc10, *gtid, & lck );

+           break;

+        case 2:

+           __kmpc_atomic_float4_add( & loc10, *gtid, r_7_shp, reduce.r_10_rpr );

+           break;

+        default:;

+    }

+} 

+

+void main_10_reduce_5( struct main_10_reduction_t_5 *reduce_lhs, 

+                       struct main_10_reduction_t_5 *reduce_rhs ) 

+{ 

+    reduce_lhs->r_10_rpr += reduce_rhs->r_10_rpr; 

+}

+@endcode

+

+@defgroup BASIC_TYPES Basic Types

+Types that are used throughout the runtime.

+

+@defgroup DEPRECATED Deprecated Functions

+Functions in this group are for backwards compatibility only, and

+should not be used in new code.

+

+@defgroup STARTUP_SHUTDOWN Startup and Shutdown

+These functions are for library initialization and shutdown.

+

+@defgroup PARALLEL Parallel (fork/join)

+These functions are used for implementing <tt>\#pragma omp parallel</tt>.

+

+@defgroup THREAD_STATES Thread Information

+These functions return information about the currently executing thread.

+

+@defgroup WORK_SHARING Work Sharing

+These functions are used for implementing 

+<tt>\#pragma omp for</tt>, <tt>\#pragma omp sections</tt>, <tt>\#pragma omp single</tt> and 

+<tt>\#pragma omp master</tt> constructs. 

+

+When handling loops, there are different functions for each of the signed and unsigned 32 and 64 bit integer types

+which have the name suffixes `_4`, `_4u`, `_8` and `_8u`. The semantics of each of the functions is the same,

+so they are only described once.

+

+Static loop scheduling is handled by  @ref __kmpc_for_static_init_4 and friends. Only a single call is needed,

+since the iterations to be executed by any give thread can be determined as soon as the loop parameters are known.

+

+Dynamic scheduling is handled by the @ref __kmpc_dispatch_init_4 and @ref __kmpc_dispatch_next_4 functions. 

+The init function is called once in each thread outside the loop, while the next function is called each

+time that the previous chunk of work has been exhausted. 

+

+@defgroup SYNCHRONIZATION Synchronization

+These functions are used for implementing barriers.

+

+@defgroup THREADPRIVATE Thread private data support

+These functions support copyin/out and thread private data.

+

+@defgroup STATS_GATHERING Statistics Gathering from OMPTB

+These macros support profiling the libomp library.  Use --stats=on when building with build.pl to enable

+and then use the KMP_* macros to profile (through counts or clock ticks) libomp during execution of an OpenMP program.

+

+@section sec_stats_env_vars Environment Variables

+

+This section describes the environment variables relevent to stats-gathering in libomp

+

+@code

+KMP_STATS_FILE

+@endcode

+This environment variable is set to an output filename that will be appended *NOT OVERWRITTEN* if it exists.  If this environment variable is undefined, the statistics will be output to stderr

+

+@code

+KMP_STATS_THREADS

+@endcode

+This environment variable indicates to print thread-specific statistics as well as aggregate statistics.  Each thread's statistics will be shown as well as the collective sum of all threads.  The values "true", "on", "1", "yes" will all indicate to print per thread statistics.

+

+@defgroup TASKING Tasking support

+These functions support tasking constructs.

+

+@defgroup USER User visible functions

+These functions can be called directly by the user, but are runtime library specific, rather than being OpenMP interfaces.

+

+*/

+


diff --git a/final/runtime/src/CMakeLists.txt b/final/runtime/src/CMakeLists.txt
new file mode 100644
index 0000000..a4e8451
--- /dev/null
+++ b/final/runtime/src/CMakeLists.txt

@@ -0,0 +1,315 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Using expand-vars.pl to generate files
+# - 'file' is generated using expand-vars.pl and 'file'.var
+# - Any .var file should use this recipe
+# TODO: Use CMake's configure_file() instead
+macro(libomp_expand_vars_recipe file_dir filename)
+  get_source_file_property(libomp_extra_evflags ${filename} EV_COMPILE_DEFINITIONS)
+  if("${libomp_extra_evflags}" MATCHES "NOTFOUND")
+    set(libomp_extra_evflags)
+  else()
+    libomp_string_to_list("${libomp_extra_evflags}" libomp_extra_evflags)
+  endif()
+  if(NOT "${filename}" STREQUAL "")
+    add_custom_command(
+      OUTPUT  ${filename}
+      COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/expand-vars.pl
+        --strict ${LIBOMP_EVFLAGS} ${libomp_extra_evflags} ${file_dir}/${filename}.var ${filename}
+      DEPENDS ${file_dir}/${filename}.var kmp_version.c ${LIBOMP_TOOLS_DIR}/expand-vars.pl
+    )
+  endif()
+endmacro()
+libomp_get_evflags(LIBOMP_EVFLAGS)
+libomp_string_to_list("${LIBOMP_EVFLAGS}" LIBOMP_EVFLAGS)
+set_source_files_properties(omp_lib.h PROPERTIES EV_COMPILE_DEFINITIONS "-D KMP_INT_PTR_KIND=\"int_ptr_kind()\"")
+set_source_files_properties(libomp.rc PROPERTIES EV_COMPILE_DEFINITIONS "-D KMP_FILE=${LIBOMP_LIB_FILE}" GENERATED TRUE)
+libomp_expand_vars_recipe(${LIBOMP_INC_DIR} omp.h)
+libomp_expand_vars_recipe(${LIBOMP_INC_DIR} ompt.h)
+libomp_expand_vars_recipe(${LIBOMP_INC_DIR} omp_lib.h)
+libomp_expand_vars_recipe(${LIBOMP_INC_DIR} omp_lib.f)
+libomp_expand_vars_recipe(${LIBOMP_INC_DIR} omp_lib.f90)
+libomp_expand_vars_recipe(${LIBOMP_SRC_DIR} libomp.rc)
+
+# Generate message catalog files: kmp_i18n_id.inc and kmp_i18n_default.inc
+add_custom_command(
+  OUTPUT  kmp_i18n_id.inc
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/message-converter.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --arch=${LIBOMP_ARCH} --prefix=kmp_i18n --enum=kmp_i18n_id.inc ${LIBOMP_SRC_DIR}/i18n/en_US.txt
+  DEPENDS ${LIBOMP_SRC_DIR}/i18n/en_US.txt ${LIBOMP_TOOLS_DIR}/message-converter.pl
+)
+add_custom_command(
+  OUTPUT  kmp_i18n_default.inc
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/message-converter.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --arch=${LIBOMP_ARCH} --prefix=kmp_i18n --default=kmp_i18n_default.inc ${LIBOMP_SRC_DIR}/i18n/en_US.txt
+  DEPENDS ${LIBOMP_SRC_DIR}/i18n/en_US.txt ${LIBOMP_TOOLS_DIR}/message-converter.pl
+)
+
+# Set the -D definitions for all sources
+libomp_get_definitions_flags(LIBOMP_CONFIGURED_DEFINITIONS_FLAGS)
+add_definitions(${LIBOMP_CONFIGURED_DEFINITIONS_FLAGS})
+
+# Set the -I includes for all sources
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${LIBOMP_SRC_DIR}
+  ${LIBOMP_SRC_DIR}/i18n
+  ${LIBOMP_INC_DIR}
+  ${LIBOMP_SRC_DIR}/thirdparty/ittnotify
+)
+
+# Getting correct source files to build library
+set(LIBOMP_CFILES)
+set(LIBOMP_CXXFILES)
+set(LIBOMP_ASMFILES)
+if(${STUBS_LIBRARY})
+  set(LIBOMP_CFILES kmp_stub.c)
+else()
+  # Get C files
+  set(LIBOMP_CFILES
+    kmp_alloc.c
+    kmp_atomic.c
+    kmp_csupport.c
+    kmp_debug.c
+    kmp_itt.c
+    kmp_environment.c
+    kmp_error.c
+    kmp_global.c
+    kmp_i18n.c
+    kmp_io.c
+    kmp_runtime.c
+    kmp_settings.c
+    kmp_str.c
+    kmp_tasking.c
+    kmp_taskq.c
+    kmp_threadprivate.c
+    kmp_utility.c
+  )
+  if(WIN32)
+    # Windows specific files
+    libomp_append(LIBOMP_CFILES z_Windows_NT_util.c)
+    libomp_append(LIBOMP_CFILES z_Windows_NT-586_util.c)
+    libomp_append(LIBOMP_ASMFILES z_Windows_NT-586_asm.asm) # Windows assembly file
+  else()
+    # Unix specific files
+    libomp_append(LIBOMP_CFILES z_Linux_util.c)
+    libomp_append(LIBOMP_CFILES kmp_gsupport.c)
+    libomp_append(LIBOMP_ASMFILES z_Linux_asm.s) # Unix assembly file
+  endif()
+  libomp_append(LIBOMP_CFILES thirdparty/ittnotify/ittnotify_static.c LIBOMP_USE_ITT_NOTIFY)
+  libomp_append(LIBOMP_CFILES kmp_debugger.c LIBOMP_USE_DEBUGGER)
+  # Get C++ files
+  set(LIBOMP_CXXFILES
+    kmp_barrier.cpp
+    kmp_wait_release.cpp
+    kmp_affinity.cpp
+    kmp_dispatch.cpp
+    kmp_lock.cpp
+    kmp_sched.cpp
+  )
+  libomp_append(LIBOMP_CXXFILES kmp_stats.cpp LIBOMP_STATS)
+  libomp_append(LIBOMP_CXXFILES kmp_stats_timing.cpp LIBOMP_STATS)
+  if(${LIBOMP_OMP_VERSION} GREATER 40 OR ${LIBOMP_OMP_VERSION} EQUAL 40)
+    libomp_append(LIBOMP_CXXFILES kmp_taskdeps.cpp)
+    libomp_append(LIBOMP_CXXFILES kmp_cancel.cpp)
+  endif()
+endif()
+# Files common to stubs and normal library
+libomp_append(LIBOMP_CFILES kmp_ftn_cdecl.c)
+libomp_append(LIBOMP_CFILES kmp_ftn_extra.c)
+libomp_append(LIBOMP_CFILES kmp_version.c)
+libomp_append(LIBOMP_CFILES ompt-general.c IF_TRUE LIBOMP_OMPT_SUPPORT)
+
+set(LIBOMP_SOURCE_FILES ${LIBOMP_CFILES} ${LIBOMP_CXXFILES} ${LIBOMP_ASMFILES})
+# For Windows, there is a resource file (.rc -> .res) that is also compiled
+libomp_append(LIBOMP_SOURCE_FILES libomp.rc WIN32)
+
+# Get compiler and assembler flags
+libomp_get_cflags(LIBOMP_CONFIGURED_CFLAGS)
+libomp_get_cxxflags(LIBOMP_CONFIGURED_CXXFLAGS)
+libomp_get_asmflags(LIBOMP_CONFIGURED_ASMFLAGS)
+# Set the compiler flags for each type of source
+set_source_files_properties(${LIBOMP_CFILES} PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CFLAGS}")
+set_source_files_properties(${LIBOMP_CXXFILES} PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CXXFLAGS}")
+set_source_files_properties(${LIBOMP_ASMFILES} PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_ASMFLAGS}")
+# Let the compiler handle the assembly files on Unix-like systems
+if(NOT WIN32)
+  set_source_files_properties(${LIBOMP_ASMFILES} PROPERTIES LANGUAGE C)
+endif()
+
+# Remove any cmake-automatic linking of the standard C++ library.
+# We neither need (nor want) the standard C++ library dependency even though we compile c++ files.
+if(NOT ${LIBOMP_USE_STDCPPLIB})
+  set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES)
+endif()
+
+# Add the OpenMP library
+libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
+add_library(omp SHARED ${LIBOMP_SOURCE_FILES})
+set_target_properties(omp PROPERTIES
+  PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_LIB_FILE}"
+  LINK_FLAGS "${LIBOMP_CONFIGURED_LDFLAGS}"
+  LINKER_LANGUAGE C # use C Compiler for linking step
+  SKIP_BUILD_RPATH true # have Mac linker -install_name just be "-install_name libomp.dylib"
+)
+
+# Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS
+libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS)
+target_link_libraries(omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS})
+
+# Create *.inc and omp.h before compiling any sources
+# objects depend on : .inc files and omp.h (and ompt.h if LIBOMP_OMPT_SUPPORT is on)
+# This way the *.inc and omp.h are generated before any compilations take place
+set(LIBOMP_NEEDED_HEADERS kmp_i18n_id.inc kmp_i18n_default.inc omp.h)
+libomp_append(LIBOMP_NEEDED_HEADERS ompt.h LIBOMP_OMPT_SUPPORT)
+add_custom_target(libomp-needed-headers DEPENDS ${LIBOMP_NEEDED_HEADERS})
+add_dependencies(omp libomp-needed-headers)
+
+# Windows specific build rules
+if(WIN32)
+  # Create .def and .rc file before compiling any sources
+  add_custom_target(libomp-needed-windows-files DEPENDS ${LIBOMP_LIB_NAME}.def libomp.rc)
+  add_dependencies(omp libomp-needed-windows-files)
+  # z_Windows_NT-586_asm.asm (masm file) send it i386 or x86_64 architecture definition flag
+  if(${IA32})
+    set_source_files_properties(z_Windows_NT-586_asm.asm PROPERTIES COMPILE_DEFINITIONS "_M_IA32")
+  elseif(${INTEL64})
+    set_source_files_properties(z_Windows_NT-586_asm.asm PROPERTIES COMPILE_DEFINITIONS "_M_AMD64")
+  endif()
+  set_source_files_properties(thirdparty/ittnotify/ittnotify_static.c PROPERTIES COMPILE_DEFINITIONS "UNICODE")
+
+  # Create Windows import library
+  # the import library is "re-linked" to include kmp_import.c which prevents
+  # linking of both Visual Studio OpenMP and newly built OpenMP
+  set_source_files_properties(kmp_import.c PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CFLAGS}")
+  set(LIBOMP_IMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(LIBOMP_GENERATED_IMP_LIB_FILENAME ${LIBOMP_LIB_FILE}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set_target_properties(omp PROPERTIES
+    VERSION ${LIBOMP_VERSION}.0 # uses /version flag
+    IMPORT_PREFIX "" IMPORT_SUFFIX "" # control generated import library name when building omp
+    ARCHIVE_OUTPUT_NAME ${LIBOMP_GENERATED_IMP_LIB_FILENAME}
+  )
+  # Get generated import library from creating omp
+  get_target_property(LIBOMP_IMPORT_LIB_DIRECTORY omp ARCHIVE_OUTPUT_DIRECTORY)
+  if(LIBOMP_IMPORT_LIB_DIRECTORY)
+    set(LIBOMP_GENERATED_IMP_LIB ${LIBOMP_IMPORT_LIB_DIRECTORY}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+  else()
+    set(LIBOMP_GENERATED_IMP_LIB ${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+  endif()
+  set_source_files_properties(${LIBOMP_GENERATED_IMP_LIB} PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE)
+  # Create new import library that is just the previously created one + kmp_import.c
+  add_library(ompimp STATIC ${LIBOMP_GENERATED_IMP_LIB} kmp_import.c)
+  set_target_properties(ompimp PROPERTIES
+    PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_IMP_LIB_FILE}"
+    LINKER_LANGUAGE C
+    SKIP_BUILD_RPATH true
+  )
+  add_dependencies(ompimp omp) # ensure generated import library is created first
+
+  # Create def file to designate exported functions
+  libomp_get_gdflags(LIBOMP_GDFLAGS) # generate-def.pl flags (Windows only)
+  libomp_string_to_list("${LIBOMP_GDFLAGS}" LIBOMP_GDFLAGS)
+  add_custom_command(
+    OUTPUT  ${LIBOMP_LIB_NAME}.def
+    COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/generate-def.pl ${LIBOMP_GDFLAGS}
+      -o ${LIBOMP_LIB_NAME}.def ${CMAKE_CURRENT_SOURCE_DIR}/dllexports
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/dllexports ${LIBOMP_TOOLS_DIR}/generate-def.pl
+  )
+endif()
+
+# Building the Fortran module files
+# One compilation step creates both omp_lib.mod and omp_lib_kinds.mod
+if(${LIBOMP_FORTRAN_MODULES})
+  add_custom_target(libomp-mod ALL DEPENDS omp_lib.mod omp_lib_kinds.mod)
+  libomp_get_fflags(LIBOMP_CONFIGURED_FFLAGS)
+  if(CMAKE_Fortran_COMPILER_SUPPORTS_F90)
+    set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.f90)
+  else()
+    set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.f)
+  endif()
+  add_custom_command(
+    OUTPUT omp_lib.mod omp_lib_kinds.mod
+    COMMAND ${CMAKE_Fortran_COMPILER} -c ${LIBOMP_CONFIGURED_FFLAGS} ${LIBOMP_FORTRAN_SOURCE_FILE}
+    DEPENDS ${LIBOMP_FORTRAN_SOURCE_FILE} omp_lib.h
+  )
+  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES omp_lib${CMAKE_C_OUTPUT_EXTENSION})
+endif()
+
+# Move files to exports/ directory if requested
+if(${LIBOMP_COPY_EXPORTS})
+  include(LibompExports)
+endif()
+
+# Micro test rules for after library has been built (cmake/LibompMicroTests.cmake)
+include(LibompMicroTests)
+add_custom_target(libomp-micro-tests)
+if(NOT ${MIC} AND NOT CMAKE_CROSSCOMPILING)
+  add_dependencies(libomp-micro-tests libomp-test-touch)
+endif()
+if(NOT WIN32 AND NOT APPLE)
+  add_dependencies(libomp-micro-tests libomp-test-relo)
+endif()
+if(NOT WIN32 AND NOT APPLE)
+  add_dependencies(libomp-micro-tests libomp-test-execstack)
+endif()
+if(${MIC})
+  add_dependencies(libomp-micro-tests libomp-test-instr)
+endif()
+add_dependencies(libomp-micro-tests libomp-test-deps)
+
+# Install rules
+# We want to install libomp in DESTDIR/CMAKE_INSTALL_PREFIX/lib
+# We want to install headers in DESTDIR/CMAKE_INSTALL_PREFIX/include
+if(${LIBOMP_STANDALONE_BUILD})
+  set(LIBOMP_HEADERS_INSTALL_PATH include)
+else()
+  string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" CLANG_VERSION ${PACKAGE_VERSION})
+  set(LIBOMP_HEADERS_INSTALL_PATH lib${LIBOMP_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
+endif()
+if(WIN32)
+  install(TARGETS omp RUNTIME DESTINATION bin)
+  install(TARGETS ompimp ARCHIVE DESTINATION lib${LIBOMP_LIBDIR_SUFFIX})
+  # Create aliases (regular copies) of the library for backwards compatibility
+  set(LIBOMP_ALIASES "libiomp5md")
+  foreach(alias IN LISTS LIBOMP_ALIASES)
+    install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E copy \"${LIBOMP_LIB_FILE}\"
+      \"${alias}${CMAKE_SHARED_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/bin)")
+    install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E copy \"${LIBOMP_IMP_LIB_FILE}\"
+      \"${alias}${CMAKE_STATIC_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/lib${LIBOMP_LIBDIR_SUFFIX})")
+  endforeach()
+else()
+  install(TARGETS omp LIBRARY DESTINATION lib${LIBOMP_LIBDIR_SUFFIX})
+  # Create aliases (symlinks) of the library for backwards compatibility
+  set(LIBOMP_ALIASES "libgomp;libiomp5")
+  foreach(alias IN LISTS LIBOMP_ALIASES)
+    install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${LIBOMP_LIB_FILE}\"
+      \"${alias}${CMAKE_SHARED_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+      \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/lib${LIBOMP_LIBDIR_SUFFIX})")
+  endforeach()
+endif()
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/omp.h
+  DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH}
+)
+if(${LIBOMP_OMPT_SUPPORT})
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ompt.h DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH})
+endif()
+if(${LIBOMP_FORTRAN_MODULES})
+  install(FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/omp_lib.h
+    ${CMAKE_CURRENT_BINARY_DIR}/omp_lib.mod
+    ${CMAKE_CURRENT_BINARY_DIR}/omp_lib_kinds.mod
+    DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH}
+  )
+endif()
+

diff --git a/final/runtime/src/defs.mk b/final/runtime/src/defs.mk
new file mode 100644
index 0000000..2bf82ac
--- /dev/null
+++ b/final/runtime/src/defs.mk

@@ -0,0 +1,67 @@
+# defs.mk
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# --------------------------------------------------------------------------------------------------
+# This file contains definitions common for OpenMP RTL and DSL makefiles.
+# --------------------------------------------------------------------------------------------------
+
+# Include really common definitions.
+include $(LIBOMP_WORK)tools/src/common-defs.mk
+
+#
+# Directories.
+#
+
+# Check and normalize LIBOMP_EXPORTS.
+ifeq "$(LIBOMP_EXPORTS)" ""
+    $(error LIBOMP_EXPORTS environment variable must be set)
+endif
+ifneq "$(words $(LIBOMP_EXPORTS))" "1"
+    $(error LIBOMP_EXPORTS must not contain spaces)
+endif
+override LIBOMP_EXPORTS := $(subst \,/,$(LIBOMP_EXPORTS))
+ifeq "$(filter %/,$(LIBOMP_EXPORTS))" ""
+    override LIBOMP_EXPORTS := $(LIBOMP_EXPORTS)/
+endif
+# Output directories.
+out_dir      = $(LIBOMP_EXPORTS)
+out_cmn_dir  = $(out_dir)common$(suffix)/
+out_ptf_dir  = $(out_dir)$(platform)$(suffix)/
+_out_lib_dir = $(out_dir)$(1)$(suffix)/lib$(if $(filter mac_%,$(1)),.thin)/
+out_lib_dir  = $(call _out_lib_dir,$(platform))
+ifneq "$(arch)" "mic"
+out_l10n_dir = $(out_lib_dir)$(if $(filter lin mac,$(os)),locale/)
+else
+out_l10n_dir = $(out_lib_dir)
+endif
+ifeq "$(os)" "mac"
+    _out_lib_fat_dir = $(out_dir)$(1)$(suffix)/lib/
+    out_lib_fat_dir  = $(call _out_lib_fat_dir,$(platform))
+    out_l10n_fat_dir = $(out_lib_fat_dir)locale/
+endif
+
+#
+# Retrieve build number,
+#
+
+ifeq "$(clean)" ""
+    # Parse kmp_version.c file, look for "#define KMP_VERSION_BUILD yyyymmdd" string,
+    # leave only "yyyymmdd". Note: Space after $$1 is important, it helps to detect possible errors.
+    build := $(strip $(shell $(perl) -p -e '$$_ =~ s{^(?:\s*\#define\s+KMP_VERSION_BUILD\s+([0-9]{8})|.*)\s*\n}{$$1 }' $(LIBOMP_WORK)src/kmp_version.c))
+    ifneq "$(words $(build))" "1"
+        $(error Failed to pase "kmp_version.c", cannot extract build number)
+    endif
+    $(call say,Build  : $(build)$(if $(filter 00000000,$(build)), (development)))
+endif
+
+# end of file #

diff --git a/final/runtime/src/dllexports b/final/runtime/src/dllexports
new file mode 100644
index 0000000..dd3c393
--- /dev/null
+++ b/final/runtime/src/dllexports

@@ -0,0 +1,976 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Deprecated entry points (numbers are reserved):
+- __kmpc_barrier_reduce_master              109
+- __kmpc_end_barrier_reduce_master          122
+- __kmpc_for_init_4                         131
+- __kmpc_for_init_8                         132
+- __kmpc_for_next_4                         133
+- __kmpc_for_next_8                         134
+- __kmpc_fork_call_bound                    139
+- __kmpc_reduce_master_nowait               149
+- __kmpc_omp_task_begin                     194
+- __kmpc_omp_task_complete                  195
+- kmpc_sharable_calloc                      218
+- kmpc_sharable_free                        219
+- kmpc_sharable_malloc                      220
+- kmpc_sharable_realloc                     221
+- kmpc_aligned_sharable_malloc              223
+- mpai4a                                    500
+- mpai8a                                    501
+- mpar4a                                    502
+- mpar8a                                    503
+- mpax4x                                    504
+- mpax8x                                    505
+- mpobar                                    506
+- mpoebr                                    507
+- mpofork                                   508
+- mpofrk                                    509
+- mpojoin                                   510
+- mpoxbr                                    511
+- mppadj                                    512
+- mppaff                                    513
+- mppbar                                    514
+- mppbeg                                    515
+- mppdeo                                    516
+- mppdnx                                    517
+- mppdnxd                                   518
+- mppdon                                    519
+- mppdxo                                    520
+- mppebr                                    521
+- mppecs                                    522
+- mppems                                    523
+- mppenc                                    524
+- mppend                                    525
+- mppepa                                    526
+- mppesp                                    527
+- mppfkd                                    528
+- mppfkt                                    529
+- mppfork                                   530
+- mppfrk                                    531
+- mppioa                                    532
+- mppiws                                    533
+- mppjoin                                   534
+- mppnth                                    535
+- mpppqa                                    536
+- mpppqc                                    537
+- mpppqs                                    538
+- mpptid                                    539
+- mpptpa                                    540
+- mpptpc                                    541
+- mpptpz                                    542
+- mppvsy                                    543
+- mppxbr                                    544
+- mppxcs                                    545
+- mppxms                                    546
+- mppxnc                                    547
+- mppxpa                                    548
+- mppxpr                                    549
+- mppxsp                                    550
+- mppxth                                    551
+- mpsbar                                    552
+- mpscpr                                    597
+- mpsebr                                    553
+- mpserd                                    554
+- mpsfd4                                    555
+- mpsfd8                                    556
+- mpsid4                                    557
+- mpsid8                                    558
+- mpsnd4                                    559
+- mpsnd8                                    560
+- mpsont                                    561
+- mpsred                                    562
+- mpsunt                                    563
+- mpsxbr                                    564
+- mpsxrd                                    565
+- mptadj                                    566
+- mptaff                                    567
+- mptbar                                    568
+- mptdeo                                    569
+- mptdin                                    570
+- mptdind                                   571
+- mptdnx                                    572
+- mptdnxd                                   573
+- mptdon                                    574
+- mptdxo                                    575
+- mptebr                                    576
+- mptecs                                    577
+- mptems                                    578
+- mptenc                                    579
+- mptepa                                    580
+- mptesp                                    581
+- mptfkd                                    582
+- mptppa                                    583
+- mptppc                                    584
+- mptpps                                    585
+- mpttpa                                    586
+- mpttpc                                    587
+- mpttpz                                    588
+- mptvsy                                    589
+- mptxbr                                    590
+- mptxcs                                    591
+- mptxms                                    592
+- mptxnc                                    593
+- mptxpa                                    594
+- mptxsp                                    595
+- mppcpr                                    596
+- ftn_set_library_gang                      736
+- kmp_set_library_gang
+- kmp_sharable_calloc                       760
+- kmp_sharable_free                         761
+- kmp_sharable_malloc                       762
+- kmp_sharable_realloc                      763
+- kmp_aligned_sharable_malloc               764
+- kmp_deferred_atomic_add_i4                765
+- kmp_deferred_atomic_add_i8                766
+- kmp_deferred_atomic_add_r4                767
+- kmp_deferred_atomic_add_r8                768
+- kmp_lock_cond_wait                        770
+- kmp_lock_cond_signal                      771
+- kmp_lock_cond_broadcast                   772
+- kmp_nest_lock_cond_wait                   773
+- kmp_nest_lock_cond_signal                 774
+- kmp_nest_lock_cond_broadcast              775
+- kmp_get_process_num                       781
+- kmp_get_num_processes                     782
+- kmp_get_process_thread_num                783
+- kmp_private_mmap                          784   # not implemented?
+- kmp_sharable_mmap                         785   # not implemented?
+- kmp_private_munmap                        786   # not implemented?
+- kmp_sharable_munmap                       787   # not implemented?
+- kmp_is_sharable                           788   # not implemented?
+
+%ifndef stub
+
+
+    #
+    # The following entry points are added so that the backtraces from
+    # the tools contain meaningful names for all the functions that might
+    # appear in a backtrace of a thread which is blocked in the RTL.
+    #
+
+    # Regular entry points
+        __kmp_wait_yield_4
+        __kmp_wait_yield_8
+        __kmp_fork_call
+        __kmp_invoke_microtask
+        __kmp_launch_monitor
+        __kmp_launch_worker
+        __kmp_reap_monitor
+        __kmp_reap_worker
+        __kmp_acquire_tas_lock
+        __kmp_acquire_nested_tas_lock
+        __kmp_acquire_ticket_lock
+        __kmp_acquire_nested_ticket_lock
+        __kmp_acquire_queuing_lock
+        __kmp_acquire_nested_queuing_lock
+        __kmp_acquire_drdpa_lock
+        __kmp_acquire_nested_drdpa_lock
+
+    %ifdef KMP_DEBUG
+        # allows console output capability for applications those don't have it
+        __kmp_printf
+    %endif
+
+
+#if USE_DEBUGGER
+        __kmp_debugging                         DATA
+        __kmp_omp_debug_struct_info             DATA
+#endif /* USE_DEBUGGER */
+
+        # Symbols for MS mutual detection:
+        _You_must_link_with_exactly_one_OpenMP_library    DATA
+        _You_must_link_with_Intel_OpenMP_library          DATA
+        %ifdef msvc_compat
+            _You_must_link_with_Microsoft_OpenMP_library  DATA
+        %endif
+
+        __kmp_wait_32
+        __kmp_wait_64
+        __kmp_wait_oncore
+        __kmp_release_32
+        __kmp_release_64
+        __kmp_release_oncore
+
+
+#    VT_getthid                              1
+#    vtgthid                                 2
+
+    __kmpc_atomic_4                         100
+    __kmpc_atomic_8                         101
+    __kmpc_atomic_fixed4_add                102
+    __kmpc_atomic_fixed8_add                103
+    __kmpc_atomic_float4_add                104
+    __kmpc_atomic_float8_add                105
+    __kmpc_barrier                          106
+    __kmpc_barrier_master                   107
+    __kmpc_barrier_master_nowait            108
+    __kmpc_begin                            110
+    __kmpc_bound_num_threads                111
+    __kmpc_bound_thread_num                 112
+    __kmpc_critical                         113
+    __kmpc_dispatch_fini_4                  114
+    __kmpc_dispatch_fini_8                  115
+    __kmpc_dispatch_init_4                  116
+    __kmpc_dispatch_init_8                  117
+    __kmpc_dispatch_next_4                  118
+    __kmpc_dispatch_next_8                  119
+    __kmpc_end                              120
+    __kmpc_end_barrier_master               121
+    __kmpc_end_critical                     123
+    __kmpc_end_master                       124
+    __kmpc_end_ordered                      125
+    __kmpc_end_serialized_parallel          126
+    __kmpc_end_single                       127
+    __kmpc_end_taskq                        128
+    __kmpc_end_taskq_task                   129
+    __kmpc_flush                            130
+    __kmpc_for_static_fini                  135
+    __kmpc_for_static_init_4                136
+    __kmpc_for_static_init_8                137
+    __kmpc_fork_call                        138
+    __kmpc_global_num_threads               140
+    __kmpc_global_thread_num                141
+    __kmpc_in_parallel                      142
+    __kmpc_invoke_task_func                 143
+    __kmpc_master                           144
+    __kmpc_ok_to_fork                       145
+    __kmpc_ordered                          146
+    __kmpc_pop_num_threads                  147
+    __kmpc_push_num_threads                 148
+    __kmpc_serialized_parallel              150
+    __kmpc_single                           151
+    __kmpc_task                             152
+    __kmpc_task_buffer                      153
+    __kmpc_taskq                            154
+    __kmpc_taskq_task                       155
+    __kmpc_threadprivate                    156
+    __kmpc_threadprivate_cached             157
+    __kmpc_threadprivate_register           158
+    __kmpc_threadprivate_register_vec       159
+#    __kmpc_ssp_begin                        160
+#    __kmpc_ssp_fork                         161
+#    __kmpc_ssp_end                          162
+#    __kmpc_ssp_post_4                       163
+#    __kmpc_ssp_post_8                       164
+#    __kmpc_ssp_wait_4                       165
+#    __kmpc_ssp_wait_8                       166
+#    __kmpc_ssp_distance_4                   167
+#    __kmpc_ssp_distance_8                   168
+#    __kmpc_in_ssp                           169
+#    __kmpc_ssp_thread_num                   170
+#    __kmpc_ssp_num_threads                  171
+    __kmpc_copyprivate                      172
+#    __kmpc_ssp_get_max_threads              173
+#    __kmpc_ssp_set_max_threads              174
+    __kmpc_init_lock                        175
+    __kmpc_destroy_lock                     176
+    __kmpc_set_lock                         177
+    __kmpc_unset_lock                       178
+    __kmpc_test_lock                        179
+    __kmpc_init_nest_lock                   180
+    __kmpc_destroy_nest_lock                181
+    __kmpc_set_nest_lock                    182
+    __kmpc_unset_nest_lock                  183
+    __kmpc_test_nest_lock                   184
+#    __kmpc_ssp_init_thread                  185
+#    __kmpc_ssp_set_event                    186
+    __kmpc_reduce_nowait                    187
+    __kmpc_end_reduce_nowait                188
+    __kmpc_reduce                           189
+    __kmpc_end_reduce                       190
+
+# OpenMP 3.0
+
+%ifdef OMP_30
+    __kmpc_omp_task_alloc                   191
+    __kmpc_omp_task                         192
+    __kmpc_omp_taskwait                     193
+    __kmpc_omp_task_begin_if0               196
+    __kmpc_omp_task_complete_if0            197
+    __kmpc_omp_task_parts                   198
+%endif # OMP_30
+
+#   __omp_collector_api                  199
+
+    # These functions are for testing purposes. There is no need in stable ordinal number:
+    __kmp_get_reduce_method
+
+%endif  # not defined stub
+
+kmpc_calloc                                 200
+kmpc_free                                   201
+%ifndef stub
+    # These functions are exported from libguide, but declared neither in omp.h not in omp_lib.h.
+#    kmpc_get_banner                         202
+#    kmpc_get_poolmode                       203
+#    kmpc_get_poolsize                       204
+#    kmpc_get_poolstat                       205
+#    kmpc_poolprint                          207
+#    kmpc_print_banner                       208
+#    kmpc_set_poolmode                       214
+#    kmpc_set_poolsize                       215
+%endif
+kmpc_malloc                                 206
+kmpc_realloc                                209
+kmpc_set_blocktime                          211
+kmpc_set_library                            212
+# kmpc_set_parallel_name                      213
+kmpc_set_stacksize                          216
+kmpc_set_stacksize_s                        222
+# kmpc_set_stats                              217
+kmpc_set_defaults                           224
+
+# OMP 3.0 entry points for unsigned loop iteration variables
+%ifndef stub
+    %ifdef OMP_30
+        __kmpc_for_static_init_8u           225
+        __kmpc_dispatch_init_8u             226
+        __kmpc_dispatch_next_8u             227
+        __kmpc_dispatch_fini_8u             228
+        __kmpc_for_static_init_4u           229
+        __kmpc_dispatch_init_4u             230
+        __kmpc_dispatch_next_4u             231
+        __kmpc_dispatch_fini_4u             232
+    %endif # OMP_30
+%endif
+
+%ifndef stub
+    __kmpc_get_taskid                       233
+    __kmpc_get_parent_taskid                234
+%endif
+
+# OpenMP 3.1 entry points
+%ifndef stub
+    %ifdef OMP_30
+        __kmpc_omp_taskyield                235
+    %endif # OMP_30
+    __kmpc_place_threads                    236
+%endif
+
+# OpenMP 4.0 entry points
+%ifndef stub
+    %ifdef OMP_40
+        __kmpc_push_proc_bind               237
+        __kmpc_taskgroup                    238
+        __kmpc_end_taskgroup                239
+        __kmpc_push_num_teams               240
+        __kmpc_fork_teams                   241
+        __kmpc_omp_task_with_deps           242
+        __kmpc_omp_wait_deps                243
+        __kmpc_cancel                       244
+        __kmpc_cancellationpoint            245
+        __kmpc_cancel_barrier               246
+        __kmpc_dist_for_static_init_4       247
+        __kmpc_dist_for_static_init_4u      248
+        __kmpc_dist_for_static_init_8       249
+        __kmpc_dist_for_static_init_8u      250
+        __kmpc_dist_dispatch_init_4         251
+        __kmpc_dist_dispatch_init_4u        252
+        __kmpc_dist_dispatch_init_8         253
+        __kmpc_dist_dispatch_init_8u        254
+        __kmpc_team_static_init_4           255
+        __kmpc_team_static_init_4u          256
+        __kmpc_team_static_init_8           257
+        __kmpc_team_static_init_8u          258
+    %endif # OMP_40
+%endif
+
+# OpenMP 4.1 entry points
+%ifndef stub
+    %ifdef OMP_41
+	__kmpc_proxy_task_completed	    259
+	__kmpc_proxy_task_completed_ooo	    260
+    %endif
+%endif
+
+# User API entry points that have both lower- and upper- case versions for Fortran.
+# Number for lowercase version is indicated.  Number for uppercase is obtained by adding 1000.
+# User API entry points are entry points that start with 'kmp_' or 'omp_'.
+
+omp_destroy_lock                            700
+omp_destroy_nest_lock                       701
+omp_get_dynamic                             702
+omp_get_max_threads                         703
+omp_get_nested                              704
+omp_get_num_procs                           705
+omp_get_num_threads                         706
+omp_get_thread_num                          707
+omp_get_wtick                               708
+omp_get_wtime                               709
+omp_in_parallel                             710
+omp_init_lock                               711
+omp_init_nest_lock                          712
+omp_set_dynamic                             713
+omp_set_lock                                714
+omp_set_nest_lock                           715
+omp_set_nested                              716
+omp_set_num_threads                         717
+omp_test_lock                               718
+omp_test_nest_lock                          719
+omp_unset_lock                              720
+omp_unset_nest_lock                         721
+
+ompc_set_dynamic                            722
+ompc_set_nested                             723
+ompc_set_num_threads                        724
+
+kmp_calloc                                  725
+kmp_free                                    726
+kmp_get_blocktime                           727
+kmp_get_library                             728
+kmp_get_stacksize                           729
+kmp_malloc                                  730
+#kmp_print_banner                            731
+kmp_realloc                                 732
+kmp_set_blocktime                           734
+kmp_set_library                             735
+kmp_set_library_serial                      737
+kmp_set_library_throughput                  738
+kmp_set_library_turnaround                  739
+# kmp_set_parallel_name                       740
+kmp_set_stacksize                           741
+# kmp_set_stats                               742
+kmp_get_num_known_threads                   743
+kmp_set_stacksize_s                         744
+kmp_get_stacksize_s                         745
+kmp_set_defaults                            746
+kmp_set_warnings_on                         779
+kmp_set_warnings_off                        780
+
+%ifdef OMP_30
+    omp_get_active_level                    789
+    omp_get_level                           790
+    omp_get_ancestor_thread_num             791
+    omp_get_team_size                       792
+    omp_get_thread_limit                    793
+    omp_get_max_active_levels               794
+    omp_set_max_active_levels               795
+    omp_get_schedule                        796
+    omp_set_schedule                        797
+    ompc_set_max_active_levels              798
+    ompc_set_schedule                       799
+    ompc_get_ancestor_thread_num            800
+    ompc_get_team_size                      801
+    kmp_set_affinity                        850
+    kmp_get_affinity                        851
+    kmp_get_affinity_max_proc               852
+    kmp_create_affinity_mask                853
+    kmp_destroy_affinity_mask               854
+    kmp_set_affinity_mask_proc              855
+    kmpc_set_affinity_mask_proc             856
+    kmp_unset_affinity_mask_proc            857
+    kmpc_unset_affinity_mask_proc           858
+    kmp_get_affinity_mask_proc              859
+    kmpc_get_affinity_mask_proc             860
+%endif # OMP_30
+
+# OpenMP 3.1
+
+%ifdef OMP_30
+    omp_in_final                            861
+%endif # OMP_30
+
+# OpenMP 40
+
+%ifdef OMP_40
+    omp_get_proc_bind                       862
+   #omp_set_proc_bind                       863
+   #omp_curr_proc_bind                      864
+    omp_get_num_teams                       865
+    omp_get_team_num                        866
+    omp_get_cancellation                    867
+    kmp_get_cancellation_status             868
+    omp_is_initial_device                   869
+%endif # OMP_40
+
+%ifndef stub
+    # Ordinals between 900 and 999 are reserved
+
+    # Ordinals between 1000 and 1999 are reserved
+    # for user-callable uppercase Fortran entries.
+
+
+    # ATOMIC entries
+
+    %ifdef HAVE_QUAD
+    __kmpc_atomic_cmplx16_div              2000
+    %endif
+
+    __kmpc_atomic_fixed1_add               2001
+    __kmpc_atomic_fixed1_andb              2002
+    __kmpc_atomic_fixed1_div               2003
+    __kmpc_atomic_fixed1u_div              2004
+    __kmpc_atomic_fixed1_mul               2005
+    __kmpc_atomic_fixed1_orb               2006
+    __kmpc_atomic_fixed1_shl               2007
+    __kmpc_atomic_fixed1_shr               2008
+    __kmpc_atomic_fixed1u_shr              2009
+    __kmpc_atomic_fixed1_sub               2010
+    __kmpc_atomic_fixed1_xor               2011
+
+    __kmpc_atomic_fixed2_add               2012
+    __kmpc_atomic_fixed2_andb              2013
+    __kmpc_atomic_fixed2_div               2014
+    __kmpc_atomic_fixed2u_div              2015
+    __kmpc_atomic_fixed2_mul               2016
+    __kmpc_atomic_fixed2_orb               2017
+    __kmpc_atomic_fixed2_shl               2018
+    __kmpc_atomic_fixed2_shr               2019
+    __kmpc_atomic_fixed2u_shr              2020
+    __kmpc_atomic_fixed2_sub               2021
+    __kmpc_atomic_fixed2_xor               2022
+
+   #__kmpc_atomic_fixed4_add           # declared above #102
+    __kmpc_atomic_fixed4_sub               2024
+   #__kmpc_atomic_float4_add           # declared above #104
+    __kmpc_atomic_float4_sub               2026
+   #__kmpc_atomic_fixed8_add           # declared above #103
+    __kmpc_atomic_fixed8_sub               2028
+   #__kmpc_atomic_float8_add           # declared above #105
+    __kmpc_atomic_float8_sub               2030
+
+    __kmpc_atomic_fixed4_andb              2031
+    __kmpc_atomic_fixed4_div               2032
+    __kmpc_atomic_fixed4u_div              2033
+    __kmpc_atomic_fixed4_mul               2034
+    __kmpc_atomic_fixed4_orb               2035
+    __kmpc_atomic_fixed4_shl               2036
+    __kmpc_atomic_fixed4_shr               2037
+    __kmpc_atomic_fixed4u_shr              2038
+    __kmpc_atomic_fixed4_xor               2039
+    __kmpc_atomic_fixed8_andb              2040
+    __kmpc_atomic_fixed8_div               2041
+    __kmpc_atomic_fixed8u_div              2042
+    __kmpc_atomic_fixed8_mul               2043
+    __kmpc_atomic_fixed8_orb               2044
+    __kmpc_atomic_fixed8_shl               2045
+    __kmpc_atomic_fixed8_shr               2046
+    __kmpc_atomic_fixed8u_shr              2047
+    __kmpc_atomic_fixed8_xor               2048
+    __kmpc_atomic_float4_div               2049
+    __kmpc_atomic_float4_mul               2050
+    __kmpc_atomic_float8_div               2051
+    __kmpc_atomic_float8_mul               2052
+
+    __kmpc_atomic_fixed1_andl              2053
+    __kmpc_atomic_fixed1_orl               2054
+    __kmpc_atomic_fixed2_andl              2055
+    __kmpc_atomic_fixed2_orl               2056
+    __kmpc_atomic_fixed4_andl              2057
+    __kmpc_atomic_fixed4_orl               2058
+    __kmpc_atomic_fixed8_andl              2059
+    __kmpc_atomic_fixed8_orl               2060
+
+    __kmpc_atomic_fixed1_max               2061
+    __kmpc_atomic_fixed1_min               2062
+    __kmpc_atomic_fixed2_max               2063
+    __kmpc_atomic_fixed2_min               2064
+    __kmpc_atomic_fixed4_max               2065
+    __kmpc_atomic_fixed4_min               2066
+    __kmpc_atomic_fixed8_max               2067
+    __kmpc_atomic_fixed8_min               2068
+    __kmpc_atomic_float4_max               2069
+    __kmpc_atomic_float4_min               2070
+    __kmpc_atomic_float8_max               2071
+    __kmpc_atomic_float8_min               2072
+
+    __kmpc_atomic_fixed1_neqv              2073
+    __kmpc_atomic_fixed2_neqv              2074
+    __kmpc_atomic_fixed4_neqv              2075
+    __kmpc_atomic_fixed8_neqv              2076
+    __kmpc_atomic_fixed1_eqv               2077
+    __kmpc_atomic_fixed2_eqv               2078
+    __kmpc_atomic_fixed4_eqv               2079
+    __kmpc_atomic_fixed8_eqv               2080
+
+    __kmpc_atomic_float10_add              2081
+    __kmpc_atomic_float10_sub              2082
+    __kmpc_atomic_float10_mul              2083
+    __kmpc_atomic_float10_div              2084
+
+    __kmpc_atomic_cmplx4_add               2085
+    __kmpc_atomic_cmplx4_sub               2086
+    __kmpc_atomic_cmplx4_mul               2087
+    __kmpc_atomic_cmplx4_div               2088
+    __kmpc_atomic_cmplx8_add               2089
+    __kmpc_atomic_cmplx8_sub               2090
+    __kmpc_atomic_cmplx8_mul               2091
+    __kmpc_atomic_cmplx8_div               2092
+    __kmpc_atomic_cmplx10_add              2093
+    __kmpc_atomic_cmplx10_sub              2094
+    __kmpc_atomic_cmplx10_mul              2095
+    __kmpc_atomic_cmplx10_div              2096
+    %ifdef HAVE_QUAD
+    __kmpc_atomic_cmplx16_add              2097
+    __kmpc_atomic_cmplx16_sub              2098
+    __kmpc_atomic_cmplx16_mul              2099
+   #__kmpc_atomic_cmplx16_div              2000 # moved up because of mistake in number (supposed to be 2100)
+
+    __kmpc_atomic_float16_add              2101
+    __kmpc_atomic_float16_sub              2102
+    __kmpc_atomic_float16_mul              2103
+    __kmpc_atomic_float16_div              2104
+    __kmpc_atomic_float16_max              2105
+    __kmpc_atomic_float16_min              2106
+
+    __kmpc_atomic_fixed1_add_fp            2107
+    __kmpc_atomic_fixed1_sub_fp            2108
+    __kmpc_atomic_fixed1_mul_fp            2109
+    __kmpc_atomic_fixed1_div_fp            2110
+    __kmpc_atomic_fixed1u_div_fp           2111
+
+    __kmpc_atomic_fixed2_add_fp            2112
+    __kmpc_atomic_fixed2_sub_fp            2113
+    __kmpc_atomic_fixed2_mul_fp            2114
+    __kmpc_atomic_fixed2_div_fp            2115
+    __kmpc_atomic_fixed2u_div_fp           2116
+
+    __kmpc_atomic_fixed4_add_fp            2117
+    __kmpc_atomic_fixed4_sub_fp            2118
+    __kmpc_atomic_fixed4_mul_fp            2119
+    __kmpc_atomic_fixed4_div_fp            2120
+    __kmpc_atomic_fixed4u_div_fp           2121
+
+    __kmpc_atomic_fixed8_add_fp            2122
+    __kmpc_atomic_fixed8_sub_fp            2123
+    __kmpc_atomic_fixed8_mul_fp            2124
+    __kmpc_atomic_fixed8_div_fp            2125
+    __kmpc_atomic_fixed8u_div_fp           2126
+
+    __kmpc_atomic_float4_add_fp            2127
+    __kmpc_atomic_float4_sub_fp            2128
+    __kmpc_atomic_float4_mul_fp            2129
+    __kmpc_atomic_float4_div_fp            2130
+
+    __kmpc_atomic_float8_add_fp            2131
+    __kmpc_atomic_float8_sub_fp            2132
+    __kmpc_atomic_float8_mul_fp            2133
+    __kmpc_atomic_float8_div_fp            2134
+
+    __kmpc_atomic_float10_add_fp           2135
+    __kmpc_atomic_float10_sub_fp           2136
+    __kmpc_atomic_float10_mul_fp           2137
+    __kmpc_atomic_float10_div_fp           2138
+    %endif
+
+    __kmpc_atomic_fixed1_mul_float8        2169
+    __kmpc_atomic_fixed1_div_float8        2170
+
+    __kmpc_atomic_fixed2_mul_float8        2174
+    __kmpc_atomic_fixed2_div_float8        2175
+
+    __kmpc_atomic_fixed4_mul_float8        2179
+    __kmpc_atomic_fixed4_div_float8        2180
+
+    __kmpc_atomic_fixed8_mul_float8        2184
+    __kmpc_atomic_fixed8_div_float8        2185
+
+    __kmpc_atomic_float4_add_float8        2187
+    __kmpc_atomic_float4_sub_float8        2188
+    __kmpc_atomic_float4_mul_float8        2189
+    __kmpc_atomic_float4_div_float8        2190
+
+    __kmpc_atomic_cmplx4_add_cmplx8        2231
+    __kmpc_atomic_cmplx4_sub_cmplx8        2232
+    __kmpc_atomic_cmplx4_mul_cmplx8        2233
+    __kmpc_atomic_cmplx4_div_cmplx8        2234
+
+    __kmpc_atomic_1                        2247
+    __kmpc_atomic_2                        2248
+   #__kmpc_atomic_4                    # declared above #100
+   #__kmpc_atomic_8                    # declared above #101
+    __kmpc_atomic_10                       2251
+    __kmpc_atomic_16                       2252
+    __kmpc_atomic_20                       2253
+    __kmpc_atomic_32                       2254
+
+    %ifdef arch_32
+
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_add_a16      2255
+        __kmpc_atomic_float16_sub_a16      2256
+        __kmpc_atomic_float16_mul_a16      2257
+        __kmpc_atomic_float16_div_a16      2258
+        __kmpc_atomic_float16_max_a16      2259
+        __kmpc_atomic_float16_min_a16      2260
+
+        __kmpc_atomic_cmplx16_add_a16      2261
+        __kmpc_atomic_cmplx16_sub_a16      2262
+        __kmpc_atomic_cmplx16_mul_a16      2263
+        __kmpc_atomic_cmplx16_div_a16      2264
+        %endif
+
+    %endif
+
+    %ifndef arch_64
+
+        # ATOMIC extensions for OpenMP 3.1 spec (x86 and x64 only)
+
+        __kmpc_atomic_fixed1_rd                2265
+        __kmpc_atomic_fixed2_rd                2266
+        __kmpc_atomic_fixed4_rd                2267
+        __kmpc_atomic_fixed8_rd                2268
+        __kmpc_atomic_float4_rd                2269
+        __kmpc_atomic_float8_rd                2270
+        __kmpc_atomic_float10_rd               2271
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_rd               2272
+        %endif
+        __kmpc_atomic_cmplx4_rd                2273
+        __kmpc_atomic_cmplx8_rd                2274
+        __kmpc_atomic_cmplx10_rd               2275
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_cmplx16_rd               2276
+            %ifdef arch_32
+                __kmpc_atomic_float16_a16_rd       2277
+                __kmpc_atomic_cmplx16_a16_rd       2278
+            %endif
+        %endif
+        __kmpc_atomic_fixed1_wr                2279
+        __kmpc_atomic_fixed2_wr                2280
+        __kmpc_atomic_fixed4_wr                2281
+        __kmpc_atomic_fixed8_wr                2282
+        __kmpc_atomic_float4_wr                2283
+        __kmpc_atomic_float8_wr                2284
+        __kmpc_atomic_float10_wr               2285
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_wr               2286
+        %endif
+        __kmpc_atomic_cmplx4_wr                2287
+        __kmpc_atomic_cmplx8_wr                2288
+        __kmpc_atomic_cmplx10_wr               2289
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_cmplx16_wr               2290
+        %ifdef arch_32
+            __kmpc_atomic_float16_a16_wr       2291
+            __kmpc_atomic_cmplx16_a16_wr       2292
+        %endif
+        %endif
+        __kmpc_atomic_fixed1_add_cpt           2293
+        __kmpc_atomic_fixed1_andb_cpt          2294
+        __kmpc_atomic_fixed1_div_cpt           2295
+        __kmpc_atomic_fixed1u_div_cpt          2296
+        __kmpc_atomic_fixed1_mul_cpt           2297
+        __kmpc_atomic_fixed1_orb_cpt           2298
+        __kmpc_atomic_fixed1_shl_cpt           2299
+        __kmpc_atomic_fixed1_shr_cpt           2300
+        __kmpc_atomic_fixed1u_shr_cpt          2301
+        __kmpc_atomic_fixed1_sub_cpt           2302
+        __kmpc_atomic_fixed1_xor_cpt           2303
+        __kmpc_atomic_fixed2_add_cpt           2304
+        __kmpc_atomic_fixed2_andb_cpt          2305
+        __kmpc_atomic_fixed2_div_cpt           2306
+        __kmpc_atomic_fixed2u_div_cpt          2307
+        __kmpc_atomic_fixed2_mul_cpt           2308
+        __kmpc_atomic_fixed2_orb_cpt           2309
+        __kmpc_atomic_fixed2_shl_cpt           2310
+        __kmpc_atomic_fixed2_shr_cpt           2311
+        __kmpc_atomic_fixed2u_shr_cpt          2312
+        __kmpc_atomic_fixed2_sub_cpt           2313
+        __kmpc_atomic_fixed2_xor_cpt           2314
+        __kmpc_atomic_fixed4_add_cpt           2315
+        __kmpc_atomic_fixed4_sub_cpt           2316
+        __kmpc_atomic_float4_add_cpt           2317
+        __kmpc_atomic_float4_sub_cpt           2318
+        __kmpc_atomic_fixed8_add_cpt           2319
+        __kmpc_atomic_fixed8_sub_cpt           2320
+        __kmpc_atomic_float8_add_cpt           2321
+        __kmpc_atomic_float8_sub_cpt           2322
+        __kmpc_atomic_fixed4_andb_cpt          2323
+        __kmpc_atomic_fixed4_div_cpt           2324
+        __kmpc_atomic_fixed4u_div_cpt          2325
+        __kmpc_atomic_fixed4_mul_cpt           2326
+        __kmpc_atomic_fixed4_orb_cpt           2327
+        __kmpc_atomic_fixed4_shl_cpt           2328
+        __kmpc_atomic_fixed4_shr_cpt           2329
+        __kmpc_atomic_fixed4u_shr_cpt          2330
+        __kmpc_atomic_fixed4_xor_cpt           2331
+        __kmpc_atomic_fixed8_andb_cpt          2332
+        __kmpc_atomic_fixed8_div_cpt           2333
+        __kmpc_atomic_fixed8u_div_cpt          2334
+        __kmpc_atomic_fixed8_mul_cpt           2335
+        __kmpc_atomic_fixed8_orb_cpt           2336
+        __kmpc_atomic_fixed8_shl_cpt           2337
+        __kmpc_atomic_fixed8_shr_cpt           2338
+        __kmpc_atomic_fixed8u_shr_cpt          2339
+        __kmpc_atomic_fixed8_xor_cpt           2340
+        __kmpc_atomic_float4_div_cpt           2341
+        __kmpc_atomic_float4_mul_cpt           2342
+        __kmpc_atomic_float8_div_cpt           2343
+        __kmpc_atomic_float8_mul_cpt           2344
+        __kmpc_atomic_fixed1_andl_cpt          2345
+        __kmpc_atomic_fixed1_orl_cpt           2346
+        __kmpc_atomic_fixed2_andl_cpt          2347
+        __kmpc_atomic_fixed2_orl_cpt           2348
+        __kmpc_atomic_fixed4_andl_cpt          2349
+        __kmpc_atomic_fixed4_orl_cpt           2350
+        __kmpc_atomic_fixed8_andl_cpt          2351
+        __kmpc_atomic_fixed8_orl_cpt           2352
+        __kmpc_atomic_fixed1_max_cpt           2353
+        __kmpc_atomic_fixed1_min_cpt           2354
+        __kmpc_atomic_fixed2_max_cpt           2355
+        __kmpc_atomic_fixed2_min_cpt           2356
+        __kmpc_atomic_fixed4_max_cpt           2357
+        __kmpc_atomic_fixed4_min_cpt           2358
+        __kmpc_atomic_fixed8_max_cpt           2359
+        __kmpc_atomic_fixed8_min_cpt           2360
+        __kmpc_atomic_float4_max_cpt           2361
+        __kmpc_atomic_float4_min_cpt           2362
+        __kmpc_atomic_float8_max_cpt           2363
+        __kmpc_atomic_float8_min_cpt           2364
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_max_cpt          2365
+        __kmpc_atomic_float16_min_cpt          2366
+        %endif
+        __kmpc_atomic_fixed1_neqv_cpt          2367
+        __kmpc_atomic_fixed2_neqv_cpt          2368
+        __kmpc_atomic_fixed4_neqv_cpt          2369
+        __kmpc_atomic_fixed8_neqv_cpt          2370
+        __kmpc_atomic_fixed1_eqv_cpt           2371
+        __kmpc_atomic_fixed2_eqv_cpt           2372
+        __kmpc_atomic_fixed4_eqv_cpt           2373
+        __kmpc_atomic_fixed8_eqv_cpt           2374
+        __kmpc_atomic_float10_add_cpt          2375
+        __kmpc_atomic_float10_sub_cpt          2376
+        __kmpc_atomic_float10_mul_cpt          2377
+        __kmpc_atomic_float10_div_cpt          2378
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_add_cpt          2379
+        __kmpc_atomic_float16_sub_cpt          2380
+        __kmpc_atomic_float16_mul_cpt          2381
+        __kmpc_atomic_float16_div_cpt          2382
+        %endif
+        __kmpc_atomic_cmplx4_add_cpt           2383
+        __kmpc_atomic_cmplx4_sub_cpt           2384
+        __kmpc_atomic_cmplx4_mul_cpt           2385
+        __kmpc_atomic_cmplx4_div_cpt           2386
+        __kmpc_atomic_cmplx8_add_cpt           2387
+        __kmpc_atomic_cmplx8_sub_cpt           2388
+        __kmpc_atomic_cmplx8_mul_cpt           2389
+        __kmpc_atomic_cmplx8_div_cpt           2390
+        __kmpc_atomic_cmplx10_add_cpt          2391
+        __kmpc_atomic_cmplx10_sub_cpt          2392
+        __kmpc_atomic_cmplx10_mul_cpt          2393
+        __kmpc_atomic_cmplx10_div_cpt          2394
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_cmplx16_add_cpt          2395
+        __kmpc_atomic_cmplx16_sub_cpt          2396
+        __kmpc_atomic_cmplx16_mul_cpt          2397
+        __kmpc_atomic_cmplx16_div_cpt          2398
+        %endif
+       #__kmpc_atomic_cmplx4_add_cpt_tmp            2409
+
+        %ifdef arch_32
+        %ifdef HAVE_QUAD
+            __kmpc_atomic_float16_add_a16_cpt  2399
+            __kmpc_atomic_float16_sub_a16_cpt  2400
+            __kmpc_atomic_float16_mul_a16_cpt  2401
+            __kmpc_atomic_float16_div_a16_cpt  2402
+            __kmpc_atomic_float16_max_a16_cpt  2403
+            __kmpc_atomic_float16_min_a16_cpt  2404
+            __kmpc_atomic_cmplx16_add_a16_cpt  2405
+            __kmpc_atomic_cmplx16_sub_a16_cpt  2406
+            __kmpc_atomic_cmplx16_mul_a16_cpt  2407
+            __kmpc_atomic_cmplx16_div_a16_cpt  2408
+        %endif
+        %endif
+
+        __kmpc_atomic_start                    2410
+        __kmpc_atomic_end                      2411
+
+        %ifdef OMP_40
+
+            # ATOMIC extensions for OpenMP 4.0 spec (x86 and x64 only)
+
+            __kmpc_atomic_fixed1_swp           2412
+            __kmpc_atomic_fixed2_swp           2413
+            __kmpc_atomic_fixed4_swp           2414
+            __kmpc_atomic_fixed8_swp           2415
+            __kmpc_atomic_float4_swp           2416
+            __kmpc_atomic_float8_swp           2417
+            __kmpc_atomic_float10_swp          2418
+            %ifdef HAVE_QUAD
+              __kmpc_atomic_float16_swp        2419
+            %endif
+            __kmpc_atomic_cmplx4_swp           2420
+            __kmpc_atomic_cmplx8_swp           2421
+            __kmpc_atomic_cmplx10_swp          2422
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_cmplx16_swp          2423
+
+            %ifdef arch_32
+                __kmpc_atomic_float16_a16_swp  2424
+                __kmpc_atomic_cmplx16_a16_swp  2425
+            %endif
+            %endif
+
+            __kmpc_atomic_fixed1_sub_cpt_rev   2426
+            __kmpc_atomic_fixed1_div_cpt_rev   2427
+            __kmpc_atomic_fixed1u_div_cpt_rev  2428
+            __kmpc_atomic_fixed1_shl_cpt_rev   2429
+            __kmpc_atomic_fixed1_shr_cpt_rev   2430
+            __kmpc_atomic_fixed1u_shr_cpt_rev  2431
+            __kmpc_atomic_fixed2_sub_cpt_rev   2432
+            __kmpc_atomic_fixed2_div_cpt_rev   2433
+            __kmpc_atomic_fixed2u_div_cpt_rev  2434
+            __kmpc_atomic_fixed2_shl_cpt_rev   2435
+            __kmpc_atomic_fixed2_shr_cpt_rev   2436
+            __kmpc_atomic_fixed2u_shr_cpt_rev  2437
+            __kmpc_atomic_fixed4_sub_cpt_rev   2438
+            __kmpc_atomic_fixed4_div_cpt_rev   2439
+            __kmpc_atomic_fixed4u_div_cpt_rev  2440
+            __kmpc_atomic_fixed4_shl_cpt_rev   2441
+            __kmpc_atomic_fixed4_shr_cpt_rev   2442
+            __kmpc_atomic_fixed4u_shr_cpt_rev  2443
+            __kmpc_atomic_fixed8_sub_cpt_rev   2444
+            __kmpc_atomic_fixed8_div_cpt_rev   2445
+            __kmpc_atomic_fixed8u_div_cpt_rev  2446
+            __kmpc_atomic_fixed8_shl_cpt_rev   2447
+            __kmpc_atomic_fixed8_shr_cpt_rev   2448
+            __kmpc_atomic_fixed8u_shr_cpt_rev  2449
+            __kmpc_atomic_float4_sub_cpt_rev   2450
+            __kmpc_atomic_float4_div_cpt_rev   2451
+            __kmpc_atomic_float8_sub_cpt_rev   2452
+            __kmpc_atomic_float8_div_cpt_rev   2453
+            __kmpc_atomic_float10_sub_cpt_rev  2454
+            __kmpc_atomic_float10_div_cpt_rev  2455
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_float16_sub_cpt_rev  2456
+            __kmpc_atomic_float16_div_cpt_rev  2457
+            %endif
+            __kmpc_atomic_cmplx4_sub_cpt_rev   2458
+            __kmpc_atomic_cmplx4_div_cpt_rev   2459
+            __kmpc_atomic_cmplx8_sub_cpt_rev   2460
+            __kmpc_atomic_cmplx8_div_cpt_rev   2461
+            __kmpc_atomic_cmplx10_sub_cpt_rev  2462
+            __kmpc_atomic_cmplx10_div_cpt_rev  2463
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_cmplx16_sub_cpt_rev  2464
+            __kmpc_atomic_cmplx16_div_cpt_rev  2465
+
+            %ifdef arch_32
+                __kmpc_atomic_float16_sub_a16_cpt_rev  2466
+                __kmpc_atomic_float16_div_a16_cpt_rev  2467
+                __kmpc_atomic_cmplx16_sub_a16_cpt_rev  2468
+                __kmpc_atomic_cmplx16_div_a16_cpt_rev  2469
+            %endif
+            %endif
+
+        %endif   # OMP_40
+
+
+    %endif   # arch_64
+
+%endif
+
+# end of file #

diff --git a/final/runtime/src/exports_so.txt b/final/runtime/src/exports_so.txt
new file mode 100644
index 0000000..d7ade44
--- /dev/null
+++ b/final/runtime/src/exports_so.txt

@@ -0,0 +1,130 @@
+# exports_so.txt #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# This is version script for OMP RTL shared library (libomp*.so)
+
+VERSION {
+
+    global: # Exported symbols.
+
+        #
+        # "Normal" symbols.
+        #
+        omp_*;     # Standard OpenMP functions.
+        ompt_initialize;     # OMPT initialization interface
+        ompt_control;        # OMPT control interface
+        
+        #
+        # OMPT state placeholders
+        #
+        ompt_idle;
+        ompt_overhead;
+        ompt_barrier_wait;
+        ompt_task_wait;
+        ompt_mutex_wait;
+
+        ompc_*;    # omp.h renames some standard functions to ompc_*.
+        kmp_*;     # Intel extensions.
+        kmpc_*;    # Intel extensions.
+        __kmpc_*;  # Functions called by compiler-generated code.
+        GOMP_*;    # GNU C compatibility functions.
+
+        _You_must_link_with_*;     # Mutual detection/MS compatibility symbols.
+
+
+        #
+        # Debugger support.
+        #
+#if USE_DEBUGGER
+        __kmp_debugging;
+        __kmp_omp_debug_struct_info;
+#endif /* USE_DEBUGGER */
+
+        #
+        # Internal functions exported for testing purposes.
+        #
+        __kmp_get_reduce_method;
+        ___kmp_allocate;
+        ___kmp_free;
+        __kmp_thread_pool;
+        __kmp_thread_pool_nth;
+
+	__kmp_reset_stats;
+
+#if USE_ITT_BUILD
+        #
+        # ITT support.
+        #
+        # The following entry points are added so that the backtraces from
+        # the tools contain meaningful names for all the functions that might
+        # appear in a backtrace of a thread which is blocked in the RTL.
+        __kmp_acquire_drdpa_lock;
+        __kmp_acquire_nested_drdpa_lock;
+        __kmp_acquire_nested_queuing_lock;
+        __kmp_acquire_nested_tas_lock;
+        __kmp_acquire_nested_ticket_lock;
+        __kmp_acquire_queuing_lock;
+        __kmp_acquire_tas_lock;
+        __kmp_acquire_ticket_lock;
+        __kmp_fork_call;
+        __kmp_get_reduce_method;
+        __kmp_invoke_microtask;
+        __kmp_itt_fini_ittlib;
+        __kmp_itt_init_ittlib;
+        __kmp_launch_monitor;
+        __kmp_launch_worker;
+        __kmp_reap_monitor;
+        __kmp_reap_worker;
+        __kmp_release_32;
+        __kmp_release_64;
+        __kmp_release_oncore;
+        __kmp_wait_32;
+        __kmp_wait_64;
+        __kmp_wait_oncore;
+        __kmp_wait_yield_4;
+        __kmp_wait_yield_8;
+
+        # ittnotify symbols to be used by debugger
+        __kmp_itt_fini_ittlib;
+        __kmp_itt_init_ittlib;
+#endif /* USE_ITT_BUILD */
+
+    local: # Non-exported symbols.
+
+        *;         # All other symbols are not exported.
+
+}; # VERSION
+
+# sets up GCC OMP_ version dependency chain
+OMP_1.0 {
+};
+OMP_2.0 {
+} OMP_1.0;
+OMP_3.0 {
+} OMP_2.0;
+OMP_3.1 {
+} OMP_3.0;
+OMP_4.0 {
+} OMP_3.1;
+
+# sets up GCC GOMP_ version dependency chain
+GOMP_1.0 {
+};
+GOMP_2.0 {
+} GOMP_1.0;
+GOMP_3.0 {
+} GOMP_2.0;
+GOMP_4.0 {
+} GOMP_3.0;
+
+# end of file #

diff --git a/final/runtime/src/extractExternal.cpp b/final/runtime/src/extractExternal.cpp
new file mode 100644
index 0000000..7a6fdb7
--- /dev/null
+++ b/final/runtime/src/extractExternal.cpp

@@ -0,0 +1,497 @@
+/*
+ * extractExternal.cpp
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stdlib.h>
+#include <iostream>
+#include <strstream>
+#include <fstream>
+#include <string>
+#include <set>
+#include <map>
+
+/* Given a set of n object files h ('external' object files) and a set of m
+   object files o ('internal' object files),
+   1. Determines r, the subset of h that o depends on, directly or indirectly
+   2. Removes the files in h - r from the file system
+   3. For each external symbol defined in some file in r, rename it in r U o
+      by prefixing it with "__kmp_external_"
+   Usage:
+   hide.exe <n> <filenames for h> <filenames for o>
+
+   Thus, the prefixed symbols become hidden in the sense that they now have a special
+   prefix.
+*/
+
+using namespace std;
+
+void stop(char* errorMsg) {
+    printf("%s\n", errorMsg);
+    exit(1);
+}
+
+// an entry in the symbol table of a .OBJ file
+class Symbol {
+public:
+    __int64 name;
+    unsigned value;
+    unsigned short sectionNum, type;
+    char storageClass, nAux;
+};
+
+class _rstream : public istrstream {
+private:
+    const char *buf;
+protected:
+    _rstream(pair<const char*, streamsize> p):istrstream(p.first,p.second),buf(p.first){}
+    ~_rstream() {
+	delete[]buf;
+    }
+};
+
+/* A stream encapuslating the content of a file or the content of a string, overriding the
+   >> operator to read various integer types in binary form, as well as a symbol table
+   entry.
+*/
+class rstream : public _rstream {
+private:
+    template<class T>
+    inline rstream& doRead(T &x) {
+	read((char*)&x, sizeof(T));
+	return *this;
+    }
+    static pair<const char*, streamsize> getBuf(const char *fileName) {
+	ifstream raw(fileName,ios::binary | ios::in);
+	if(!raw.is_open())
+	    stop("rstream.getBuf: Error opening file");
+	raw.seekg(0,ios::end);
+	streampos fileSize = raw.tellg();
+	if(fileSize < 0)
+	    stop("rstream.getBuf: Error reading file");
+	char *buf = new char[fileSize];
+	raw.seekg(0,ios::beg);
+	raw.read(buf, fileSize);
+	return pair<const char*, streamsize>(buf,fileSize);
+    }
+public:
+    // construct from a string
+    rstream(const char *buf,streamsize size):_rstream(pair<const char*,streamsize>(buf, size)){}
+    /* construct from a file whole content is fully read once to initialize the content of
+       this stream
+    */
+    rstream(const char *fileName):_rstream(getBuf(fileName)){}
+    rstream& operator>>(int &x) {
+	return doRead(x);
+    }
+    rstream& operator>>(unsigned &x) {
+	return doRead(x);
+    }
+    rstream& operator>>(short &x) {
+	return doRead(x);
+    }
+    rstream& operator>>(unsigned short &x) {
+	return doRead(x);
+    }
+    rstream& operator>>(Symbol &e) {
+	read((char*)&e, 18);
+	return *this;
+    }
+};
+
+// string table in a .OBJ file
+class StringTable {
+private:
+    map<string, unsigned> directory;
+    size_t length;
+    char *data;
+
+    // make <directory> from <length> bytes in <data>
+    void makeDirectory(void) {
+	unsigned i = 4;
+	while(i < length) {
+	    string s = string(data + i);
+	    directory.insert(make_pair(s, i));
+	    i += s.size() + 1;
+	}
+    }
+    // initialize <length> and <data> with contents specified by the arguments
+    void init(const char *_data) {
+	unsigned _length = *(unsigned*)_data;
+
+	if(_length < sizeof(unsigned) || _length != *(unsigned*)_data)
+	    stop("StringTable.init: Invalid symbol table");
+	if(_data[_length - 1]) {
+	    // to prevent runaway strings, make sure the data ends with a zero
+	    data = new char[length = _length + 1];
+	    data[_length] = 0;
+	} else {
+	    data = new char[length = _length];
+	}
+	*(unsigned*)data = length;
+	KMP_MEMCPY(data + sizeof(unsigned), _data + sizeof(unsigned),
+	           length - sizeof(unsigned));
+	makeDirectory();
+    }
+public:
+    StringTable(rstream &f) {
+	/* Construct string table by reading from f.
+	 */
+	streampos s;
+	unsigned strSize;
+	char *strData;
+
+	s = f.tellg();
+	f>>strSize;
+	if(strSize < sizeof(unsigned))
+	    stop("StringTable: Invalid string table");
+	strData = new char[strSize];
+	*(unsigned*)strData = strSize;
+	// read the raw data into <strData>
+	f.read(strData + sizeof(unsigned), strSize - sizeof(unsigned));
+	s = f.tellg() - s;
+	if(s < strSize)
+	    stop("StringTable: Unexpected EOF");
+	init(strData);
+	delete[]strData;
+    }
+    StringTable(const set<string> &strings) {
+	/* Construct string table from given strings.
+	 */
+	char *p;
+	set<string>::const_iterator it;
+	size_t s;
+
+	// count required size for data
+	for(length = sizeof(unsigned), it = strings.begin(); it != strings.end(); ++it) {
+	    size_t l = (*it).size();
+
+	    if(l > (unsigned) 0xFFFFFFFF)
+		stop("StringTable: String too long");
+	    if(l > 8) {
+		length += l + 1;
+		if(length > (unsigned) 0xFFFFFFFF)
+		    stop("StringTable: Symbol table too long");
+	    }
+	}
+	data = new char[length];
+	*(unsigned*)data = length;
+	// populate data and directory
+	for(p = data + sizeof(unsigned), it = strings.begin(); it != strings.end(); ++it) {
+	    const string &str = *it;
+	    size_t l = str.size();
+	    if(l > 8) {
+		directory.insert(make_pair(str, p - data));
+		KMP_MEMCPY(p, str.c_str(), l);
+		p[l] = 0;
+		p += l + 1;
+	    }
+	}
+    }
+    ~StringTable() {
+	delete[] data;
+    }
+    /* Returns encoding for given string based on this string table.
+       Error if string length is greater than 8 but string is not in
+       the string table--returns 0.
+    */
+    __int64 encode(const string &str) {
+	__int64 r;
+
+	if(str.size() <= 8) {
+	    // encoded directly
+	    ((char*)&r)[7] = 0;
+	    KMP_STRNCPY_S((char*)&r, sizeof(r), str.c_str(), 8);
+	    return r;
+	} else {
+	    // represented as index into table
+	    map<string,unsigned>::const_iterator it = directory.find(str);
+	    if(it == directory.end())
+		stop("StringTable::encode: String now found in string table");
+	    ((unsigned*)&r)[0] = 0;
+	    ((unsigned*)&r)[1] = (*it).second;
+	    return r;
+	}
+    }
+    /* Returns string represented by x based on this string table.
+       Error if x references an invalid position in the table--returns
+       the empty string.
+    */
+    string decode(__int64 x) const {
+	if(*(unsigned*)&x == 0) {
+	    // represented as index into table
+	    unsigned &p = ((unsigned*)&x)[1];
+	    if(p >= length)
+		stop("StringTable::decode: Invalid string table lookup");
+	    return string(data + p);
+	} else {
+	    // encoded directly
+	    char *p = (char*)&x;
+	    int i;
+
+	    for(i = 0; i < 8 && p[i]; ++i);
+	    return string(p, i);
+	}
+    }
+    void write(ostream &os) {
+	os.write(data, length);
+    }
+};
+
+/* for the named object file, determines the set of defined symbols and the set of undefined external symbols
+   and writes them to <defined> and <undefined> respectively
+*/
+void computeExternalSymbols(const char *fileName, set<string> *defined, set<string> *undefined){
+    streampos fileSize;
+    size_t strTabStart;
+    unsigned symTabStart, symNEntries;
+    rstream f(fileName);
+
+    f.seekg(0,ios::end);
+    fileSize = f.tellg();
+
+    f.seekg(8);
+    f >> symTabStart >> symNEntries;
+    // seek to the string table
+    f.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
+    if(f.eof()) {
+	printf("computeExternalSymbols: fileName='%s', fileSize = %lu, symTabStart = %u, symNEntries = %u\n",
+	       fileName, (unsigned long) fileSize, symTabStart, symNEntries);
+	stop("computeExternalSymbols: Unexpected EOF 1");
+    }
+    StringTable stringTable(f); // read the string table
+    if(f.tellg() != fileSize)
+	stop("computeExternalSymbols: Unexpected data after string table");
+
+    f.clear();
+    f.seekg(symTabStart); // seek to the symbol table
+
+    defined->clear(); undefined->clear();
+    for(int i = 0; i < symNEntries; ++i) {
+	// process each entry
+	Symbol e;
+
+	if(f.eof())
+	    stop("computeExternalSymbols: Unexpected EOF 2");
+	f>>e;
+	if(f.fail())
+	    stop("computeExternalSymbols: File read error");
+	if(e.nAux) { // auxiliary entry: skip
+	    f.seekg(e.nAux * 18, ios::cur);
+	    i += e.nAux;
+	}
+	// if symbol is extern and defined in the current file, insert it
+	if(e.storageClass == 2)
+	    if(e.sectionNum)
+		defined->insert(stringTable.decode(e.name));
+	    else
+		undefined->insert(stringTable.decode(e.name));
+    }
+}
+
+/* For each occurrence of an external symbol in the object file named by
+   by <fileName> that is a member of <hide>, renames it by prefixing
+   with "__kmp_external_", writing back the file in-place
+*/
+void hideSymbols(char *fileName, const set<string> &hide) {
+    static const string prefix("__kmp_external_");
+    set<string> strings; // set of all occurring symbols, appropriately prefixed
+    streampos fileSize;
+    size_t strTabStart;
+    unsigned symTabStart, symNEntries;
+    int i;
+    rstream in(fileName);
+
+    in.seekg(0,ios::end);
+    fileSize = in.tellg();
+
+    in.seekg(8);
+    in >> symTabStart >> symNEntries;
+    in.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
+    if(in.eof())
+	stop("hideSymbols: Unexpected EOF");
+    StringTable stringTableOld(in); // read original string table
+
+    if(in.tellg() != fileSize)
+	stop("hideSymbols: Unexpected data after string table");
+
+    // compute set of occurring strings with prefix added
+    for(i = 0; i < symNEntries; ++i) {
+	Symbol e;
+
+	in.seekg(symTabStart + i * 18);
+	if(in.eof())
+	    stop("hideSymbols: Unexpected EOF");
+	in >> e;
+	if(in.fail())
+	    stop("hideSymbols: File read error");
+	if(e.nAux)
+	    i += e.nAux;
+	const string &s = stringTableOld.decode(e.name);
+	// if symbol is extern and found in <hide>, prefix and insert into strings,
+	// otherwise, just insert into strings without prefix
+	strings.insert( (e.storageClass == 2 && hide.find(s) != hide.end()) ?
+			prefix + s : s);
+    }
+
+    ofstream out(fileName, ios::trunc | ios::out | ios::binary);
+    if(!out.is_open())
+	stop("hideSymbols: Error opening output file");
+
+    // make new string table from string set
+    StringTable stringTableNew = StringTable(strings);
+
+    // copy input file to output file up to just before the symbol table
+    in.seekg(0);
+    char *buf = new char[symTabStart];
+    in.read(buf, symTabStart);
+    out.write(buf, symTabStart);
+    delete []buf;
+
+    // copy input symbol table to output symbol table with name translation
+    for(i = 0; i < symNEntries; ++i) {
+	Symbol e;
+
+	in.seekg(symTabStart + i*18);
+	if(in.eof())
+	    stop("hideSymbols: Unexpected EOF");
+	in >> e;
+	if(in.fail())
+	    stop("hideSymbols: File read error");
+	const string &s = stringTableOld.decode(e.name);
+	out.seekp(symTabStart + i*18);
+	e.name = stringTableNew.encode( (e.storageClass == 2 && hide.find(s) != hide.end()) ?
+					prefix + s : s);
+	out.write((char*)&e, 18);
+	if(out.fail())
+	    stop("hideSymbols: File write error");
+	if(e.nAux) {
+	    // copy auxiliary symbol table entries
+	    int nAux = e.nAux;
+	    for(int j = 1; j <= nAux; ++j) {
+		in >> e;
+		out.seekp(symTabStart + (i + j) * 18);
+		out.write((char*)&e, 18);
+	    }
+	    i += nAux;
+	}
+    }
+    // output string table
+    stringTableNew.write(out);
+}
+
+// returns true iff <a> and <b> have no common element
+template <class T>
+bool isDisjoint(const set<T> &a, const set<T> &b) {
+    set<T>::const_iterator ita, itb;
+
+    for(ita = a.begin(), itb = b.begin(); ita != a.end() && itb != b.end();) {
+	const T &ta = *ita, &tb = *itb;
+	if(ta < tb)
+	    ++ita;
+	else if (tb < ta)
+	    ++itb;
+	else
+	    return false;
+    }
+    return true;
+}
+
+/* precondition: <defined> and <undefined> are arrays with <nTotal> elements where
+   <nTotal> >= <nExternal>.  The first <nExternal> elements correspond to the external object
+   files and the rest correspond to the internal object files.
+   postcondition: file x is said to depend on file y if undefined[x] and defined[y] are not
+   disjoint.  Returns the transitive closure of the set of internal object files, as a set of
+   file indexes, under the 'depends on' relation, minus the set of internal object files.
+*/
+set<int> *findRequiredExternal(int nExternal, int nTotal, set<string> *defined, set<string> *undefined) {
+    set<int> *required = new set<int>;
+    set<int> fresh[2];
+    int i, cur = 0;
+    bool changed;
+
+    for(i = nTotal - 1; i >= nExternal; --i)
+	fresh[cur].insert(i);
+    do {
+	changed = false;
+	for(set<int>::iterator it = fresh[cur].begin(); it != fresh[cur].end(); ++it) {
+	    set<string> &s = undefined[*it];
+
+	    for(i = 0; i < nExternal; ++i) {
+		if(required->find(i) == required->end()) {
+		    if(!isDisjoint(defined[i], s)) {
+			// found a new qualifying element
+			required->insert(i);
+			fresh[1 - cur].insert(i);
+			changed = true;
+		    }
+		}
+	    }
+	}
+	fresh[cur].clear();
+	cur = 1 - cur;
+    } while(changed);
+    return required;
+}
+
+int main(int argc, char **argv) {
+    int nExternal, nInternal, i;
+    set<string> *defined, *undefined;
+    set<int>::iterator it;
+
+    if(argc < 3)
+	stop("Please specify a positive integer followed by a list of object filenames");
+    nExternal = atoi(argv[1]);
+    if(nExternal <= 0)
+	stop("Please specify a positive integer followed by a list of object filenames");
+    if(nExternal +  2 > argc)
+	stop("Too few external objects");
+    nInternal = argc - nExternal - 2;
+    defined = new set<string>[argc - 2];
+    undefined = new set<string>[argc - 2];
+
+    // determine the set of defined and undefined external symbols
+    for(i = 2; i < argc; ++i)
+	computeExternalSymbols(argv[i], defined + i - 2, undefined + i - 2);
+
+    // determine the set of required external files
+    set<int> *requiredExternal = findRequiredExternal(nExternal, argc - 2, defined, undefined);
+    set<string> hide;
+
+    /* determine the set of symbols to hide--namely defined external symbols of the
+       required external files
+    */
+    for(it = requiredExternal->begin(); it != requiredExternal->end(); ++it) {
+	int idx = *it;
+	set<string>::iterator it2;
+	/* We have to insert one element at a time instead of inserting a range because
+	   the insert member function taking a range doesn't exist on Windows* OS, at least
+	   at the time of this writing.
+	*/
+	for(it2 = defined[idx].begin(); it2 != defined[idx].end(); ++it2)
+	    hide.insert(*it2);
+    }
+
+    /* process the external files--removing those that are not required and hiding
+       the appropriate symbols in the others
+    */
+    for(i = 0; i < nExternal; ++i)
+	if(requiredExternal->find(i) != requiredExternal->end())
+	    hideSymbols(argv[2 + i], hide);
+	else
+	    remove(argv[2 + i]);
+    // hide the appropriate symbols in the internal files
+    for(i = nExternal + 2; i < argc; ++i)
+	hideSymbols(argv[i], hide);
+    return 0;
+}

diff --git a/final/runtime/src/i18n/en_US.txt b/final/runtime/src/i18n/en_US.txt
new file mode 100644
index 0000000..cc9eb8f
--- /dev/null
+++ b/final/runtime/src/i18n/en_US.txt

@@ -0,0 +1,469 @@
+# en_US.txt #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Default messages, embedded into the OpenMP RTL, and source for English catalog.
+
+
+# Compatible changes (which does not require version bumping):
+#     * Editing message (number and type of placeholders must remain, relative order of
+#       placeholders may be changed, e.g. "File %1$s line %2$d" may be safely edited to
+#       "Line %2$d file %1$s").
+#     * Adding new message to the end of section.
+# Incompatible changes (version must be bumbed by 1):
+#     * Introducing new placeholders to existing messages.
+#     * Changing type of placeholders (e.g. "line %1$d" -> "line %1$s").
+#     * Rearranging order of messages.
+#     * Deleting messages.
+# Use special "OBSOLETE" pseudoidentifier for obsolete entries, which is kept only for backward
+# compatibility. When version is bumped, do not forget to delete all obsolete entries.
+
+
+# --------------------------------------------------------------------------------------------------
+-*- META -*-
+# --------------------------------------------------------------------------------------------------
+
+# Meta information about message catalog.
+
+Language "English"
+Country  "USA"
+LangId   "1033"
+Version  "2"
+Revision "20140827"
+
+
+
+# --------------------------------------------------------------------------------------------------
+-*- STRINGS -*-
+# --------------------------------------------------------------------------------------------------
+
+# Strings are not complete messages, just fragments. We need to work on it and reduce number of
+# strings (to zero?).
+
+Error                        "Error"
+UnknownFile                  "(unknown file)"
+NotANumber                   "not a number"
+BadUnit                      "bad unit"
+IllegalCharacters            "illegal characters"
+ValueTooLarge                "value too large"
+ValueTooSmall                "value too small"
+NotMultiple4K                "value is not a multiple of 4k"
+UnknownTopology              "Unknown processor topology"
+CantOpenCpuinfo              "Cannot open /proc/cpuinfo"
+ProcCpuinfo                  "/proc/cpuinfo"
+NoProcRecords                "cpuinfo file invalid (No processor records)"
+TooManyProcRecords           "cpuinfo file invalid (Too many processor records)"
+CantRewindCpuinfo            "Cannot rewind cpuinfo file"
+LongLineCpuinfo              "cpuinfo file invalid (long line)"
+TooManyEntries               "cpuinfo file contains too many entries"
+MissingProcField             "cpuinfo file missing processor field"
+MissingPhysicalIDField       "cpuinfo file missing physical id field"
+MissingValCpuinfo            "cpuinfo file invalid (missing val)"
+DuplicateFieldCpuinfo        "cpuinfo file invalid (duplicate field)"
+PhysicalIDsNotUnique         "Physical node/pkg/core/thread ids not unique"
+ApicNotPresent               "APIC not present"
+InvalidCpuidInfo             "Invalid cpuid info"
+OBSOLETE                     "APIC ids not unique"
+InconsistentCpuidInfo        "Inconsistent cpuid info"
+OutOfHeapMemory              "Out of heap memory"
+MemoryAllocFailed            "Memory allocation failed"
+Core                         "core"
+Thread                       "thread"
+Package                      "package"
+Node                         "node"
+OBSOLETE                     "<undef>"
+DecodingLegacyAPIC           "decoding legacy APIC ids"
+OBSOLETE                     "parsing /proc/cpuinfo"
+NotDefined                   "value is not defined"
+EffectiveSettings            "Effective settings:"
+UserSettings                 "User settings:"
+StorageMapWarning            "warning: pointers or size don't make sense"
+OBSOLETE                     "CPU"
+OBSOLETE                     "TPU"
+OBSOLETE                     "TPUs per package"
+OBSOLETE                     "HT enabled"
+OBSOLETE                     "HT disabled"
+Decodingx2APIC               "decoding x2APIC ids"
+NoLeaf11Support              "cpuid leaf 11 not supported"
+NoLeaf4Support               "cpuid leaf 4 not supported"
+ThreadIDsNotUnique           "thread ids not unique"
+UsingPthread                 "using pthread info"
+LegacyApicIDsNotUnique       "legacy APIC ids not unique"
+x2ApicIDsNotUnique           "x2APIC ids not unique"
+DisplayEnvBegin		     "OPENMP DISPLAY ENVIRONMENT BEGIN"
+DisplayEnvEnd		     "OPENMP DISPLAY ENVIRONMENT END"
+Device			     "[device]"
+Host			     "[host]"
+
+
+
+# --------------------------------------------------------------------------------------------------
+-*- FORMATS -*-
+# --------------------------------------------------------------------------------------------------
+
+Info                         "OMP: Info #%1$d: %2$s\n"
+Warning                      "OMP: Warning #%1$d: %2$s\n"
+Fatal                        "OMP: Error #%1$d: %2$s\n"
+SysErr                       "OMP: System error #%1$d: %2$s\n"
+Hint                         "OMP: Hint: %2$s\n"
+
+Pragma                       "%1$s pragma (at %2$s:%3$s():%4$s)"
+    # %1 is pragma name (like "parallel" or "master",
+    # %2 is file name,
+    # %3 is function (routine) name,
+    # %4 is the line number (as string, so "s" type specifier should be used).
+
+
+
+# --------------------------------------------------------------------------------------------------
+-*- MESSAGES -*-
+# --------------------------------------------------------------------------------------------------
+
+# Messages of any severity: informational, warning, or fatal.
+# To maintain message numbers (they are visible to customers), add new messages to the end.
+
+# Use following prefixes for messages and hints when appropriate:
+#    Aff -- Affinity messages.
+#    Cns -- Consistency check failures (KMP_CONSISTENCY_CHECK).
+#    Itt -- ITT Notify-related messages.
+
+LibraryIsSerial              "Library is \"serial\"."
+CantOpenMessageCatalog       "Cannot open message catalog \"%1$s\":"
+WillUseDefaultMessages       "Default messages will be used."
+LockIsUninitialized          "%1$s: Lock is uninitialized"
+LockSimpleUsedAsNestable     "%1$s: Lock was initialized as simple, but used as nestable"
+LockNestableUsedAsSimple     "%1$s: Lock was initialized as nestable, but used as simple"
+LockIsAlreadyOwned           "%1$s: Lock is already owned by requesting thread"
+LockStillOwned               "%1$s: Lock is still owned by a thread"
+LockUnsettingFree            "%1$s: Attempt to release a lock not owned by any thread"
+LockUnsettingSetByAnother    "%1$s: Attempt to release a lock owned by another thread"
+StackOverflow                "Stack overflow detected for OpenMP thread #%1$d"
+StackOverlap                 "Stack overlap detected. "
+AssertionFailure             "Assertion failure at %1$s(%2$d)."
+CantRegisterNewThread        "Unable to register a new user thread."
+DuplicateLibrary             "Initializing %1$s, but found %2$s already initialized."
+CantOpenFileForReading       "Cannot open file \"%1$s\" for reading:"
+CantGetEnvVar                "Getting environment variable \"%1$s\" failed:"
+CantSetEnvVar                "Setting environment variable \"%1$s\" failed:"
+CantGetEnvironment           "Getting environment failed:"
+BadBoolValue                 "%1$s=\"%2$s\": Wrong value, boolean expected."
+SSPNotBuiltIn                "No Helper Thread support built in this OMP library."
+SPPSotfTerminateFailed       "Helper thread failed to soft terminate."
+BufferOverflow               "Buffer overflow detected."
+RealTimeSchedNotSupported    "Real-time scheduling policy is not supported."
+RunningAtMaxPriority         "OMP application is running at maximum priority with real-time scheduling policy. "
+CantChangeMonitorPriority    "Changing priority of the monitor thread failed:"
+MonitorWillStarve            "Deadlocks are highly possible due to monitor thread starvation."
+CantSetMonitorStackSize      "Unable to set monitor thread stack size to %1$lu bytes:"
+CantSetWorkerStackSize       "Unable to set OMP thread stack size to %1$lu bytes:"
+CantInitThreadAttrs          "Thread attribute initialization failed:"
+CantDestroyThreadAttrs       "Thread attribute destroying failed:"
+CantSetWorkerState           "OMP thread joinable state setting failed:"
+CantSetMonitorState          "Monitor thread joinable state setting failed:"
+NoResourcesForWorkerThread   "System unable to allocate necessary resources for OMP thread:"
+NoResourcesForMonitorThread  "System unable to allocate necessary resources for the monitor thread:"
+CantTerminateWorkerThread    "Unable to terminate OMP thread:"
+ScheduleKindOutOfRange       "Wrong schedule type %1$d, see <omp.h> or <omp_lib.h> file for the list of values supported."
+UnknownSchedulingType        "Unknown scheduling type \"%1$d\"."
+InvalidValue                 "%1$s value \"%2$s\" is invalid."
+SmallValue                   "%1$s value \"%2$s\" is too small."
+LargeValue                   "%1$s value \"%2$s\" is too large."
+StgInvalidValue              "%1$s: \"%2$s\" is an invalid value; ignored."
+BarrReleaseValueInvalid      "%1$s release value \"%2$s\" is invalid."
+BarrGatherValueInvalid       "%1$s gather value \"%2$s\" is invalid."
+OBSOLETE                     "%1$s supported only on debug builds; ignored."
+ParRangeSyntax               "Syntax error: Usage: %1$s=[ routine=<func> | filename=<file> | range=<lb>:<ub> "
+                             "| excl_range=<lb>:<ub> ],..."
+UnbalancedQuotes             "Unbalanced quotes in %1$s."
+EmptyString                  "Empty string specified for %1$s; ignored."
+LongValue                    "%1$s value is too long; ignored."
+InvalidClause                "%1$s: Invalid clause in \"%2$s\"."
+EmptyClause                  "Empty clause in %1$s."
+InvalidChunk                 "%1$s value \"%2$s\" is invalid chunk size."
+LargeChunk                   "%1$s value \"%2$s\" is to large chunk size."
+IgnoreChunk                  "%1$s value \"%2$s\" is ignored."
+CantGetProcFreq              "Cannot get processor frequency, using zero KMP_ITT_PREPARE_DELAY."
+EnvParallelWarn              "%1$s must be set prior to first parallel region; ignored."
+AffParamDefined              "%1$s: parameter has been specified already, ignoring \"%2$s\"."
+AffInvalidParam              "%1$s: parameter invalid, ignoring \"%2$s\"."
+AffManyParams                "%1$s: too many integer parameters specified, ignoring \"%2$s\"."
+AffManyParamsForLogic        "%1$s: too many integer parameters specified for logical or physical type, ignoring \"%2$d\"."
+AffNoParam                   "%1$s: '%2$s' type does not take any integer parameters, ignoring them."
+AffNoProcList                "%1$s: proclist not specified with explicit affinity type, using \"none\"."
+AffProcListNoType            "%1$s: proclist specified, setting affinity type to \"explicit\"."
+AffProcListNotExplicit       "%1$s: proclist specified without \"explicit\" affinity type, proclist ignored."
+AffSyntaxError               "%1$s: syntax error, not using affinity."
+AffZeroStride                "%1$s: range error (zero stride), not using affinity."
+AffStartGreaterEnd           "%1$s: range error (%2$d > %3$d), not using affinity."
+AffStrideLessZero            "%1$s: range error (%2$d < %3$d & stride < 0), not using affinity."
+AffRangeTooBig               "%1$s: range error ((%2$d-%3$d)/%4$d too big), not using affinity."
+OBSOLETE                     "%1$s: %2$s is defined. %3$s will be ignored."
+AffNotSupported              "%1$s: affinity not supported, using \"disabled\"."
+OBSOLETE                     "%1$s: affinity only supported for Intel(R) processors."
+GetAffSysCallNotSupported    "%1$s: getaffinity system call not supported."
+SetAffSysCallNotSupported    "%1$s: setaffinity system call not supported."
+OBSOLETE                     "%1$s: pthread_aff_set_np call not found."
+OBSOLETE                     "%1$s: pthread_get_num_resources_np call not found."
+OBSOLETE                     "%1$s: the OS kernel does not support affinity."
+OBSOLETE                     "%1$s: pthread_get_num_resources_np returned %2$d."
+AffCantGetMaskSize           "%1$s: cannot determine proper affinity mask size."
+ParseSizeIntWarn             "%1$s=\"%2$s\": %3$s."
+ParseExtraCharsWarn          "%1$s: extra trailing characters ignored: \"%2$s\"."
+UnknownForceReduction        "%1$s: unknown method \"%2$s\"."
+TimerUseGettimeofday         "KMP_STATS_TIMER: clock_gettime is undefined, using gettimeofday."
+TimerNeedMoreParam           "KMP_STATS_TIMER: \"%1$s\" needs additional parameter, e.g. 'clock_gettime,2'. Using gettimeofday."
+TimerInvalidParam            "KMP_STATS_TIMER: clock_gettime parameter \"%1$s\" is invalid, using gettimeofday."
+TimerGettimeFailed           "KMP_STATS_TIMER: clock_gettime failed, using gettimeofday."
+TimerUnknownFunction         "KMP_STATS_TIMER: clock function unknown (ignoring value \"%1$s\")."
+UnknownSchedTypeDetected     "Unknown scheduling type detected."
+DispatchManyThreads          "Too many threads to use analytical guided scheduling - switching to iterative guided scheduling."
+IttLookupFailed              "ittnotify: Lookup of \"%1$s\" function in \"%2$s\" library failed."
+IttLoadLibFailed             "ittnotify: Loading \"%1$s\" library failed."
+IttAllNotifDisabled          "ittnotify: All itt notifications disabled."
+IttObjNotifDisabled          "ittnotify: Object state itt notifications disabled."
+IttMarkNotifDisabled         "ittnotify: Mark itt notifications disabled."
+IttUnloadLibFailed           "ittnotify: Unloading \"%1$s\" library failed."
+CantFormThrTeam              "Cannot form a team with %1$d threads, using %2$d instead."
+ActiveLevelsNegative         "Requested number of active parallel levels \"%1$d\" is negative; ignored."
+ActiveLevelsExceedLimit      "Requested number of active parallel levels \"%1$d\" exceeds supported limit; "
+                             "the following limit value will be used: \"%1$d\"."
+SetLibraryIncorrectCall      "kmp_set_library must only be called from the top level serial thread; ignored."
+FatalSysError                "Fatal system error detected."
+OutOfHeapMemory              "Out of heap memory."
+OBSOLETE                     "Clearing __KMP_REGISTERED_LIB env var failed."
+OBSOLETE                     "Registering library with env var failed."
+Using_int_Value              "%1$s value \"%2$d\" will be used."
+Using_uint_Value             "%1$s value \"%2$u\" will be used."
+Using_uint64_Value           "%1$s value \"%2$s\" will be used."
+Using_str_Value              "%1$s value \"%2$s\" will be used."
+MaxValueUsing                "%1$s maximum value \"%2$d\" will be used."
+MinValueUsing                "%1$s minimum value \"%2$d\" will be used."
+MemoryAllocFailed            "Memory allocation failed."
+FileNameTooLong              "File name too long."
+OBSOLETE                     "Lock table overflow."
+ManyThreadsForTPDirective    "Too many threads to use threadprivate directive."
+AffinityInvalidMask          "%1$s: invalid mask."
+WrongDefinition              "Wrong definition."
+TLSSetValueFailed            "Windows* OS: TLS Set Value failed."
+TLSOutOfIndexes              "Windows* OS: TLS out of indexes."
+OBSOLETE                     "PDONE directive must be nested within a DO directive."
+CantGetNumAvailCPU           "Cannot get number of available CPUs."
+AssumedNumCPU                "Assumed number of CPUs is 2."
+ErrorInitializeAffinity      "Error initializing affinity - not using affinity."
+AffThreadsMayMigrate         "Threads may migrate across all available OS procs (granularity setting too coarse)."
+AffIgnoreInvalidProcID       "Ignoring invalid OS proc ID %1$d."
+AffNoValidProcID             "No valid OS proc IDs specified - not using affinity."
+UsingFlatOS                  "%1$s - using \"flat\" OS <-> physical proc mapping."
+UsingFlatOSFile              "%1$s: %2$s - using \"flat\" OS <-> physical proc mapping."
+UsingFlatOSFileLine          "%1$s, line %2$d: %3$s - using \"flat\" OS <-> physical proc mapping."
+FileMsgExiting               "%1$s: %2$s - exiting."
+FileLineMsgExiting           "%1$s, line %2$d: %3$s - exiting."
+ConstructIdentInvalid        "Construct identifier invalid."
+ThreadIdentInvalid           "Thread identifier invalid."
+RTLNotInitialized            "runtime library not initialized."
+TPCommonBlocksInconsist      "Inconsistent THREADPRIVATE common block declarations are non-conforming "
+                             "and are unsupported. Either all threadprivate common blocks must be declared "
+                             "identically, or the largest instance of each threadprivate common block "
+                             "must be referenced first during the run."
+CantSetThreadAffMask         "Cannot set thread affinity mask."
+CantSetThreadPriority        "Cannot set thread priority."
+CantCreateThread             "Cannot create thread."
+CantCreateEvent              "Cannot create event."
+CantSetEvent                 "Cannot set event."
+CantCloseHandle              "Cannot close handle."
+UnknownLibraryType           "Unknown library type: %1$d."
+ReapMonitorError             "Monitor did not reap properly."
+ReapWorkerError              "Worker thread failed to join."
+ChangeThreadAffMaskError     "Cannot change thread affinity mask."
+ThreadsMigrate               "%1$s: Threads may migrate across %2$d innermost levels of machine"
+DecreaseToThreads            "%1$s: decrease to %2$d threads"
+IncreaseToThreads            "%1$s: increase to %2$d threads"
+OBSOLETE                     "%1$s: Internal thread %2$d bound to OS proc set %3$s"
+AffCapableUseCpuinfo         "%1$s: Affinity capable, using cpuinfo file"
+AffUseGlobCpuid              "%1$s: Affinity capable, using global cpuid info"
+AffCapableUseFlat            "%1$s: Affinity capable, using default \"flat\" topology"
+AffNotCapableUseLocCpuid     "%1$s: Affinity not capable, using local cpuid info"
+AffNotCapableUseCpuinfo      "%1$s: Affinity not capable, using cpuinfo file"
+AffFlatTopology              "%1$s: Affinity not capable, assumming \"flat\" topology"
+InitOSProcSetRespect         "%1$s: Initial OS proc set respected: %2$s"
+InitOSProcSetNotRespect      "%1$s: Initial OS proc set not respected: %2$s"
+AvailableOSProc              "%1$s: %2$d available OS procs"
+Uniform                      "%1$s: Uniform topology"
+NonUniform                   "%1$s: Nonuniform topology"
+Topology                     "%1$s: %2$d packages x %3$d cores/pkg x %4$d threads/core (%5$d total cores)"
+OBSOLETE                     "%1$s: OS proc to physical thread map ([] => level not in map):"
+OSProcToPackage              "%1$s: OS proc <n> maps to <n>th package core 0"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d [core %4$d] [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] [core %4$d] [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] [core %4$d] thread %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] core %4$d [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d [core %4$d] [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] core %4$d thread %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d core %4$d [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d [core %4$d] thread %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d core %4$d thread %5$d"
+OSProcMapToPack              "%1$s: OS proc %2$d maps to %3$s"
+OBSOLETE                     "%1$s: Internal thread %2$d changed affinity mask from %3$s to %4$s"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d, CPU %4$d, TPU %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d, CPU %4$d"
+OBSOLETE                     "%1$s: HT enabled; %2$d packages; %3$d TPU; %4$d TPUs per package"
+OBSOLETE                     "%1$s: HT disabled; %2$d packages"
+BarriersInDifferentOrder     "Threads encountered barriers in different order. "
+FunctionError                "Function %1$s failed:"
+TopologyExtra                "%1$s: %2$s packages x %3$d cores/pkg x %4$d threads/core (%5$d total cores)"
+WrongMessageCatalog          "Incompatible message catalog \"%1$s\": Version \"%2$s\" found, version \"%3$s\" expected."
+StgIgnored                   "%1$s: ignored because %2$s has been defined"
+                                 # %1, -- name of ignored variable, %2 -- name of variable with higher priority.
+OBSOLETE                     "%1$s: overrides %3$s specified before"
+                                 # %1, %2 -- name and value of the overriding variable, %3 -- name of overriden variable.
+
+# --- OpenMP errors detected at runtime ---
+#
+#    %1 is the name of OpenMP construct (formatted with "Pragma" format).
+#
+CnsBoundToWorksharing        "%1$s must be bound to a work-sharing or work-queuing construct with an \"ordered\" clause"
+CnsDetectedEnd               "Detected end of %1$s without first executing a corresponding beginning."
+CnsIterationRangeTooLarge    "Iteration range too large in %1$s."
+CnsLoopIncrZeroProhibited    "%1$s must not have a loop increment that evaluates to zero."
+#
+#    %1 is the name of the first OpenMP construct, %2 -- the name of the second one (both formatted with "Pragma" format).
+#
+CnsExpectedEnd               "Expected end of %1$s; %2$s, however, has most recently begun execution."
+CnsInvalidNesting            "%1$s is incorrectly nested within %2$s"
+CnsMultipleNesting           "%1$s cannot be executed multiple times during execution of one parallel iteration/section of %2$s"
+CnsNestingSameName           "%1$s is incorrectly nested within %2$s of the same name"
+CnsNoOrderedClause           "%1$s is incorrectly nested within %2$s that does not have an \"ordered\" clause"
+CnsNotInTaskConstruct        "%1$s is incorrectly nested within %2$s but not within any of its \"task\" constructs"
+CnsThreadsAtBarrier          "One thread at %1$s while another thread is at %2$s."
+
+# New errors
+CantConnect                  "Cannot connect to %1$s"
+CantConnectUsing             "Cannot connect to %1$s - Using %2$s"
+LibNotSupport                "%1$s does not support %2$s. Continuing without using %2$s."
+LibNotSupportFor             "%1$s does not support %2$s for %3$s. Continuing without using %2$s."
+StaticLibNotSupport          "Static %1$s does not support %2$s. Continuing without using %2$s."
+OBSOLETE                     "KMP_DYNAMIC_MODE=irml cannot be used with KMP_USE_IRML=0"
+IttUnknownGroup              "ittnotify: Unknown group \"%2$s\" specified in environment variable \"%1$s\"."
+IttEnvVarTooLong             "ittnotify: Environment variable \"%1$s\" too long: Actual lengths is %2$lu, max allowed length is %3$lu."
+AffUseGlobCpuidL11           "%1$s: Affinity capable, using global cpuid leaf 11 info"
+AffNotCapableUseLocCpuidL11  "%1$s: Affinity not capable, using local cpuid leaf 11 info"
+AffInfoStr                   "%1$s: %2$s."
+AffInfoStrStr                "%1$s: %2$s - %3$s."
+OSProcToPhysicalThreadMap    "%1$s: OS proc to physical thread map:"
+AffUsingFlatOS               "%1$s: using \"flat\" OS <-> physical proc mapping."
+AffParseFilename             "%1$s: parsing %2$s."
+MsgExiting                   "%1$s - exiting."
+IncompatibleLibrary          "Incompatible %1$s library with version %2$s found."
+IttFunctionError             "ittnotify: Function %1$s failed:"
+IttUnknownError              "ittnofify: Error #%1$d."
+EnvMiddleWarn                "%1$s must be set prior to first parallel region or certain API calls; ignored."
+CnsLockNotDestroyed          "Lock initialized at %1$s(%2$d) was not destroyed"
+                                 # %1, %2, %3, %4 -- file, line, func, col
+CantLoadBalUsing             "Cannot determine machine load balance - Using %1$s"
+AffNotCapableUsePthread      "%1$s: Affinity not capable, using pthread info"
+AffUsePthread                "%1$s: Affinity capable, using pthread info"
+OBSOLETE                     "Loading \"%1$s\" library failed:"
+OBSOLETE                     "Lookup of \"%1$s\" function failed:"
+OBSOLETE                     "Buffer too small."
+OBSOLETE                     "Error #%1$d."
+NthSyntaxError               "%1$s: Invalid symbols found. Check the value \"%2$s\"."
+NthSpacesNotAllowed          "%1$s: Spaces between digits are not allowed \"%2$s\"."
+AffStrParseFilename          "%1$s: %2$s - parsing %3$s."
+OBSOLETE                     "%1$s cannot be specified via kmp_set_defaults() on this machine because it has more than one processor group."
+AffTypeCantUseMultGroups     "Cannot use affinity type \"%1$s\" with multiple Windows* OS processor groups, using \"%2$s\"."
+AffGranCantUseMultGroups     "Cannot use affinity granularity \"%1$s\" with multiple Windows* OS processor groups, using \"%2$s\"."
+AffWindowsProcGroupMap       "%1$s: Mapping Windows* OS processor group <i> proc <j> to OS proc 64*<i>+<j>."
+AffOSProcToGroup             "%1$s: OS proc %2$d maps to Windows* OS processor group %3$d proc %4$d"
+AffBalancedNotAvail          "%1$s: Affinity balanced is not available."
+OBSOLETE                     "%1$s: granularity=core will be used."
+EnvLockWarn                  "%1$s must be set prior to first OMP lock call or critical section; ignored."
+FutexNotSupported            "futex system call not supported; %1$s=%2$s ignored."
+AffGranUsing                 "%1$s: granularity=%2$s will be used."
+AffThrPlaceInvalid           "%1$s: invalid value \"%2$s\", valid format is \"nC,mT[,kO]\"."
+AffThrPlaceUnsupported       "KMP_PLACE_THREADS ignored: unsupported architecture."
+AffThrPlaceManyCores         "KMP_PLACE_THREADS ignored: too many cores requested."
+SyntaxErrorUsing             "%1$s: syntax error, using %2$s."
+AdaptiveNotSupported         "%1$s: Adaptive locks are not supported; using queuing."
+EnvSyntaxError               "%1$s: Invalid symbols found. Check the value \"%2$s\"."
+EnvSpacesNotAllowed          "%1$s: Spaces between digits are not allowed \"%2$s\"."
+BoundToOSProcSet             "%1$s: pid %2$d thread %3$d bound to OS proc set %4$s"
+CnsLoopIncrIllegal           "%1$s error: parallel loop increment and condition are inconsistent."
+NoGompCancellation           "libgomp cancellation is not currently supported."
+AffThrPlaceNonUniform        "KMP_PLACE_THREADS ignored: non-uniform topology."
+AffThrPlaceNonThreeLevel     "KMP_PLACE_THREADS ignored: only three-level topology is supported."
+AffGranTopGroup              "%1$s: granularity=%2$s is not supported with KMP_TOPOLOGY_METHOD=group. Using \"granularity=fine\"."
+AffGranGroupType             "%1$s: granularity=group is not supported with KMP_AFFINITY=%2$s. Using \"granularity=core\"."
+
+
+# --------------------------------------------------------------------------------------------------
+-*- HINTS -*-
+# --------------------------------------------------------------------------------------------------
+
+# Hints. Hint may be printed after a message. Usually it is longer explanation text or suggestion.
+# To maintain hint numbers (they are visible to customers), add new hints to the end.
+
+SubmitBugReport              "Please submit a bug report with this message, compile and run "
+                             "commands used, and machine configuration info including native "
+                             "compiler and operating system versions. Faster response will be "
+                             "obtained by including all program sources. For information on "
+                             "submitting this issue, please see "
+                             "http://www.intel.com/software/products/support/."
+OBSOLETE                     "Check NLSPATH environment variable, its value is \"%1$s\"."
+ChangeStackLimit             "Please try changing the shell stack limit or adjusting the "
+                             "OMP_STACKSIZE environment variable."
+Unset_ALL_THREADS            "Consider unsetting KMP_ALL_THREADS and OMP_THREAD_LIMIT (if either is set)."
+Set_ALL_THREADPRIVATE        "Consider setting KMP_ALL_THREADPRIVATE to a value larger than %1$d."
+PossibleSystemLimitOnThreads "This could also be due to a system-related limit on the number of threads."
+DuplicateLibrary             "This means that multiple copies of the OpenMP runtime have been "
+                             "linked into the program. That is dangerous, since it can degrade "
+                             "performance or cause incorrect results. "
+                             "The best thing to do is to ensure that only a single OpenMP runtime is "
+                             "linked into the process, e.g. by avoiding static linking of the OpenMP "
+                             "runtime in any library. As an unsafe, unsupported, undocumented workaround "
+                             "you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow "
+                             "the program to continue to execute, but that may cause crashes or "
+                             "silently produce incorrect results. "
+                             "For more information, please see http://www.intel.com/software/products/support/."
+NameComesFrom_CPUINFO_FILE   "This name is specified in environment variable KMP_CPUINFO_FILE."
+NotEnoughMemory              "Seems application required too much memory."
+ValidBoolValues              "Use \"0\", \"FALSE\". \".F.\", \"off\", \"no\" as false values, "
+                             "\"1\", \"TRUE\", \".T.\", \"on\", \"yes\" as true values."
+BufferOverflow               "Perhaps too many threads."
+RunningAtMaxPriority         "Decrease priority of application. "
+                             "This will allow the monitor thread run at higher priority than other threads."
+ChangeMonitorStackSize       "Try changing KMP_MONITOR_STACKSIZE or the shell stack limit."
+ChangeWorkerStackSize        "Try changing OMP_STACKSIZE and/or the shell stack limit."
+IncreaseWorkerStackSize      "Try increasing OMP_STACKSIZE or the shell stack limit."
+DecreaseWorkerStackSize      "Try decreasing OMP_STACKSIZE."
+Decrease_NUM_THREADS         "Try decreasing the value of OMP_NUM_THREADS."
+IncreaseMonitorStackSize     "Try increasing KMP_MONITOR_STACKSIZE."
+DecreaseMonitorStackSize     "Try decreasing KMP_MONITOR_STACKSIZE."
+DecreaseNumberOfThreadsInUse "Try decreasing the number of threads in use simultaneously."
+DefaultScheduleKindUsed      "Will use default schedule type (%1$s)."
+GetNewerLibrary              "It could be a result of using an older OMP library with a newer "
+                             "compiler or memory corruption. You may check the proper OMP library "
+                             "is linked to the application."
+CheckEnvVar                  "Check %1$s environment variable, its value is \"%2$s\"."
+OBSOLETE                     "You may want to use an %1$s library that supports %2$s interface with version %3$s."
+OBSOLETE                     "You may want to use an %1$s library with version %2$s."
+BadExeFormat                 "System error #193 is \"Bad format of EXE or DLL file\". "
+                             "Usually it means the file is found, but it is corrupted or "
+                             "a file for another architecture. "
+                             "Check whether \"%1$s\" is a file for %2$s architecture."
+SystemLimitOnThreads         "System-related limit on the number of threads."
+
+
+
+# --------------------------------------------------------------------------------------------------
+# end of file #
+# --------------------------------------------------------------------------------------------------
+

diff --git a/final/runtime/src/include/30/iomp.h.var b/final/runtime/src/include/30/iomp.h.var
new file mode 100644
index 0000000..c182486
--- /dev/null
+++ b/final/runtime/src/include/30/iomp.h.var

@@ -0,0 +1,96 @@
+/*
+ * include/30/iomp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __IOMP_H
+#   define __IOMP_H
+
+#   define KMP_VERSION_MAJOR    $KMP_VERSION_MAJOR
+#   define KMP_VERSION_MINOR    $KMP_VERSION_MINOR
+#   define KMP_VERSION_BUILD    $KMP_VERSION_BUILD
+#   define KMP_BUILD_DATE       "$KMP_BUILD_DATE"
+
+#   ifdef __cplusplus
+        extern "C" {
+#   endif
+
+#       define kmp_set_stacksize            kmpc_set_stacksize
+#       define kmp_set_stacksize_s          kmpc_set_stacksize_s
+#       define kmp_set_blocktime            kmpc_set_blocktime
+#       define kmp_set_library              kmpc_set_library
+#       define kmp_set_defaults             kmpc_set_defaults
+#       define kmp_set_affinity_mask_proc   kmpc_set_affinity_mask_proc
+#       define kmp_unset_affinity_mask_proc kmpc_unset_affinity_mask_proc
+#       define kmp_get_affinity_mask_proc   kmpc_get_affinity_mask_proc
+
+#       define kmp_malloc                   kmpc_malloc
+#       define kmp_calloc                   kmpc_calloc
+#       define kmp_realloc                  kmpc_realloc
+#       define kmp_free                     kmpc_free
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#   else
+#       define __KAI_KMPC_CONVENTION
+#   endif
+
+#   include <stdlib.h>
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+
+    /* affinity API functions */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+#   undef __KAI_KMPC_CONVENTION
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+        }
+#   endif
+
+#endif /* __IOMP_H */
+

diff --git a/final/runtime/src/include/30/iomp_lib.h.var b/final/runtime/src/include/30/iomp_lib.h.var
new file mode 100644
index 0000000..9abbd60
--- /dev/null
+++ b/final/runtime/src/include/30/iomp_lib.h.var

@@ -0,0 +1,81 @@
+! include/30/iomp_lib.h.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** omp_integer_kind and omp_logical_kind appear to be predefined by gcc and
+!*** gfortran (definitions do not appear in the omp.h / omp_lib.h /omp_lib.f).
+!*** omp_real_kind is not predefined, however.
+!***
+
+        integer, parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer, parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer, parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*)          kmp_build_date
+        parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+        integer, parameter :: omp_real_kind = 4
+
+!***
+!*** kmp_* type extensions
+!***
+
+        integer, parameter :: kmp_pointer_kind       = $KMP_INT_PTR_KIND
+        integer, parameter :: kmp_size_t_kind        = $KMP_INT_PTR_KIND
+        integer, parameter :: kmp_affinity_mask_kind = $KMP_INT_PTR_KIND
+
+!***
+!*** kmp_* entry points
+!***
+
+        external kmp_set_stacksize
+        external kmp_set_stacksize_s
+        external kmp_set_blocktime
+        external kmp_set_library_serial
+        external kmp_set_library_turnaround
+        external kmp_set_library_throughput
+        external kmp_set_library
+        external kmp_set_defaults
+        external kmp_get_stacksize
+        integer kmp_get_stacksize
+        external kmp_get_stacksize_s
+        integer (kind = kmp_size_t_kind) kmp_get_stacksize_s
+        external kmp_get_blocktime
+        integer kmp_get_blocktime
+        external kmp_get_library
+        integer kmp_get_library
+        external kmp_set_affinity
+        integer kmp_set_affinity
+        external kmp_get_affinity
+        integer kmp_get_affinity
+        external kmp_get_affinity_max_proc
+        integer kmp_get_affinity_max_proc
+        external kmp_create_affinity_mask
+        external kmp_destroy_affinity_mask
+        external kmp_set_affinity_mask_proc
+        integer kmp_set_affinity_mask_proc
+        external kmp_unset_affinity_mask_proc
+        integer kmp_unset_affinity_mask_proc
+        external kmp_get_affinity_mask_proc
+        integer kmp_get_affinity_mask_proc
+        external kmp_malloc
+        integer (kind = kmp_pointer_kind) kmp_malloc
+        external kmp_calloc
+        integer (kind = kmp_pointer_kind) kmp_calloc
+        external kmp_realloc
+        integer (kind = kmp_pointer_kind) kmp_realloc
+        external kmp_free
+
+        external kmp_set_warnings_on
+        external kmp_set_warnings_off
+
+

diff --git a/final/runtime/src/include/30/omp.h.var b/final/runtime/src/include/30/omp.h.var
new file mode 100644
index 0000000..02b049b
--- /dev/null
+++ b/final/runtime/src/include/30/omp.h.var

@@ -0,0 +1,164 @@
+/*
+ * include/30/omp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __OMP_H
+#   define __OMP_H
+
+#   define KMP_VERSION_MAJOR    $KMP_VERSION_MAJOR
+#   define KMP_VERSION_MINOR    $KMP_VERSION_MINOR
+#   define KMP_VERSION_BUILD    $KMP_VERSION_BUILD
+#   define KMP_BUILD_DATE       "$KMP_BUILD_DATE"
+
+#   ifdef __cplusplus
+    extern "C" {
+#   endif
+
+#       define omp_set_num_threads          ompc_set_num_threads
+#       define omp_set_dynamic              ompc_set_dynamic
+#       define omp_set_nested               ompc_set_nested
+#       define omp_set_max_active_levels    ompc_set_max_active_levels
+#       define omp_set_schedule             ompc_set_schedule
+#       define omp_get_ancestor_thread_num  ompc_get_ancestor_thread_num
+#       define omp_get_team_size            ompc_get_team_size
+
+
+#       define kmp_set_stacksize            kmpc_set_stacksize
+#       define kmp_set_stacksize_s          kmpc_set_stacksize_s
+#       define kmp_set_blocktime            kmpc_set_blocktime
+#       define kmp_set_library              kmpc_set_library
+#       define kmp_set_defaults             kmpc_set_defaults
+#       define kmp_set_affinity_mask_proc   kmpc_set_affinity_mask_proc
+#       define kmp_unset_affinity_mask_proc kmpc_unset_affinity_mask_proc
+#       define kmp_get_affinity_mask_proc   kmpc_get_affinity_mask_proc
+
+#       define kmp_malloc                   kmpc_malloc
+#       define kmp_calloc                   kmpc_calloc
+#       define kmp_realloc                  kmpc_realloc
+#       define kmp_free                     kmpc_free
+
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#   else
+#       define __KAI_KMPC_CONVENTION
+#   endif
+
+    /* schedule kind constants */
+    typedef enum omp_sched_t {
+	omp_sched_static  = 1,
+	omp_sched_dynamic = 2,
+	omp_sched_guided  = 3,
+	omp_sched_auto    = 4
+    } omp_sched_t;
+
+    /* set API functions */
+    extern void   __KAI_KMPC_CONVENTION  omp_set_num_threads (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_dynamic     (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nested      (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_max_active_levels (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_schedule          (omp_sched_t, int);
+
+    /* query API functions */
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_dynamic      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_nested       (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_num   (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_procs    (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_parallel      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_final         (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_active_level        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_level               (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_ancestor_thread_num (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_team_size           (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_limit        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_active_levels   (void);
+    extern void   __KAI_KMPC_CONVENTION  omp_get_schedule            (omp_sched_t *, int *);
+
+    /* lock API functions */
+    typedef struct omp_lock_t {
+        void * _lk;
+    } omp_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_lock    (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_lock     (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_lock   (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_lock (omp_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_lock    (omp_lock_t *);
+
+    /* nested lock API functions */
+    typedef struct omp_nest_lock_t {
+        void * _lk;
+    } omp_nest_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_nest_lock    (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nest_lock     (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_nest_lock   (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_nest_lock (omp_nest_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_nest_lock    (omp_nest_lock_t *);
+
+    /* time API functions */
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtime (void);
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtick (void);
+
+#   include <stdlib.h>
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+
+    /* affinity API functions */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+#   undef __KAI_KMPC_CONVENTION
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+    }
+#   endif
+
+#endif /* __OMP_H */
+

diff --git a/final/runtime/src/include/30/omp_lib.f.var b/final/runtime/src/include/30/omp_lib.f.var
new file mode 100644
index 0000000..a825895
--- /dev/null
+++ b/final/runtime/src/include/30/omp_lib.f.var

@@ -0,0 +1,633 @@
+! include/30/omp_lib.f.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** Some of the directives for the following routine extend past column 72,
+!*** so process this file in 132-column mode.
+!***
+
+!dec$ fixedformlinesize:132
+
+      module omp_lib_kinds
+
+        integer, parameter :: omp_integer_kind       = 4
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = 4
+        integer, parameter :: omp_lock_kind          = int_ptr_kind()
+        integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
+        integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
+        integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*), parameter :: kmp_build_date    = '$KMP_BUILD_DATE'
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(nthreads)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) nthreads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(enable)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) enable
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(enable)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) enable
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_get_dynamic()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_team_size
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_get_schedule
+
+          function omp_get_wtime()
+            double precision omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick ()
+            double precision omp_get_wtick
+          end function omp_get_wtick
+
+          subroutine omp_init_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(lockvar)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) lockvar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(lockvar)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) lockvar
+          end function omp_test_nest_lock
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial()
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround()
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput()
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string)
+            character*(*) string
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s()
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          function kmp_set_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_malloc
+
+          function kmp_calloc(nelem, elsize)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind) nelem
+            integer (kind=kmp_size_t_kind) elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind) ptr
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on()
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off()
+          end subroutine kmp_set_warnings_off
+
+        end interface
+
+!dec$ if defined(_WIN32)
+!dec$   if defined(_WIN64) .or. defined(_M_AMD64)
+
+!***
+!*** The Fortran entry points must be in uppercase, even if the /Qlowercase
+!*** option is specified.  The alias attribute ensures that the specified
+!*** string is used as the entry point.
+!***
+!*** On the Windows* OS IA-32 architecture, the Fortran entry points have an
+!*** underscore prepended.  On the Windows* OS Intel(R) 64
+!*** architecture, no underscore is prepended.
+!***
+
+!dec$ attributes alias:'OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'OMP_GET_WTICK' :: omp_get_wtick
+
+!dec$ attributes alias:'omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$   else
+
+!***
+!*** On Windows* OS IA-32 architecture, the Fortran entry points have an underscore prepended.
+!***
+
+!dec$ attributes alias:'_OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'_OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'_OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'_OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'_OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'_OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'_OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'_OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'_OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'_OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'_OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'_OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'_OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'_OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'_OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'_OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'_OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'_OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'_OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'_OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'_OMP_GET_WTICK' :: omp_get_wtick
+
+!dec$ attributes alias:'_omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'_KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'_KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'_KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'_KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'_KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'_KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'_KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'_KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'_KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'_KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'_KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'_KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'_KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'_KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'_KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'_KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'_KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'_KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$   endif
+!dec$ endif
+
+!dec$ if defined(__linux)
+
+!***
+!*** The Linux* OS entry points are in lowercase, with an underscore appended.
+!***
+
+!dec$ attributes alias:'omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'omp_get_level_'::omp_get_level
+!dec$ attributes alias:'omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'omp_get_wtick_'::omp_get_wtick
+
+!dec$ attributes alias:'omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'kmp_free_'::kmp_free
+
+!dec$ attributes alias:'kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'kmp_set_warnings_off_'::kmp_set_warnings_off
+
+!dec$ endif
+
+!dec$ if defined(__APPLE__)
+
+!***
+!*** The Mac entry points are in lowercase, with an both an underscore
+!*** appended and an underscore prepended.
+!***
+
+!dec$ attributes alias:'_omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'_omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'_omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'_omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'_omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'_omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'_omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'_omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'_omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'_omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'_omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'_omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'_omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'_omp_get_level_'::omp_get_level
+!dec$ attributes alias:'_omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'_omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'_omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'_omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'_omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'_omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'_omp_get_wtick_'::omp_get_wtick
+
+!dec$ attributes alias:'_omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'_kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'_kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'_kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'_kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'_kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'_kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'_kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'_kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'_kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'_kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'_kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'_kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'_kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'_kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'_kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'_kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'_kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'_kmp_free_'::kmp_free
+
+!dec$ attributes alias:'_kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'_kmp_set_warnings_off_'::kmp_set_warnings_off
+
+!dec$ endif
+
+      end module omp_lib
+

diff --git a/final/runtime/src/include/30/omp_lib.f90.var b/final/runtime/src/include/30/omp_lib.f90.var
new file mode 100644
index 0000000..8e86697
--- /dev/null
+++ b/final/runtime/src/include/30/omp_lib.f90.var

@@ -0,0 +1,358 @@
+! include/30/omp_lib.f90.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+      module omp_lib_kinds
+
+        use, intrinsic :: iso_c_binding
+
+        integer, parameter :: omp_integer_kind       = c_int
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = c_float
+        integer, parameter :: kmp_double_kind        = c_double
+        integer, parameter :: omp_lock_kind          = c_intptr_t
+        integer, parameter :: omp_nest_lock_kind     = c_intptr_t
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = c_intptr_t
+        integer, parameter :: kmp_size_t_kind        = c_size_t
+        integer, parameter :: kmp_affinity_mask_kind = c_intptr_t
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*)               kmp_build_date
+        parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(nthreads) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: nthreads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(enable) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: enable
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(enable) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: enable
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_in_final() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_final
+          end function omp_in_final
+
+          function omp_get_dynamic() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) :: omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) :: omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_size
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, modifier) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind), value :: kind
+            integer (kind=omp_integer_kind), value :: modifier
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, modifier) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind)   :: kind
+            integer (kind=omp_integer_kind) :: modifier
+          end subroutine omp_get_schedule
+
+          function omp_get_wtime() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtick
+          end function omp_get_wtick
+
+          subroutine omp_init_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) lockvar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(lockvar) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) lockvar
+          end function omp_test_nest_lock
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind), value :: size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial() bind(c)
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround() bind(c)
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput() bind(c)
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string) bind(c)
+            use, intrinsic :: iso_c_binding
+            character (kind=c_char) :: string(*)
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s() bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          function kmp_set_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_malloc
+
+          function kmp_calloc(nelem, elsize) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind), value :: nelem
+            integer (kind=kmp_size_t_kind), value :: elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind), value :: ptr
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind), value :: ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on() bind(c)
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off() bind(c)
+          end subroutine kmp_set_warnings_off
+
+        end interface
+
+      end module omp_lib

diff --git a/final/runtime/src/include/30/omp_lib.h.var b/final/runtime/src/include/30/omp_lib.h.var
new file mode 100644
index 0000000..f8677cf
--- /dev/null
+++ b/final/runtime/src/include/30/omp_lib.h.var

@@ -0,0 +1,638 @@
+! include/30/omp_lib.h.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** Some of the directives for the following routine extend past column 72,
+!*** so process this file in 132-column mode.
+!***
+
+!dec$ fixedformlinesize:132
+
+      integer, parameter :: omp_integer_kind       = 4
+      integer, parameter :: omp_logical_kind       = 4
+      integer, parameter :: omp_real_kind          = 4
+      integer, parameter :: omp_lock_kind          = int_ptr_kind()
+      integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
+      integer, parameter :: omp_sched_kind         = omp_integer_kind
+      integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
+      integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
+      integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
+
+      integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+      integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+      integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+      integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+      integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+      integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+      integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+      character(*)               kmp_build_date
+      parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+      integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+
+      interface
+
+!       ***
+!       *** omp_* entry points
+!       ***
+
+        subroutine omp_set_num_threads(nthreads)
+          import
+          integer (kind=omp_integer_kind) nthreads
+        end subroutine omp_set_num_threads
+
+        subroutine omp_set_dynamic(enable)
+          import
+          logical (kind=omp_logical_kind) enable
+        end subroutine omp_set_dynamic
+
+        subroutine omp_set_nested(enable)
+          import
+          logical (kind=omp_logical_kind) enable
+        end subroutine omp_set_nested
+
+        function omp_get_num_threads()
+          import
+          integer (kind=omp_integer_kind) omp_get_num_threads
+        end function omp_get_num_threads
+
+        function omp_get_max_threads()
+          import
+          integer (kind=omp_integer_kind) omp_get_max_threads
+        end function omp_get_max_threads
+
+        function omp_get_thread_num()
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_num
+        end function omp_get_thread_num
+
+        function omp_get_num_procs()
+          import
+          integer (kind=omp_integer_kind) omp_get_num_procs
+        end function omp_get_num_procs
+
+        function omp_in_parallel()
+          import
+          logical (kind=omp_logical_kind) omp_in_parallel
+        end function omp_in_parallel
+
+        function omp_in_final()
+          import
+          logical (kind=omp_logical_kind) omp_in_final
+        end function omp_in_final
+
+        function omp_get_dynamic()
+          import
+          logical (kind=omp_logical_kind) omp_get_dynamic
+        end function omp_get_dynamic
+
+        function omp_get_nested()
+          import
+          logical (kind=omp_logical_kind) omp_get_nested
+        end function omp_get_nested
+
+        function omp_get_thread_limit()
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_limit
+        end function omp_get_thread_limit
+
+        subroutine omp_set_max_active_levels(max_levels)
+          import
+          integer (kind=omp_integer_kind) max_levels
+        end subroutine omp_set_max_active_levels
+
+        function omp_get_max_active_levels()
+          import
+          integer (kind=omp_integer_kind) omp_get_max_active_levels
+        end function omp_get_max_active_levels
+
+        function omp_get_level()
+          import
+          integer (kind=omp_integer_kind) omp_get_level
+        end function omp_get_level
+
+        function omp_get_active_level()
+          import
+          integer (kind=omp_integer_kind) omp_get_active_level
+        end function omp_get_active_level
+
+        function omp_get_ancestor_thread_num(level)
+          import
+          integer (kind=omp_integer_kind) level
+          integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+        end function omp_get_ancestor_thread_num
+
+        function omp_get_team_size(level)
+          import
+          integer (kind=omp_integer_kind) level
+          integer (kind=omp_integer_kind) omp_get_team_size
+        end function omp_get_team_size
+
+        subroutine omp_set_schedule(kind, modifier)
+          import
+          integer (kind=omp_sched_kind) kind
+          integer (kind=omp_integer_kind) modifier
+        end subroutine omp_set_schedule
+
+        subroutine omp_get_schedule(kind, modifier)
+          import
+          integer (kind=omp_sched_kind) kind
+          integer (kind=omp_integer_kind) modifier
+        end subroutine omp_get_schedule
+
+        function omp_get_wtime()
+          double precision omp_get_wtime
+        end function omp_get_wtime
+
+        function omp_get_wtick ()
+          double precision omp_get_wtick
+        end function omp_get_wtick
+
+        subroutine omp_init_lock(lockvar)
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_init_lock
+
+        subroutine omp_destroy_lock(lockvar)
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_destroy_lock
+
+        subroutine omp_set_lock(lockvar)
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_set_lock
+
+        subroutine omp_unset_lock(lockvar)
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_unset_lock
+
+        function omp_test_lock(lockvar)
+          import
+          logical (kind=omp_logical_kind) omp_test_lock
+          integer (kind=omp_lock_kind) lockvar
+        end function omp_test_lock
+
+        subroutine omp_init_nest_lock(lockvar)
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_init_nest_lock
+
+        subroutine omp_destroy_nest_lock(lockvar)
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_destroy_nest_lock
+
+        subroutine omp_set_nest_lock(lockvar)
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_set_nest_lock
+
+        subroutine omp_unset_nest_lock(lockvar)
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_unset_nest_lock
+
+        function omp_test_nest_lock(lockvar)
+          import
+          integer (kind=omp_integer_kind) omp_test_nest_lock
+          integer (kind=omp_nest_lock_kind) lockvar
+        end function omp_test_nest_lock
+
+!       ***
+!       *** kmp_* entry points
+!       ***
+
+        subroutine kmp_set_stacksize(size)
+          import
+          integer (kind=omp_integer_kind) size
+        end subroutine kmp_set_stacksize
+
+        subroutine kmp_set_stacksize_s(size)
+          import
+          integer (kind=kmp_size_t_kind) size
+        end subroutine kmp_set_stacksize_s
+
+        subroutine kmp_set_blocktime(msec)
+          import
+          integer (kind=omp_integer_kind) msec
+        end subroutine kmp_set_blocktime
+
+        subroutine kmp_set_library_serial()
+        end subroutine kmp_set_library_serial
+
+        subroutine kmp_set_library_turnaround()
+        end subroutine kmp_set_library_turnaround
+
+        subroutine kmp_set_library_throughput()
+        end subroutine kmp_set_library_throughput
+
+        subroutine kmp_set_library(libnum)
+          import
+          integer (kind=omp_integer_kind) libnum
+        end subroutine kmp_set_library
+
+        subroutine kmp_set_defaults(string)
+          character*(*) string
+        end subroutine kmp_set_defaults
+
+        function kmp_get_stacksize()
+          import
+          integer (kind=omp_integer_kind) kmp_get_stacksize
+        end function kmp_get_stacksize
+
+        function kmp_get_stacksize_s()
+          import
+          integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+        end function kmp_get_stacksize_s
+
+        function kmp_get_blocktime()
+          import
+          integer (kind=omp_integer_kind) kmp_get_blocktime
+        end function kmp_get_blocktime
+
+        function kmp_get_library()
+          import
+          integer (kind=omp_integer_kind) kmp_get_library
+        end function kmp_get_library
+
+        function kmp_set_affinity(mask)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity
+
+        function kmp_get_affinity(mask)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity
+
+        function kmp_get_affinity_max_proc()
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+        end function kmp_get_affinity_max_proc
+
+        subroutine kmp_create_affinity_mask(mask)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_create_affinity_mask
+
+        subroutine kmp_destroy_affinity_mask(mask)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_destroy_affinity_mask
+
+        function kmp_set_affinity_mask_proc(proc, mask)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+          integer (kind=omp_integer_kind) proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity_mask_proc
+
+        function kmp_unset_affinity_mask_proc(proc, mask)
+          import
+          integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+          integer (kind=omp_integer_kind) proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_unset_affinity_mask_proc
+
+        function kmp_get_affinity_mask_proc(proc, mask)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+          integer (kind=omp_integer_kind) proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity_mask_proc
+
+        function kmp_malloc(size)
+          import
+          integer (kind=kmp_pointer_kind) kmp_malloc
+          integer (kind=kmp_size_t_kind) size
+        end function kmp_malloc
+
+        function kmp_calloc(nelem, elsize)
+          import
+          integer (kind=kmp_pointer_kind) kmp_calloc
+          integer (kind=kmp_size_t_kind) nelem
+          integer (kind=kmp_size_t_kind) elsize
+        end function kmp_calloc
+
+        function kmp_realloc(ptr, size)
+          import
+          integer (kind=kmp_pointer_kind) kmp_realloc
+          integer (kind=kmp_pointer_kind) ptr
+          integer (kind=kmp_size_t_kind) size
+        end function kmp_realloc
+
+        subroutine kmp_free(ptr)
+          import
+          integer (kind=kmp_pointer_kind) ptr
+        end subroutine kmp_free
+
+        subroutine kmp_set_warnings_on()
+        end subroutine kmp_set_warnings_on
+
+        subroutine kmp_set_warnings_off()
+        end subroutine kmp_set_warnings_off
+
+      end interface
+
+!dec$ if defined(_WIN32)
+!dec$   if defined(_WIN64) .or. defined(_M_AMD64)
+
+!***
+!*** The Fortran entry points must be in uppercase, even if the /Qlowercase
+!*** option is specified.  The alias attribute ensures that the specified
+!*** string is used as the entry point.
+!***
+!*** On the Windows* OS IA-32 architecture, the Fortran entry points have an
+!*** underscore prepended.  On the Windows* OS Intel(R) 64
+!*** architecture, no underscore is prepended.
+!***
+
+!dec$ attributes alias:'OMP_SET_NUM_THREADS'::omp_set_num_threads
+!dec$ attributes alias:'OMP_SET_DYNAMIC'::omp_set_dynamic
+!dec$ attributes alias:'OMP_SET_NESTED'::omp_set_nested
+!dec$ attributes alias:'OMP_GET_NUM_THREADS'::omp_get_num_threads
+!dec$ attributes alias:'OMP_GET_MAX_THREADS'::omp_get_max_threads
+!dec$ attributes alias:'OMP_GET_THREAD_NUM'::omp_get_thread_num
+!dec$ attributes alias:'OMP_GET_NUM_PROCS'::omp_get_num_procs
+!dec$ attributes alias:'OMP_IN_PARALLEL'::omp_in_parallel
+!dec$ attributes alias:'OMP_IN_FINAL'::omp_in_final
+!dec$ attributes alias:'OMP_GET_DYNAMIC'::omp_get_dynamic
+!dec$ attributes alias:'OMP_GET_NESTED'::omp_get_nested
+!dec$ attributes alias:'OMP_GET_THREAD_LIMIT'::omp_get_thread_limit
+!dec$ attributes alias:'OMP_SET_MAX_ACTIVE_LEVELS'::omp_set_max_active_levels
+!dec$ attributes alias:'OMP_GET_MAX_ACTIVE_LEVELS'::omp_get_max_active_levels
+!dec$ attributes alias:'OMP_GET_LEVEL'::omp_get_level
+!dec$ attributes alias:'OMP_GET_ACTIVE_LEVEL'::omp_get_active_level
+!dec$ attributes alias:'OMP_GET_ANCESTOR_THREAD_NUM'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'OMP_GET_TEAM_SIZE'::omp_get_team_size
+!dec$ attributes alias:'OMP_SET_SCHEDULE'::omp_set_schedule
+!dec$ attributes alias:'OMP_GET_SCHEDULE'::omp_get_schedule
+!dec$ attributes alias:'OMP_GET_WTIME'::omp_get_wtime
+!dec$ attributes alias:'OMP_GET_WTICK'::omp_get_wtick
+
+!dec$ attributes alias:'omp_init_lock'::omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock'::omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock'::omp_set_lock
+!dec$ attributes alias:'omp_unset_lock'::omp_unset_lock
+!dec$ attributes alias:'omp_test_lock'::omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock'::omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock'::omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock'::omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock'::omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock'::omp_test_nest_lock
+
+!dec$ attributes alias:'KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'KMP_SET_DEFAULTS'::kmp_set_defaults
+!dec$ attributes alias:'KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$   else
+
+!***
+!*** On Windows* OS IA-32 architecture, the Fortran entry points have an underscore prepended.
+!***
+
+!dec$ attributes alias:'_OMP_SET_NUM_THREADS'::omp_set_num_threads
+!dec$ attributes alias:'_OMP_SET_DYNAMIC'::omp_set_dynamic
+!dec$ attributes alias:'_OMP_SET_NESTED'::omp_set_nested
+!dec$ attributes alias:'_OMP_GET_NUM_THREADS'::omp_get_num_threads
+!dec$ attributes alias:'_OMP_GET_MAX_THREADS'::omp_get_max_threads
+!dec$ attributes alias:'_OMP_GET_THREAD_NUM'::omp_get_thread_num
+!dec$ attributes alias:'_OMP_GET_NUM_PROCS'::omp_get_num_procs
+!dec$ attributes alias:'_OMP_IN_PARALLEL'::omp_in_parallel
+!dec$ attributes alias:'_OMP_IN_FINAL'::omp_in_final
+!dec$ attributes alias:'_OMP_GET_DYNAMIC'::omp_get_dynamic
+!dec$ attributes alias:'_OMP_GET_NESTED'::omp_get_nested
+!dec$ attributes alias:'_OMP_GET_THREAD_LIMIT'::omp_get_thread_limit
+!dec$ attributes alias:'_OMP_SET_MAX_ACTIVE_LEVELS'::omp_set_max_active_levels
+!dec$ attributes alias:'_OMP_GET_MAX_ACTIVE_LEVELS'::omp_get_max_active_levels
+!dec$ attributes alias:'_OMP_GET_LEVEL'::omp_get_level
+!dec$ attributes alias:'_OMP_GET_ACTIVE_LEVEL'::omp_get_active_level
+!dec$ attributes alias:'_OMP_GET_ANCESTOR_THREAD_NUM'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'_OMP_GET_TEAM_SIZE'::omp_get_team_size
+!dec$ attributes alias:'_OMP_SET_SCHEDULE'::omp_set_schedule
+!dec$ attributes alias:'_OMP_GET_SCHEDULE'::omp_get_schedule
+!dec$ attributes alias:'_OMP_GET_WTIME'::omp_get_wtime
+!dec$ attributes alias:'_OMP_GET_WTICK'::omp_get_wtick
+
+!dec$ attributes alias:'_omp_init_lock'::omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock'::omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock'::omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock'::omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock'::omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock'::omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock'::omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock'::omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock'::omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock'::omp_test_nest_lock
+
+!dec$ attributes alias:'_KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'_KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'_KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'_KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'_KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'_KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'_KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'_KMP_SET_DEFAULTS'::kmp_set_defaults
+!dec$ attributes alias:'_KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'_KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'_KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'_KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'_KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'_KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'_KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'_KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'_KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'_KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'_KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$   endif
+!dec$ endif
+
+!dec$ if defined(__linux)
+
+!***
+!*** The Linux* OS entry points are in lowercase, with an underscore appended.
+!***
+
+!dec$ attributes alias:'omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'omp_in_final_'::omp_in_final
+!dec$ attributes alias:'omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'omp_get_level_'::omp_get_level
+!dec$ attributes alias:'omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'omp_get_wtick_'::omp_get_wtick
+
+!dec$ attributes alias:'omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'kmp_set_defaults_'::kmp_set_defaults
+!dec$ attributes alias:'kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'kmp_free_'::kmp_free
+
+!dec$ attributes alias:'kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'kmp_set_warnings_off_'::kmp_set_warnings_off
+
+!dec$ endif
+
+!dec$ if defined(__APPLE__)
+
+!***
+!*** The Mac entry points are in lowercase, with an both an underscore
+!*** appended and an underscore prepended.
+!***
+
+!dec$ attributes alias:'_omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'_omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'_omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'_omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'_omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'_omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'_omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'_omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'_omp_in_final_'::omp_in_final
+!dec$ attributes alias:'_omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'_omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'_omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'_omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'_omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'_omp_get_level_'::omp_get_level
+!dec$ attributes alias:'_omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'_omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'_omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'_omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'_omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'_omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'_omp_get_wtick_'::omp_get_wtick
+
+!dec$ attributes alias:'_omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'_kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'_kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'_kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'_kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'_kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'_kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'_kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'_kmp_set_defaults_'::kmp_set_defaults
+!dec$ attributes alias:'_kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'_kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'_kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'_kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'_kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'_kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'_kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'_kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'_kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'_kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'_kmp_free_'::kmp_free
+
+!dec$ attributes alias:'_kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'_kmp_set_warnings_off_'::kmp_set_warnings_off
+
+!dec$ endif
+
+

diff --git a/final/runtime/src/include/30/ompt.h.var b/final/runtime/src/include/30/ompt.h.var
new file mode 100644
index 0000000..3d3b537
--- /dev/null
+++ b/final/runtime/src/include/30/ompt.h.var

@@ -0,0 +1,472 @@
+/*
+ * include/30/ompt.h.var
+ */
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+
+
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)  \
+    macro (ompt_enumerate_state)        \
+                                        \
+    macro (ompt_set_callback)           \
+    macro (ompt_get_callback)           \
+                                        \
+    macro (ompt_get_idle_frame)         \
+    macro (ompt_get_task_frame)         \
+                                        \
+    macro (ompt_get_state)              \
+                                        \
+    macro (ompt_get_parallel_id)        \
+    macro (ompt_get_parallel_team_size) \
+    macro (ompt_get_task_id)            \
+    macro (ompt_get_thread_id)
+
+#define FOREACH_OMPT_PLACEHOLDER_FN(macro)  \
+    macro (ompt_idle)                       \
+    macro (ompt_overhead)                   \
+    macro (ompt_barrier_wait)               \
+    macro (ompt_task_wait)                  \
+    macro (ompt_mutex_wait)
+
+#define FOREACH_OMPT_STATE(macro)                                                               \
+                                                                                                \
+    /* first */                                                                                 \
+    macro (ompt_state_first, 0x71)          /* initial enumeration state */                     \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x00)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x01)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x02) /* performing a reduction */                        \
+                                                                                                \
+    /* idle (16..31) */                                                                         \
+    macro (ompt_state_idle, 0x10)            /* waiting for work */                             \
+                                                                                                \
+    /* overhead states (32..63) */                                                              \
+    macro (ompt_state_overhead, 0x20)        /* overhead excluding wait states */               \
+                                                                                                \
+    /* barrier wait states (64..79) */                                                          \
+    macro (ompt_state_wait_barrier, 0x40)    /* waiting at a barrier */                         \
+    macro (ompt_state_wait_barrier_implicit, 0x41)    /* implicit barrier */                    \
+    macro (ompt_state_wait_barrier_explicit, 0x42)    /* explicit barrier */                    \
+                                                                                                \
+    /* task wait states (80..95) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x50)   /* waiting at a taskwait */                        \
+    macro (ompt_state_wait_taskgroup, 0x51)  /* waiting at a taskgroup */                       \
+                                                                                                \
+    /* mutex wait states (96..111) */                                                           \
+    macro (ompt_state_wait_lock, 0x60)       /* waiting for lock */                             \
+    macro (ompt_state_wait_nest_lock, 0x61)  /* waiting for nest lock */                        \
+    macro (ompt_state_wait_critical, 0x62)   /* waiting for critical */                         \
+    macro (ompt_state_wait_atomic, 0x63)     /* waiting for atomic */                           \
+    macro (ompt_state_wait_ordered, 0x64)    /* waiting for ordered */                          \
+    macro (ompt_state_wait_single, 0x6F)     /* waiting for single region (non-standard!) */    \
+                                                                                                \
+    /* misc (112..127) */                                                                       \
+    macro (ompt_state_undefined, 0x70)       /* undefined thread state */
+
+
+#define FOREACH_OMPT_EVENT(macro)                                                                               \
+                                                                                                                \
+    /*--- Mandatory Events ---*/                                                                                \
+    macro (ompt_event_parallel_begin,           ompt_new_parallel_callback_t,   1) /* parallel begin */         \
+    macro (ompt_event_parallel_end,             ompt_parallel_callback_t,       2) /* parallel end */           \
+                                                                                                                \
+    macro (ompt_event_task_begin,               ompt_new_task_callback_t,       3) /* task begin */             \
+    macro (ompt_event_task_end,                 ompt_task_callback_t,           4) /* task destroy */           \
+                                                                                                                \
+    macro (ompt_event_thread_begin,             ompt_thread_type_callback_t,    5) /* thread begin */           \
+    macro (ompt_event_thread_end,               ompt_thread_type_callback_t,    6) /* thread end */             \
+                                                                                                                \
+    macro (ompt_event_control,                  ompt_control_callback_t,        7) /* support control calls */  \
+                                                                                                                \
+    macro (ompt_event_runtime_shutdown,         ompt_callback_t,                8) /* runtime shutdown */       \
+                                                                                                                \
+    /*--- Optional Events (blame shifting, ompt_event_unimplemented) ---*/                                      \
+    macro (ompt_event_idle_begin,               ompt_thread_callback_t,         9) /* begin idle state */       \
+    macro (ompt_event_idle_end,                 ompt_thread_callback_t,        10) /* end idle state */         \
+                                                                                                                \
+    macro (ompt_event_wait_barrier_begin,       ompt_parallel_callback_t,      11) /* begin wait at barrier */  \
+    macro (ompt_event_wait_barrier_end,         ompt_parallel_callback_t,      12) /* end wait at barrier */    \
+                                                                                                                \
+    macro (ompt_event_wait_taskwait_begin,      ompt_parallel_callback_t,      13) /* begin wait at taskwait */ \
+    macro (ompt_event_wait_taskwait_end,        ompt_parallel_callback_t,      14) /* end wait at taskwait */   \
+                                                                                                                \
+    macro (ompt_event_wait_taskgroup_begin,     ompt_parallel_callback_t,      15) /* begin wait at taskgroup */\
+    macro (ompt_event_wait_taskgroup_end,       ompt_parallel_callback_t,      16) /* end wait at taskgroup */  \
+                                                                                                                \
+    macro (ompt_event_release_lock,             ompt_wait_callback_t,          17) /* lock release */           \
+    macro (ompt_event_release_nest_lock_last,   ompt_wait_callback_t,          18) /* last nest lock release */ \
+    macro (ompt_event_release_critical,         ompt_wait_callback_t,          19) /* critical release */       \
+                                                                                                                \
+    macro (ompt_event_release_atomic,           ompt_wait_callback_t,          20) /* atomic release */         \
+                                                                                                                \
+    macro (ompt_event_release_ordered,          ompt_wait_callback_t,          21) /* ordered release */        \
+                                                                                                                \
+    /*--- Optional Events (synchronous events, ompt_event_unimplemented) --- */                                 \
+    macro (ompt_event_implicit_task_begin,      ompt_parallel_callback_t,      22) /* implicit task begin   */  \
+    macro (ompt_event_implicit_task_end,        ompt_parallel_callback_t,      23) /* implicit task end  */     \
+                                                                                                                \
+    macro (ompt_event_initial_task_begin,       ompt_parallel_callback_t,      24) /* initial task begin   */   \
+    macro (ompt_event_initial_task_end,         ompt_parallel_callback_t,      25) /* initial task end  */      \
+                                                                                                                \
+    macro (ompt_event_task_switch,              ompt_task_switch_callback_t,   26) /* task switch */            \
+                                                                                                                \
+    macro (ompt_event_loop_begin,               ompt_new_workshare_callback_t, 27) /* task at loop begin */     \
+    macro (ompt_event_loop_end,                 ompt_parallel_callback_t,      28) /* task at loop end */       \
+                                                                                                                \
+    macro (ompt_event_sections_begin,           ompt_new_workshare_callback_t, 29) /* task at sections begin  */\
+    macro (ompt_event_sections_end,             ompt_parallel_callback_t,      30) /* task at sections end */   \
+                                                                                                                \
+    macro (ompt_event_single_in_block_begin,    ompt_new_workshare_callback_t, 31) /* task at single begin*/    \
+    macro (ompt_event_single_in_block_end,      ompt_parallel_callback_t,      32) /* task at single end */     \
+                                                                                                                \
+    macro (ompt_event_single_others_begin,      ompt_parallel_callback_t,      33) /* task at single begin */   \
+    macro (ompt_event_single_others_end,        ompt_parallel_callback_t,      34) /* task at single end */     \
+                                                                                                                \
+    macro (ompt_event_workshare_begin,          ompt_new_workshare_callback_t, 35) /* task at workshare begin */\
+    macro (ompt_event_workshare_end,            ompt_parallel_callback_t,      36) /* task at workshare end */  \
+                                                                                                                \
+    macro (ompt_event_master_begin,             ompt_parallel_callback_t,      37) /* task at master begin */   \
+    macro (ompt_event_master_end,               ompt_parallel_callback_t,      38) /* task at master end */     \
+                                                                                                                \
+    macro (ompt_event_barrier_begin,            ompt_parallel_callback_t,      39) /* task at barrier begin  */ \
+    macro (ompt_event_barrier_end,              ompt_parallel_callback_t,      40) /* task at barrier end */    \
+                                                                                                                \
+    macro (ompt_event_taskwait_begin,           ompt_parallel_callback_t,      41) /* task at taskwait begin */ \
+    macro (ompt_event_taskwait_end,             ompt_parallel_callback_t,      42) /* task at task wait end */  \
+                                                                                                                \
+    macro (ompt_event_taskgroup_begin,          ompt_parallel_callback_t,      43) /* task at taskgroup begin */\
+    macro (ompt_event_taskgroup_end,            ompt_parallel_callback_t,      44) /* task at taskgroup end */  \
+                                                                                                                \
+    macro (ompt_event_release_nest_lock_prev,   ompt_wait_callback_t,          45) /* prev nest lock release */ \
+                                                                                                                \
+    macro (ompt_event_wait_lock,                ompt_wait_callback_t,          46) /* lock wait */              \
+    macro (ompt_event_wait_nest_lock,           ompt_wait_callback_t,          47) /* nest lock wait */         \
+    macro (ompt_event_wait_critical,            ompt_wait_callback_t,          48) /* critical wait */          \
+    macro (ompt_event_wait_atomic,              ompt_wait_callback_t,          49) /* atomic wait */            \
+    macro (ompt_event_wait_ordered,             ompt_wait_callback_t,          50) /* ordered wait */           \
+                                                                                                                \
+    macro (ompt_event_acquired_lock,            ompt_wait_callback_t,          51) /* lock acquired */          \
+    macro (ompt_event_acquired_nest_lock_first, ompt_wait_callback_t,          52) /* 1st nest lock acquired */ \
+    macro (ompt_event_acquired_nest_lock_next,  ompt_wait_callback_t,          53) /* next nest lock acquired*/ \
+    macro (ompt_event_acquired_critical,        ompt_wait_callback_t,          54) /* critical acquired */      \
+    macro (ompt_event_acquired_atomic,          ompt_wait_callback_t,          55) /* atomic acquired */        \
+    macro (ompt_event_acquired_ordered,         ompt_wait_callback_t,          56) /* ordered acquired */       \
+                                                                                                                \
+    macro (ompt_event_init_lock,                ompt_wait_callback_t,          57) /* lock init */              \
+    macro (ompt_event_init_nest_lock,           ompt_wait_callback_t,          58) /* nest lock init */         \
+                                                                                                                \
+    macro (ompt_event_destroy_lock,             ompt_wait_callback_t,          59) /* lock destruction */       \
+    macro (ompt_event_destroy_nest_lock,        ompt_wait_callback_t,          60) /* nest lock destruction */  \
+                                                                                                                \
+    macro (ompt_event_flush,                    ompt_callback_t,               61) /* after executing flush */
+
+
+
+/*****************************************************************************
+ * data types
+ *****************************************************************************/
+
+/*---------------------
+ * identifiers
+ *---------------------*/
+
+typedef uint64_t ompt_thread_id_t;
+#define ompt_thread_id_none ((ompt_thread_id_t) 0)     /* non-standard */
+
+typedef uint64_t ompt_task_id_t;
+#define ompt_task_id_none ((ompt_task_id_t) 0)         /* non-standard */
+
+typedef uint64_t ompt_parallel_id_t;
+#define ompt_parallel_id_none ((ompt_parallel_id_t) 0) /* non-standard */
+
+typedef uint64_t ompt_wait_id_t;
+#define ompt_wait_id_none ((ompt_wait_id_t) 0)         /* non-standard */
+
+
+/*---------------------
+ * ompt_frame_t
+ *---------------------*/
+
+typedef struct ompt_frame_s {
+    void *exit_runtime_frame;    /* next frame is user code     */
+    void *reenter_runtime_frame; /* previous frame is user code */
+} ompt_frame_t;
+
+
+/*****************************************************************************
+ * enumerations for thread states and runtime events
+ *****************************************************************************/
+
+/*---------------------
+ * runtime states
+ *---------------------*/
+
+typedef enum {
+#define ompt_state_macro(state, code) state = code,
+    FOREACH_OMPT_STATE(ompt_state_macro)
+#undef ompt_state_macro
+} ompt_state_t;
+
+
+/*---------------------
+ * runtime events
+ *---------------------*/
+
+typedef enum {
+#define ompt_event_macro(event, callback, eventid) event = eventid,
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+#undef ompt_event_macro
+} ompt_event_t;
+
+
+/*---------------------
+ * set callback results
+ *---------------------*/
+typedef enum {
+    ompt_set_result_registration_error              = 0,
+    ompt_set_result_event_may_occur_no_callback     = 1,
+    ompt_set_result_event_never_occurs              = 2,
+    ompt_set_result_event_may_occur_callback_some   = 3,
+    ompt_set_result_event_may_occur_callback_always = 4,
+} ompt_set_result_t;
+
+
+
+/*****************************************************************************
+ * callback signatures
+ *****************************************************************************/
+
+/* initialization */
+typedef void (*ompt_interface_fn_t)(void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t)(
+    const char *                      /* entry point to look up       */
+);
+
+/* threads */
+typedef void (*ompt_thread_callback_t) (
+    ompt_thread_id_t thread_id        /* ID of thread                 */
+);
+
+typedef enum {
+    ompt_thread_initial = 1, // start the enumeration at 1
+    ompt_thread_worker  = 2,
+    ompt_thread_other   = 3
+} ompt_thread_type_t;
+
+typedef void (*ompt_thread_type_callback_t) (
+    ompt_thread_type_t thread_type,   /* type of thread               */
+    ompt_thread_id_t thread_id        /* ID of thread                 */
+);
+
+typedef void (*ompt_wait_callback_t) (
+    ompt_wait_id_t wait_id            /* wait id                      */
+);
+
+/* parallel and workshares */
+typedef void (*ompt_parallel_callback_t) (
+    ompt_parallel_id_t parallel_id,    /* id of parallel region       */
+    ompt_task_id_t task_id             /* id of task                  */
+);
+
+typedef void (*ompt_new_workshare_callback_t) (
+    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    void *workshare_function          /* pointer to outlined function */
+);
+
+typedef void (*ompt_new_parallel_callback_t) (
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    ompt_frame_t *parent_task_frame,  /* frame data of parent task    */
+    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
+    uint32_t requested_team_size,     /* number of threads in team    */
+    void *parallel_function           /* pointer to outlined function */
+);
+
+/* tasks */
+typedef void (*ompt_task_callback_t) (
+    ompt_task_id_t task_id            /* id of task                   */
+);
+
+typedef void (*ompt_task_switch_callback_t) (
+    ompt_task_id_t suspended_task_id, /* tool data for suspended task */
+    ompt_task_id_t resumed_task_id    /* tool data for resumed task   */
+);
+
+typedef void (*ompt_new_task_callback_t) (
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    ompt_frame_t *parent_task_frame,  /* frame data for parent task   */
+    ompt_task_id_t  new_task_id,      /* id of created task           */
+    void *task_function               /* pointer to outlined function */
+);
+
+/* program */
+typedef void (*ompt_control_callback_t) (
+    uint64_t command,                 /* command of control call      */
+    uint64_t modifier                 /* modifier of control call     */
+);
+
+typedef void (*ompt_callback_t)(void);
+
+
+/****************************************************************************
+ * ompt API
+ ***************************************************************************/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define OMPT_API_FNTYPE(fn) fn##_t
+
+#define OMPT_API_FUNCTION(return_type, fn, args)  \
+    typedef return_type (*OMPT_API_FNTYPE(fn)) args
+
+
+
+/****************************************************************************
+ * INQUIRY FUNCTIONS
+ ***************************************************************************/
+
+/* state */
+OMPT_API_FUNCTION(ompt_state_t, ompt_get_state, (
+    ompt_wait_id_t *ompt_wait_id
+));
+
+/* thread */
+OMPT_API_FUNCTION(ompt_thread_id_t, ompt_get_thread_id, (void));
+
+OMPT_API_FUNCTION(void *, ompt_get_idle_frame, (void));
+
+/* parallel region */
+OMPT_API_FUNCTION(ompt_parallel_id_t, ompt_get_parallel_id, (
+    int ancestor_level
+));
+
+OMPT_API_FUNCTION(int, ompt_get_parallel_team_size, (
+    int ancestor_level
+));
+
+/* task */
+OMPT_API_FUNCTION(ompt_task_id_t, ompt_get_task_id, (
+    int depth
+));
+
+OMPT_API_FUNCTION(ompt_frame_t *, ompt_get_task_frame, (
+    int depth
+));
+
+
+
+/****************************************************************************
+ * PLACEHOLDERS FOR PERFORMANCE REPORTING
+ ***************************************************************************/
+
+/* idle */
+OMPT_API_FUNCTION(void, ompt_idle, (
+    void
+));
+
+/* overhead */
+OMPT_API_FUNCTION(void, ompt_overhead, (
+    void
+));
+
+/* barrier wait */
+OMPT_API_FUNCTION(void, ompt_barrier_wait, (
+    void
+));
+
+/* task wait */
+OMPT_API_FUNCTION(void, ompt_task_wait, (
+    void
+));
+
+/* mutex wait */
+OMPT_API_FUNCTION(void, ompt_mutex_wait, (
+    void
+));
+
+
+
+/****************************************************************************
+ * INITIALIZATION FUNCTIONS
+ ***************************************************************************/
+
+/* initialization interface to be defined by tool */
+int ompt_initialize(
+    ompt_function_lookup_t ompt_fn_lookup,
+    const char *runtime_version,
+    unsigned int ompt_version
+);
+
+typedef enum opt_init_mode_e {
+    ompt_init_mode_never  = 0,
+    ompt_init_mode_false  = 1,
+    ompt_init_mode_true   = 2,
+    ompt_init_mode_always = 3
+} ompt_init_mode_t;
+
+OMPT_API_FUNCTION(int, ompt_set_callback, (
+    ompt_event_t event,
+    ompt_callback_t callback
+));
+
+typedef enum ompt_set_callback_rc_e {  /* non-standard */
+    ompt_set_callback_error      = 0,
+    ompt_has_event_no_callback   = 1,
+    ompt_no_event_no_callback    = 2,
+    ompt_has_event_may_callback  = 3,
+    ompt_has_event_must_callback = 4,
+} ompt_set_callback_rc_t;
+
+
+OMPT_API_FUNCTION(int, ompt_get_callback, (
+    ompt_event_t event,
+    ompt_callback_t *callback
+));
+
+
+
+/****************************************************************************
+ * MISCELLANEOUS FUNCTIONS
+ ***************************************************************************/
+
+/* control */
+#if defined(_OPENMP) && (_OPENMP >= 201307)
+#pragma omp declare target
+#endif
+void ompt_control(
+    uint64_t command,
+    uint64_t modifier
+);
+#if defined(_OPENMP) && (_OPENMP >= 201307)
+#pragma omp end declare target
+#endif
+
+/* state enumeration */
+OMPT_API_FUNCTION(int, ompt_enumerate_state, (
+    int current_state,
+    int *next_state,
+    const char **next_state_name
+));
+
+#ifdef  __cplusplus
+};
+#endif
+
+#endif
+

diff --git a/final/runtime/src/include/40/iomp.h.var b/final/runtime/src/include/40/iomp.h.var
new file mode 100644
index 0000000..860c18e
--- /dev/null
+++ b/final/runtime/src/include/40/iomp.h.var

@@ -0,0 +1,106 @@
+/*
+ * include/40/iomp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __IOMP_H
+#   define __IOMP_H
+
+#   define KMP_VERSION_MAJOR    $KMP_VERSION_MAJOR
+#   define KMP_VERSION_MINOR    $KMP_VERSION_MINOR
+#   define KMP_VERSION_BUILD    $KMP_VERSION_BUILD
+#   define KMP_BUILD_DATE       "$KMP_BUILD_DATE"
+
+#   ifdef __cplusplus
+        extern "C" {
+#   endif
+
+#       define kmp_set_stacksize            kmpc_set_stacksize
+#       define kmp_set_stacksize_s          kmpc_set_stacksize_s
+#       define kmp_set_blocktime            kmpc_set_blocktime
+#       define kmp_set_library              kmpc_set_library
+#       define kmp_set_defaults             kmpc_set_defaults
+#       define kmp_set_affinity_mask_proc   kmpc_set_affinity_mask_proc
+#       define kmp_unset_affinity_mask_proc kmpc_unset_affinity_mask_proc
+#       define kmp_get_affinity_mask_proc   kmpc_get_affinity_mask_proc
+
+#       define kmp_malloc                   kmpc_malloc
+#       define kmp_calloc                   kmpc_calloc
+#       define kmp_realloc                  kmpc_realloc
+#       define kmp_free                     kmpc_free
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#   else
+#       define __KAI_KMPC_CONVENTION
+#   endif
+
+#   include <stdlib.h>
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+
+    /* affinity API functions */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+    /* schedule kind constants */
+    typedef enum kmp_cancel_kind_t {
+        kmp_cancel_parallel  = 1,
+        kmp_cancel_loop = 2,
+        kmp_cancel_sections  = 3,
+        kmp_cancel_taskgroup = 4
+    } kmp_cancel_kind_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_cancellation_status(kmp_cancel_kind_t);
+
+#   undef __KAI_KMPC_CONVENTION
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+        }
+#   endif
+
+#endif /* __IOMP_H */
+

diff --git a/final/runtime/src/include/40/iomp_lib.h.var b/final/runtime/src/include/40/iomp_lib.h.var
new file mode 100644
index 0000000..1804a31
--- /dev/null
+++ b/final/runtime/src/include/40/iomp_lib.h.var

@@ -0,0 +1,81 @@
+! include/40/iomp_lib.h.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** omp_integer_kind and omp_logical_kind appear to be predefined by gcc and
+!*** gfortran (definitions do not appear in the omp.h / omp_lib.h /omp_lib.f).
+!*** omp_real_kind is not predefined, however.
+!***
+
+        integer, parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer, parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer, parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*)          kmp_build_date
+        parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+        integer, parameter :: omp_real_kind = 4
+
+!***
+!*** kmp_* type extensions
+!***
+
+        integer, parameter :: kmp_pointer_kind       = $KMP_INT_PTR_KIND
+        integer, parameter :: kmp_size_t_kind        = $KMP_INT_PTR_KIND
+        integer, parameter :: kmp_affinity_mask_kind = $KMP_INT_PTR_KIND
+
+!***
+!*** kmp_* entry points
+!***
+
+        external kmp_set_stacksize
+        external kmp_set_stacksize_s
+        external kmp_set_blocktime
+        external kmp_set_library_serial
+        external kmp_set_library_turnaround
+        external kmp_set_library_throughput
+        external kmp_set_library
+        external kmp_set_defaults
+        external kmp_get_stacksize
+        integer kmp_get_stacksize
+        external kmp_get_stacksize_s
+        integer (kind = kmp_size_t_kind) kmp_get_stacksize_s
+        external kmp_get_blocktime
+        integer kmp_get_blocktime
+        external kmp_get_library
+        integer kmp_get_library
+        external kmp_set_affinity
+        integer kmp_set_affinity
+        external kmp_get_affinity
+        integer kmp_get_affinity
+        external kmp_get_affinity_max_proc
+        integer kmp_get_affinity_max_proc
+        external kmp_create_affinity_mask
+        external kmp_destroy_affinity_mask
+        external kmp_set_affinity_mask_proc
+        integer kmp_set_affinity_mask_proc
+        external kmp_unset_affinity_mask_proc
+        integer kmp_unset_affinity_mask_proc
+        external kmp_get_affinity_mask_proc
+        integer kmp_get_affinity_mask_proc
+        external kmp_malloc
+        integer (kind = kmp_pointer_kind) kmp_malloc
+        external kmp_calloc
+        integer (kind = kmp_pointer_kind) kmp_calloc
+        external kmp_realloc
+        integer (kind = kmp_pointer_kind) kmp_realloc
+        external kmp_free
+
+        external kmp_set_warnings_on
+        external kmp_set_warnings_off
+
+

diff --git a/final/runtime/src/include/40/omp.h.var b/final/runtime/src/include/40/omp.h.var
new file mode 100644
index 0000000..0fd3d65
--- /dev/null
+++ b/final/runtime/src/include/40/omp.h.var

@@ -0,0 +1,174 @@
+/*
+ * include/40/omp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __OMP_H
+#   define __OMP_H
+
+#   define KMP_VERSION_MAJOR    $KMP_VERSION_MAJOR
+#   define KMP_VERSION_MINOR    $KMP_VERSION_MINOR
+#   define KMP_VERSION_BUILD    $KMP_VERSION_BUILD
+#   define KMP_BUILD_DATE       "$KMP_BUILD_DATE"
+
+#   ifdef __cplusplus
+    extern "C" {
+#   endif
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#   else
+#       define __KAI_KMPC_CONVENTION
+#   endif
+
+    /* schedule kind constants */
+    typedef enum omp_sched_t {
+	omp_sched_static  = 1,
+	omp_sched_dynamic = 2,
+	omp_sched_guided  = 3,
+	omp_sched_auto    = 4
+    } omp_sched_t;
+
+    /* set API functions */
+    extern void   __KAI_KMPC_CONVENTION  omp_set_num_threads (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_dynamic     (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nested      (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_max_active_levels (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_schedule          (omp_sched_t, int);
+
+    /* query API functions */
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_dynamic      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_nested       (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_num   (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_procs    (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_parallel      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_final         (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_active_level        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_level               (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_ancestor_thread_num (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_team_size           (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_limit        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_active_levels   (void);
+    extern void   __KAI_KMPC_CONVENTION  omp_get_schedule            (omp_sched_t *, int *);
+
+    /* lock API functions */
+    typedef struct omp_lock_t {
+        void * _lk;
+    } omp_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_lock    (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_lock     (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_lock   (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_lock (omp_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_lock    (omp_lock_t *);
+
+    /* nested lock API functions */
+    typedef struct omp_nest_lock_t {
+        void * _lk;
+    } omp_nest_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_nest_lock    (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nest_lock     (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_nest_lock   (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_nest_lock (omp_nest_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_nest_lock    (omp_nest_lock_t *);
+
+    /* lock hint type for dynamic user lock */
+    typedef enum kmp_lock_hint_t {
+        kmp_lock_hint_none = 0,
+        kmp_lock_hint_contended,
+        kmp_lock_hint_uncontended,
+        kmp_lock_hint_nonspeculative,
+        kmp_lock_hint_speculative,
+        kmp_lock_hint_adaptive,
+    } kmp_lock_hint_t;
+
+    /* hinted lock initializers */
+    extern void __KAI_KMPC_CONVENTION kmp_init_lock_hinted(omp_lock_t *, kmp_lock_hint_t);
+    extern void __KAI_KMPC_CONVENTION kmp_init_nest_lock_hinted(omp_nest_lock_t *, kmp_lock_hint_t);
+
+    /* time API functions */
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtime (void);
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtick (void);
+
+    /* OpenMP 4.0 */
+    extern int  __KAI_KMPC_CONVENTION  omp_get_default_device (void);
+    extern void __KAI_KMPC_CONVENTION  omp_set_default_device (int);
+    extern int  __KAI_KMPC_CONVENTION  omp_is_initial_device (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_devices (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_teams (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_team_num (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_cancellation (void);
+
+#   include <stdlib.h>
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+
+    /* Intel affinity API */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    /* OpenMP 4.0 affinity API */
+    typedef enum omp_proc_bind_t {
+        omp_proc_bind_false = 0,
+        omp_proc_bind_true = 1,
+        omp_proc_bind_master = 2,
+        omp_proc_bind_close = 3,
+        omp_proc_bind_spread = 4
+    } omp_proc_bind_t;
+
+    extern omp_proc_bind_t __KAI_KMPC_CONVENTION omp_get_proc_bind (void);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+#   undef __KAI_KMPC_CONVENTION
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+    }
+#   endif
+
+#endif /* __OMP_H */
+

diff --git a/final/runtime/src/include/40/omp_lib.f.var b/final/runtime/src/include/40/omp_lib.f.var
new file mode 100644
index 0000000..f7df393
--- /dev/null
+++ b/final/runtime/src/include/40/omp_lib.f.var

@@ -0,0 +1,790 @@
+! include/40/omp_lib.f.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** Some of the directives for the following routine extend past column 72,
+!*** so process this file in 132-column mode.
+!***
+
+!dec$ fixedformlinesize:132
+
+      module omp_lib_kinds
+
+        integer, parameter :: omp_integer_kind       = 4
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = 4
+        integer, parameter :: omp_lock_kind          = int_ptr_kind()
+        integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
+        integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
+        integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
+        integer, parameter :: kmp_cancel_kind        = omp_integer_kind
+        integer, parameter :: kmp_lock_hint_kind     = omp_integer_kind
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*), parameter :: kmp_build_date    = '$KMP_BUILD_DATE'
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
+
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_none           = 0
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_uncontended    = 1
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_contended      = 2
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_nonspeculative = 3
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_speculative    = 4
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 5
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(nthreads)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) nthreads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(enable)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) enable
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(enable)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) enable
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_get_dynamic()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_team_size
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_get_schedule
+
+          function omp_get_proc_bind()
+            use omp_lib_kinds
+            integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+          end function omp_get_proc_bind
+
+          function omp_get_wtime()
+            double precision omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick ()
+            double precision omp_get_wtick
+          end function omp_get_wtick
+
+          function omp_get_default_device()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_default_device
+          end function omp_get_default_device
+
+          subroutine omp_set_default_device(dflt_device)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) dflt_device
+          end subroutine omp_set_default_device
+
+          function omp_get_num_devices()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_devices
+          end function omp_get_num_devices
+
+          function omp_get_num_teams()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_teams
+          end function omp_get_num_teams
+
+          function omp_get_team_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_num
+          end function omp_get_team_num
+
+          function omp_get_cancellation()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_cancellation
+          end function omp_get_cancellation
+
+          function omp_is_initial_device()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_is_initial_device
+          end function omp_is_initial_device
+
+          subroutine omp_init_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) lockvar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) lockvar
+          end function omp_test_nest_lock
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial()
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround()
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput()
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string)
+            character*(*) string
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s()
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          function kmp_set_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_malloc
+
+          function kmp_calloc(nelem, elsize)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind) nelem
+            integer (kind=kmp_size_t_kind) elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind) ptr
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on()
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off()
+          end subroutine kmp_set_warnings_off
+
+          function kmp_get_cancellation_status(cancelkind)
+            use omp_lib_kinds
+            integer (kind=kmp_cancel_kind) cancelkind
+            logical (kind=omp_logical_kind) kmp_get_cancellation_status
+          end function kmp_get_cancellation_status
+
+          subroutine kmp_init_lock_hinted(lockvar, lockhint)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind) lockhint
+          end subroutine kmp_init_lock_hinted
+
+          subroutine kmp_init_nest_lock_hinted(lockvar, lockhint)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind) lockhint
+          end subroutine kmp_init_nest_lock_hinted
+
+        end interface
+
+!dec$ if defined(_WIN32)
+!dec$   if defined(_WIN64) .or. defined(_M_AMD64)
+
+!***
+!*** The Fortran entry points must be in uppercase, even if the /Qlowercase
+!*** option is specified.  The alias attribute ensures that the specified
+!*** string is used as the entry point.
+!***
+!*** On the Windows* OS IA-32 architecture, the Fortran entry points have an
+!*** underscore prepended.  On the Windows* OS Intel(R) 64
+!*** architecture, no underscore is prepended.
+!***
+
+!dec$ attributes alias:'OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'OMP_GET_PROC_BIND' :: omp_get_proc_bind
+!dec$ attributes alias:'OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'OMP_GET_WTICK' :: omp_get_wtick
+!dec$ attributes alias:'OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
+!dec$ attributes alias:'OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
+!dec$ attributes alias:'OMP_GET_NUM_DEVICES' :: omp_get_num_devices
+!dec$ attributes alias:'OMP_GET_NUM_TEAMS' :: omp_get_num_teams
+!dec$ attributes alias:'OMP_GET_TEAM_NUM' :: omp_get_team_num
+!dec$ attributes alias:'OMP_GET_CANCELLATION' :: omp_get_cancellation
+!dec$ attributes alias:'OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
+
+!dec$ attributes alias:'omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$ attributes alias:'KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
+
+!dec$ attributes alias:'KMP_INIT_LOCK_HINTED'::kmp_init_lock_hinted
+!dec$ attributes alias:'KMP_INIT_NEST_LOCK_HINTED'::kmp_init_nest_lock_hinted
+
+!dec$   else
+
+!***
+!*** On Windows* OS IA-32 architecture, the Fortran entry points have an underscore prepended.
+!***
+
+!dec$ attributes alias:'_OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'_OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'_OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'_OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'_OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'_OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'_OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'_OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'_OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'_OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'_OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'_OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'_OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'_OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'_OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'_OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'_OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'_OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'_OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'_OMP_GET_PROC_BIND' :: omp_get_proc_bind
+!dec$ attributes alias:'_OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'_OMP_GET_WTICK' :: omp_get_wtick
+!dec$ attributes alias:'_OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
+!dec$ attributes alias:'_OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
+!dec$ attributes alias:'_OMP_GET_NUM_DEVICES' :: omp_get_num_devices
+!dec$ attributes alias:'_OMP_GET_NUM_TEAMS' :: omp_get_num_teams
+!dec$ attributes alias:'_OMP_GET_TEAM_NUM' :: omp_get_team_num
+!dec$ attributes alias:'_OMP_GET_CANCELLATION' :: omp_get_cancellation
+!dec$ attributes alias:'_OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
+
+!dec$ attributes alias:'_omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'_KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'_KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'_KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'_KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'_KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'_KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'_KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'_KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'_KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'_KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'_KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'_KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'_KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'_KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'_KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'_KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'_KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'_KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$ attributes alias:'_KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
+
+!dec$ attributes alias:'_KMP_INIT_LOCK_HINTED'::kmp_init_lock_hinted
+!dec$ attributes alias:'_KMP_INIT_NEST_LOCK_HINTED'::kmp_init_nest_lock_hinted
+
+!dec$   endif
+!dec$ endif
+
+!dec$ if defined(__linux)
+
+!***
+!*** The Linux* OS entry points are in lowercase, with an underscore appended.
+!***
+
+!dec$ attributes alias:'omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'omp_get_level_'::omp_get_level
+!dec$ attributes alias:'omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'omp_get_proc_bind_' :: omp_get_proc_bind
+!dec$ attributes alias:'omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'omp_get_wtick_'::omp_get_wtick
+!dec$ attributes alias:'omp_get_default_device_'::omp_get_default_device
+!dec$ attributes alias:'omp_set_default_device_'::omp_set_default_device
+!dec$ attributes alias:'omp_get_num_devices_'::omp_get_num_devices
+!dec$ attributes alias:'omp_get_num_teams_'::omp_get_num_teams
+!dec$ attributes alias:'omp_get_team_num_'::omp_get_team_num
+!dec$ attributes alias:'omp_get_cancellation_'::omp_get_cancellation
+!dec$ attributes alias:'omp_is_initial_device_'::omp_is_initial_device
+
+!dec$ attributes alias:'omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'kmp_free_'::kmp_free
+
+!dec$ attributes alias:'kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'kmp_set_warnings_off_'::kmp_set_warnings_off
+!dec$ attributes alias:'kmp_get_cancellation_status_'::kmp_get_cancellation_status
+
+!dec$ attributes alias:'kmp_init_lock_hinted_'::kmp_init_lock_hinted
+!dec$ attributes alias:'kmp_init_nest_lock_hinted_'::kmp_init_nest_lock_hinted
+
+!dec$ endif
+
+!dec$ if defined(__APPLE__)
+
+!***
+!*** The Mac entry points are in lowercase, with an both an underscore
+!*** appended and an underscore prepended.
+!***
+
+!dec$ attributes alias:'_omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'_omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'_omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'_omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'_omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'_omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'_omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'_omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'_omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'_omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'_omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'_omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'_omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'_omp_get_level_'::omp_get_level
+!dec$ attributes alias:'_omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'_omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'_omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'_omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'_omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'_omp_get_proc_bind_' :: omp_get_proc_bind
+!dec$ attributes alias:'_omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'_omp_get_wtick_'::omp_get_wtick
+!dec$ attributes alias:'_omp_get_num_teams_'::omp_get_num_teams
+!dec$ attributes alias:'_omp_get_team_num_'::omp_get_team_num
+!dec$ attributes alias:'_omp_get_cancellation_'::omp_get_cancellation
+!dec$ attributes alias:'_omp_is_initial_device_'::omp_is_initial_device
+
+!dec$ attributes alias:'_omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'_kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'_kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'_kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'_kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'_kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'_kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'_kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'_kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'_kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'_kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'_kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'_kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'_kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'_kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'_kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'_kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'_kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'_kmp_free_'::kmp_free
+
+!dec$ attributes alias:'_kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'_kmp_set_warnings_off_'::kmp_set_warnings_off
+
+!dec$ attributes alias:'_kmp_get_cancellation_status_'::kmp_get_cancellation_status
+
+!dec$ attributes alias:'_kmp_init_lock_hinted_'::kmp_init_lock_hinted
+!dec$ attributes alias:'_kmp_init_nest_lock_hinted_'::kmp_init_nest_lock_hinted
+
+!dec$ endif
+
+      end module omp_lib
+

diff --git a/final/runtime/src/include/40/omp_lib.f90.var b/final/runtime/src/include/40/omp_lib.f90.var
new file mode 100644
index 0000000..5a8221c
--- /dev/null
+++ b/final/runtime/src/include/40/omp_lib.f90.var

@@ -0,0 +1,468 @@
+! include/40/omp_lib.f90.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+      module omp_lib_kinds
+
+        use, intrinsic :: iso_c_binding
+
+        integer, parameter :: omp_integer_kind       = c_int
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = c_float
+        integer, parameter :: kmp_double_kind        = c_double
+        integer, parameter :: omp_lock_kind          = c_intptr_t
+        integer, parameter :: omp_nest_lock_kind     = c_intptr_t
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = c_intptr_t
+        integer, parameter :: kmp_size_t_kind        = c_size_t
+        integer, parameter :: kmp_affinity_mask_kind = c_intptr_t
+        integer, parameter :: kmp_cancel_kind        = omp_integer_kind
+        integer, parameter :: kmp_lock_hint_kind     = omp_integer_kind
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*)               kmp_build_date
+        parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
+
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_none           = 0
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_uncontended    = 1
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_contended      = 2
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_nonspeculative = 3
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_speculative    = 4
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 5
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(nthreads) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: nthreads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(enable) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: enable
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(enable) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: enable
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_in_final() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_final
+          end function omp_in_final
+
+          function omp_get_dynamic() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_size
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, modifier) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind), value :: kind
+            integer (kind=omp_integer_kind), value :: modifier
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, modifier) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_get_schedule
+
+          function omp_get_proc_bind() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+          end function omp_get_proc_bind
+
+          function omp_get_wtime() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtick
+          end function omp_get_wtick
+
+          function omp_get_default_device() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_default_device
+          end function omp_get_default_device
+
+          subroutine omp_set_default_device(dflt_device) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: dflt_device
+          end subroutine omp_set_default_device
+
+          function omp_get_num_devices() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_devices
+          end function omp_get_num_devices
+
+          function omp_get_num_teams() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_teams
+          end function omp_get_num_teams
+
+          function omp_get_team_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_num
+          end function omp_get_team_num
+
+          function omp_get_cancellation() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_cancellation
+          end function omp_get_cancellation
+
+          function omp_is_initial_device() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_is_initial_device
+          end function omp_is_initial_device
+
+          subroutine omp_init_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) lockvar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) lockvar
+          end function omp_test_nest_lock
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind), value :: size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial() bind(c)
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround() bind(c)
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput() bind(c)
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string) bind(c)
+            use, intrinsic :: iso_c_binding
+            character (kind=c_char) :: string(*)
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s() bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          function kmp_set_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_malloc
+
+          function kmp_calloc(nelem, elsize) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind), value :: nelem
+            integer (kind=kmp_size_t_kind), value :: elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind), value :: ptr
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind), value :: ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on() bind(c)
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off() bind(c)
+          end subroutine kmp_set_warnings_off
+
+          function kmp_get_cancellation_status(cancelkind) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_cancel_kind), value :: cancelkind
+            logical (kind=omp_logical_kind) kmp_get_cancellation_status
+          end function kmp_get_cancellation_status
+
+          subroutine kmp_init_lock_hinted(lockvar, lockhint) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind), value :: lockhint
+          end subroutine kmp_init_lock_hinted
+
+          subroutine kmp_init_nest_lock_hinted(lockvar, lockhint) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind), value :: lockhint
+          end subroutine kmp_init_nest_lock_hinted
+
+        end interface
+
+      end module omp_lib

diff --git a/final/runtime/src/include/40/omp_lib.h.var b/final/runtime/src/include/40/omp_lib.h.var
new file mode 100644
index 0000000..dd3e42b
--- /dev/null
+++ b/final/runtime/src/include/40/omp_lib.h.var

@@ -0,0 +1,582 @@
+! include/40/omp_lib.h.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** Some of the directives for the following routine extend past column 72,
+!*** so process this file in 132-column mode.
+!***
+
+!DIR$ fixedformlinesize:132
+
+      integer, parameter :: omp_integer_kind       = 4
+      integer, parameter :: omp_logical_kind       = 4
+      integer, parameter :: omp_real_kind          = 4
+      integer, parameter :: omp_lock_kind          = int_ptr_kind()
+      integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
+      integer, parameter :: omp_sched_kind         = omp_integer_kind
+      integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+      integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
+      integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
+      integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
+      integer, parameter :: kmp_lock_hint_kind     = omp_integer_kind
+
+      integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+      integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+      integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+      integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+      character(*)               kmp_build_date
+      parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+      integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+      integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+      integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+      integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_none           = 0
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_uncontended    = 1
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_contended      = 2
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_nonspeculative = 3
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_speculative    = 4
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 5
+
+      interface
+
+!       ***
+!       *** omp_* entry points
+!       ***
+
+        subroutine omp_set_num_threads(nthreads) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: nthreads
+        end subroutine omp_set_num_threads
+
+        subroutine omp_set_dynamic(enable) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: enable
+        end subroutine omp_set_dynamic
+
+        subroutine omp_set_nested(enable) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: enable
+        end subroutine omp_set_nested
+
+        function omp_get_num_threads() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_threads
+        end function omp_get_num_threads
+
+        function omp_get_max_threads() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_threads
+        end function omp_get_max_threads
+
+        function omp_get_thread_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_num
+        end function omp_get_thread_num
+
+        function omp_get_num_procs() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_procs
+        end function omp_get_num_procs
+
+        function omp_in_parallel() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_in_parallel
+        end function omp_in_parallel
+
+        function omp_in_final() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_in_final
+        end function omp_in_final
+
+        function omp_get_dynamic() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_get_dynamic
+        end function omp_get_dynamic
+
+        function omp_get_nested() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_get_nested
+        end function omp_get_nested
+
+        function omp_get_thread_limit() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_limit
+        end function omp_get_thread_limit
+
+        subroutine omp_set_max_active_levels(max_levels) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: max_levels
+        end subroutine omp_set_max_active_levels
+
+        function omp_get_max_active_levels() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_active_levels
+        end function omp_get_max_active_levels
+
+        function omp_get_level() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_level
+        end function omp_get_level
+
+        function omp_get_active_level() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_active_level
+        end function omp_get_active_level
+
+        function omp_get_ancestor_thread_num(level) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+          integer (kind=omp_integer_kind), value :: level
+        end function omp_get_ancestor_thread_num
+
+        function omp_get_team_size(level) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_team_size
+          integer (kind=omp_integer_kind), value :: level
+        end function omp_get_team_size
+
+        subroutine omp_set_schedule(kind, modifier) bind(c)
+          import
+          integer (kind=omp_sched_kind), value :: kind
+          integer (kind=omp_integer_kind), value :: modifier
+        end subroutine omp_set_schedule
+
+        subroutine omp_get_schedule(kind, modifier) bind(c)
+          import
+          integer (kind=omp_sched_kind) kind
+          integer (kind=omp_integer_kind) modifier
+        end subroutine omp_get_schedule
+
+        function omp_get_proc_bind() bind(c)
+          import
+          integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+        end function omp_get_proc_bind
+
+        function omp_get_wtime() bind(c)
+          double precision omp_get_wtime
+        end function omp_get_wtime
+
+        function omp_get_wtick() bind(c)
+          double precision omp_get_wtick
+        end function omp_get_wtick
+
+        function omp_get_default_device() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_default_device
+        end function omp_get_default_device
+
+        subroutine omp_set_default_device(dflt_device) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: dflt_device
+        end subroutine omp_set_default_device
+
+        function omp_get_num_devices() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_devices
+        end function omp_get_num_devices
+
+        function omp_get_num_teams() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_teams
+        end function omp_get_num_teams
+
+        function omp_get_team_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_team_num
+        end function omp_get_team_num
+
+        function omp_is_initial_device() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_is_initial_device
+        end function omp_is_initial_device
+
+        subroutine omp_init_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_init_lock
+
+        subroutine omp_destroy_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_destroy_lock
+
+        subroutine omp_set_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_set_lock
+
+        subroutine omp_unset_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_unset_lock
+
+        function omp_test_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+          import
+          logical (kind=omp_logical_kind) omp_test_lock
+          integer (kind=omp_lock_kind) lockvar
+        end function omp_test_lock
+
+        subroutine omp_init_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_init_nest_lock
+
+        subroutine omp_destroy_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_destroy_nest_lock
+
+        subroutine omp_set_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_set_nest_lock
+
+        subroutine omp_unset_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_unset_nest_lock
+
+        function omp_test_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_integer_kind) omp_test_nest_lock
+          integer (kind=omp_nest_lock_kind) lockvar
+        end function omp_test_nest_lock
+
+!       ***
+!       *** kmp_* entry points
+!       ***
+
+        subroutine kmp_set_stacksize(size) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: size
+        end subroutine kmp_set_stacksize
+
+        subroutine kmp_set_stacksize_s(size) bind(c)
+          import
+          integer (kind=kmp_size_t_kind), value :: size
+        end subroutine kmp_set_stacksize_s
+
+        subroutine kmp_set_blocktime(msec) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: msec
+        end subroutine kmp_set_blocktime
+
+        subroutine kmp_set_library_serial() bind(c)
+        end subroutine kmp_set_library_serial
+
+        subroutine kmp_set_library_turnaround() bind(c)
+        end subroutine kmp_set_library_turnaround
+
+        subroutine kmp_set_library_throughput() bind(c)
+        end subroutine kmp_set_library_throughput
+
+        subroutine kmp_set_library(libnum) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: libnum
+        end subroutine kmp_set_library
+
+        subroutine kmp_set_defaults(string) bind(c)
+          character string(*)
+        end subroutine kmp_set_defaults
+
+        function kmp_get_stacksize() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_stacksize
+        end function kmp_get_stacksize
+
+        function kmp_get_stacksize_s() bind(c)
+          import
+          integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+        end function kmp_get_stacksize_s
+
+        function kmp_get_blocktime() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_blocktime
+        end function kmp_get_blocktime
+
+        function kmp_get_library() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_library
+        end function kmp_get_library
+
+        function kmp_set_affinity(mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity
+
+        function kmp_get_affinity(mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity
+
+        function kmp_get_affinity_max_proc() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+        end function kmp_get_affinity_max_proc
+
+        subroutine kmp_create_affinity_mask(mask) bind(c)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_create_affinity_mask
+
+        subroutine kmp_destroy_affinity_mask(mask) bind(c)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_destroy_affinity_mask
+
+        function kmp_set_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity_mask_proc
+
+        function kmp_unset_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_unset_affinity_mask_proc
+
+        function kmp_get_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity_mask_proc
+
+        function kmp_malloc(size) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_malloc
+          integer (kind=kmp_size_t_kind), value :: size
+        end function kmp_malloc
+
+        function kmp_calloc(nelem, elsize) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_calloc
+          integer (kind=kmp_size_t_kind), value :: nelem
+          integer (kind=kmp_size_t_kind), value :: elsize
+        end function kmp_calloc
+
+        function kmp_realloc(ptr, size) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_realloc
+          integer (kind=kmp_pointer_kind), value :: ptr
+          integer (kind=kmp_size_t_kind), value :: size
+        end function kmp_realloc
+
+        subroutine kmp_free(ptr) bind(c)
+          import
+          integer (kind=kmp_pointer_kind), value :: ptr
+        end subroutine kmp_free
+
+        subroutine kmp_set_warnings_on() bind(c)
+        end subroutine kmp_set_warnings_on
+
+        subroutine kmp_set_warnings_off() bind(c)
+        end subroutine kmp_set_warnings_off
+
+        subroutine kmp_init_lock_hinted(lockvar, lockhint) bind(c)
+          import
+          integer (kind=omp_lock_kind) lockvar
+          integer (kind=kmp_lock_hint_kind), value :: lockhint
+        end subroutine kmp_init_lock_hinted
+
+        subroutine kmp_init_nest_lock_hinted(lockvar, lockhint) bind(c)
+          import
+          integer (kind=omp_lock_kind) lockvar
+          integer (kind=kmp_lock_hint_kind), value :: lockhint
+        end subroutine kmp_init_nest_lock_hinted
+
+      end interface
+
+!DIR$ IF DEFINED (__INTEL_OFFLOAD)
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_num_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_dynamic
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_nested
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_max_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_thread_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_procs
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_in_parallel
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_in_final
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_dynamic
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_nested
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_thread_limit
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_max_active_levels
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_max_active_levels
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_level
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_active_level
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_ancestor_thread_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_team_size
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_schedule
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_schedule
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_proc_bind
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_wtime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_wtick
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_default_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_default_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_is_initial_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_devices
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_teams
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_team_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_destroy_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_unset_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_test_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_destroy_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_unset_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_test_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_stacksize
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_stacksize_s
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_blocktime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_serial
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_turnaround
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_throughput
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_defaults
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_stacksize
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_stacksize_s
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_blocktime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_library
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity_max_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_create_affinity_mask
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_destroy_affinity_mask
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_unset_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_malloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_calloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_realloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_free
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_warnings_on
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_warnings_off
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_init_lock_hinted
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_init_nest_lock_hinted
+
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!$omp declare target(omp_set_num_threads )
+!$omp declare target(omp_set_dynamic )
+!$omp declare target(omp_set_nested )
+!$omp declare target(omp_get_num_threads )
+!$omp declare target(omp_get_max_threads )
+!$omp declare target(omp_get_thread_num )
+!$omp declare target(omp_get_num_procs )
+!$omp declare target(omp_in_parallel )
+!$omp declare target(omp_in_final )
+!$omp declare target(omp_get_dynamic )
+!$omp declare target(omp_get_nested )
+!$omp declare target(omp_get_thread_limit )
+!$omp declare target(omp_set_max_active_levels )
+!$omp declare target(omp_get_max_active_levels )
+!$omp declare target(omp_get_level )
+!$omp declare target(omp_get_active_level )
+!$omp declare target(omp_get_ancestor_thread_num )
+!$omp declare target(omp_get_team_size )
+!$omp declare target(omp_set_schedule )
+!$omp declare target(omp_get_schedule )
+!$omp declare target(omp_get_proc_bind )
+!$omp declare target(omp_get_wtime )
+!$omp declare target(omp_get_wtick )
+!$omp declare target(omp_get_default_device )
+!$omp declare target(omp_set_default_device )
+!$omp declare target(omp_is_initial_device )
+!$omp declare target(omp_get_num_devices )
+!$omp declare target(omp_get_num_teams )
+!$omp declare target(omp_get_team_num )
+!$omp declare target(omp_init_lock )
+!$omp declare target(omp_destroy_lock )
+!$omp declare target(omp_set_lock )
+!$omp declare target(omp_unset_lock )
+!$omp declare target(omp_test_lock )
+!$omp declare target(omp_init_nest_lock )
+!$omp declare target(omp_destroy_nest_lock )
+!$omp declare target(omp_set_nest_lock )
+!$omp declare target(omp_unset_nest_lock )
+!$omp declare target(omp_test_nest_lock )
+!$omp declare target(kmp_set_stacksize )
+!$omp declare target(kmp_set_stacksize_s )
+!$omp declare target(kmp_set_blocktime )
+!$omp declare target(kmp_set_library_serial )
+!$omp declare target(kmp_set_library_turnaround )
+!$omp declare target(kmp_set_library_throughput )
+!$omp declare target(kmp_set_library )
+!$omp declare target(kmp_set_defaults )
+!$omp declare target(kmp_get_stacksize )
+!$omp declare target(kmp_get_stacksize_s )
+!$omp declare target(kmp_get_blocktime )
+!$omp declare target(kmp_get_library )
+!$omp declare target(kmp_set_affinity )
+!$omp declare target(kmp_get_affinity )
+!$omp declare target(kmp_get_affinity_max_proc )
+!$omp declare target(kmp_create_affinity_mask )
+!$omp declare target(kmp_destroy_affinity_mask )
+!$omp declare target(kmp_set_affinity_mask_proc )
+!$omp declare target(kmp_unset_affinity_mask_proc )
+!$omp declare target(kmp_get_affinity_mask_proc )
+!$omp declare target(kmp_malloc )
+!$omp declare target(kmp_calloc )
+!$omp declare target(kmp_realloc )
+!$omp declare target(kmp_free )
+!$omp declare target(kmp_set_warnings_on )
+!$omp declare target(kmp_set_warnings_off )
+!$omp declare target(kmp_init_lock_hinted )
+!$omp declare target(kmp_init_nest_lock_hinted )
+!DIR$ ENDIF
+!DIR$ ENDIF
+

diff --git a/final/runtime/src/include/40/ompt.h.var b/final/runtime/src/include/40/ompt.h.var
new file mode 100644
index 0000000..394aa6c
--- /dev/null
+++ b/final/runtime/src/include/40/ompt.h.var

@@ -0,0 +1,472 @@
+/*
+ * include/40/ompt.h.var
+ */
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+
+
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)  \
+    macro (ompt_enumerate_state)        \
+                                        \
+    macro (ompt_set_callback)           \
+    macro (ompt_get_callback)           \
+                                        \
+    macro (ompt_get_idle_frame)         \
+    macro (ompt_get_task_frame)         \
+                                        \
+    macro (ompt_get_state)              \
+                                        \
+    macro (ompt_get_parallel_id)        \
+    macro (ompt_get_parallel_team_size) \
+    macro (ompt_get_task_id)            \
+    macro (ompt_get_thread_id)
+
+#define FOREACH_OMPT_PLACEHOLDER_FN(macro)  \
+    macro (ompt_idle)                       \
+    macro (ompt_overhead)                   \
+    macro (ompt_barrier_wait)               \
+    macro (ompt_task_wait)                  \
+    macro (ompt_mutex_wait)
+
+#define FOREACH_OMPT_STATE(macro)                                                               \
+                                                                                                \
+    /* first */                                                                                 \
+    macro (ompt_state_first, 0x71)          /* initial enumeration state */                     \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x00)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x01)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x02) /* performing a reduction */                        \
+                                                                                                \
+    /* idle (16..31) */                                                                         \
+    macro (ompt_state_idle, 0x10)            /* waiting for work */                             \
+                                                                                                \
+    /* overhead states (32..63) */                                                              \
+    macro (ompt_state_overhead, 0x20)        /* overhead excluding wait states */               \
+                                                                                                \
+    /* barrier wait states (64..79) */                                                          \
+    macro (ompt_state_wait_barrier, 0x40)    /* waiting at a barrier */                         \
+    macro (ompt_state_wait_barrier_implicit, 0x41)    /* implicit barrier */                    \
+    macro (ompt_state_wait_barrier_explicit, 0x42)    /* explicit barrier */                    \
+                                                                                                \
+    /* task wait states (80..95) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x50)   /* waiting at a taskwait */                        \
+    macro (ompt_state_wait_taskgroup, 0x51)  /* waiting at a taskgroup */                       \
+                                                                                                \
+    /* mutex wait states (96..111) */                                                           \
+    macro (ompt_state_wait_lock, 0x60)       /* waiting for lock */                             \
+    macro (ompt_state_wait_nest_lock, 0x61)  /* waiting for nest lock */                        \
+    macro (ompt_state_wait_critical, 0x62)   /* waiting for critical */                         \
+    macro (ompt_state_wait_atomic, 0x63)     /* waiting for atomic */                           \
+    macro (ompt_state_wait_ordered, 0x64)    /* waiting for ordered */                          \
+    macro (ompt_state_wait_single, 0x6F)     /* waiting for single region (non-standard!) */    \
+                                                                                                \
+    /* misc (112..127) */                                                                       \
+    macro (ompt_state_undefined, 0x70)       /* undefined thread state */
+
+
+#define FOREACH_OMPT_EVENT(macro)                                                                               \
+                                                                                                                \
+    /*--- Mandatory Events ---*/                                                                                \
+    macro (ompt_event_parallel_begin,           ompt_new_parallel_callback_t,   1) /* parallel begin */         \
+    macro (ompt_event_parallel_end,             ompt_parallel_callback_t,       2) /* parallel end */           \
+                                                                                                                \
+    macro (ompt_event_task_begin,               ompt_new_task_callback_t,       3) /* task begin */             \
+    macro (ompt_event_task_end,                 ompt_task_callback_t,           4) /* task destroy */           \
+                                                                                                                \
+    macro (ompt_event_thread_begin,             ompt_thread_type_callback_t,    5) /* thread begin */           \
+    macro (ompt_event_thread_end,               ompt_thread_type_callback_t,    6) /* thread end */             \
+                                                                                                                \
+    macro (ompt_event_control,                  ompt_control_callback_t,        7) /* support control calls */  \
+                                                                                                                \
+    macro (ompt_event_runtime_shutdown,         ompt_callback_t,                8) /* runtime shutdown */       \
+                                                                                                                \
+    /*--- Optional Events (blame shifting, ompt_event_unimplemented) ---*/                                      \
+    macro (ompt_event_idle_begin,               ompt_thread_callback_t,         9) /* begin idle state */       \
+    macro (ompt_event_idle_end,                 ompt_thread_callback_t,        10) /* end idle state */         \
+                                                                                                                \
+    macro (ompt_event_wait_barrier_begin,       ompt_parallel_callback_t,      11) /* begin wait at barrier */  \
+    macro (ompt_event_wait_barrier_end,         ompt_parallel_callback_t,      12) /* end wait at barrier */    \
+                                                                                                                \
+    macro (ompt_event_wait_taskwait_begin,      ompt_parallel_callback_t,      13) /* begin wait at taskwait */ \
+    macro (ompt_event_wait_taskwait_end,        ompt_parallel_callback_t,      14) /* end wait at taskwait */   \
+                                                                                                                \
+    macro (ompt_event_wait_taskgroup_begin,     ompt_parallel_callback_t,      15) /* begin wait at taskgroup */\
+    macro (ompt_event_wait_taskgroup_end,       ompt_parallel_callback_t,      16) /* end wait at taskgroup */  \
+                                                                                                                \
+    macro (ompt_event_release_lock,             ompt_wait_callback_t,          17) /* lock release */           \
+    macro (ompt_event_release_nest_lock_last,   ompt_wait_callback_t,          18) /* last nest lock release */ \
+    macro (ompt_event_release_critical,         ompt_wait_callback_t,          19) /* critical release */       \
+                                                                                                                \
+    macro (ompt_event_release_atomic,           ompt_wait_callback_t,          20) /* atomic release */         \
+                                                                                                                \
+    macro (ompt_event_release_ordered,          ompt_wait_callback_t,          21) /* ordered release */        \
+                                                                                                                \
+    /*--- Optional Events (synchronous events, ompt_event_unimplemented) --- */                                 \
+    macro (ompt_event_implicit_task_begin,      ompt_parallel_callback_t,      22) /* implicit task begin   */  \
+    macro (ompt_event_implicit_task_end,        ompt_parallel_callback_t,      23) /* implicit task end  */     \
+                                                                                                                \
+    macro (ompt_event_initial_task_begin,       ompt_parallel_callback_t,      24) /* initial task begin   */   \
+    macro (ompt_event_initial_task_end,         ompt_parallel_callback_t,      25) /* initial task end  */      \
+                                                                                                                \
+    macro (ompt_event_task_switch,              ompt_task_switch_callback_t,   26) /* task switch */            \
+                                                                                                                \
+    macro (ompt_event_loop_begin,               ompt_new_workshare_callback_t, 27) /* task at loop begin */     \
+    macro (ompt_event_loop_end,                 ompt_parallel_callback_t,      28) /* task at loop end */       \
+                                                                                                                \
+    macro (ompt_event_sections_begin,           ompt_new_workshare_callback_t, 29) /* task at sections begin  */\
+    macro (ompt_event_sections_end,             ompt_parallel_callback_t,      30) /* task at sections end */   \
+                                                                                                                \
+    macro (ompt_event_single_in_block_begin,    ompt_new_workshare_callback_t, 31) /* task at single begin*/    \
+    macro (ompt_event_single_in_block_end,      ompt_parallel_callback_t,      32) /* task at single end */     \
+                                                                                                                \
+    macro (ompt_event_single_others_begin,      ompt_parallel_callback_t,      33) /* task at single begin */   \
+    macro (ompt_event_single_others_end,        ompt_parallel_callback_t,      34) /* task at single end */     \
+                                                                                                                \
+    macro (ompt_event_workshare_begin,          ompt_new_workshare_callback_t, 35) /* task at workshare begin */\
+    macro (ompt_event_workshare_end,            ompt_parallel_callback_t,      36) /* task at workshare end */  \
+                                                                                                                \
+    macro (ompt_event_master_begin,             ompt_parallel_callback_t,      37) /* task at master begin */   \
+    macro (ompt_event_master_end,               ompt_parallel_callback_t,      38) /* task at master end */     \
+                                                                                                                \
+    macro (ompt_event_barrier_begin,            ompt_parallel_callback_t,      39) /* task at barrier begin  */ \
+    macro (ompt_event_barrier_end,              ompt_parallel_callback_t,      40) /* task at barrier end */    \
+                                                                                                                \
+    macro (ompt_event_taskwait_begin,           ompt_parallel_callback_t,      41) /* task at taskwait begin */ \
+    macro (ompt_event_taskwait_end,             ompt_parallel_callback_t,      42) /* task at task wait end */  \
+                                                                                                                \
+    macro (ompt_event_taskgroup_begin,          ompt_parallel_callback_t,      43) /* task at taskgroup begin */\
+    macro (ompt_event_taskgroup_end,            ompt_parallel_callback_t,      44) /* task at taskgroup end */  \
+                                                                                                                \
+    macro (ompt_event_release_nest_lock_prev,   ompt_wait_callback_t,          45) /* prev nest lock release */ \
+                                                                                                                \
+    macro (ompt_event_wait_lock,                ompt_wait_callback_t,          46) /* lock wait */              \
+    macro (ompt_event_wait_nest_lock,           ompt_wait_callback_t,          47) /* nest lock wait */         \
+    macro (ompt_event_wait_critical,            ompt_wait_callback_t,          48) /* critical wait */          \
+    macro (ompt_event_wait_atomic,              ompt_wait_callback_t,          49) /* atomic wait */            \
+    macro (ompt_event_wait_ordered,             ompt_wait_callback_t,          50) /* ordered wait */           \
+                                                                                                                \
+    macro (ompt_event_acquired_lock,            ompt_wait_callback_t,          51) /* lock acquired */          \
+    macro (ompt_event_acquired_nest_lock_first, ompt_wait_callback_t,          52) /* 1st nest lock acquired */ \
+    macro (ompt_event_acquired_nest_lock_next,  ompt_wait_callback_t,          53) /* next nest lock acquired*/ \
+    macro (ompt_event_acquired_critical,        ompt_wait_callback_t,          54) /* critical acquired */      \
+    macro (ompt_event_acquired_atomic,          ompt_wait_callback_t,          55) /* atomic acquired */        \
+    macro (ompt_event_acquired_ordered,         ompt_wait_callback_t,          56) /* ordered acquired */       \
+                                                                                                                \
+    macro (ompt_event_init_lock,                ompt_wait_callback_t,          57) /* lock init */              \
+    macro (ompt_event_init_nest_lock,           ompt_wait_callback_t,          58) /* nest lock init */         \
+                                                                                                                \
+    macro (ompt_event_destroy_lock,             ompt_wait_callback_t,          59) /* lock destruction */       \
+    macro (ompt_event_destroy_nest_lock,        ompt_wait_callback_t,          60) /* nest lock destruction */  \
+                                                                                                                \
+    macro (ompt_event_flush,                    ompt_callback_t,               61) /* after executing flush */
+
+
+
+/*****************************************************************************
+ * data types
+ *****************************************************************************/
+
+/*---------------------
+ * identifiers
+ *---------------------*/
+
+typedef uint64_t ompt_thread_id_t;
+#define ompt_thread_id_none ((ompt_thread_id_t) 0)     /* non-standard */
+
+typedef uint64_t ompt_task_id_t;
+#define ompt_task_id_none ((ompt_task_id_t) 0)         /* non-standard */
+
+typedef uint64_t ompt_parallel_id_t;
+#define ompt_parallel_id_none ((ompt_parallel_id_t) 0) /* non-standard */
+
+typedef uint64_t ompt_wait_id_t;
+#define ompt_wait_id_none ((ompt_wait_id_t) 0)         /* non-standard */
+
+
+/*---------------------
+ * ompt_frame_t
+ *---------------------*/
+
+typedef struct ompt_frame_s {
+    void *exit_runtime_frame;    /* next frame is user code     */
+    void *reenter_runtime_frame; /* previous frame is user code */
+} ompt_frame_t;
+
+
+/*****************************************************************************
+ * enumerations for thread states and runtime events
+ *****************************************************************************/
+
+/*---------------------
+ * runtime states
+ *---------------------*/
+
+typedef enum {
+#define ompt_state_macro(state, code) state = code,
+    FOREACH_OMPT_STATE(ompt_state_macro)
+#undef ompt_state_macro
+} ompt_state_t;
+
+
+/*---------------------
+ * runtime events
+ *---------------------*/
+
+typedef enum {
+#define ompt_event_macro(event, callback, eventid) event = eventid,
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+#undef ompt_event_macro
+} ompt_event_t;
+
+
+/*---------------------
+ * set callback results
+ *---------------------*/
+typedef enum {
+    ompt_set_result_registration_error              = 0,
+    ompt_set_result_event_may_occur_no_callback     = 1,
+    ompt_set_result_event_never_occurs              = 2,
+    ompt_set_result_event_may_occur_callback_some   = 3,
+    ompt_set_result_event_may_occur_callback_always = 4,
+} ompt_set_result_t;
+
+
+
+/*****************************************************************************
+ * callback signatures
+ *****************************************************************************/
+
+/* initialization */
+typedef void (*ompt_interface_fn_t)(void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t)(
+    const char *                      /* entry point to look up       */
+);
+
+/* threads */
+typedef void (*ompt_thread_callback_t) (
+    ompt_thread_id_t thread_id        /* ID of thread                 */
+);
+
+typedef enum {
+    ompt_thread_initial = 1, // start the enumeration at 1
+    ompt_thread_worker  = 2,
+    ompt_thread_other   = 3
+} ompt_thread_type_t;
+
+typedef void (*ompt_thread_type_callback_t) (
+    ompt_thread_type_t thread_type,   /* type of thread               */
+    ompt_thread_id_t thread_id        /* ID of thread                 */
+);
+
+typedef void (*ompt_wait_callback_t) (
+    ompt_wait_id_t wait_id            /* wait id                      */
+);
+
+/* parallel and workshares */
+typedef void (*ompt_parallel_callback_t) (
+    ompt_parallel_id_t parallel_id,    /* id of parallel region       */
+    ompt_task_id_t task_id             /* id of task                  */
+);
+
+typedef void (*ompt_new_workshare_callback_t) (
+    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    void *workshare_function          /* pointer to outlined function */
+);
+
+typedef void (*ompt_new_parallel_callback_t) (
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    ompt_frame_t *parent_task_frame,  /* frame data of parent task    */
+    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
+    uint32_t requested_team_size,     /* number of threads in team    */
+    void *parallel_function           /* pointer to outlined function */
+);
+
+/* tasks */
+typedef void (*ompt_task_callback_t) (
+    ompt_task_id_t task_id            /* id of task                   */
+);
+
+typedef void (*ompt_task_switch_callback_t) (
+    ompt_task_id_t suspended_task_id, /* tool data for suspended task */
+    ompt_task_id_t resumed_task_id    /* tool data for resumed task   */
+);
+
+typedef void (*ompt_new_task_callback_t) (
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    ompt_frame_t *parent_task_frame,  /* frame data for parent task   */
+    ompt_task_id_t  new_task_id,      /* id of created task           */
+    void *task_function               /* pointer to outlined function */
+);
+
+/* program */
+typedef void (*ompt_control_callback_t) (
+    uint64_t command,                 /* command of control call      */
+    uint64_t modifier                 /* modifier of control call     */
+);
+
+typedef void (*ompt_callback_t)(void);
+
+
+/****************************************************************************
+ * ompt API
+ ***************************************************************************/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define OMPT_API_FNTYPE(fn) fn##_t
+
+#define OMPT_API_FUNCTION(return_type, fn, args)  \
+    typedef return_type (*OMPT_API_FNTYPE(fn)) args
+
+
+
+/****************************************************************************
+ * INQUIRY FUNCTIONS
+ ***************************************************************************/
+
+/* state */
+OMPT_API_FUNCTION(ompt_state_t, ompt_get_state, (
+    ompt_wait_id_t *ompt_wait_id
+));
+
+/* thread */
+OMPT_API_FUNCTION(ompt_thread_id_t, ompt_get_thread_id, (void));
+
+OMPT_API_FUNCTION(void *, ompt_get_idle_frame, (void));
+
+/* parallel region */
+OMPT_API_FUNCTION(ompt_parallel_id_t, ompt_get_parallel_id, (
+    int ancestor_level
+));
+
+OMPT_API_FUNCTION(int, ompt_get_parallel_team_size, (
+    int ancestor_level
+));
+
+/* task */
+OMPT_API_FUNCTION(ompt_task_id_t, ompt_get_task_id, (
+    int depth
+));
+
+OMPT_API_FUNCTION(ompt_frame_t *, ompt_get_task_frame, (
+    int depth
+));
+
+
+
+/****************************************************************************
+ * PLACEHOLDERS FOR PERFORMANCE REPORTING
+ ***************************************************************************/
+
+/* idle */
+OMPT_API_FUNCTION(void, ompt_idle, (
+    void
+));
+
+/* overhead */
+OMPT_API_FUNCTION(void, ompt_overhead, (
+    void
+));
+
+/* barrier wait */
+OMPT_API_FUNCTION(void, ompt_barrier_wait, (
+    void
+));
+
+/* task wait */
+OMPT_API_FUNCTION(void, ompt_task_wait, (
+    void
+));
+
+/* mutex wait */
+OMPT_API_FUNCTION(void, ompt_mutex_wait, (
+    void
+));
+
+
+
+/****************************************************************************
+ * INITIALIZATION FUNCTIONS
+ ***************************************************************************/
+
+/* initialization interface to be defined by tool */
+int ompt_initialize(
+    ompt_function_lookup_t ompt_fn_lookup,
+    const char *runtime_version,
+    unsigned int ompt_version
+);
+
+typedef enum opt_init_mode_e {
+    ompt_init_mode_never  = 0,
+    ompt_init_mode_false  = 1,
+    ompt_init_mode_true   = 2,
+    ompt_init_mode_always = 3
+} ompt_init_mode_t;
+
+OMPT_API_FUNCTION(int, ompt_set_callback, (
+    ompt_event_t event,
+    ompt_callback_t callback
+));
+
+typedef enum ompt_set_callback_rc_e {  /* non-standard */
+    ompt_set_callback_error      = 0,
+    ompt_has_event_no_callback   = 1,
+    ompt_no_event_no_callback    = 2,
+    ompt_has_event_may_callback  = 3,
+    ompt_has_event_must_callback = 4,
+} ompt_set_callback_rc_t;
+
+
+OMPT_API_FUNCTION(int, ompt_get_callback, (
+    ompt_event_t event,
+    ompt_callback_t *callback
+));
+
+
+
+/****************************************************************************
+ * MISCELLANEOUS FUNCTIONS
+ ***************************************************************************/
+
+/* control */
+#if defined(_OPENMP) && (_OPENMP >= 201307)
+#pragma omp declare target
+#endif
+void ompt_control(
+    uint64_t command,
+    uint64_t modifier
+);
+#if defined(_OPENMP) && (_OPENMP >= 201307)
+#pragma omp end declare target
+#endif
+
+/* state enumeration */
+OMPT_API_FUNCTION(int, ompt_enumerate_state, (
+    int current_state,
+    int *next_state,
+    const char **next_state_name
+));
+
+#ifdef  __cplusplus
+};
+#endif
+
+#endif
+

diff --git a/final/runtime/src/include/41/iomp.h.var b/final/runtime/src/include/41/iomp.h.var
new file mode 100644
index 0000000..28ebeca
--- /dev/null
+++ b/final/runtime/src/include/41/iomp.h.var

@@ -0,0 +1,106 @@
+/*
+ * include/41/iomp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __IOMP_H
+#   define __IOMP_H
+
+#   define KMP_VERSION_MAJOR    $KMP_VERSION_MAJOR
+#   define KMP_VERSION_MINOR    $KMP_VERSION_MINOR
+#   define KMP_VERSION_BUILD    $KMP_VERSION_BUILD
+#   define KMP_BUILD_DATE       "$KMP_BUILD_DATE"
+
+#   ifdef __cplusplus
+        extern "C" {
+#   endif
+
+#       define kmp_set_stacksize            kmpc_set_stacksize
+#       define kmp_set_stacksize_s          kmpc_set_stacksize_s
+#       define kmp_set_blocktime            kmpc_set_blocktime
+#       define kmp_set_library              kmpc_set_library
+#       define kmp_set_defaults             kmpc_set_defaults
+#       define kmp_set_affinity_mask_proc   kmpc_set_affinity_mask_proc
+#       define kmp_unset_affinity_mask_proc kmpc_unset_affinity_mask_proc
+#       define kmp_get_affinity_mask_proc   kmpc_get_affinity_mask_proc
+
+#       define kmp_malloc                   kmpc_malloc
+#       define kmp_calloc                   kmpc_calloc
+#       define kmp_realloc                  kmpc_realloc
+#       define kmp_free                     kmpc_free
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#   else
+#       define __KAI_KMPC_CONVENTION
+#   endif
+
+#   include <stdlib.h>
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+
+    /* affinity API functions */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+    /* schedule kind constants */
+    typedef enum kmp_cancel_kind_t {
+        kmp_cancel_parallel  = 1,
+        kmp_cancel_loop = 2,
+        kmp_cancel_sections  = 3,
+        kmp_cancel_taskgroup = 4
+    } kmp_cancel_kind_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_cancellation_status(kmp_cancel_kind_t);
+
+#   undef __KAI_KMPC_CONVENTION
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+        }
+#   endif
+
+#endif /* __IOMP_H */
+

diff --git a/final/runtime/src/include/41/iomp_lib.h.var b/final/runtime/src/include/41/iomp_lib.h.var
new file mode 100644
index 0000000..dc02596
--- /dev/null
+++ b/final/runtime/src/include/41/iomp_lib.h.var

@@ -0,0 +1,81 @@
+! include/41/iomp_lib.h.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** omp_integer_kind and omp_logical_kind appear to be predefined by gcc and
+!*** gfortran (definitions do not appear in the omp.h / omp_lib.h /omp_lib.f).
+!*** omp_real_kind is not predefined, however.
+!***
+
+        integer, parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer, parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer, parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*)          kmp_build_date
+        parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+        integer, parameter :: omp_real_kind = 4
+
+!***
+!*** kmp_* type extensions
+!***
+
+        integer, parameter :: kmp_pointer_kind       = $KMP_INT_PTR_KIND
+        integer, parameter :: kmp_size_t_kind        = $KMP_INT_PTR_KIND
+        integer, parameter :: kmp_affinity_mask_kind = $KMP_INT_PTR_KIND
+
+!***
+!*** kmp_* entry points
+!***
+
+        external kmp_set_stacksize
+        external kmp_set_stacksize_s
+        external kmp_set_blocktime
+        external kmp_set_library_serial
+        external kmp_set_library_turnaround
+        external kmp_set_library_throughput
+        external kmp_set_library
+        external kmp_set_defaults
+        external kmp_get_stacksize
+        integer kmp_get_stacksize
+        external kmp_get_stacksize_s
+        integer (kind = kmp_size_t_kind) kmp_get_stacksize_s
+        external kmp_get_blocktime
+        integer kmp_get_blocktime
+        external kmp_get_library
+        integer kmp_get_library
+        external kmp_set_affinity
+        integer kmp_set_affinity
+        external kmp_get_affinity
+        integer kmp_get_affinity
+        external kmp_get_affinity_max_proc
+        integer kmp_get_affinity_max_proc
+        external kmp_create_affinity_mask
+        external kmp_destroy_affinity_mask
+        external kmp_set_affinity_mask_proc
+        integer kmp_set_affinity_mask_proc
+        external kmp_unset_affinity_mask_proc
+        integer kmp_unset_affinity_mask_proc
+        external kmp_get_affinity_mask_proc
+        integer kmp_get_affinity_mask_proc
+        external kmp_malloc
+        integer (kind = kmp_pointer_kind) kmp_malloc
+        external kmp_calloc
+        integer (kind = kmp_pointer_kind) kmp_calloc
+        external kmp_realloc
+        integer (kind = kmp_pointer_kind) kmp_realloc
+        external kmp_free
+
+        external kmp_set_warnings_on
+        external kmp_set_warnings_off
+
+

diff --git a/final/runtime/src/include/41/omp.h.var b/final/runtime/src/include/41/omp.h.var
new file mode 100644
index 0000000..86f019e
--- /dev/null
+++ b/final/runtime/src/include/41/omp.h.var

@@ -0,0 +1,174 @@
+/*
+ * include/41/omp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __OMP_H
+#   define __OMP_H
+
+#   define KMP_VERSION_MAJOR    $KMP_VERSION_MAJOR
+#   define KMP_VERSION_MINOR    $KMP_VERSION_MINOR
+#   define KMP_VERSION_BUILD    $KMP_VERSION_BUILD
+#   define KMP_BUILD_DATE       "$KMP_BUILD_DATE"
+
+#   ifdef __cplusplus
+    extern "C" {
+#   endif
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#   else
+#       define __KAI_KMPC_CONVENTION
+#   endif
+
+    /* schedule kind constants */
+    typedef enum omp_sched_t {
+	omp_sched_static  = 1,
+	omp_sched_dynamic = 2,
+	omp_sched_guided  = 3,
+	omp_sched_auto    = 4
+    } omp_sched_t;
+
+    /* set API functions */
+    extern void   __KAI_KMPC_CONVENTION  omp_set_num_threads (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_dynamic     (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nested      (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_max_active_levels (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_schedule          (omp_sched_t, int);
+
+    /* query API functions */
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_dynamic      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_nested       (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_num   (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_procs    (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_parallel      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_final         (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_active_level        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_level               (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_ancestor_thread_num (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_team_size           (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_limit        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_active_levels   (void);
+    extern void   __KAI_KMPC_CONVENTION  omp_get_schedule            (omp_sched_t *, int *);
+
+    /* lock API functions */
+    typedef struct omp_lock_t {
+        void * _lk;
+    } omp_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_lock    (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_lock     (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_lock   (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_lock (omp_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_lock    (omp_lock_t *);
+
+    /* nested lock API functions */
+    typedef struct omp_nest_lock_t {
+        void * _lk;
+    } omp_nest_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_nest_lock    (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nest_lock     (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_nest_lock   (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_nest_lock (omp_nest_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_nest_lock    (omp_nest_lock_t *);
+
+    /* lock hint type for dynamic user lock */
+    typedef enum kmp_lock_hint_t {
+        kmp_lock_hint_none = 0,
+        kmp_lock_hint_contended,
+        kmp_lock_hint_uncontended,
+        kmp_lock_hint_nonspeculative,
+        kmp_lock_hint_speculative,
+        kmp_lock_hint_adaptive,
+    } kmp_lock_hint_t;
+
+    /* hinted lock initializers */
+    extern void __KAI_KMPC_CONVENTION kmp_init_lock_hinted(omp_lock_t *, kmp_lock_hint_t);
+    extern void __KAI_KMPC_CONVENTION kmp_init_nest_lock_hinted(omp_nest_lock_t *, kmp_lock_hint_t);
+
+    /* time API functions */
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtime (void);
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtick (void);
+
+    /* OpenMP 4.0 */
+    extern int  __KAI_KMPC_CONVENTION  omp_get_default_device (void);
+    extern void __KAI_KMPC_CONVENTION  omp_set_default_device (int);
+    extern int  __KAI_KMPC_CONVENTION  omp_is_initial_device (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_devices (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_teams (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_team_num (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_cancellation (void);
+
+#   include <stdlib.h>
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+
+    /* Intel affinity API */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    /* OpenMP 4.0 affinity API */
+    typedef enum omp_proc_bind_t {
+        omp_proc_bind_false = 0,
+        omp_proc_bind_true = 1,
+        omp_proc_bind_master = 2,
+        omp_proc_bind_close = 3,
+        omp_proc_bind_spread = 4
+    } omp_proc_bind_t;
+
+    extern omp_proc_bind_t __KAI_KMPC_CONVENTION omp_get_proc_bind (void);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+#   undef __KAI_KMPC_CONVENTION
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+    }
+#   endif
+
+#endif /* __OMP_H */
+

diff --git a/final/runtime/src/include/41/omp_lib.f.var b/final/runtime/src/include/41/omp_lib.f.var
new file mode 100644
index 0000000..c28b1ec
--- /dev/null
+++ b/final/runtime/src/include/41/omp_lib.f.var

@@ -0,0 +1,790 @@
+! include/41/omp_lib.f.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** Some of the directives for the following routine extend past column 72,
+!*** so process this file in 132-column mode.
+!***
+
+!dec$ fixedformlinesize:132
+
+      module omp_lib_kinds
+
+        integer, parameter :: omp_integer_kind       = 4
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = 4
+        integer, parameter :: omp_lock_kind          = int_ptr_kind()
+        integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
+        integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
+        integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
+        integer, parameter :: kmp_cancel_kind        = omp_integer_kind
+        integer, parameter :: kmp_lock_hint_kind     = omp_integer_kind
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*), parameter :: kmp_build_date    = '$KMP_BUILD_DATE'
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
+
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_none           = 0
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_uncontended    = 1
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_contended      = 2
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_nonspeculative = 3
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_speculative    = 4
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 5
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(nthreads)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) nthreads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(enable)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) enable
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(enable)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) enable
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_get_dynamic()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_team_size
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_get_schedule
+
+          function omp_get_proc_bind()
+            use omp_lib_kinds
+            integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+          end function omp_get_proc_bind
+
+          function omp_get_wtime()
+            double precision omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick ()
+            double precision omp_get_wtick
+          end function omp_get_wtick
+
+          function omp_get_default_device()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_default_device
+          end function omp_get_default_device
+
+          subroutine omp_set_default_device(dflt_device)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) dflt_device
+          end subroutine omp_set_default_device
+
+          function omp_get_num_devices()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_devices
+          end function omp_get_num_devices
+
+          function omp_get_num_teams()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_teams
+          end function omp_get_num_teams
+
+          function omp_get_team_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_num
+          end function omp_get_team_num
+
+          function omp_get_cancellation()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_cancellation
+          end function omp_get_cancellation
+
+          function omp_is_initial_device()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_is_initial_device
+          end function omp_is_initial_device
+
+          subroutine omp_init_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) lockvar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(lockvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) lockvar
+          end function omp_test_nest_lock
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial()
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround()
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput()
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string)
+            character*(*) string
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s()
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          function kmp_set_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_malloc
+
+          function kmp_calloc(nelem, elsize)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind) nelem
+            integer (kind=kmp_size_t_kind) elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind) ptr
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on()
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off()
+          end subroutine kmp_set_warnings_off
+
+          function kmp_get_cancellation_status(cancelkind)
+            use omp_lib_kinds
+            integer (kind=kmp_cancel_kind) cancelkind
+            logical (kind=omp_logical_kind) kmp_get_cancellation_status
+          end function kmp_get_cancellation_status
+
+          subroutine kmp_init_lock_hinted(lockvar, lockhint)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind) lockhint
+          end subroutine kmp_init_lock_hinted
+
+          subroutine kmp_init_nest_lock_hinted(lockvar, lockhint)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind) lockhint
+          end subroutine kmp_init_nest_lock_hinted
+
+        end interface
+
+!dec$ if defined(_WIN32)
+!dec$   if defined(_WIN64) .or. defined(_M_AMD64)
+
+!***
+!*** The Fortran entry points must be in uppercase, even if the /Qlowercase
+!*** option is specified.  The alias attribute ensures that the specified
+!*** string is used as the entry point.
+!***
+!*** On the Windows* OS IA-32 architecture, the Fortran entry points have an
+!*** underscore prepended.  On the Windows* OS Intel(R) 64
+!*** architecture, no underscore is prepended.
+!***
+
+!dec$ attributes alias:'OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'OMP_GET_PROC_BIND' :: omp_get_proc_bind
+!dec$ attributes alias:'OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'OMP_GET_WTICK' :: omp_get_wtick
+!dec$ attributes alias:'OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
+!dec$ attributes alias:'OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
+!dec$ attributes alias:'OMP_GET_NUM_DEVICES' :: omp_get_num_devices
+!dec$ attributes alias:'OMP_GET_NUM_TEAMS' :: omp_get_num_teams
+!dec$ attributes alias:'OMP_GET_TEAM_NUM' :: omp_get_team_num
+!dec$ attributes alias:'OMP_GET_CANCELLATION' :: omp_get_cancellation
+!dec$ attributes alias:'OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
+
+!dec$ attributes alias:'omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$ attributes alias:'KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
+
+!dec$ attributes alias:'KMP_INIT_LOCK_HINTED'::kmp_init_lock_hinted
+!dec$ attributes alias:'KMP_INIT_NEST_LOCK_HINTED'::kmp_init_nest_lock_hinted
+
+!dec$   else
+
+!***
+!*** On Windows* OS IA-32 architecture, the Fortran entry points have an underscore prepended.
+!***
+
+!dec$ attributes alias:'_OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'_OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'_OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'_OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'_OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'_OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'_OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'_OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'_OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'_OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'_OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'_OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'_OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'_OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'_OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'_OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'_OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'_OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'_OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'_OMP_GET_PROC_BIND' :: omp_get_proc_bind
+!dec$ attributes alias:'_OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'_OMP_GET_WTICK' :: omp_get_wtick
+!dec$ attributes alias:'_OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
+!dec$ attributes alias:'_OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
+!dec$ attributes alias:'_OMP_GET_NUM_DEVICES' :: omp_get_num_devices
+!dec$ attributes alias:'_OMP_GET_NUM_TEAMS' :: omp_get_num_teams
+!dec$ attributes alias:'_OMP_GET_TEAM_NUM' :: omp_get_team_num
+!dec$ attributes alias:'_OMP_GET_CANCELLATION' :: omp_get_cancellation
+!dec$ attributes alias:'_OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
+
+!dec$ attributes alias:'_omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'_KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'_KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'_KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'_KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'_KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'_KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'_KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'_KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'_KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'_KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'_KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'_KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'_KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'_KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'_KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'_KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'_KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'_KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$ attributes alias:'_KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
+
+!dec$ attributes alias:'_KMP_INIT_LOCK_HINTED'::kmp_init_lock_hinted
+!dec$ attributes alias:'_KMP_INIT_NEST_LOCK_HINTED'::kmp_init_nest_lock_hinted
+
+!dec$   endif
+!dec$ endif
+
+!dec$ if defined(__linux)
+
+!***
+!*** The Linux* OS entry points are in lowercase, with an underscore appended.
+!***
+
+!dec$ attributes alias:'omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'omp_get_level_'::omp_get_level
+!dec$ attributes alias:'omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'omp_get_proc_bind_' :: omp_get_proc_bind
+!dec$ attributes alias:'omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'omp_get_wtick_'::omp_get_wtick
+!dec$ attributes alias:'omp_get_default_device_'::omp_get_default_device
+!dec$ attributes alias:'omp_set_default_device_'::omp_set_default_device
+!dec$ attributes alias:'omp_get_num_devices_'::omp_get_num_devices
+!dec$ attributes alias:'omp_get_num_teams_'::omp_get_num_teams
+!dec$ attributes alias:'omp_get_team_num_'::omp_get_team_num
+!dec$ attributes alias:'omp_get_cancellation_'::omp_get_cancellation
+!dec$ attributes alias:'omp_is_initial_device_'::omp_is_initial_device
+
+!dec$ attributes alias:'omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'kmp_free_'::kmp_free
+
+!dec$ attributes alias:'kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'kmp_set_warnings_off_'::kmp_set_warnings_off
+!dec$ attributes alias:'kmp_get_cancellation_status_'::kmp_get_cancellation_status
+
+!dec$ attributes alias:'kmp_init_lock_hinted_'::kmp_init_lock_hinted
+!dec$ attributes alias:'kmp_init_nest_lock_hinted_'::kmp_init_nest_lock_hinted
+
+!dec$ endif
+
+!dec$ if defined(__APPLE__)
+
+!***
+!*** The Mac entry points are in lowercase, with an both an underscore
+!*** appended and an underscore prepended.
+!***
+
+!dec$ attributes alias:'_omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'_omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'_omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'_omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'_omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'_omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'_omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'_omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'_omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'_omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'_omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'_omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'_omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'_omp_get_level_'::omp_get_level
+!dec$ attributes alias:'_omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'_omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'_omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'_omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'_omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'_omp_get_proc_bind_' :: omp_get_proc_bind
+!dec$ attributes alias:'_omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'_omp_get_wtick_'::omp_get_wtick
+!dec$ attributes alias:'_omp_get_num_teams_'::omp_get_num_teams
+!dec$ attributes alias:'_omp_get_team_num_'::omp_get_team_num
+!dec$ attributes alias:'_omp_get_cancellation_'::omp_get_cancellation
+!dec$ attributes alias:'_omp_is_initial_device_'::omp_is_initial_device
+
+!dec$ attributes alias:'_omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'_omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'_omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock_'::omp_test_nest_lock
+
+!dec$ attributes alias:'_kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'_kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'_kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'_kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'_kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'_kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'_kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'_kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'_kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'_kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'_kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'_kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'_kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'_kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'_kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'_kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'_kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'_kmp_free_'::kmp_free
+
+!dec$ attributes alias:'_kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'_kmp_set_warnings_off_'::kmp_set_warnings_off
+
+!dec$ attributes alias:'_kmp_get_cancellation_status_'::kmp_get_cancellation_status
+
+!dec$ attributes alias:'_kmp_init_lock_hinted_'::kmp_init_lock_hinted
+!dec$ attributes alias:'_kmp_init_nest_lock_hinted_'::kmp_init_nest_lock_hinted
+
+!dec$ endif
+
+      end module omp_lib
+

diff --git a/final/runtime/src/include/41/omp_lib.f90.var b/final/runtime/src/include/41/omp_lib.f90.var
new file mode 100644
index 0000000..10274ca
--- /dev/null
+++ b/final/runtime/src/include/41/omp_lib.f90.var

@@ -0,0 +1,468 @@
+! include/41/omp_lib.f90.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+      module omp_lib_kinds
+
+        use, intrinsic :: iso_c_binding
+
+        integer, parameter :: omp_integer_kind       = c_int
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = c_float
+        integer, parameter :: kmp_double_kind        = c_double
+        integer, parameter :: omp_lock_kind          = c_intptr_t
+        integer, parameter :: omp_nest_lock_kind     = c_intptr_t
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = c_intptr_t
+        integer, parameter :: kmp_size_t_kind        = c_size_t
+        integer, parameter :: kmp_affinity_mask_kind = c_intptr_t
+        integer, parameter :: kmp_cancel_kind        = omp_integer_kind
+        integer, parameter :: kmp_lock_hint_kind     = omp_integer_kind
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+        character(*)               kmp_build_date
+        parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
+
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_none           = 0
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_uncontended    = 1
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_contended      = 2
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_nonspeculative = 3
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_speculative    = 4
+        integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 5
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(nthreads) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: nthreads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(enable) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: enable
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(enable) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: enable
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_in_final() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_final
+          end function omp_in_final
+
+          function omp_get_dynamic() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_size
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, modifier) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind), value :: kind
+            integer (kind=omp_integer_kind), value :: modifier
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, modifier) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) modifier
+          end subroutine omp_get_schedule
+
+          function omp_get_proc_bind() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+          end function omp_get_proc_bind
+
+          function omp_get_wtime() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtick
+          end function omp_get_wtick
+
+          function omp_get_default_device() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_default_device
+          end function omp_get_default_device
+
+          subroutine omp_set_default_device(dflt_device) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: dflt_device
+          end subroutine omp_set_default_device
+
+          function omp_get_num_devices() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_devices
+          end function omp_get_num_devices
+
+          function omp_get_num_teams() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_teams
+          end function omp_get_num_teams
+
+          function omp_get_team_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_num
+          end function omp_get_team_num
+
+          function omp_get_cancellation() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_cancellation
+          end function omp_get_cancellation
+
+          function omp_is_initial_device() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_is_initial_device
+          end function omp_is_initial_device
+
+          subroutine omp_init_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) lockvar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) lockvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) lockvar
+          end function omp_test_nest_lock
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind), value :: size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial() bind(c)
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround() bind(c)
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput() bind(c)
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string) bind(c)
+            use, intrinsic :: iso_c_binding
+            character (kind=c_char) :: string(*)
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s() bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          function kmp_set_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_malloc
+
+          function kmp_calloc(nelem, elsize) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind), value :: nelem
+            integer (kind=kmp_size_t_kind), value :: elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind), value :: ptr
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind), value :: ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on() bind(c)
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off() bind(c)
+          end subroutine kmp_set_warnings_off
+
+          function kmp_get_cancellation_status(cancelkind) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_cancel_kind), value :: cancelkind
+            logical (kind=omp_logical_kind) kmp_get_cancellation_status
+          end function kmp_get_cancellation_status
+
+          subroutine kmp_init_lock_hinted(lockvar, lockhint) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind), value :: lockhint
+          end subroutine kmp_init_lock_hinted
+
+          subroutine kmp_init_nest_lock_hinted(lockvar, lockhint) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) lockvar
+            integer (kind=kmp_lock_hint_kind), value :: lockhint
+          end subroutine kmp_init_nest_lock_hinted
+
+        end interface
+
+      end module omp_lib

diff --git a/final/runtime/src/include/41/omp_lib.h.var b/final/runtime/src/include/41/omp_lib.h.var
new file mode 100644
index 0000000..3f1a21c
--- /dev/null
+++ b/final/runtime/src/include/41/omp_lib.h.var

@@ -0,0 +1,582 @@
+! include/41/omp_lib.h.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!//                     The LLVM Compiler Infrastructure
+!//
+!// This file is dual licensed under the MIT and the University of Illinois Open
+!// Source Licenses. See LICENSE.txt for details.
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** Some of the directives for the following routine extend past column 72,
+!*** so process this file in 132-column mode.
+!***
+
+!DIR$ fixedformlinesize:132
+
+      integer, parameter :: omp_integer_kind       = 4
+      integer, parameter :: omp_logical_kind       = 4
+      integer, parameter :: omp_real_kind          = 4
+      integer, parameter :: omp_lock_kind          = int_ptr_kind()
+      integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
+      integer, parameter :: omp_sched_kind         = omp_integer_kind
+      integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+      integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
+      integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
+      integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
+      integer, parameter :: kmp_lock_hint_kind     = omp_integer_kind
+
+      integer (kind=omp_integer_kind), parameter :: openmp_version    = $OMP_VERSION
+      integer (kind=omp_integer_kind), parameter :: kmp_version_major = $KMP_VERSION_MAJOR
+      integer (kind=omp_integer_kind), parameter :: kmp_version_minor = $KMP_VERSION_MINOR
+      integer (kind=omp_integer_kind), parameter :: kmp_version_build = $KMP_VERSION_BUILD
+      character(*)               kmp_build_date
+      parameter( kmp_build_date = '$KMP_BUILD_DATE' )
+
+      integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+      integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+      integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+      integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+      integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_none           = 0
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_uncontended    = 1
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_contended      = 2
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_nonspeculative = 3
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_speculative    = 4
+      integer (kind=kmp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 5
+
+      interface
+
+!       ***
+!       *** omp_* entry points
+!       ***
+
+        subroutine omp_set_num_threads(nthreads) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: nthreads
+        end subroutine omp_set_num_threads
+
+        subroutine omp_set_dynamic(enable) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: enable
+        end subroutine omp_set_dynamic
+
+        subroutine omp_set_nested(enable) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: enable
+        end subroutine omp_set_nested
+
+        function omp_get_num_threads() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_threads
+        end function omp_get_num_threads
+
+        function omp_get_max_threads() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_threads
+        end function omp_get_max_threads
+
+        function omp_get_thread_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_num
+        end function omp_get_thread_num
+
+        function omp_get_num_procs() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_procs
+        end function omp_get_num_procs
+
+        function omp_in_parallel() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_in_parallel
+        end function omp_in_parallel
+
+        function omp_in_final() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_in_final
+        end function omp_in_final
+
+        function omp_get_dynamic() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_get_dynamic
+        end function omp_get_dynamic
+
+        function omp_get_nested() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_get_nested
+        end function omp_get_nested
+
+        function omp_get_thread_limit() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_limit
+        end function omp_get_thread_limit
+
+        subroutine omp_set_max_active_levels(max_levels) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: max_levels
+        end subroutine omp_set_max_active_levels
+
+        function omp_get_max_active_levels() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_active_levels
+        end function omp_get_max_active_levels
+
+        function omp_get_level() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_level
+        end function omp_get_level
+
+        function omp_get_active_level() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_active_level
+        end function omp_get_active_level
+
+        function omp_get_ancestor_thread_num(level) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+          integer (kind=omp_integer_kind), value :: level
+        end function omp_get_ancestor_thread_num
+
+        function omp_get_team_size(level) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_team_size
+          integer (kind=omp_integer_kind), value :: level
+        end function omp_get_team_size
+
+        subroutine omp_set_schedule(kind, modifier) bind(c)
+          import
+          integer (kind=omp_sched_kind), value :: kind
+          integer (kind=omp_integer_kind), value :: modifier
+        end subroutine omp_set_schedule
+
+        subroutine omp_get_schedule(kind, modifier) bind(c)
+          import
+          integer (kind=omp_sched_kind) kind
+          integer (kind=omp_integer_kind) modifier
+        end subroutine omp_get_schedule
+
+        function omp_get_proc_bind() bind(c)
+          import
+          integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+        end function omp_get_proc_bind
+
+        function omp_get_wtime() bind(c)
+          double precision omp_get_wtime
+        end function omp_get_wtime
+
+        function omp_get_wtick() bind(c)
+          double precision omp_get_wtick
+        end function omp_get_wtick
+
+        function omp_get_default_device() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_default_device
+        end function omp_get_default_device
+
+        subroutine omp_set_default_device(dflt_device) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: dflt_device
+        end subroutine omp_set_default_device
+
+        function omp_get_num_devices() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_devices
+        end function omp_get_num_devices
+
+        function omp_get_num_teams() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_teams
+        end function omp_get_num_teams
+
+        function omp_get_team_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_team_num
+        end function omp_get_team_num
+
+        function omp_is_initial_device() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_is_initial_device
+        end function omp_is_initial_device
+
+        subroutine omp_init_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_init_lock
+
+        subroutine omp_destroy_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_destroy_lock
+
+        subroutine omp_set_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_set_lock
+
+        subroutine omp_unset_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) lockvar
+        end subroutine omp_unset_lock
+
+        function omp_test_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+          import
+          logical (kind=omp_logical_kind) omp_test_lock
+          integer (kind=omp_lock_kind) lockvar
+        end function omp_test_lock
+
+        subroutine omp_init_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_init_nest_lock
+
+        subroutine omp_destroy_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_destroy_nest_lock
+
+        subroutine omp_set_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_set_nest_lock
+
+        subroutine omp_unset_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) lockvar
+        end subroutine omp_unset_nest_lock
+
+        function omp_test_nest_lock(lockvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_integer_kind) omp_test_nest_lock
+          integer (kind=omp_nest_lock_kind) lockvar
+        end function omp_test_nest_lock
+
+!       ***
+!       *** kmp_* entry points
+!       ***
+
+        subroutine kmp_set_stacksize(size) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: size
+        end subroutine kmp_set_stacksize
+
+        subroutine kmp_set_stacksize_s(size) bind(c)
+          import
+          integer (kind=kmp_size_t_kind), value :: size
+        end subroutine kmp_set_stacksize_s
+
+        subroutine kmp_set_blocktime(msec) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: msec
+        end subroutine kmp_set_blocktime
+
+        subroutine kmp_set_library_serial() bind(c)
+        end subroutine kmp_set_library_serial
+
+        subroutine kmp_set_library_turnaround() bind(c)
+        end subroutine kmp_set_library_turnaround
+
+        subroutine kmp_set_library_throughput() bind(c)
+        end subroutine kmp_set_library_throughput
+
+        subroutine kmp_set_library(libnum) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: libnum
+        end subroutine kmp_set_library
+
+        subroutine kmp_set_defaults(string) bind(c)
+          character string(*)
+        end subroutine kmp_set_defaults
+
+        function kmp_get_stacksize() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_stacksize
+        end function kmp_get_stacksize
+
+        function kmp_get_stacksize_s() bind(c)
+          import
+          integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+        end function kmp_get_stacksize_s
+
+        function kmp_get_blocktime() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_blocktime
+        end function kmp_get_blocktime
+
+        function kmp_get_library() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_library
+        end function kmp_get_library
+
+        function kmp_set_affinity(mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity
+
+        function kmp_get_affinity(mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity
+
+        function kmp_get_affinity_max_proc() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+        end function kmp_get_affinity_max_proc
+
+        subroutine kmp_create_affinity_mask(mask) bind(c)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_create_affinity_mask
+
+        subroutine kmp_destroy_affinity_mask(mask) bind(c)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_destroy_affinity_mask
+
+        function kmp_set_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity_mask_proc
+
+        function kmp_unset_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_unset_affinity_mask_proc
+
+        function kmp_get_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity_mask_proc
+
+        function kmp_malloc(size) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_malloc
+          integer (kind=kmp_size_t_kind), value :: size
+        end function kmp_malloc
+
+        function kmp_calloc(nelem, elsize) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_calloc
+          integer (kind=kmp_size_t_kind), value :: nelem
+          integer (kind=kmp_size_t_kind), value :: elsize
+        end function kmp_calloc
+
+        function kmp_realloc(ptr, size) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_realloc
+          integer (kind=kmp_pointer_kind), value :: ptr
+          integer (kind=kmp_size_t_kind), value :: size
+        end function kmp_realloc
+
+        subroutine kmp_free(ptr) bind(c)
+          import
+          integer (kind=kmp_pointer_kind), value :: ptr
+        end subroutine kmp_free
+
+        subroutine kmp_set_warnings_on() bind(c)
+        end subroutine kmp_set_warnings_on
+
+        subroutine kmp_set_warnings_off() bind(c)
+        end subroutine kmp_set_warnings_off
+
+        subroutine kmp_init_lock_hinted(lockvar, lockhint) bind(c)
+          import
+          integer (kind=omp_lock_kind) lockvar
+          integer (kind=kmp_lock_hint_kind), value :: lockhint
+        end subroutine kmp_init_lock_hinted
+
+        subroutine kmp_init_nest_lock_hinted(lockvar, lockhint) bind(c)
+          import
+          integer (kind=omp_lock_kind) lockvar
+          integer (kind=kmp_lock_hint_kind), value :: lockhint
+        end subroutine kmp_init_nest_lock_hinted
+
+      end interface
+
+!DIR$ IF DEFINED (__INTEL_OFFLOAD)
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_num_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_dynamic
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_nested
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_max_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_thread_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_procs
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_in_parallel
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_in_final
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_dynamic
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_nested
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_thread_limit
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_max_active_levels
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_max_active_levels
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_level
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_active_level
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_ancestor_thread_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_team_size
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_schedule
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_schedule
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_proc_bind
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_wtime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_wtick
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_default_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_default_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_is_initial_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_devices
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_teams
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_team_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_destroy_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_unset_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_test_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_destroy_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_unset_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_test_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_stacksize
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_stacksize_s
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_blocktime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_serial
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_turnaround
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_throughput
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_defaults
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_stacksize
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_stacksize_s
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_blocktime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_library
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity_max_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_create_affinity_mask
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_destroy_affinity_mask
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_unset_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_malloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_calloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_realloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_free
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_warnings_on
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_warnings_off
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_init_lock_hinted
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_init_nest_lock_hinted
+
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!$omp declare target(omp_set_num_threads )
+!$omp declare target(omp_set_dynamic )
+!$omp declare target(omp_set_nested )
+!$omp declare target(omp_get_num_threads )
+!$omp declare target(omp_get_max_threads )
+!$omp declare target(omp_get_thread_num )
+!$omp declare target(omp_get_num_procs )
+!$omp declare target(omp_in_parallel )
+!$omp declare target(omp_in_final )
+!$omp declare target(omp_get_dynamic )
+!$omp declare target(omp_get_nested )
+!$omp declare target(omp_get_thread_limit )
+!$omp declare target(omp_set_max_active_levels )
+!$omp declare target(omp_get_max_active_levels )
+!$omp declare target(omp_get_level )
+!$omp declare target(omp_get_active_level )
+!$omp declare target(omp_get_ancestor_thread_num )
+!$omp declare target(omp_get_team_size )
+!$omp declare target(omp_set_schedule )
+!$omp declare target(omp_get_schedule )
+!$omp declare target(omp_get_proc_bind )
+!$omp declare target(omp_get_wtime )
+!$omp declare target(omp_get_wtick )
+!$omp declare target(omp_get_default_device )
+!$omp declare target(omp_set_default_device )
+!$omp declare target(omp_is_initial_device )
+!$omp declare target(omp_get_num_devices )
+!$omp declare target(omp_get_num_teams )
+!$omp declare target(omp_get_team_num )
+!$omp declare target(omp_init_lock )
+!$omp declare target(omp_destroy_lock )
+!$omp declare target(omp_set_lock )
+!$omp declare target(omp_unset_lock )
+!$omp declare target(omp_test_lock )
+!$omp declare target(omp_init_nest_lock )
+!$omp declare target(omp_destroy_nest_lock )
+!$omp declare target(omp_set_nest_lock )
+!$omp declare target(omp_unset_nest_lock )
+!$omp declare target(omp_test_nest_lock )
+!$omp declare target(kmp_set_stacksize )
+!$omp declare target(kmp_set_stacksize_s )
+!$omp declare target(kmp_set_blocktime )
+!$omp declare target(kmp_set_library_serial )
+!$omp declare target(kmp_set_library_turnaround )
+!$omp declare target(kmp_set_library_throughput )
+!$omp declare target(kmp_set_library )
+!$omp declare target(kmp_set_defaults )
+!$omp declare target(kmp_get_stacksize )
+!$omp declare target(kmp_get_stacksize_s )
+!$omp declare target(kmp_get_blocktime )
+!$omp declare target(kmp_get_library )
+!$omp declare target(kmp_set_affinity )
+!$omp declare target(kmp_get_affinity )
+!$omp declare target(kmp_get_affinity_max_proc )
+!$omp declare target(kmp_create_affinity_mask )
+!$omp declare target(kmp_destroy_affinity_mask )
+!$omp declare target(kmp_set_affinity_mask_proc )
+!$omp declare target(kmp_unset_affinity_mask_proc )
+!$omp declare target(kmp_get_affinity_mask_proc )
+!$omp declare target(kmp_malloc )
+!$omp declare target(kmp_calloc )
+!$omp declare target(kmp_realloc )
+!$omp declare target(kmp_free )
+!$omp declare target(kmp_set_warnings_on )
+!$omp declare target(kmp_set_warnings_off )
+!$omp declare target(kmp_init_lock_hinted )
+!$omp declare target(kmp_init_nest_lock_hinted )
+!DIR$ ENDIF
+!DIR$ ENDIF
+

diff --git a/final/runtime/src/include/41/ompt.h.var b/final/runtime/src/include/41/ompt.h.var
new file mode 100644
index 0000000..e241daa
--- /dev/null
+++ b/final/runtime/src/include/41/ompt.h.var

@@ -0,0 +1,472 @@
+/*
+ * include/41/ompt.h.var
+ */
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+
+
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)  \
+    macro (ompt_enumerate_state)        \
+                                        \
+    macro (ompt_set_callback)           \
+    macro (ompt_get_callback)           \
+                                        \
+    macro (ompt_get_idle_frame)         \
+    macro (ompt_get_task_frame)         \
+                                        \
+    macro (ompt_get_state)              \
+                                        \
+    macro (ompt_get_parallel_id)        \
+    macro (ompt_get_parallel_team_size) \
+    macro (ompt_get_task_id)            \
+    macro (ompt_get_thread_id)
+
+#define FOREACH_OMPT_PLACEHOLDER_FN(macro)  \
+    macro (ompt_idle)                       \
+    macro (ompt_overhead)                   \
+    macro (ompt_barrier_wait)               \
+    macro (ompt_task_wait)                  \
+    macro (ompt_mutex_wait)
+
+#define FOREACH_OMPT_STATE(macro)                                                               \
+                                                                                                \
+    /* first */                                                                                 \
+    macro (ompt_state_first, 0x71)          /* initial enumeration state */                     \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x00)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x01)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x02) /* performing a reduction */                        \
+                                                                                                \
+    /* idle (16..31) */                                                                         \
+    macro (ompt_state_idle, 0x10)            /* waiting for work */                             \
+                                                                                                \
+    /* overhead states (32..63) */                                                              \
+    macro (ompt_state_overhead, 0x20)        /* overhead excluding wait states */               \
+                                                                                                \
+    /* barrier wait states (64..79) */                                                          \
+    macro (ompt_state_wait_barrier, 0x40)    /* waiting at a barrier */                         \
+    macro (ompt_state_wait_barrier_implicit, 0x41)    /* implicit barrier */                    \
+    macro (ompt_state_wait_barrier_explicit, 0x42)    /* explicit barrier */                    \
+                                                                                                \
+    /* task wait states (80..95) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x50)   /* waiting at a taskwait */                        \
+    macro (ompt_state_wait_taskgroup, 0x51)  /* waiting at a taskgroup */                       \
+                                                                                                \
+    /* mutex wait states (96..111) */                                                           \
+    macro (ompt_state_wait_lock, 0x60)       /* waiting for lock */                             \
+    macro (ompt_state_wait_nest_lock, 0x61)  /* waiting for nest lock */                        \
+    macro (ompt_state_wait_critical, 0x62)   /* waiting for critical */                         \
+    macro (ompt_state_wait_atomic, 0x63)     /* waiting for atomic */                           \
+    macro (ompt_state_wait_ordered, 0x64)    /* waiting for ordered */                          \
+    macro (ompt_state_wait_single, 0x6F)     /* waiting for single region (non-standard!) */    \
+                                                                                                \
+    /* misc (112..127) */                                                                       \
+    macro (ompt_state_undefined, 0x70)       /* undefined thread state */
+
+
+#define FOREACH_OMPT_EVENT(macro)                                                                               \
+                                                                                                                \
+    /*--- Mandatory Events ---*/                                                                                \
+    macro (ompt_event_parallel_begin,           ompt_new_parallel_callback_t,   1) /* parallel begin */         \
+    macro (ompt_event_parallel_end,             ompt_parallel_callback_t,       2) /* parallel end */           \
+                                                                                                                \
+    macro (ompt_event_task_begin,               ompt_new_task_callback_t,       3) /* task begin */             \
+    macro (ompt_event_task_end,                 ompt_task_callback_t,           4) /* task destroy */           \
+                                                                                                                \
+    macro (ompt_event_thread_begin,             ompt_thread_type_callback_t,    5) /* thread begin */           \
+    macro (ompt_event_thread_end,               ompt_thread_type_callback_t,    6) /* thread end */             \
+                                                                                                                \
+    macro (ompt_event_control,                  ompt_control_callback_t,        7) /* support control calls */  \
+                                                                                                                \
+    macro (ompt_event_runtime_shutdown,         ompt_callback_t,                8) /* runtime shutdown */       \
+                                                                                                                \
+    /*--- Optional Events (blame shifting, ompt_event_unimplemented) ---*/                                      \
+    macro (ompt_event_idle_begin,               ompt_thread_callback_t,         9) /* begin idle state */       \
+    macro (ompt_event_idle_end,                 ompt_thread_callback_t,        10) /* end idle state */         \
+                                                                                                                \
+    macro (ompt_event_wait_barrier_begin,       ompt_parallel_callback_t,      11) /* begin wait at barrier */  \
+    macro (ompt_event_wait_barrier_end,         ompt_parallel_callback_t,      12) /* end wait at barrier */    \
+                                                                                                                \
+    macro (ompt_event_wait_taskwait_begin,      ompt_parallel_callback_t,      13) /* begin wait at taskwait */ \
+    macro (ompt_event_wait_taskwait_end,        ompt_parallel_callback_t,      14) /* end wait at taskwait */   \
+                                                                                                                \
+    macro (ompt_event_wait_taskgroup_begin,     ompt_parallel_callback_t,      15) /* begin wait at taskgroup */\
+    macro (ompt_event_wait_taskgroup_end,       ompt_parallel_callback_t,      16) /* end wait at taskgroup */  \
+                                                                                                                \
+    macro (ompt_event_release_lock,             ompt_wait_callback_t,          17) /* lock release */           \
+    macro (ompt_event_release_nest_lock_last,   ompt_wait_callback_t,          18) /* last nest lock release */ \
+    macro (ompt_event_release_critical,         ompt_wait_callback_t,          19) /* critical release */       \
+                                                                                                                \
+    macro (ompt_event_release_atomic,           ompt_wait_callback_t,          20) /* atomic release */         \
+                                                                                                                \
+    macro (ompt_event_release_ordered,          ompt_wait_callback_t,          21) /* ordered release */        \
+                                                                                                                \
+    /*--- Optional Events (synchronous events, ompt_event_unimplemented) --- */                                 \
+    macro (ompt_event_implicit_task_begin,      ompt_parallel_callback_t,      22) /* implicit task begin   */  \
+    macro (ompt_event_implicit_task_end,        ompt_parallel_callback_t,      23) /* implicit task end  */     \
+                                                                                                                \
+    macro (ompt_event_initial_task_begin,       ompt_parallel_callback_t,      24) /* initial task begin   */   \
+    macro (ompt_event_initial_task_end,         ompt_parallel_callback_t,      25) /* initial task end  */      \
+                                                                                                                \
+    macro (ompt_event_task_switch,              ompt_task_switch_callback_t,   26) /* task switch */            \
+                                                                                                                \
+    macro (ompt_event_loop_begin,               ompt_new_workshare_callback_t, 27) /* task at loop begin */     \
+    macro (ompt_event_loop_end,                 ompt_parallel_callback_t,      28) /* task at loop end */       \
+                                                                                                                \
+    macro (ompt_event_sections_begin,           ompt_new_workshare_callback_t, 29) /* task at sections begin  */\
+    macro (ompt_event_sections_end,             ompt_parallel_callback_t,      30) /* task at sections end */   \
+                                                                                                                \
+    macro (ompt_event_single_in_block_begin,    ompt_new_workshare_callback_t, 31) /* task at single begin*/    \
+    macro (ompt_event_single_in_block_end,      ompt_parallel_callback_t,      32) /* task at single end */     \
+                                                                                                                \
+    macro (ompt_event_single_others_begin,      ompt_parallel_callback_t,      33) /* task at single begin */   \
+    macro (ompt_event_single_others_end,        ompt_parallel_callback_t,      34) /* task at single end */     \
+                                                                                                                \
+    macro (ompt_event_workshare_begin,          ompt_new_workshare_callback_t, 35) /* task at workshare begin */\
+    macro (ompt_event_workshare_end,            ompt_parallel_callback_t,      36) /* task at workshare end */  \
+                                                                                                                \
+    macro (ompt_event_master_begin,             ompt_parallel_callback_t,      37) /* task at master begin */   \
+    macro (ompt_event_master_end,               ompt_parallel_callback_t,      38) /* task at master end */     \
+                                                                                                                \
+    macro (ompt_event_barrier_begin,            ompt_parallel_callback_t,      39) /* task at barrier begin  */ \
+    macro (ompt_event_barrier_end,              ompt_parallel_callback_t,      40) /* task at barrier end */    \
+                                                                                                                \
+    macro (ompt_event_taskwait_begin,           ompt_parallel_callback_t,      41) /* task at taskwait begin */ \
+    macro (ompt_event_taskwait_end,             ompt_parallel_callback_t,      42) /* task at task wait end */  \
+                                                                                                                \
+    macro (ompt_event_taskgroup_begin,          ompt_parallel_callback_t,      43) /* task at taskgroup begin */\
+    macro (ompt_event_taskgroup_end,            ompt_parallel_callback_t,      44) /* task at taskgroup end */  \
+                                                                                                                \
+    macro (ompt_event_release_nest_lock_prev,   ompt_wait_callback_t,          45) /* prev nest lock release */ \
+                                                                                                                \
+    macro (ompt_event_wait_lock,                ompt_wait_callback_t,          46) /* lock wait */              \
+    macro (ompt_event_wait_nest_lock,           ompt_wait_callback_t,          47) /* nest lock wait */         \
+    macro (ompt_event_wait_critical,            ompt_wait_callback_t,          48) /* critical wait */          \
+    macro (ompt_event_wait_atomic,              ompt_wait_callback_t,          49) /* atomic wait */            \
+    macro (ompt_event_wait_ordered,             ompt_wait_callback_t,          50) /* ordered wait */           \
+                                                                                                                \
+    macro (ompt_event_acquired_lock,            ompt_wait_callback_t,          51) /* lock acquired */          \
+    macro (ompt_event_acquired_nest_lock_first, ompt_wait_callback_t,          52) /* 1st nest lock acquired */ \
+    macro (ompt_event_acquired_nest_lock_next,  ompt_wait_callback_t,          53) /* next nest lock acquired*/ \
+    macro (ompt_event_acquired_critical,        ompt_wait_callback_t,          54) /* critical acquired */      \
+    macro (ompt_event_acquired_atomic,          ompt_wait_callback_t,          55) /* atomic acquired */        \
+    macro (ompt_event_acquired_ordered,         ompt_wait_callback_t,          56) /* ordered acquired */       \
+                                                                                                                \
+    macro (ompt_event_init_lock,                ompt_wait_callback_t,          57) /* lock init */              \
+    macro (ompt_event_init_nest_lock,           ompt_wait_callback_t,          58) /* nest lock init */         \
+                                                                                                                \
+    macro (ompt_event_destroy_lock,             ompt_wait_callback_t,          59) /* lock destruction */       \
+    macro (ompt_event_destroy_nest_lock,        ompt_wait_callback_t,          60) /* nest lock destruction */  \
+                                                                                                                \
+    macro (ompt_event_flush,                    ompt_callback_t,               61) /* after executing flush */
+
+
+
+/*****************************************************************************
+ * data types
+ *****************************************************************************/
+
+/*---------------------
+ * identifiers
+ *---------------------*/
+
+typedef uint64_t ompt_thread_id_t;
+#define ompt_thread_id_none ((ompt_thread_id_t) 0)     /* non-standard */
+
+typedef uint64_t ompt_task_id_t;
+#define ompt_task_id_none ((ompt_task_id_t) 0)         /* non-standard */
+
+typedef uint64_t ompt_parallel_id_t;
+#define ompt_parallel_id_none ((ompt_parallel_id_t) 0) /* non-standard */
+
+typedef uint64_t ompt_wait_id_t;
+#define ompt_wait_id_none ((ompt_wait_id_t) 0)         /* non-standard */
+
+
+/*---------------------
+ * ompt_frame_t
+ *---------------------*/
+
+typedef struct ompt_frame_s {
+    void *exit_runtime_frame;    /* next frame is user code     */
+    void *reenter_runtime_frame; /* previous frame is user code */
+} ompt_frame_t;
+
+
+/*****************************************************************************
+ * enumerations for thread states and runtime events
+ *****************************************************************************/
+
+/*---------------------
+ * runtime states
+ *---------------------*/
+
+typedef enum {
+#define ompt_state_macro(state, code) state = code,
+    FOREACH_OMPT_STATE(ompt_state_macro)
+#undef ompt_state_macro
+} ompt_state_t;
+
+
+/*---------------------
+ * runtime events
+ *---------------------*/
+
+typedef enum {
+#define ompt_event_macro(event, callback, eventid) event = eventid,
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+#undef ompt_event_macro
+} ompt_event_t;
+
+
+/*---------------------
+ * set callback results
+ *---------------------*/
+typedef enum {
+    ompt_set_result_registration_error              = 0,
+    ompt_set_result_event_may_occur_no_callback     = 1,
+    ompt_set_result_event_never_occurs              = 2,
+    ompt_set_result_event_may_occur_callback_some   = 3,
+    ompt_set_result_event_may_occur_callback_always = 4,
+} ompt_set_result_t;
+
+
+
+/*****************************************************************************
+ * callback signatures
+ *****************************************************************************/
+
+/* initialization */
+typedef void (*ompt_interface_fn_t)(void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t)(
+    const char *                      /* entry point to look up       */
+);
+
+/* threads */
+typedef void (*ompt_thread_callback_t) (
+    ompt_thread_id_t thread_id        /* ID of thread                 */
+);
+
+typedef enum {
+    ompt_thread_initial = 1, // start the enumeration at 1
+    ompt_thread_worker  = 2,
+    ompt_thread_other   = 3
+} ompt_thread_type_t;
+
+typedef void (*ompt_thread_type_callback_t) (
+    ompt_thread_type_t thread_type,   /* type of thread               */
+    ompt_thread_id_t thread_id        /* ID of thread                 */
+);
+
+typedef void (*ompt_wait_callback_t) (
+    ompt_wait_id_t wait_id            /* wait id                      */
+);
+
+/* parallel and workshares */
+typedef void (*ompt_parallel_callback_t) (
+    ompt_parallel_id_t parallel_id,    /* id of parallel region       */
+    ompt_task_id_t task_id             /* id of task                  */
+);
+
+typedef void (*ompt_new_workshare_callback_t) (
+    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    void *workshare_function          /* pointer to outlined function */
+);
+
+typedef void (*ompt_new_parallel_callback_t) (
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    ompt_frame_t *parent_task_frame,  /* frame data of parent task    */
+    ompt_parallel_id_t parallel_id,   /* id of parallel region        */
+    uint32_t requested_team_size,     /* number of threads in team    */
+    void *parallel_function           /* pointer to outlined function */
+);
+
+/* tasks */
+typedef void (*ompt_task_callback_t) (
+    ompt_task_id_t task_id            /* id of task                   */
+);
+
+typedef void (*ompt_task_switch_callback_t) (
+    ompt_task_id_t suspended_task_id, /* tool data for suspended task */
+    ompt_task_id_t resumed_task_id    /* tool data for resumed task   */
+);
+
+typedef void (*ompt_new_task_callback_t) (
+    ompt_task_id_t parent_task_id,    /* id of parent task            */
+    ompt_frame_t *parent_task_frame,  /* frame data for parent task   */
+    ompt_task_id_t  new_task_id,      /* id of created task           */
+    void *task_function               /* pointer to outlined function */
+);
+
+/* program */
+typedef void (*ompt_control_callback_t) (
+    uint64_t command,                 /* command of control call      */
+    uint64_t modifier                 /* modifier of control call     */
+);
+
+typedef void (*ompt_callback_t)(void);
+
+
+/****************************************************************************
+ * ompt API
+ ***************************************************************************/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define OMPT_API_FNTYPE(fn) fn##_t
+
+#define OMPT_API_FUNCTION(return_type, fn, args)  \
+    typedef return_type (*OMPT_API_FNTYPE(fn)) args
+
+
+
+/****************************************************************************
+ * INQUIRY FUNCTIONS
+ ***************************************************************************/
+
+/* state */
+OMPT_API_FUNCTION(ompt_state_t, ompt_get_state, (
+    ompt_wait_id_t *ompt_wait_id
+));
+
+/* thread */
+OMPT_API_FUNCTION(ompt_thread_id_t, ompt_get_thread_id, (void));
+
+OMPT_API_FUNCTION(void *, ompt_get_idle_frame, (void));
+
+/* parallel region */
+OMPT_API_FUNCTION(ompt_parallel_id_t, ompt_get_parallel_id, (
+    int ancestor_level
+));
+
+OMPT_API_FUNCTION(int, ompt_get_parallel_team_size, (
+    int ancestor_level
+));
+
+/* task */
+OMPT_API_FUNCTION(ompt_task_id_t, ompt_get_task_id, (
+    int depth
+));
+
+OMPT_API_FUNCTION(ompt_frame_t *, ompt_get_task_frame, (
+    int depth
+));
+
+
+
+/****************************************************************************
+ * PLACEHOLDERS FOR PERFORMANCE REPORTING
+ ***************************************************************************/
+
+/* idle */
+OMPT_API_FUNCTION(void, ompt_idle, (
+    void
+));
+
+/* overhead */
+OMPT_API_FUNCTION(void, ompt_overhead, (
+    void
+));
+
+/* barrier wait */
+OMPT_API_FUNCTION(void, ompt_barrier_wait, (
+    void
+));
+
+/* task wait */
+OMPT_API_FUNCTION(void, ompt_task_wait, (
+    void
+));
+
+/* mutex wait */
+OMPT_API_FUNCTION(void, ompt_mutex_wait, (
+    void
+));
+
+
+
+/****************************************************************************
+ * INITIALIZATION FUNCTIONS
+ ***************************************************************************/
+
+/* initialization interface to be defined by tool */
+int ompt_initialize(
+    ompt_function_lookup_t ompt_fn_lookup,
+    const char *runtime_version,
+    unsigned int ompt_version
+);
+
+typedef enum opt_init_mode_e {
+    ompt_init_mode_never  = 0,
+    ompt_init_mode_false  = 1,
+    ompt_init_mode_true   = 2,
+    ompt_init_mode_always = 3
+} ompt_init_mode_t;
+
+OMPT_API_FUNCTION(int, ompt_set_callback, (
+    ompt_event_t event,
+    ompt_callback_t callback
+));
+
+typedef enum ompt_set_callback_rc_e {  /* non-standard */
+    ompt_set_callback_error      = 0,
+    ompt_has_event_no_callback   = 1,
+    ompt_no_event_no_callback    = 2,
+    ompt_has_event_may_callback  = 3,
+    ompt_has_event_must_callback = 4,
+} ompt_set_callback_rc_t;
+
+
+OMPT_API_FUNCTION(int, ompt_get_callback, (
+    ompt_event_t event,
+    ompt_callback_t *callback
+));
+
+
+
+/****************************************************************************
+ * MISCELLANEOUS FUNCTIONS
+ ***************************************************************************/
+
+/* control */
+#if defined(_OPENMP) && (_OPENMP >= 201307)
+#pragma omp declare target
+#endif
+void ompt_control(
+    uint64_t command,
+    uint64_t modifier
+);
+#if defined(_OPENMP) && (_OPENMP >= 201307)
+#pragma omp end declare target
+#endif
+
+/* state enumeration */
+OMPT_API_FUNCTION(int, ompt_enumerate_state, (
+    int current_state,
+    int *next_state,
+    const char **next_state_name
+));
+
+#ifdef  __cplusplus
+};
+#endif
+
+#endif
+

diff --git a/final/runtime/src/kmp.h b/final/runtime/src/kmp.h
new file mode 100644
index 0000000..9f09290
--- /dev/null
+++ b/final/runtime/src/kmp.h

@@ -0,0 +1,3464 @@
+/*! \file */
+/*
+ * kmp.h -- KPTS runtime header file.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_H
+#define KMP_H
+
+/* #define BUILD_PARALLEL_ORDERED 1 */
+
+/* This fix replaces gettimeofday with clock_gettime for better scalability on
+   the Altix.  Requires user code to be linked with -lrt.
+*/
+//#define FIX_SGI_CLOCK
+
+/* Defines for OpenMP 3.0 tasking and auto scheduling */
+
+# ifndef KMP_STATIC_STEAL_ENABLED
+#  define KMP_STATIC_STEAL_ENABLED 1
+# endif
+
+#define TASK_CURRENT_NOT_QUEUED  0
+#define TASK_CURRENT_QUEUED      1
+
+#define TASK_DEQUE_BITS          8  // Used solely to define TASK_DEQUE_SIZE and TASK_DEQUE_MASK.
+#define TASK_DEQUE_SIZE          ( 1 << TASK_DEQUE_BITS )
+#define TASK_DEQUE_MASK          ( TASK_DEQUE_SIZE - 1 )
+
+#ifdef BUILD_TIED_TASK_STACK
+#define TASK_STACK_EMPTY         0  // entries when the stack is empty
+
+#define TASK_STACK_BLOCK_BITS    5  // Used to define TASK_STACK_SIZE and TASK_STACK_MASK
+#define TASK_STACK_BLOCK_SIZE    ( 1 << TASK_STACK_BLOCK_BITS ) // Number of entries in each task stack array
+#define TASK_STACK_INDEX_MASK    ( TASK_STACK_BLOCK_SIZE - 1 )  // Mask for determining index into stack block
+#endif // BUILD_TIED_TASK_STACK
+
+#define TASK_NOT_PUSHED          1
+#define TASK_SUCCESSFULLY_PUSHED 0
+#define TASK_TIED                1
+#define TASK_UNTIED              0
+#define TASK_EXPLICIT            1
+#define TASK_IMPLICIT            0
+#define TASK_PROXY               1
+#define TASK_FULL                0
+
+#define KMP_CANCEL_THREADS
+#define KMP_THREAD_ATTR
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#include <signal.h>
+/*  include <ctype.h> don't use; problems with /MD on Windows* OS NT due to bad Microsoft library  */
+/*  some macros provided below to replace some of these functions  */
+#ifndef __ABSOFT_WIN
+#include <sys/types.h>
+#endif
+#include <limits.h>
+#include <time.h>
+
+#include <errno.h>
+
+#include "kmp_os.h"
+
+#if KMP_STATS_ENABLED
+class kmp_stats_list;
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#include <xmmintrin.h>
+#endif
+
+#include "kmp_version.h"
+#include "kmp_debug.h"
+#include "kmp_lock.h"
+#if USE_DEBUGGER
+#include "kmp_debugger.h"
+#endif
+#include "kmp_i18n.h"
+
+#define KMP_HANDLE_SIGNALS (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN)
+
+#ifdef KMP_SETVERSION
+/*  from factory/Include, to get VERSION_STRING embedded for 'what'  */
+#include "kaiconfig.h"
+#include "eye.h"
+#include "own.h"
+#include "setversion.h"
+#endif
+
+#include "kmp_wrapper_malloc.h"
+#if KMP_OS_UNIX
+# include <unistd.h>
+# if !defined NSIG && defined _NSIG
+#  define NSIG _NSIG
+# endif
+#endif
+
+#if KMP_OS_LINUX
+# pragma weak clock_gettime
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#endif
+
+/*Select data placement in NUMA memory */
+#define NO_FIRST_TOUCH 0
+#define FIRST_TOUCH 1       /* Exploit SGI's first touch page placement algo */
+
+/* If not specified on compile command line, assume no first touch */
+#ifndef BUILD_MEMORY
+#define BUILD_MEMORY NO_FIRST_TOUCH
+#endif
+
+// 0 - no fast memory allocation, alignment: 8-byte on x86, 16-byte on x64.
+// 3 - fast allocation using sync, non-sync free lists of any size, non-self free lists of limited size.
+#ifndef USE_FAST_MEMORY
+#define USE_FAST_MEMORY 3
+#endif
+
+#ifndef KMP_NESTED_HOT_TEAMS
+# define KMP_NESTED_HOT_TEAMS 0
+# define USE_NESTED_HOT_ARG(x)
+#else
+# if KMP_NESTED_HOT_TEAMS
+#  if OMP_40_ENABLED
+#   define USE_NESTED_HOT_ARG(x) ,x
+#  else
+// Nested hot teams feature depends on omp 4.0, disable it for earlier versions
+#   undef KMP_NESTED_HOT_TEAMS
+#   define KMP_NESTED_HOT_TEAMS 0
+#   define USE_NESTED_HOT_ARG(x)
+#  endif
+# else
+#  define USE_NESTED_HOT_ARG(x)
+# endif
+#endif
+
+// Assume using BGET compare_exchange instruction instead of lock by default.
+#ifndef USE_CMP_XCHG_FOR_BGET
+#define USE_CMP_XCHG_FOR_BGET 1
+#endif
+
+// Test to see if queuing lock is better than bootstrap lock for bget
+// #ifndef USE_QUEUING_LOCK_FOR_BGET
+// #define USE_QUEUING_LOCK_FOR_BGET
+// #endif
+
+#define KMP_NSEC_PER_SEC 1000000000L
+#define KMP_USEC_PER_SEC 1000000L
+
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+// FIXME DOXYGEN... need to group these flags somehow (Making them an anonymous enum would do it...)
+/*!
+Values for bit flags used in the ident_t to describe the fields.
+*/
+/*! Use trampoline for internal microtasks */
+#define KMP_IDENT_IMB             0x01
+/*! Use c-style ident structure */
+#define KMP_IDENT_KMPC            0x02
+/* 0x04 is no longer used */
+/*! Entry point generated by auto-parallelization */
+#define KMP_IDENT_AUTOPAR         0x08
+/*! Compiler generates atomic reduction option for kmpc_reduce* */
+#define KMP_IDENT_ATOMIC_REDUCE   0x10
+/*! To mark a 'barrier' directive in user code */
+#define KMP_IDENT_BARRIER_EXPL    0x20
+/*! To Mark implicit barriers. */
+#define KMP_IDENT_BARRIER_IMPL           0x0040
+#define KMP_IDENT_BARRIER_IMPL_MASK      0x01C0
+#define KMP_IDENT_BARRIER_IMPL_FOR       0x0040
+#define KMP_IDENT_BARRIER_IMPL_SECTIONS  0x00C0
+
+#define KMP_IDENT_BARRIER_IMPL_SINGLE    0x0140
+#define KMP_IDENT_BARRIER_IMPL_WORKSHARE 0x01C0
+
+/*!
+ * The ident structure that describes a source location.
+ */
+typedef struct ident {
+    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
+    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
+    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+                            /*  but currently used for storing region-specific ITT */
+                            /*  contextual information. */
+#endif /* USE_ITT_BUILD */
+    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
+    char const *psource;    /**< String describing the source location.
+                            The string is composed of semi-colon separated fields which describe the source file,
+                            the function and a pair of line numbers that delimit the construct.
+                             */
+} ident_t;
+/*!
+@}
+*/
+
+// Some forward declarations.
+
+typedef union  kmp_team      kmp_team_t;
+typedef struct kmp_taskdata  kmp_taskdata_t;
+typedef union  kmp_task_team kmp_task_team_t;
+typedef union  kmp_team      kmp_team_p;
+typedef union  kmp_info      kmp_info_p;
+typedef union  kmp_root      kmp_root_p;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* Pack two 32-bit signed integers into a 64-bit signed integer */
+/* ToDo: Fix word ordering for big-endian machines. */
+#define KMP_PACK_64(HIGH_32,LOW_32) \
+    ( (kmp_int64) ((((kmp_uint64)(HIGH_32))<<32) | (kmp_uint64)(LOW_32)) )
+
+
+/*
+ * Generic string manipulation macros.
+ * Assume that _x is of type char *
+ */
+#define SKIP_WS(_x)     { while (*(_x) == ' ' || *(_x) == '\t') (_x)++; }
+#define SKIP_DIGITS(_x) { while (*(_x) >= '0' && *(_x) <= '9') (_x)++; }
+#define SKIP_TO(_x,_c)  { while (*(_x) != '\0' && *(_x) != (_c)) (_x)++; }
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#define KMP_MAX( x, y ) ( (x) > (y) ? (x) : (y) )
+#define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+
+/* Enumeration types */
+
+enum kmp_state_timer {
+    ts_stop,
+    ts_start,
+    ts_pause,
+
+    ts_last_state
+};
+
+enum dynamic_mode {
+    dynamic_default,
+#ifdef USE_LOAD_BALANCE
+    dynamic_load_balance,
+#endif /* USE_LOAD_BALANCE */
+    dynamic_random,
+    dynamic_thread_limit,
+    dynamic_max
+};
+
+/* external schedule constants, duplicate enum omp_sched in omp.h in order to not include it here */
+#ifndef KMP_SCHED_TYPE_DEFINED
+#define KMP_SCHED_TYPE_DEFINED
+typedef enum kmp_sched {
+    kmp_sched_lower             = 0,     // lower and upper bounds are for routine parameter check
+    // Note: need to adjust __kmp_sch_map global array in case this enum is changed
+    kmp_sched_static            = 1,     // mapped to kmp_sch_static_chunked           (33)
+    kmp_sched_dynamic           = 2,     // mapped to kmp_sch_dynamic_chunked          (35)
+    kmp_sched_guided            = 3,     // mapped to kmp_sch_guided_chunked           (36)
+    kmp_sched_auto              = 4,     // mapped to kmp_sch_auto                     (38)
+    kmp_sched_upper_std         = 5,     // upper bound for standard schedules
+    kmp_sched_lower_ext         = 100,   // lower bound of Intel extension schedules
+    kmp_sched_trapezoidal       = 101,   // mapped to kmp_sch_trapezoidal              (39)
+//  kmp_sched_static_steal      = 102,   // mapped to kmp_sch_static_steal             (44)
+    kmp_sched_upper             = 102,
+    kmp_sched_default = kmp_sched_static // default scheduling
+} kmp_sched_t;
+#endif
+
+/*!
+ @ingroup WORK_SHARING
+ * Describes the loop schedule to be used for a parallel for loop.
+ */
+enum sched_type {
+    kmp_sch_lower                     = 32,   /**< lower bound for unordered values */
+    kmp_sch_static_chunked            = 33,
+    kmp_sch_static                    = 34,   /**< static unspecialized */
+    kmp_sch_dynamic_chunked           = 35,
+    kmp_sch_guided_chunked            = 36,   /**< guided unspecialized */
+    kmp_sch_runtime                   = 37,
+    kmp_sch_auto                      = 38,   /**< auto */
+    kmp_sch_trapezoidal               = 39,
+
+    /* accessible only through KMP_SCHEDULE environment variable */
+    kmp_sch_static_greedy             = 40,
+    kmp_sch_static_balanced           = 41,
+    /* accessible only through KMP_SCHEDULE environment variable */
+    kmp_sch_guided_iterative_chunked  = 42,
+    kmp_sch_guided_analytical_chunked = 43,
+
+    kmp_sch_static_steal              = 44,   /**< accessible only through KMP_SCHEDULE environment variable */
+
+    /* accessible only through KMP_SCHEDULE environment variable */
+    kmp_sch_upper                     = 45,   /**< upper bound for unordered values */
+
+    kmp_ord_lower                     = 64,   /**< lower bound for ordered values, must be power of 2 */
+    kmp_ord_static_chunked            = 65,
+    kmp_ord_static                    = 66,   /**< ordered static unspecialized */
+    kmp_ord_dynamic_chunked           = 67,
+    kmp_ord_guided_chunked            = 68,
+    kmp_ord_runtime                   = 69,
+    kmp_ord_auto                      = 70,   /**< ordered auto */
+    kmp_ord_trapezoidal               = 71,
+    kmp_ord_upper                     = 72,   /**< upper bound for ordered values */
+
+#if OMP_40_ENABLED
+    /* Schedules for Distribute construct */
+    kmp_distribute_static_chunked     = 91,   /**< distribute static chunked */
+    kmp_distribute_static             = 92,   /**< distribute static unspecialized */
+#endif
+
+    /*
+     * For the "nomerge" versions, kmp_dispatch_next*() will always return
+     * a single iteration/chunk, even if the loop is serialized.  For the
+     * schedule types listed above, the entire iteration vector is returned
+     * if the loop is serialized.  This doesn't work for gcc/gcomp sections.
+     */
+    kmp_nm_lower                      = 160,  /**< lower bound for nomerge values */
+
+    kmp_nm_static_chunked             = (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower),
+    kmp_nm_static                     = 162,  /**< static unspecialized */
+    kmp_nm_dynamic_chunked            = 163,
+    kmp_nm_guided_chunked             = 164,  /**< guided unspecialized */
+    kmp_nm_runtime                    = 165,
+    kmp_nm_auto                       = 166,  /**< auto */
+    kmp_nm_trapezoidal                = 167,
+
+    /* accessible only through KMP_SCHEDULE environment variable */
+    kmp_nm_static_greedy              = 168,
+    kmp_nm_static_balanced            = 169,
+    /* accessible only through KMP_SCHEDULE environment variable */
+    kmp_nm_guided_iterative_chunked   = 170,
+    kmp_nm_guided_analytical_chunked  = 171,
+    kmp_nm_static_steal               = 172,  /* accessible only through OMP_SCHEDULE environment variable */
+
+    kmp_nm_ord_static_chunked         = 193,
+    kmp_nm_ord_static                 = 194,  /**< ordered static unspecialized */
+    kmp_nm_ord_dynamic_chunked        = 195,
+    kmp_nm_ord_guided_chunked         = 196,
+    kmp_nm_ord_runtime                = 197,
+    kmp_nm_ord_auto                   = 198,  /**< auto */
+    kmp_nm_ord_trapezoidal            = 199,
+    kmp_nm_upper                      = 200,  /**< upper bound for nomerge values */
+
+    kmp_sch_default = kmp_sch_static  /**< default scheduling algorithm */
+};
+
+/* Type to keep runtime schedule set via OMP_SCHEDULE or omp_set_schedule() */
+typedef struct kmp_r_sched {
+    enum sched_type r_sched_type;
+    int             chunk;
+} kmp_r_sched_t;
+
+extern enum sched_type __kmp_sch_map[]; // map OMP 3.0 schedule types with our internal schedule types
+
+enum library_type {
+    library_none,
+    library_serial,
+    library_turnaround,
+    library_throughput
+};
+
+#if KMP_OS_LINUX
+enum clock_function_type {
+    clock_function_gettimeofday,
+    clock_function_clock_gettime
+};
+#endif /* KMP_OS_LINUX */
+
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+enum mic_type {
+    non_mic,
+    mic1,
+    mic2,
+    mic3,
+    dummy
+};
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* -- fast reduction stuff ------------------------------------------------ */
+
+#undef KMP_FAST_REDUCTION_BARRIER
+#define KMP_FAST_REDUCTION_BARRIER 1
+
+#undef KMP_FAST_REDUCTION_CORE_DUO
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    #define KMP_FAST_REDUCTION_CORE_DUO 1
+#endif
+
+enum _reduction_method {
+    reduction_method_not_defined = 0,
+    critical_reduce_block        = ( 1 << 8 ),
+    atomic_reduce_block          = ( 2 << 8 ),
+    tree_reduce_block            = ( 3 << 8 ),
+    empty_reduce_block           = ( 4 << 8 )
+};
+
+// description of the packed_reduction_method variable
+// the packed_reduction_method variable consists of two enum types variables that are packed together into 0-th byte and 1-st byte:
+// 0: ( packed_reduction_method & 0x000000FF ) is a 'enum barrier_type' value of barrier that will be used in fast reduction: bs_plain_barrier or bs_reduction_barrier
+// 1: ( packed_reduction_method & 0x0000FF00 ) is a reduction method that will be used in fast reduction;
+// reduction method is of 'enum _reduction_method' type and it's defined the way so that the bits of 0-th byte are empty,
+// so no need to execute a shift instruction while packing/unpacking
+
+#if KMP_FAST_REDUCTION_BARRIER
+    #define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method,barrier_type) \
+            ( ( reduction_method ) | ( barrier_type ) )
+
+    #define UNPACK_REDUCTION_METHOD(packed_reduction_method) \
+            ( ( enum _reduction_method )( ( packed_reduction_method ) & ( 0x0000FF00 ) ) )
+
+    #define UNPACK_REDUCTION_BARRIER(packed_reduction_method) \
+            ( ( enum barrier_type )(      ( packed_reduction_method ) & ( 0x000000FF ) ) )
+#else
+    #define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method,barrier_type) \
+            ( reduction_method )
+
+    #define UNPACK_REDUCTION_METHOD(packed_reduction_method) \
+            ( packed_reduction_method )
+
+    #define UNPACK_REDUCTION_BARRIER(packed_reduction_method) \
+            ( bs_plain_barrier )
+#endif
+
+#define TEST_REDUCTION_METHOD(packed_reduction_method,which_reduction_block) \
+            ( ( UNPACK_REDUCTION_METHOD( packed_reduction_method ) ) == ( which_reduction_block ) )
+
+#if KMP_FAST_REDUCTION_BARRIER
+    #define TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER \
+            ( PACK_REDUCTION_METHOD_AND_BARRIER( tree_reduce_block, bs_reduction_barrier ) )
+
+    #define TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER \
+            ( PACK_REDUCTION_METHOD_AND_BARRIER( tree_reduce_block, bs_plain_barrier ) )
+#endif
+
+typedef int PACKED_REDUCTION_METHOD_T;
+
+/* -- end of fast reduction stuff ----------------------------------------- */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if KMP_OS_WINDOWS
+# define USE_CBLKDATA
+# pragma warning( push )
+# pragma warning( disable: 271 310 )
+# include <windows.h>
+# pragma warning( pop )
+#endif
+
+#if KMP_OS_UNIX
+# include <pthread.h>
+# include <dlfcn.h>
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Only Linux* OS and Windows* OS support thread affinity.
+ */
+#if KMP_AFFINITY_SUPPORTED
+
+extern size_t __kmp_affin_mask_size;
+# define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
+# define KMP_AFFINITY_DISABLE() (__kmp_affin_mask_size = 0)
+# define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
+# define KMP_CPU_SETSIZE        (__kmp_affin_mask_size * CHAR_BIT)
+
+# if KMP_OS_LINUX
+//
+// On Linux* OS, the mask is actually a vector of length __kmp_affin_mask_size
+// (in bytes).  It should be allocated on a word boundary.
+//
+// WARNING!!!  We have made the base type of the affinity mask unsigned char,
+// in order to eliminate a lot of checks that the true system mask size is
+// really a multiple of 4 bytes (on Linux* OS).
+//
+// THESE MACROS WON'T WORK PROPERLY ON BIG ENDIAN MACHINES!!!
+//
+
+typedef unsigned char kmp_affin_mask_t;
+
+#  define _KMP_CPU_SET(i,mask)   (mask[i/CHAR_BIT] |= (((kmp_affin_mask_t)1) << (i % CHAR_BIT)))
+#  define KMP_CPU_SET(i,mask)    _KMP_CPU_SET((i), ((kmp_affin_mask_t *)(mask)))
+#  define _KMP_CPU_ISSET(i,mask) (!!(mask[i/CHAR_BIT] & (((kmp_affin_mask_t)1) << (i % CHAR_BIT))))
+#  define KMP_CPU_ISSET(i,mask)  _KMP_CPU_ISSET((i), ((kmp_affin_mask_t *)(mask)))
+#  define _KMP_CPU_CLR(i,mask)   (mask[i/CHAR_BIT] &= ~(((kmp_affin_mask_t)1) << (i % CHAR_BIT)))
+#  define KMP_CPU_CLR(i,mask)    _KMP_CPU_CLR((i), ((kmp_affin_mask_t *)(mask)))
+
+#  define KMP_CPU_ZERO(mask) \
+        {                                                                    \
+            size_t __i;                                                      \
+            for (__i = 0; __i < __kmp_affin_mask_size; __i++) {              \
+                ((kmp_affin_mask_t *)(mask))[__i] = 0;                       \
+            }                                                                \
+        }
+
+#  define KMP_CPU_COPY(dest, src) \
+        {                                                                    \
+            size_t __i;                                                      \
+            for (__i = 0; __i < __kmp_affin_mask_size; __i++) {              \
+                ((kmp_affin_mask_t *)(dest))[__i]                            \
+                  = ((kmp_affin_mask_t *)(src))[__i];                        \
+            }                                                                \
+        }
+
+#  define KMP_CPU_COMPLEMENT(mask) \
+        {                                                                    \
+            size_t __i;                                                      \
+            for (__i = 0; __i < __kmp_affin_mask_size; __i++) {              \
+                ((kmp_affin_mask_t *)(mask))[__i]                            \
+                  = ~((kmp_affin_mask_t *)(mask))[__i];                      \
+            }                                                                \
+        }
+
+#  define KMP_CPU_UNION(dest, src) \
+        {                                                                    \
+            size_t __i;                                                      \
+            for (__i = 0; __i < __kmp_affin_mask_size; __i++) {              \
+                ((kmp_affin_mask_t *)(dest))[__i]                            \
+                  |= ((kmp_affin_mask_t *)(src))[__i];                       \
+            }                                                                \
+        }
+
+# endif /* KMP_OS_LINUX */
+
+# if KMP_OS_WINDOWS
+//
+// On Windows* OS, the mask size is 4 bytes for IA-32 architecture, and on
+// Intel(R) 64 it is 8 bytes times the number of processor groups.
+//
+
+#  if KMP_GROUP_AFFINITY
+
+// GROUP_AFFINITY is already defined for _MSC_VER>=1600 (VS2010 and later).
+#   if _MSC_VER < 1600
+typedef struct GROUP_AFFINITY {
+    KAFFINITY Mask;
+    WORD Group;
+    WORD Reserved[3];
+} GROUP_AFFINITY;
+#   endif
+
+typedef DWORD_PTR kmp_affin_mask_t;
+
+extern int __kmp_num_proc_groups;
+
+#   define _KMP_CPU_SET(i,mask) \
+        (mask[i/(CHAR_BIT * sizeof(kmp_affin_mask_t))] |=                    \
+        (((kmp_affin_mask_t)1) << (i % (CHAR_BIT * sizeof(kmp_affin_mask_t)))))
+
+#   define KMP_CPU_SET(i,mask) \
+        _KMP_CPU_SET((i), ((kmp_affin_mask_t *)(mask)))
+
+#   define _KMP_CPU_ISSET(i,mask) \
+        (!!(mask[i/(CHAR_BIT * sizeof(kmp_affin_mask_t))] &                  \
+        (((kmp_affin_mask_t)1) << (i % (CHAR_BIT * sizeof(kmp_affin_mask_t))))))
+
+#   define KMP_CPU_ISSET(i,mask) \
+        _KMP_CPU_ISSET((i), ((kmp_affin_mask_t *)(mask)))
+
+#   define _KMP_CPU_CLR(i,mask) \
+        (mask[i/(CHAR_BIT * sizeof(kmp_affin_mask_t))] &=                    \
+        ~(((kmp_affin_mask_t)1) << (i % (CHAR_BIT * sizeof(kmp_affin_mask_t)))))
+
+#   define KMP_CPU_CLR(i,mask) \
+        _KMP_CPU_CLR((i), ((kmp_affin_mask_t *)(mask)))
+
+#   define KMP_CPU_ZERO(mask) \
+        {                                                                    \
+            int __i;                                                         \
+            for (__i = 0; __i < __kmp_num_proc_groups; __i++) {              \
+                ((kmp_affin_mask_t *)(mask))[__i] = 0;                       \
+            }                                                                \
+        }
+
+#   define KMP_CPU_COPY(dest, src) \
+        {                                                                    \
+            int __i;                                                         \
+            for (__i = 0; __i < __kmp_num_proc_groups; __i++) {              \
+                ((kmp_affin_mask_t *)(dest))[__i]                            \
+                  = ((kmp_affin_mask_t *)(src))[__i];                        \
+            }                                                                \
+        }
+
+#   define KMP_CPU_COMPLEMENT(mask) \
+        {                                                                    \
+            int __i;                                                         \
+            for (__i = 0; __i < __kmp_num_proc_groups; __i++) {              \
+                ((kmp_affin_mask_t *)(mask))[__i]                            \
+                  = ~((kmp_affin_mask_t *)(mask))[__i];                      \
+            }                                                                \
+        }
+
+#   define KMP_CPU_UNION(dest, src) \
+        {                                                                    \
+            int __i;                                                         \
+            for (__i = 0; __i < __kmp_num_proc_groups; __i++) {              \
+                ((kmp_affin_mask_t *)(dest))[__i]                            \
+                  |= ((kmp_affin_mask_t *)(src))[__i];                       \
+            }                                                                \
+        }
+
+typedef DWORD (*kmp_GetActiveProcessorCount_t)(WORD);
+extern kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount;
+
+typedef WORD (*kmp_GetActiveProcessorGroupCount_t)(void);
+extern kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount;
+
+typedef BOOL (*kmp_GetThreadGroupAffinity_t)(HANDLE, GROUP_AFFINITY *);
+extern kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity;
+
+typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *, GROUP_AFFINITY *);
+extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
+
+extern int __kmp_get_proc_group(kmp_affin_mask_t const *mask);
+
+#  else
+
+typedef DWORD kmp_affin_mask_t; /* for compatibility with older winbase.h */
+
+#   define KMP_CPU_SET(i,mask)      (*(mask) |= (((kmp_affin_mask_t)1) << (i)))
+#   define KMP_CPU_ISSET(i,mask)    (!!(*(mask) & (((kmp_affin_mask_t)1) << (i))))
+#   define KMP_CPU_CLR(i,mask)      (*(mask) &= ~(((kmp_affin_mask_t)1) << (i)))
+#   define KMP_CPU_ZERO(mask)       (*(mask) = 0)
+#   define KMP_CPU_COPY(dest, src)  (*(dest) = *(src))
+#   define KMP_CPU_COMPLEMENT(mask) (*(mask) = ~*(mask))
+#   define KMP_CPU_UNION(dest, src) (*(dest) |= *(src))
+
+#  endif /* KMP_GROUP_AFFINITY */
+
+# endif /* KMP_OS_WINDOWS */
+
+//
+// __kmp_allocate() will return memory allocated on a 4-bytes boundary.
+// after zeroing it - it takes care of those assumptions stated above.
+//
+# define KMP_CPU_ALLOC(ptr) \
+        (ptr = ((kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size)))
+# define KMP_CPU_FREE(ptr) __kmp_free(ptr)
+
+//
+// The following macro should be used to index an array of masks.
+// The array should be declared as "kmp_affinity_t *" and allocated with
+// size "__kmp_affinity_mask_size * len".  The macro takes care of the fact
+// that on Windows* OS, sizeof(kmp_affin_t) is really the size of the mask, but
+// on Linux* OS, sizeof(kmp_affin_t) is 1.
+//
+# define KMP_CPU_INDEX(array,i) \
+        ((kmp_affin_mask_t *)(((char *)(array)) + (i) * __kmp_affin_mask_size))
+
+//
+// Declare local char buffers with this size for printing debug and info
+// messages, using __kmp_affinity_print_mask().
+//
+#define KMP_AFFIN_MASK_PRINT_LEN        1024
+
+enum affinity_type {
+    affinity_none = 0,
+    affinity_physical,
+    affinity_logical,
+    affinity_compact,
+    affinity_scatter,
+    affinity_explicit,
+    affinity_balanced,
+    affinity_disabled,  // not used outsize the env var parser
+    affinity_default
+};
+
+enum affinity_gran {
+    affinity_gran_fine = 0,
+    affinity_gran_thread,
+    affinity_gran_core,
+    affinity_gran_package,
+    affinity_gran_node,
+#if KMP_GROUP_AFFINITY
+    //
+    // The "group" granularity isn't necesssarily coarser than all of the
+    // other levels, but we put it last in the enum.
+    //
+    affinity_gran_group,
+#endif /* KMP_GROUP_AFFINITY */
+    affinity_gran_default
+};
+
+enum affinity_top_method {
+    affinity_top_method_all = 0, // try all (supported) methods, in order
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    affinity_top_method_apicid,
+    affinity_top_method_x2apicid,
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+    affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
+#if KMP_GROUP_AFFINITY
+    affinity_top_method_group,
+#endif /* KMP_GROUP_AFFINITY */
+    affinity_top_method_flat,
+    affinity_top_method_default
+};
+
+#define affinity_respect_mask_default   (-1)
+
+extern enum affinity_type __kmp_affinity_type; /* Affinity type */
+extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
+extern int __kmp_affinity_gran_levels; /* corresponding int value */
+extern int __kmp_affinity_dups; /* Affinity duplicate masks */
+extern enum affinity_top_method __kmp_affinity_top_method;
+extern int __kmp_affinity_compact; /* Affinity 'compact' value */
+extern int __kmp_affinity_offset; /* Affinity offset value  */
+extern int __kmp_affinity_verbose; /* Was verbose specified for KMP_AFFINITY? */
+extern int __kmp_affinity_warnings; /* KMP_AFFINITY warnings enabled ? */
+extern int __kmp_affinity_respect_mask; /* Respect process' initial affinity mask? */
+extern char * __kmp_affinity_proclist; /* proc ID list */
+extern kmp_affin_mask_t *__kmp_affinity_masks;
+extern unsigned __kmp_affinity_num_masks;
+extern int __kmp_get_system_affinity(kmp_affin_mask_t *mask, int abort_on_error);
+extern int __kmp_set_system_affinity(kmp_affin_mask_t const *mask, int abort_on_error);
+extern void __kmp_affinity_bind_thread(int which);
+
+# if KMP_OS_LINUX
+extern kmp_affin_mask_t *__kmp_affinity_get_fullMask();
+# endif /* KMP_OS_LINUX */
+extern char const * __kmp_cpuinfo_file;
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+#if OMP_40_ENABLED
+
+//
+// This needs to be kept in sync with the values in omp.h !!!
+//
+typedef enum kmp_proc_bind_t {
+    proc_bind_false = 0,
+    proc_bind_true,
+    proc_bind_master,
+    proc_bind_close,
+    proc_bind_spread,
+    proc_bind_intel,    // use KMP_AFFINITY interface
+    proc_bind_default
+} kmp_proc_bind_t;
+
+typedef struct kmp_nested_proc_bind_t {
+    kmp_proc_bind_t *bind_types;
+    int size;
+    int used;
+} kmp_nested_proc_bind_t;
+
+extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
+
+#endif /* OMP_40_ENABLED */
+
+# if KMP_AFFINITY_SUPPORTED
+#  define KMP_PLACE_ALL       (-1)
+#  define KMP_PLACE_UNDEFINED (-2)
+# endif /* KMP_AFFINITY_SUPPORTED */
+
+extern int __kmp_affinity_num_places;
+
+
+#if OMP_40_ENABLED
+typedef enum kmp_cancel_kind_t {
+    cancel_noreq = 0,
+    cancel_parallel = 1,
+    cancel_loop = 2,
+    cancel_sections = 3,
+    cancel_taskgroup = 4
+} kmp_cancel_kind_t;
+#endif // OMP_40_ENABLED
+
+extern int __kmp_place_num_cores;
+extern int __kmp_place_num_threads_per_core;
+extern int __kmp_place_core_offset;
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#define KMP_PAD(type, sz)     (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
+
+//
+// We need to avoid using -1 as a GTID as +1 is added to the gtid
+// when storing it in a lock, and the value 0 is reserved.
+//
+#define KMP_GTID_DNE            (-2)    /* Does not exist */
+#define KMP_GTID_SHUTDOWN       (-3)    /* Library is shutting down */
+#define KMP_GTID_MONITOR        (-4)    /* Monitor thread ID */
+#define KMP_GTID_UNKNOWN        (-5)    /* Is not known */
+#define KMP_GTID_MIN            (-6)    /* Minimal gtid for low bound check in DEBUG */
+
+#define __kmp_get_gtid()               __kmp_get_global_thread_id()
+#define __kmp_entry_gtid()             __kmp_get_global_thread_id_reg()
+
+#define __kmp_tid_from_gtid(gtid)     ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), \
+                                        __kmp_threads[ (gtid) ]->th.th_info.ds.ds_tid )
+
+#define __kmp_get_tid()               ( __kmp_tid_from_gtid( __kmp_get_gtid() ) )
+#define __kmp_gtid_from_tid(tid,team) ( KMP_DEBUG_ASSERT( (tid) >= 0 && (team) != NULL ), \
+                                        team -> t.t_threads[ (tid) ] -> th.th_info .ds.ds_gtid )
+
+#define __kmp_get_team()              ( __kmp_threads[ (__kmp_get_gtid()) ]-> th.th_team )
+#define __kmp_team_from_gtid(gtid)    ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), \
+                                        __kmp_threads[ (gtid) ]-> th.th_team )
+
+#define __kmp_thread_from_gtid(gtid)  ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), __kmp_threads[ (gtid) ] )
+#define __kmp_get_thread()            ( __kmp_thread_from_gtid( __kmp_get_gtid() ) )
+
+    // Returns current thread (pointer to kmp_info_t). In contrast to __kmp_get_thread(), it works
+    // with registered and not-yet-registered threads.
+#define __kmp_gtid_from_thread(thr)   ( KMP_DEBUG_ASSERT( (thr) != NULL ), \
+                                        (thr)->th.th_info.ds.ds_gtid )
+
+// AT: Which way is correct?
+// AT: 1. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc;
+// AT: 2. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team_nproc;
+#define __kmp_get_team_num_threads(gtid) ( __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc )
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#define KMP_UINT64_MAX         (~((kmp_uint64)1<<((sizeof(kmp_uint64)*(1<<3))-1)))
+
+#define KMP_MIN_NTH           1
+
+#ifndef KMP_MAX_NTH
+#  ifdef PTHREAD_THREADS_MAX
+#    define KMP_MAX_NTH          PTHREAD_THREADS_MAX
+#  else
+#    define KMP_MAX_NTH          (32 * 1024)
+#  endif
+#endif /* KMP_MAX_NTH */
+
+#ifdef PTHREAD_STACK_MIN
+# define KMP_MIN_STKSIZE         PTHREAD_STACK_MIN
+#else
+# define KMP_MIN_STKSIZE         ((size_t)(32 * 1024))
+#endif
+
+#define KMP_MAX_STKSIZE          (~((size_t)1<<((sizeof(size_t)*(1<<3))-1)))
+
+#if KMP_ARCH_X86
+# define KMP_DEFAULT_STKSIZE     ((size_t)(2 * 1024 * 1024))
+#elif KMP_ARCH_X86_64
+# define KMP_DEFAULT_STKSIZE     ((size_t)(4 * 1024 * 1024))
+# define KMP_BACKUP_STKSIZE      ((size_t)(2 * 1024 * 1024))
+#else
+# define KMP_DEFAULT_STKSIZE     ((size_t)(1024 * 1024))
+#endif
+
+#define KMP_DEFAULT_MONITOR_STKSIZE     ((size_t)(64 * 1024))
+
+#define KMP_DEFAULT_MALLOC_POOL_INCR    ((size_t) (1024 * 1024))
+#define KMP_MIN_MALLOC_POOL_INCR        ((size_t) (4 * 1024))
+#define KMP_MAX_MALLOC_POOL_INCR        (~((size_t)1<<((sizeof(size_t)*(1<<3))-1)))
+
+#define KMP_MIN_STKOFFSET       (0)
+#define KMP_MAX_STKOFFSET       KMP_MAX_STKSIZE
+#if KMP_OS_DARWIN
+# define KMP_DEFAULT_STKOFFSET  KMP_MIN_STKOFFSET
+#else
+# define KMP_DEFAULT_STKOFFSET  CACHE_LINE
+#endif
+
+#define KMP_MIN_STKPADDING      (0)
+#define KMP_MAX_STKPADDING      (2 * 1024 * 1024)
+
+#define KMP_MIN_MONITOR_WAKEUPS      (1)       /* min number of times monitor wakes up per second */
+#define KMP_MAX_MONITOR_WAKEUPS      (1000)    /* maximum number of times monitor can wake up per second */
+#define KMP_BLOCKTIME_MULTIPLIER     (1000)    /* number of blocktime units per second */
+#define KMP_MIN_BLOCKTIME            (0)
+#define KMP_MAX_BLOCKTIME            (INT_MAX) /* Must be this for "infinite" setting the work */
+#define KMP_DEFAULT_BLOCKTIME        (200)     /*  __kmp_blocktime is in milliseconds  */
+/* Calculate new number of monitor wakeups for a specific block time based on previous monitor_wakeups */
+/* Only allow increasing number of wakeups */
+#define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups) \
+                                 ( ((blocktime) == KMP_MAX_BLOCKTIME) ? (monitor_wakeups) : \
+                                   ((blocktime) == KMP_MIN_BLOCKTIME) ? KMP_MAX_MONITOR_WAKEUPS : \
+                                   ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime))) ? (monitor_wakeups) : \
+                                       (KMP_BLOCKTIME_MULTIPLIER) / (blocktime) )
+
+/* Calculate number of intervals for a specific block time based on monitor_wakeups */
+#define KMP_INTERVALS_FROM_BLOCKTIME(blocktime, monitor_wakeups)  \
+                                 ( ( (blocktime) + (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) - 1 ) /  \
+                                   (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) )
+
+#define KMP_MIN_STATSCOLS       40
+#define KMP_MAX_STATSCOLS       4096
+#define KMP_DEFAULT_STATSCOLS   80
+
+#define KMP_MIN_INTERVAL        0
+#define KMP_MAX_INTERVAL        (INT_MAX-1)
+#define KMP_DEFAULT_INTERVAL    0
+
+#define KMP_MIN_CHUNK           1
+#define KMP_MAX_CHUNK           (INT_MAX-1)
+#define KMP_DEFAULT_CHUNK       1
+
+#define KMP_MIN_INIT_WAIT       1
+#define KMP_MAX_INIT_WAIT       (INT_MAX/2)
+#define KMP_DEFAULT_INIT_WAIT   2048U
+
+#define KMP_MIN_NEXT_WAIT       1
+#define KMP_MAX_NEXT_WAIT       (INT_MAX/2)
+#define KMP_DEFAULT_NEXT_WAIT   1024U
+
+// max possible dynamic loops in concurrent execution per team
+#define KMP_MAX_DISP_BUF        7
+#define KMP_MAX_ORDERED         8
+
+#define KMP_MAX_FIELDS          32
+
+#define KMP_MAX_BRANCH_BITS     31
+
+#define KMP_MAX_ACTIVE_LEVELS_LIMIT INT_MAX
+
+/* Minimum number of threads before switch to TLS gtid (experimentally determined) */
+/* josh TODO: what about OS X* tuning? */
+#if   KMP_ARCH_X86 || KMP_ARCH_X86_64
+# define KMP_TLS_GTID_MIN     5
+#else
+# define KMP_TLS_GTID_MIN     INT_MAX
+#endif
+
+#define KMP_MASTER_TID(tid)      ( (tid) == 0 )
+#define KMP_WORKER_TID(tid)      ( (tid) != 0 )
+
+#define KMP_MASTER_GTID(gtid)    ( __kmp_tid_from_gtid((gtid)) == 0 )
+#define KMP_WORKER_GTID(gtid)    ( __kmp_tid_from_gtid((gtid)) != 0 )
+#define KMP_UBER_GTID(gtid)                                           \
+    (                                                                 \
+        KMP_DEBUG_ASSERT( (gtid) >= KMP_GTID_MIN ),                   \
+        KMP_DEBUG_ASSERT( (gtid) < __kmp_threads_capacity ),          \
+        (gtid) >= 0 && __kmp_root[(gtid)] && __kmp_threads[(gtid)] && \
+        (__kmp_threads[(gtid)] == __kmp_root[(gtid)]->r.r_uber_thread)\
+    )
+#define KMP_INITIAL_GTID(gtid)   ( (gtid) == 0 )
+
+#ifndef TRUE
+#define FALSE   0
+#define TRUE    (! FALSE)
+#endif
+
+/* NOTE: all of the following constants must be even */
+
+#if KMP_OS_WINDOWS
+#  define KMP_INIT_WAIT    64U          /* initial number of spin-tests   */
+#  define KMP_NEXT_WAIT    32U          /* susequent number of spin-tests */
+#elif KMP_OS_CNK
+#  define KMP_INIT_WAIT    16U          /* initial number of spin-tests   */
+#  define KMP_NEXT_WAIT     8U          /* susequent number of spin-tests */
+#elif KMP_OS_LINUX
+#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
+#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#elif KMP_OS_DARWIN
+/* TODO: tune for KMP_OS_DARWIN */
+#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
+#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#elif KMP_OS_FREEBSD
+/* TODO: tune for KMP_OS_FREEBSD */
+#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
+#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+typedef struct kmp_cpuid {
+    kmp_uint32  eax;
+    kmp_uint32  ebx;
+    kmp_uint32  ecx;
+    kmp_uint32  edx;
+} kmp_cpuid_t;
+extern void __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
+# if KMP_ARCH_X86
+  extern void __kmp_x86_pause( void );
+# elif KMP_MIC
+  static void __kmp_x86_pause( void ) { _mm_delay_32( 100 ); };
+# else
+  static void __kmp_x86_pause( void ) { _mm_pause(); };
+# endif
+# define KMP_CPU_PAUSE() __kmp_x86_pause()
+#elif KMP_ARCH_PPC64
+# define KMP_PPC64_PRI_LOW() __asm__ volatile ("or 1, 1, 1")
+# define KMP_PPC64_PRI_MED() __asm__ volatile ("or 2, 2, 2")
+# define KMP_PPC64_PRI_LOC_MB() __asm__ volatile ("" : : : "memory")
+# define KMP_CPU_PAUSE() do { KMP_PPC64_PRI_LOW(); KMP_PPC64_PRI_MED(); KMP_PPC64_PRI_LOC_MB(); } while (0)
+#else
+# define KMP_CPU_PAUSE()        /* nothing to do */
+#endif
+
+#define KMP_INIT_YIELD(count)           { (count) = __kmp_yield_init; }
+
+#define KMP_YIELD(cond)                 { KMP_CPU_PAUSE(); __kmp_yield( (cond) ); }
+
+// Note the decrement of 2 in the following Macros.  With KMP_LIBRARY=turnaround,
+// there should be no yielding since the starting value from KMP_INIT_YIELD() is odd.
+
+#define KMP_YIELD_WHEN(cond,count)      { KMP_CPU_PAUSE(); (count) -= 2; \
+                                                if (!(count)) { KMP_YIELD(cond); (count) = __kmp_yield_next; } }
+#define KMP_YIELD_SPIN(count)           { KMP_CPU_PAUSE(); (count) -=2; \
+                                                if (!(count)) { KMP_YIELD(1); (count) = __kmp_yield_next; } }
+
+/* ------------------------------------------------------------------------ */
+/* Support datatypes for the orphaned construct nesting checks.             */
+/* ------------------------------------------------------------------------ */
+
+enum cons_type {
+    ct_none,
+    ct_parallel,
+    ct_pdo,
+    ct_pdo_ordered,
+    ct_psections,
+    ct_psingle,
+
+    /* the following must be left in order and not split up */
+    ct_taskq,
+    ct_task,                    /* really task inside non-ordered taskq, considered a worksharing type */
+    ct_task_ordered,            /* really task inside ordered taskq, considered a worksharing type */
+    /* the preceding must be left in order and not split up */
+
+    ct_critical,
+    ct_ordered_in_parallel,
+    ct_ordered_in_pdo,
+    ct_ordered_in_taskq,
+    ct_master,
+    ct_reduce,
+    ct_barrier
+};
+
+/* test to see if we are in a taskq construct */
+# define IS_CONS_TYPE_TASKQ( ct )       ( ((int)(ct)) >= ((int)ct_taskq) && ((int)(ct)) <= ((int)ct_task_ordered) )
+# define IS_CONS_TYPE_ORDERED( ct )     ((ct) == ct_pdo_ordered || (ct) == ct_task_ordered)
+
+struct cons_data {
+    ident_t const     *ident;
+    enum cons_type     type;
+    int                prev;
+    kmp_user_lock_p    name;    /* address exclusively for critical section name comparison */
+};
+
+struct cons_header {
+    int                 p_top, w_top, s_top;
+    int                 stack_size, stack_top;
+    struct cons_data   *stack_data;
+};
+
+struct kmp_region_info {
+    char                *text;
+    int                 offset[KMP_MAX_FIELDS];
+    int                 length[KMP_MAX_FIELDS];
+};
+
+
+/* ---------------------------------------------------------------------- */
+/* ---------------------------------------------------------------------- */
+
+#if KMP_OS_WINDOWS
+    typedef HANDLE              kmp_thread_t;
+    typedef DWORD               kmp_key_t;
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_OS_UNIX
+    typedef pthread_t           kmp_thread_t;
+    typedef pthread_key_t       kmp_key_t;
+#endif
+
+extern kmp_key_t  __kmp_gtid_threadprivate_key;
+
+typedef struct kmp_sys_info {
+    long maxrss;          /* the maximum resident set size utilized (in kilobytes)     */
+    long minflt;          /* the number of page faults serviced without any I/O        */
+    long majflt;          /* the number of page faults serviced that required I/O      */
+    long nswap;           /* the number of times a process was "swapped" out of memory */
+    long inblock;         /* the number of times the file system had to perform input  */
+    long oublock;         /* the number of times the file system had to perform output */
+    long nvcsw;           /* the number of times a context switch was voluntarily      */
+    long nivcsw;          /* the number of times a context switch was forced           */
+} kmp_sys_info_t;
+
+typedef struct kmp_cpuinfo {
+    int        initialized;  // If 0, other fields are not initialized.
+    int        signature;    // CPUID(1).EAX
+    int        family;       // CPUID(1).EAX[27:20] + CPUID(1).EAX[11:8] ( Extended Family + Family )
+    int        model;        // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended Model << 4 ) + Model)
+    int        stepping;     // CPUID(1).EAX[3:0] ( Stepping )
+    int        sse2;         // 0 if SSE2 instructions are not supported, 1 otherwise.
+    int        rtm;          // 0 if RTM instructions are not supported, 1 otherwise.
+    int        cpu_stackoffset;
+    int        apic_id;
+    int        physical_id;
+    int        logical_id;
+    kmp_uint64 frequency;    // Nominal CPU frequency in Hz.
+} kmp_cpuinfo_t;
+
+
+#ifdef BUILD_TV
+
+struct tv_threadprivate {
+    /* Record type #1 */
+    void        *global_addr;
+    void        *thread_addr;
+};
+
+struct tv_data {
+    struct tv_data      *next;
+    void                *type;
+    union tv_union {
+        struct tv_threadprivate tp;
+    } u;
+};
+
+extern kmp_key_t __kmp_tv_key;
+
+#endif /* BUILD_TV */
+
+/* ------------------------------------------------------------------------ */
+
+#if USE_ITT_BUILD
+// We cannot include "kmp_itt.h" due to circular dependency. Declare the only required type here.
+// Later we will check the type meets requirements.
+typedef int kmp_itt_mark_t;
+#define KMP_ITT_DEBUG 0
+#endif /* USE_ITT_BUILD */
+
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Taskq data structures
+ */
+
+#define HIGH_WATER_MARK(nslots)         (((nslots) * 3) / 4)
+#define __KMP_TASKQ_THUNKS_PER_TH        1      /* num thunks that each thread can simultaneously execute from a task queue */
+
+/*  flags for taskq_global_flags, kmp_task_queue_t tq_flags, kmpc_thunk_t th_flags  */
+
+#define TQF_IS_ORDERED          0x0001  /*  __kmpc_taskq interface, taskq ordered  */
+#define TQF_IS_LASTPRIVATE      0x0002  /*  __kmpc_taskq interface, taskq with lastprivate list  */
+#define TQF_IS_NOWAIT           0x0004  /*  __kmpc_taskq interface, end taskq nowait  */
+#define TQF_HEURISTICS          0x0008  /*  __kmpc_taskq interface, use heuristics to decide task queue size  */
+#define TQF_INTERFACE_RESERVED1 0x0010  /*  __kmpc_taskq interface, reserved for future use  */
+#define TQF_INTERFACE_RESERVED2 0x0020  /*  __kmpc_taskq interface, reserved for future use  */
+#define TQF_INTERFACE_RESERVED3 0x0040  /*  __kmpc_taskq interface, reserved for future use  */
+#define TQF_INTERFACE_RESERVED4 0x0080  /*  __kmpc_taskq interface, reserved for future use  */
+
+#define TQF_INTERFACE_FLAGS     0x00ff  /*  all the __kmpc_taskq interface flags  */
+
+#define TQF_IS_LAST_TASK        0x0100  /*  internal/read by instrumentation; only used with TQF_IS_LASTPRIVATE  */
+#define TQF_TASKQ_TASK          0x0200  /*  internal use only; this thunk->th_task is the taskq_task  */
+#define TQF_RELEASE_WORKERS     0x0400  /*  internal use only; must release worker threads once ANY queued task exists (global) */
+#define TQF_ALL_TASKS_QUEUED    0x0800  /*  internal use only; notify workers that master has finished enqueuing tasks */
+#define TQF_PARALLEL_CONTEXT    0x1000  /*  internal use only: this queue encountered in a parallel context: not serialized */
+#define TQF_DEALLOCATED         0x2000  /*  internal use only; this queue is on the freelist and not in use */
+
+#define TQF_INTERNAL_FLAGS      0x3f00  /*  all the internal use only flags  */
+
+typedef struct KMP_ALIGN_CACHE kmpc_aligned_int32_t {
+    kmp_int32                      ai_data;
+} kmpc_aligned_int32_t;
+
+typedef struct KMP_ALIGN_CACHE kmpc_aligned_queue_slot_t {
+    struct kmpc_thunk_t   *qs_thunk;
+} kmpc_aligned_queue_slot_t;
+
+typedef struct kmpc_task_queue_t {
+        /* task queue linkage fields for n-ary tree of queues (locked with global taskq_tree_lck) */
+    kmp_lock_t                    tq_link_lck;          /*  lock for child link, child next/prev links and child ref counts */
+    union {
+        struct kmpc_task_queue_t *tq_parent;            /*  pointer to parent taskq, not locked */
+        struct kmpc_task_queue_t *tq_next_free;         /*  for taskq internal freelists, locked with global taskq_freelist_lck */
+    } tq;
+    volatile struct kmpc_task_queue_t *tq_first_child;  /*  pointer to linked-list of children, locked by tq's tq_link_lck */
+    struct kmpc_task_queue_t     *tq_next_child;        /*  next child in linked-list, locked by parent tq's tq_link_lck */
+    struct kmpc_task_queue_t     *tq_prev_child;        /*  previous child in linked-list, locked by parent tq's tq_link_lck */
+    volatile kmp_int32            tq_ref_count;         /*  reference count of threads with access to this task queue */
+                                                        /*  (other than the thread executing the kmpc_end_taskq call) */
+                                                        /*  locked by parent tq's tq_link_lck */
+
+        /* shared data for task queue */
+    struct kmpc_aligned_shared_vars_t    *tq_shareds;   /*  per-thread array of pointers to shared variable structures */
+                                                        /*  only one array element exists for all but outermost taskq */
+
+        /* bookkeeping for ordered task queue */
+    kmp_uint32                    tq_tasknum_queuing;   /*  ordered task number assigned while queuing tasks */
+    volatile kmp_uint32           tq_tasknum_serving;   /*  ordered number of next task to be served (executed) */
+
+        /* thunk storage management for task queue */
+    kmp_lock_t                    tq_free_thunks_lck;   /*  lock for thunk freelist manipulation */
+    struct kmpc_thunk_t          *tq_free_thunks;       /*  thunk freelist, chained via th.th_next_free  */
+    struct kmpc_thunk_t          *tq_thunk_space;       /*  space allocated for thunks for this task queue  */
+
+        /* data fields for queue itself */
+    kmp_lock_t                    tq_queue_lck;         /*  lock for [de]enqueue operations: tq_queue, tq_head, tq_tail, tq_nfull */
+    kmpc_aligned_queue_slot_t    *tq_queue;             /*  array of queue slots to hold thunks for tasks */
+    volatile struct kmpc_thunk_t *tq_taskq_slot;        /*  special slot for taskq task thunk, occupied if not NULL  */
+    kmp_int32                     tq_nslots;            /*  # of tq_thunk_space thunks alloc'd (not incl. tq_taskq_slot space)  */
+    kmp_int32                     tq_head;              /*  enqueue puts next item in here (index into tq_queue array) */
+    kmp_int32                     tq_tail;              /*  dequeue takes next item out of here (index into tq_queue array) */
+    volatile kmp_int32            tq_nfull;             /*  # of occupied entries in task queue right now  */
+    kmp_int32                     tq_hiwat;             /*  high-water mark for tq_nfull and queue scheduling  */
+    volatile kmp_int32            tq_flags;             /*  TQF_xxx  */
+
+        /* bookkeeping for outstanding thunks */
+    struct kmpc_aligned_int32_t  *tq_th_thunks;         /*  per-thread array for # of regular thunks currently being executed */
+    kmp_int32                     tq_nproc;             /*  number of thunks in the th_thunks array */
+
+        /* statistics library bookkeeping */
+    ident_t                       *tq_loc;              /*  source location information for taskq directive */
+} kmpc_task_queue_t;
+
+typedef void (*kmpc_task_t) (kmp_int32 global_tid, struct kmpc_thunk_t *thunk);
+
+/*  sizeof_shareds passed as arg to __kmpc_taskq call  */
+typedef struct kmpc_shared_vars_t {             /*  aligned during dynamic allocation */
+    kmpc_task_queue_t         *sv_queue;
+    /*  (pointers to) shared vars  */
+} kmpc_shared_vars_t;
+
+typedef struct KMP_ALIGN_CACHE kmpc_aligned_shared_vars_t {
+    volatile struct kmpc_shared_vars_t     *ai_data;
+} kmpc_aligned_shared_vars_t;
+
+/*  sizeof_thunk passed as arg to kmpc_taskq call  */
+typedef struct kmpc_thunk_t {                   /*  aligned during dynamic allocation */
+    union {                                     /*  field used for internal freelists too  */
+        kmpc_shared_vars_t  *th_shareds;
+        struct kmpc_thunk_t *th_next_free;      /*  freelist of individual thunks within queue, head at tq_free_thunks  */
+    } th;
+    kmpc_task_t th_task;                        /*  taskq_task if flags & TQF_TASKQ_TASK  */
+    struct kmpc_thunk_t *th_encl_thunk;         /*  pointer to dynamically enclosing thunk on this thread's call stack */
+    kmp_int32 th_flags;                         /*  TQF_xxx (tq_flags interface plus possible internal flags)  */
+    kmp_int32 th_status;
+    kmp_uint32 th_tasknum;                      /*  task number assigned in order of queuing, used for ordered sections */
+    /*  private vars  */
+} kmpc_thunk_t;
+
+typedef struct KMP_ALIGN_CACHE kmp_taskq {
+    int                 tq_curr_thunk_capacity;
+
+    kmpc_task_queue_t  *tq_root;
+    kmp_int32           tq_global_flags;
+
+    kmp_lock_t          tq_freelist_lck;
+    kmpc_task_queue_t  *tq_freelist;
+
+    kmpc_thunk_t      **tq_curr_thunk;
+} kmp_taskq_t;
+
+/* END Taskq data structures */
+/* --------------------------------------------------------------------------- */
+
+typedef kmp_int32 kmp_critical_name[8];
+
+/*!
+@ingroup PARALLEL
+The type for a microtask which gets passed to @ref __kmpc_fork_call().
+The arguments to the outlined function are
+@param global_tid the global thread identity of the thread executing the function.
+@param bound_tid  the local identitiy of the thread executing the function
+@param ... pointers to shared variables accessed by the function.
+*/
+typedef void (*kmpc_micro)              ( kmp_int32 * global_tid, kmp_int32 * bound_tid, ... );
+typedef void (*kmpc_micro_bound)        ( kmp_int32 * bound_tid, kmp_int32 * bound_nth, ... );
+
+/*!
+@ingroup THREADPRIVATE
+@{
+*/
+/* --------------------------------------------------------------------------- */
+/* Threadprivate initialization/finalization function declarations */
+
+/*  for non-array objects:  __kmpc_threadprivate_register()  */
+
+/*!
+ Pointer to the constructor function.
+ The first argument is the <tt>this</tt> pointer
+*/
+typedef void *(*kmpc_ctor)    (void *);
+
+/*!
+ Pointer to the destructor function.
+ The first argument is the <tt>this</tt> pointer
+*/
+typedef void (*kmpc_dtor)     (void * /*, size_t */); /* 2nd arg: magic number for KCC unused by Intel compiler */
+/*!
+ Pointer to an alternate constructor.
+ The first argument is the <tt>this</tt> pointer.
+*/
+typedef void *(*kmpc_cctor)   (void *, void *);
+
+/*  for array objects: __kmpc_threadprivate_register_vec()  */
+                                /* First arg: "this" pointer */
+                                /* Last arg: number of array elements */
+/*!
+ Array constructor.
+ First argument is the <tt>this</tt> pointer
+ Second argument the number of array elements.
+*/
+typedef void *(*kmpc_ctor_vec)  (void *, size_t);
+/*!
+ Pointer to the array destructor function.
+ The first argument is the <tt>this</tt> pointer
+ Second argument the number of array elements.
+*/
+typedef void (*kmpc_dtor_vec)   (void *, size_t);
+/*!
+ Array constructor.
+ First argument is the <tt>this</tt> pointer
+ Third argument the number of array elements.
+*/
+typedef void *(*kmpc_cctor_vec) (void *, void *, size_t); /* function unused by compiler */
+
+/*!
+@}
+*/
+
+
+/* ------------------------------------------------------------------------ */
+
+/* keeps tracked of threadprivate cache allocations for cleanup later */
+typedef struct kmp_cached_addr {
+    void                      **addr;           /* address of allocated cache */
+    struct kmp_cached_addr     *next;           /* pointer to next cached address */
+} kmp_cached_addr_t;
+
+struct private_data {
+    struct private_data *next;          /* The next descriptor in the list      */
+    void                *data;          /* The data buffer for this descriptor  */
+    int                  more;          /* The repeat count for this descriptor */
+    size_t               size;          /* The data size for this descriptor    */
+};
+
+struct private_common {
+    struct private_common     *next;
+    struct private_common     *link;
+    void                      *gbl_addr;
+    void                      *par_addr;        /* par_addr == gbl_addr for MASTER thread */
+    size_t                     cmn_size;
+};
+
+struct shared_common
+{
+    struct shared_common      *next;
+    struct private_data       *pod_init;
+    void                      *obj_init;
+    void                      *gbl_addr;
+    union {
+        kmpc_ctor              ctor;
+        kmpc_ctor_vec          ctorv;
+    } ct;
+    union {
+        kmpc_cctor             cctor;
+        kmpc_cctor_vec         cctorv;
+    } cct;
+    union {
+        kmpc_dtor              dtor;
+        kmpc_dtor_vec          dtorv;
+    } dt;
+    size_t                     vec_len;
+    int                        is_vec;
+    size_t                     cmn_size;
+};
+
+#define KMP_HASH_TABLE_LOG2     9                               /* log2 of the hash table size */
+#define KMP_HASH_TABLE_SIZE     (1 << KMP_HASH_TABLE_LOG2)      /* size of the hash table */
+#define KMP_HASH_SHIFT          3                               /* throw away this many low bits from the address */
+#define KMP_HASH(x)             ((((kmp_uintptr_t) x) >> KMP_HASH_SHIFT) & (KMP_HASH_TABLE_SIZE-1))
+
+struct common_table {
+    struct  private_common      *data[ KMP_HASH_TABLE_SIZE ];
+};
+
+struct shared_table {
+    struct  shared_common       *data[ KMP_HASH_TABLE_SIZE ];
+};
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#ifdef KMP_STATIC_STEAL_ENABLED
+typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
+    kmp_int32 count;
+    kmp_int32 ub;
+    /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+    kmp_int32 lb;
+    kmp_int32 st;
+    kmp_int32 tc;
+    kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put after ub */
+
+    // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
+    //    a) parm3 is properly aligned and
+    //    b) all parm1-4 are in the same cache line.
+    // Because of parm1-4 are used together, performance seems to be better
+    // if they are in the same line (not measured though).
+
+    struct KMP_ALIGN( 32 ) { // AC: changed 16 to 32 in order to simplify template
+        kmp_int32 parm1;     //     structures in kmp_dispatch.cpp. This should
+        kmp_int32 parm2;     //     make no real change at least while padding is off.
+        kmp_int32 parm3;
+        kmp_int32 parm4;
+    };
+
+    kmp_uint32 ordered_lower;
+    kmp_uint32 ordered_upper;
+#if KMP_OS_WINDOWS
+    // This var can be placed in the hole between 'tc' and 'parm1', instead of 'static_steal_counter'.
+    // It would be nice to measure execution times.
+    // Conditional if/endif can be removed at all.
+    kmp_int32 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info32_t;
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
+    kmp_int64 count;   /* current chunk number for static and static-steal scheduling*/
+    kmp_int64 ub;      /* upper-bound */
+    /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+    kmp_int64 lb;      /* lower-bound */
+    kmp_int64 st;      /* stride */
+    kmp_int64 tc;      /* trip count (number of iterations) */
+    kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put after ub */
+
+    /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+    // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+    //    a) parm3 is properly aligned and
+    //    b) all parm1-4 are in the same cache line.
+    // Because of parm1-4 are used together, performance seems to be better
+    // if they are in the same line (not measured though).
+
+    struct KMP_ALIGN( 32 ) {
+        kmp_int64 parm1;
+        kmp_int64 parm2;
+        kmp_int64 parm3;
+        kmp_int64 parm4;
+    };
+
+    kmp_uint64 ordered_lower;
+    kmp_uint64 ordered_upper;
+#if KMP_OS_WINDOWS
+    // This var can be placed in the hole between 'tc' and 'parm1', instead of 'static_steal_counter'.
+    // It would be nice to measure execution times.
+    // Conditional if/endif can be removed at all.
+    kmp_int64 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info64_t;
+#else /* KMP_STATIC_STEAL_ENABLED */
+typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
+    kmp_int32 lb;
+    kmp_int32 ub;
+    kmp_int32 st;
+    kmp_int32 tc;
+
+    kmp_int32 parm1;
+    kmp_int32 parm2;
+    kmp_int32 parm3;
+    kmp_int32 parm4;
+
+    kmp_int32 count;
+
+    kmp_uint32 ordered_lower;
+    kmp_uint32 ordered_upper;
+#if KMP_OS_WINDOWS
+    kmp_int32 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info32_t;
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
+    kmp_int64 lb;      /* lower-bound */
+    kmp_int64 ub;      /* upper-bound */
+    kmp_int64 st;      /* stride */
+    kmp_int64 tc;      /* trip count (number of iterations) */
+
+    /* parm[1-4] are used in different ways by different scheduling algorithms */
+    kmp_int64 parm1;
+    kmp_int64 parm2;
+    kmp_int64 parm3;
+    kmp_int64 parm4;
+
+    kmp_int64 count;   /* current chunk number for static scheduling */
+
+    kmp_uint64 ordered_lower;
+    kmp_uint64 ordered_upper;
+#if KMP_OS_WINDOWS
+    kmp_int64 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info64_t;
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info {
+    union private_info {
+        dispatch_private_info32_t  p32;
+        dispatch_private_info64_t  p64;
+    } u;
+    enum sched_type schedule;  /* scheduling algorithm */
+    kmp_int32       ordered;   /* ordered clause specified */
+    kmp_int32       ordered_bumped;
+    kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar
+    struct dispatch_private_info * next; /* stack of buffers for nest of serial regions */
+    kmp_int32       nomerge;   /* don't merge iters if serialized */
+    kmp_int32       type_size; /* the size of types in private_info */
+    enum cons_type  pushed_ws;
+} dispatch_private_info_t;
+
+typedef struct dispatch_shared_info32 {
+    /* chunk index under dynamic, number of idle threads under static-steal;
+       iteration index otherwise */
+    volatile kmp_uint32      iteration;
+    volatile kmp_uint32      num_done;
+    volatile kmp_uint32      ordered_iteration;
+    kmp_int32   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar
+} dispatch_shared_info32_t;
+
+typedef struct dispatch_shared_info64 {
+    /* chunk index under dynamic, number of idle threads under static-steal;
+       iteration index otherwise */
+    volatile kmp_uint64      iteration;
+    volatile kmp_uint64      num_done;
+    volatile kmp_uint64      ordered_iteration;
+    kmp_int64   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar
+} dispatch_shared_info64_t;
+
+typedef struct dispatch_shared_info {
+    union shared_info {
+        dispatch_shared_info32_t  s32;
+        dispatch_shared_info64_t  s64;
+    } u;
+/*    volatile kmp_int32      dispatch_abort;  depricated */
+    volatile kmp_uint32     buffer_index;
+} dispatch_shared_info_t;
+
+typedef struct kmp_disp {
+    /* Vector for ORDERED SECTION */
+    void (*th_deo_fcn)( int * gtid, int * cid, ident_t *);
+    /* Vector for END ORDERED SECTION */
+    void (*th_dxo_fcn)( int * gtid, int * cid, ident_t *);
+
+    dispatch_shared_info_t  *th_dispatch_sh_current;
+    dispatch_private_info_t *th_dispatch_pr_current;
+
+    dispatch_private_info_t *th_disp_buffer;
+    kmp_int32                th_disp_index;
+    void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
+#if KMP_USE_INTERNODE_ALIGNMENT
+    char more_padding[INTERNODE_CACHE_LINE];
+#endif
+} kmp_disp_t;
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* Barrier stuff */
+
+/* constants for barrier state update */
+#define KMP_INIT_BARRIER_STATE  0       /* should probably start from zero */
+#define KMP_BARRIER_SLEEP_BIT   0       /* bit used for suspend/sleep part of state */
+#define KMP_BARRIER_UNUSED_BIT  1       /* bit that must never be set for valid state */
+#define KMP_BARRIER_BUMP_BIT    2       /* lsb used for bump of go/arrived state */
+
+#define KMP_BARRIER_SLEEP_STATE         ((kmp_uint) (1 << KMP_BARRIER_SLEEP_BIT))
+#define KMP_BARRIER_UNUSED_STATE        ((kmp_uint) (1 << KMP_BARRIER_UNUSED_BIT))
+#define KMP_BARRIER_STATE_BUMP          ((kmp_uint) (1 << KMP_BARRIER_BUMP_BIT))
+
+#if (KMP_BARRIER_SLEEP_BIT >= KMP_BARRIER_BUMP_BIT)
+# error "Barrier sleep bit must be smaller than barrier bump bit"
+#endif
+#if (KMP_BARRIER_UNUSED_BIT >= KMP_BARRIER_BUMP_BIT)
+# error "Barrier unused bit must be smaller than barrier bump bit"
+#endif
+
+// Constants for release barrier wait state: currently, hierarchical only
+#define KMP_BARRIER_NOT_WAITING        0  // Normal state; worker not in wait_sleep
+#define KMP_BARRIER_OWN_FLAG           1  // Normal state; worker waiting on own b_go flag in release
+#define KMP_BARRIER_PARENT_FLAG        2  // Special state; worker waiting on parent's b_go flag in release
+#define KMP_BARRIER_SWITCH_TO_OWN_FLAG 3  // Special state; tells worker to shift from parent to own b_go
+#define KMP_BARRIER_SWITCHING          4  // Special state; worker resets appropriate flag on wake-up
+
+enum barrier_type {
+    bs_plain_barrier = 0,       /* 0, All non-fork/join barriers (except reduction barriers if enabled) */
+    bs_forkjoin_barrier,        /* 1, All fork/join (parallel region) barriers */
+    #if KMP_FAST_REDUCTION_BARRIER
+        bs_reduction_barrier,   /* 2, All barriers that are used in reduction */
+    #endif // KMP_FAST_REDUCTION_BARRIER
+    bs_last_barrier             /* Just a placeholder to mark the end */
+};
+
+// to work with reduction barriers just like with plain barriers
+#if !KMP_FAST_REDUCTION_BARRIER
+    #define bs_reduction_barrier bs_plain_barrier
+#endif // KMP_FAST_REDUCTION_BARRIER
+
+typedef enum kmp_bar_pat {      /* Barrier communication patterns */
+    bp_linear_bar = 0,          /* Single level (degenerate) tree */
+    bp_tree_bar = 1,            /* Balanced tree with branching factor 2^n */
+    bp_hyper_bar = 2,           /* Hypercube-embedded tree with min branching factor 2^n */
+    bp_hierarchical_bar = 3,    /* Machine hierarchy tree */
+    bp_last_bar = 4             /* Placeholder to mark the end */
+} kmp_bar_pat_e;
+
+# define KMP_BARRIER_ICV_PUSH   1
+
+/* Record for holding the values of the internal controls stack records */
+typedef struct kmp_internal_control {
+    int           serial_nesting_level;  /* corresponds to the value of the th_team_serialized field */
+    kmp_int8      nested;                /* internal control for nested parallelism (per thread) */
+    kmp_int8      dynamic;               /* internal control for dynamic adjustment of threads (per thread) */
+    kmp_int8      bt_set;                /* internal control for whether blocktime is explicitly set */
+    int           blocktime;             /* internal control for blocktime */
+    int           bt_intervals;          /* internal control for blocktime intervals */
+    int           nproc;                 /* internal control for #threads for next parallel region (per thread) */
+    int           max_active_levels;     /* internal control for max_active_levels */
+    kmp_r_sched_t sched;                 /* internal control for runtime schedule {sched,chunk} pair */
+#if OMP_40_ENABLED
+    kmp_proc_bind_t proc_bind;           /* internal control for affinity  */
+#endif // OMP_40_ENABLED
+    struct kmp_internal_control *next;
+} kmp_internal_control_t;
+
+static inline void
+copy_icvs( kmp_internal_control_t *dst, kmp_internal_control_t *src ) {
+    *dst = *src;
+}
+
+/* Thread barrier needs volatile barrier fields */
+typedef struct KMP_ALIGN_CACHE kmp_bstate {
+    // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all uses of it).
+    // It is not explicitly aligned below, because we *don't* want it to be padded -- instead,
+    // we fit b_go into the same cache line with th_fixed_icvs, enabling NGO cache lines
+    // stores in the hierarchical barrier.
+    kmp_internal_control_t th_fixed_icvs;          // Initial ICVs for the thread
+    // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with same NGO store
+    volatile kmp_uint64 b_go;                      // STATE => task should proceed (hierarchical)
+    KMP_ALIGN_CACHE volatile kmp_uint64 b_arrived; // STATE => task reached synch point.
+    kmp_uint32 *skip_per_level;
+    kmp_uint32 my_level;
+    kmp_int32 parent_tid;
+    kmp_int32 old_tid;
+    kmp_uint32 depth;
+    struct kmp_bstate *parent_bar;
+    kmp_team_t *team;
+    kmp_uint64 leaf_state;
+    kmp_uint32 nproc;
+    kmp_uint8 base_leaf_kids;
+    kmp_uint8 leaf_kids;
+    kmp_uint8 offset;
+    kmp_uint8 wait_flag;
+    kmp_uint8 use_oncore_barrier;
+#if USE_DEBUGGER
+    // The following field is intended for the debugger solely. Only the worker thread itself accesses this
+    // field: the worker increases it by 1 when it arrives to a barrier.
+    KMP_ALIGN_CACHE kmp_uint b_worker_arrived;
+#endif /* USE_DEBUGGER */
+} kmp_bstate_t;
+
+union KMP_ALIGN_CACHE kmp_barrier_union {
+    double       b_align;        /* use worst case alignment */
+    char         b_pad[ KMP_PAD(kmp_bstate_t, CACHE_LINE) ];
+    kmp_bstate_t bb;
+};
+
+typedef union kmp_barrier_union kmp_balign_t;
+
+/* Team barrier needs only non-volatile arrived counter */
+union KMP_ALIGN_CACHE kmp_barrier_team_union {
+    double       b_align;        /* use worst case alignment */
+    char         b_pad[ CACHE_LINE ];
+    struct {
+        kmp_uint     b_arrived;       /* STATE => task reached synch point. */
+#if USE_DEBUGGER
+        // The following two fields are indended for the debugger solely. Only master of the team accesses
+        // these fields: the first one is increased by 1 when master arrives to a barrier, the
+        // second one is increased by one when all the threads arrived.
+        kmp_uint     b_master_arrived;
+        kmp_uint     b_team_arrived;
+#endif
+    };
+};
+
+typedef union kmp_barrier_team_union kmp_balign_team_t;
+
+/*
+ * Padding for Linux* OS pthreads condition variables and mutexes used to signal
+ * threads when a condition changes.  This is to workaround an NPTL bug
+ * where padding was added to pthread_cond_t which caused the initialization
+ * routine to write outside of the structure if compiled on pre-NPTL threads.
+ */
+
+#if KMP_OS_WINDOWS
+typedef struct kmp_win32_mutex
+{
+    /* The Lock */
+    CRITICAL_SECTION cs;
+} kmp_win32_mutex_t;
+
+typedef struct kmp_win32_cond
+{
+    /* Count of the number of waiters. */
+    int waiters_count_;
+
+    /* Serialize access to <waiters_count_> */
+    kmp_win32_mutex_t waiters_count_lock_;
+
+    /* Number of threads to release via a <cond_broadcast> or a */
+    /* <cond_signal> */
+    int release_count_;
+
+    /* Keeps track of the current "generation" so that we don't allow */
+    /* one thread to steal all the "releases" from the broadcast. */
+    int wait_generation_count_;
+
+    /* A manual-reset event that's used to block and release waiting */
+    /* threads. */
+    HANDLE event_;
+} kmp_win32_cond_t;
+#endif
+
+#if KMP_OS_UNIX
+
+union KMP_ALIGN_CACHE kmp_cond_union {
+    double              c_align;
+    char                c_pad[ CACHE_LINE ];
+    pthread_cond_t      c_cond;
+};
+
+typedef union kmp_cond_union kmp_cond_align_t;
+
+union KMP_ALIGN_CACHE kmp_mutex_union {
+    double              m_align;
+    char                m_pad[ CACHE_LINE ];
+    pthread_mutex_t     m_mutex;
+};
+
+typedef union kmp_mutex_union kmp_mutex_align_t;
+
+#endif /* KMP_OS_UNIX */
+
+typedef struct kmp_desc_base {
+    void    *ds_stackbase;
+    size_t            ds_stacksize;
+    int               ds_stackgrow;
+    kmp_thread_t      ds_thread;
+    volatile int      ds_tid;
+    int               ds_gtid;
+#if KMP_OS_WINDOWS
+    volatile int      ds_alive;
+    DWORD             ds_thread_id;
+        /*
+            ds_thread keeps thread handle on Windows* OS. It is enough for RTL purposes. However,
+            debugger support (libomp_db) cannot work with handles, because they uncomparable. For
+            example, debugger requests info about thread with handle h. h is valid within debugger
+            process, and meaningless within debugee process. Even if h is duped by call to
+            DuplicateHandle(), so the result h' is valid within debugee process, but it is a *new*
+            handle which does *not* equal to any other handle in debugee... The only way to
+            compare handles is convert them to system-wide ids. GetThreadId() function is
+            available only in Longhorn and Server 2003. :-( In contrast, GetCurrentThreadId() is
+            available on all Windows* OS flavours (including Windows* 95). Thus, we have to get thread id by
+            call to GetCurrentThreadId() from within the thread and save it to let libomp_db
+            identify threads.
+        */
+#endif /* KMP_OS_WINDOWS */
+} kmp_desc_base_t;
+
+typedef union KMP_ALIGN_CACHE kmp_desc {
+    double           ds_align;        /* use worst case alignment */
+    char             ds_pad[ KMP_PAD(kmp_desc_base_t, CACHE_LINE) ];
+    kmp_desc_base_t  ds;
+} kmp_desc_t;
+
+
+typedef struct kmp_local {
+    volatile int           this_construct; /* count of single's encountered by thread */
+    void                  *reduce_data;
+#if KMP_USE_BGET
+    void                  *bget_data;
+    void                  *bget_list;
+#if ! USE_CMP_XCHG_FOR_BGET
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    kmp_lock_t             bget_lock;      /* Lock for accessing bget free list */
+#else
+    kmp_bootstrap_lock_t   bget_lock;      /* Lock for accessing bget free list */
+                                           /* Must be bootstrap lock so we can use it at library shutdown */
+#endif /* USE_LOCK_FOR_BGET */
+#endif /* ! USE_CMP_XCHG_FOR_BGET */
+#endif /* KMP_USE_BGET */
+
+#ifdef BUILD_TV
+    struct tv_data        *tv_data;
+#endif
+
+    PACKED_REDUCTION_METHOD_T packed_reduction_method; /* stored by __kmpc_reduce*(), used by __kmpc_end_reduce*() */
+
+} kmp_local_t;
+
+#define get__blocktime( xteam, xtid )     ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
+#define get__bt_set( xteam, xtid )        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
+#define get__bt_intervals( xteam, xtid )  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
+
+#define get__nested_2(xteam,xtid)         ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nested)
+#define get__dynamic_2(xteam,xtid)        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
+#define get__nproc_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
+#define get__sched_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
+
+#define set__blocktime_team( xteam, xtid, xval ) \
+        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime )    = (xval) )
+
+#define set__bt_intervals_team( xteam, xtid, xval ) \
+        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals ) = (xval) )
+
+#define set__bt_set_team( xteam, xtid, xval ) \
+        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set )       = (xval) )
+
+
+#define set__nested( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.nested ) = (xval) )
+#define get__nested( xthread ) \
+        ( ( (xthread)->th.th_current_task->td_icvs.nested ) ? (FTN_TRUE) : (FTN_FALSE) )
+
+#define set__dynamic( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) = (xval) )
+#define get__dynamic( xthread ) \
+        ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) ? (FTN_TRUE) : (FTN_FALSE) )
+
+#define set__nproc( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.nproc ) = (xval) )
+
+#define set__max_active_levels( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.max_active_levels ) = (xval) )
+
+#define set__sched( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.sched ) = (xval) )
+
+#if OMP_40_ENABLED
+
+#define set__proc_bind( xthread, xval )                          \
+        ( ( (xthread)->th.th_current_task->td_icvs.proc_bind ) = (xval) )
+#define get__proc_bind( xthread ) \
+        ( (xthread)->th.th_current_task->td_icvs.proc_bind )
+
+#endif /* OMP_40_ENABLED */
+
+
+/* ------------------------------------------------------------------------ */
+// OpenMP tasking data structures
+//
+
+typedef enum kmp_tasking_mode {
+    tskm_immediate_exec = 0,
+    tskm_extra_barrier = 1,
+    tskm_task_teams = 2,
+    tskm_max = 2
+} kmp_tasking_mode_t;
+
+extern kmp_tasking_mode_t __kmp_tasking_mode;         /* determines how/when to execute tasks */
+extern kmp_int32 __kmp_task_stealing_constraint;
+
+/* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with taskdata first */
+#define KMP_TASK_TO_TASKDATA(task)     (((kmp_taskdata_t *) task) - 1)
+#define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *) (taskdata + 1)
+
+// The tt_found_tasks flag is a signal to all threads in the team that tasks were spawned and
+// queued since the previous barrier release.
+#define KMP_TASKING_ENABLED(task_team) \
+    (TCR_SYNC_4((task_team)->tt.tt_found_tasks) == TRUE)
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+ */
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, void * );
+
+/*  sizeof_kmp_task_t passed as arg to kmpc_omp_task call  */
+/*!
+ */
+typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
+    void *              shareds;            /**< pointer to block of pointers to shared vars   */
+    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
+    kmp_int32           part_id;            /**< part id for the task                          */
+#if OMP_40_ENABLED
+    kmp_routine_entry_t destructors;        /* pointer to function to invoke deconstructors of firstprivate C++ objects */
+#endif // OMP_40_ENABLED
+    /*  private vars  */
+} kmp_task_t;
+
+/*!
+@}
+*/
+
+#if OMP_40_ENABLED
+typedef struct kmp_taskgroup {
+    kmp_uint32            count;   // number of allocated and not yet complete tasks
+    kmp_int32             cancel_request; // request for cancellation of this taskgroup
+    struct kmp_taskgroup *parent;  // parent taskgroup
+} kmp_taskgroup_t;
+
+
+// forward declarations
+typedef union kmp_depnode       kmp_depnode_t;
+typedef struct kmp_depnode_list  kmp_depnode_list_t;
+typedef struct kmp_dephash_entry kmp_dephash_entry_t;
+
+typedef struct kmp_depend_info {
+     kmp_intptr_t               base_addr;
+     size_t                     len;
+     struct {
+         bool                   in:1;
+         bool                   out:1;
+     } flags;
+} kmp_depend_info_t;
+
+struct kmp_depnode_list {
+   kmp_depnode_t *              node;
+   kmp_depnode_list_t *         next;
+};
+
+typedef struct kmp_base_depnode {
+    kmp_depnode_list_t        * successors;
+    kmp_task_t                * task;
+
+    kmp_lock_t                  lock;
+
+#if KMP_SUPPORT_GRAPH_OUTPUT
+    kmp_uint32                  id;
+#endif
+
+    volatile kmp_int32          npredecessors;
+    volatile kmp_int32          nrefs;
+} kmp_base_depnode_t;
+
+union KMP_ALIGN_CACHE kmp_depnode {
+    double          dn_align;        /* use worst case alignment */
+    char            dn_pad[ KMP_PAD(kmp_base_depnode_t, CACHE_LINE) ];
+    kmp_base_depnode_t dn;
+};
+
+struct kmp_dephash_entry {
+    kmp_intptr_t               addr;
+    kmp_depnode_t            * last_out;
+    kmp_depnode_list_t       * last_ins;
+    kmp_dephash_entry_t      * next_in_bucket;
+};
+
+typedef struct kmp_dephash {
+   kmp_dephash_entry_t     ** buckets;
+#ifdef KMP_DEBUG
+   kmp_uint32                 nelements;
+   kmp_uint32                 nconflicts;
+#endif
+} kmp_dephash_t;
+
+#endif
+
+#ifdef BUILD_TIED_TASK_STACK
+
+/* Tied Task stack definitions */
+typedef struct kmp_stack_block {
+    kmp_taskdata_t *          sb_block[ TASK_STACK_BLOCK_SIZE ];
+    struct kmp_stack_block *  sb_next;
+    struct kmp_stack_block *  sb_prev;
+} kmp_stack_block_t;
+
+typedef struct kmp_task_stack {
+    kmp_stack_block_t         ts_first_block;  // first block of stack entries
+    kmp_taskdata_t **         ts_top;          // pointer to the top of stack
+    kmp_int32                 ts_entries;      // number of entries on the stack
+} kmp_task_stack_t;
+
+#endif // BUILD_TIED_TASK_STACK
+
+typedef struct kmp_tasking_flags {          /* Total struct must be exactly 32 bits */
+    /* Compiler flags */                    /* Total compiler flags must be 16 bits */
+    unsigned tiedness    : 1;               /* task is either tied (1) or untied (0) */
+    unsigned final       : 1;               /* task is final(1) so execute immediately */
+    unsigned merged_if0  : 1;               /* no __kmpc_task_{begin/complete}_if0 calls in if0 code path */
+#if OMP_40_ENABLED
+    unsigned destructors_thunk : 1;         /* set if the compiler creates a thunk to invoke destructors from the runtime */
+#if OMP_41_ENABLED
+    unsigned proxy       : 1;               /* task is a proxy task (it will be executed outside the context of the RTL) */
+    unsigned reserved    : 11;              /* reserved for compiler use */
+#else
+    unsigned reserved    : 12;              /* reserved for compiler use */
+#endif
+#else // OMP_40_ENABLED
+    unsigned reserved    : 13;              /* reserved for compiler use */
+#endif // OMP_40_ENABLED
+
+    /* Library flags */                     /* Total library flags must be 16 bits */
+    unsigned tasktype    : 1;               /* task is either explicit(1) or implicit (0) */
+    unsigned task_serial : 1;               /* this task is executed immediately (1) or deferred (0) */
+    unsigned tasking_ser : 1;               /* all tasks in team are either executed immediately (1) or may be deferred (0) */
+    unsigned team_serial : 1;               /* entire team is serial (1) [1 thread] or parallel (0) [>= 2 threads] */
+                                            /* If either team_serial or tasking_ser is set, task team may be NULL */
+    /* Task State Flags: */
+    unsigned started     : 1;               /* 1==started, 0==not started     */
+    unsigned executing   : 1;               /* 1==executing, 0==not executing */
+    unsigned complete    : 1;               /* 1==complete, 0==not complete   */
+    unsigned freed       : 1;               /* 1==freed, 0==allocateed        */
+    unsigned native      : 1;               /* 1==gcc-compiled task, 0==intel */
+    unsigned reserved31  : 7;               /* reserved for library use */
+
+} kmp_tasking_flags_t;
+
+
+struct kmp_taskdata {                                 /* aligned during dynamic allocation       */
+    kmp_int32               td_task_id;               /* id, assigned by debugger                */
+    kmp_tasking_flags_t     td_flags;                 /* task flags                              */
+    kmp_team_t *            td_team;                  /* team for this task                      */
+    kmp_info_p *            td_alloc_thread;          /* thread that allocated data structures   */
+                                                      /* Currently not used except for perhaps IDB */
+    kmp_taskdata_t *        td_parent;                /* parent task                             */
+    kmp_int32               td_level;                 /* task nesting level                      */
+    ident_t *               td_ident;                 /* task identifier                         */
+                            // Taskwait data.
+    ident_t *               td_taskwait_ident;
+    kmp_uint32              td_taskwait_counter;
+    kmp_int32               td_taskwait_thread;       /* gtid + 1 of thread encountered taskwait */
+    KMP_ALIGN_CACHE kmp_internal_control_t  td_icvs;  /* Internal control variables for the task */
+    volatile kmp_uint32     td_allocated_child_tasks;  /* Child tasks (+ current task) not yet deallocated */
+    volatile kmp_uint32     td_incomplete_child_tasks; /* Child tasks not yet complete */
+#if OMP_40_ENABLED
+    kmp_taskgroup_t *       td_taskgroup;         // Each task keeps pointer to its current taskgroup
+    kmp_dephash_t *         td_dephash;           // Dependencies for children tasks are tracked from here
+    kmp_depnode_t *         td_depnode;           // Pointer to graph node if this task has dependencies
+#endif
+#if OMPT_SUPPORT
+    ompt_task_info_t        ompt_task_info;
+#endif
+#if KMP_HAVE_QUAD
+    _Quad                   td_dummy;             // Align structure 16-byte size since allocated just before kmp_task_t
+#else
+    kmp_uint32              td_dummy[2];
+#endif
+}; // struct kmp_taskdata
+
+// Make sure padding above worked
+KMP_BUILD_ASSERT( sizeof(kmp_taskdata_t) % sizeof(void *) == 0 );
+
+// Data for task team but per thread
+typedef struct kmp_base_thread_data {
+    kmp_info_p *            td_thr;                // Pointer back to thread info
+                                                   // Used only in __kmp_execute_tasks_template, maybe not avail until task is queued?
+    kmp_bootstrap_lock_t    td_deque_lock;         // Lock for accessing deque
+    kmp_taskdata_t **       td_deque;              // Deque of tasks encountered by td_thr, dynamically allocated
+    kmp_uint32              td_deque_head;         // Head of deque (will wrap)
+    kmp_uint32              td_deque_tail;         // Tail of deque (will wrap)
+    kmp_int32               td_deque_ntasks;       // Number of tasks in deque
+                                                   // GEH: shouldn't this be volatile since used in while-spin?
+    kmp_int32               td_deque_last_stolen;  // Thread number of last successful steal
+#ifdef BUILD_TIED_TASK_STACK
+    kmp_task_stack_t        td_susp_tied_tasks;    // Stack of suspended tied tasks for task scheduling constraint
+#endif // BUILD_TIED_TASK_STACK
+} kmp_base_thread_data_t;
+
+typedef union KMP_ALIGN_CACHE kmp_thread_data {
+    kmp_base_thread_data_t  td;
+    double                  td_align;       /* use worst case alignment */
+    char                    td_pad[ KMP_PAD(kmp_base_thread_data_t, CACHE_LINE) ];
+} kmp_thread_data_t;
+
+
+// Data for task teams which are used when tasking is enabled for the team
+typedef struct kmp_base_task_team {
+    kmp_bootstrap_lock_t    tt_threads_lock;       /* Lock used to allocate per-thread part of task team */
+                                                   /* must be bootstrap lock since used at library shutdown*/
+    kmp_task_team_t *       tt_next;               /* For linking the task team free list */
+    kmp_thread_data_t *     tt_threads_data;       /* Array of per-thread structures for task team */
+                                                   /* Data survives task team deallocation */
+    kmp_int32               tt_found_tasks;        /* Have we found tasks and queued them while executing this team? */
+                                                   /* TRUE means tt_threads_data is set up and initialized */
+    kmp_int32               tt_nproc;              /* #threads in team           */
+    kmp_int32               tt_max_threads;        /* number of entries allocated for threads_data array */
+#if OMP_41_ENABLED
+    kmp_int32               tt_found_proxy_tasks;  /* Have we found proxy tasks since last barrier */
+#endif
+
+    KMP_ALIGN_CACHE
+    volatile kmp_uint32     tt_unfinished_threads; /* #threads still active      */
+
+    KMP_ALIGN_CACHE
+    volatile kmp_uint32     tt_active;             /* is the team still actively executing tasks */
+
+    KMP_ALIGN_CACHE
+#if KMP_USE_INTERNODE_ALIGNMENT
+    kmp_int32               tt_padme[INTERNODE_CACHE_LINE/sizeof(kmp_int32)];
+#endif
+
+    volatile kmp_uint32     tt_ref_ct;             /* #threads accessing struct  */
+                                                   /* (not incl. master)         */
+} kmp_base_task_team_t;
+
+union KMP_ALIGN_CACHE kmp_task_team {
+    kmp_base_task_team_t tt;
+    double               tt_align;       /* use worst case alignment */
+    char                 tt_pad[ KMP_PAD(kmp_base_task_team_t, CACHE_LINE) ];
+};
+
+#if ( USE_FAST_MEMORY == 3 ) || ( USE_FAST_MEMORY == 5 )
+// Free lists keep same-size free memory slots for fast memory allocation routines
+typedef struct kmp_free_list {
+    void             *th_free_list_self;   // Self-allocated tasks free list
+    void             *th_free_list_sync;   // Self-allocated tasks stolen/returned by other threads
+    void             *th_free_list_other;  // Non-self free list (to be returned to owner's sync list)
+} kmp_free_list_t;
+#endif
+#if KMP_NESTED_HOT_TEAMS
+// Hot teams array keeps hot teams and their sizes for given thread.
+// Hot teams are not put in teams pool, and they don't put threads in threads pool.
+typedef struct kmp_hot_team_ptr {
+    kmp_team_p *hot_team;      // pointer to hot_team of given nesting level
+    kmp_int32   hot_team_nth;  // number of threads allocated for the hot_team
+} kmp_hot_team_ptr_t;
+#endif
+#if OMP_40_ENABLED
+typedef struct kmp_teams_size {
+    kmp_int32   nteams;        // number of teams in a league
+    kmp_int32   nth;           // number of threads in each team of the league
+} kmp_teams_size_t;
+#endif
+
+/* ------------------------------------------------------------------------ */
+// OpenMP thread data structures
+//
+
+typedef struct KMP_ALIGN_CACHE kmp_base_info {
+/*
+ * Start with the readonly data which is cache aligned and padded.
+ * this is written before the thread starts working by the master.
+ * (uber masters may update themselves later)
+ * (usage does not consider serialized regions)
+ */
+    kmp_desc_t        th_info;
+    kmp_team_p       *th_team;       /* team we belong to */
+    kmp_root_p       *th_root;       /* pointer to root of task hierarchy */
+    kmp_info_p       *th_next_pool;  /* next available thread in the pool */
+    kmp_disp_t       *th_dispatch;   /* thread's dispatch data */
+    int               th_in_pool;    /* in thread pool (32 bits for TCR/TCW) */
+
+    /* The following are cached from the team info structure */
+    /* TODO use these in more places as determined to be needed via profiling */
+    int               th_team_nproc;      /* number of threads in a team */
+    kmp_info_p       *th_team_master;     /* the team's master thread */
+    int               th_team_serialized; /* team is serialized */
+#if OMP_40_ENABLED
+    microtask_t       th_teams_microtask; /* save entry address for teams construct */
+    int               th_teams_level;     /* save initial level of teams construct */
+                                          /* it is 0 on device but may be any on host */
+#endif
+
+    /* The blocktime info is copied from the team struct to the thread sruct */
+    /* at the start of a barrier, and the values stored in the team are used */
+    /* at points in the code where the team struct is no longer guaranteed   */
+    /* to exist (from the POV of worker threads).                            */
+    int               th_team_bt_intervals;
+    int               th_team_bt_set;
+
+
+#if KMP_AFFINITY_SUPPORTED
+    kmp_affin_mask_t  *th_affin_mask; /* thread's current affinity mask */
+#endif
+
+/*
+ * The data set by the master at reinit, then R/W by the worker
+ */
+    KMP_ALIGN_CACHE int     th_set_nproc;  /* if > 0, then only use this request for the next fork */
+#if KMP_NESTED_HOT_TEAMS
+    kmp_hot_team_ptr_t     *th_hot_teams;     /* array of hot teams */
+#endif
+#if OMP_40_ENABLED
+    kmp_proc_bind_t         th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
+    kmp_teams_size_t        th_teams_size;    /* number of teams/threads in teams construct */
+# if KMP_AFFINITY_SUPPORTED
+    int                     th_current_place; /* place currently bound to */
+    int                     th_new_place;     /* place to bind to in par reg */
+    int                     th_first_place;   /* first place in partition */
+    int                     th_last_place;    /* last place in partition */
+# endif
+#endif
+#if USE_ITT_BUILD
+    kmp_uint64              th_bar_arrive_time;           /* arrival to barrier timestamp */
+    kmp_uint64              th_bar_min_time;              /* minimum arrival time at the barrier */
+    kmp_uint64              th_frame_time;                /* frame timestamp */
+    kmp_uint64              th_frame_time_serialized;     /* frame timestamp in serialized parallel */
+#endif /* USE_ITT_BUILD */
+    kmp_local_t             th_local;
+    struct private_common  *th_pri_head;
+
+/*
+ * Now the data only used by the worker (after initial allocation)
+ */
+    /* TODO the first serial team should actually be stored in the info_t
+     * structure.  this will help reduce initial allocation overhead */
+    KMP_ALIGN_CACHE kmp_team_p *th_serial_team; /*serialized team held in reserve*/
+
+#if OMPT_SUPPORT
+    ompt_thread_info_t      ompt_thread_info;
+#endif
+
+/* The following are also read by the master during reinit */
+    struct common_table    *th_pri_common;
+
+    volatile kmp_uint32     th_spin_here;   /* thread-local location for spinning */
+                                            /* while awaiting queuing lock acquire */
+
+    volatile void          *th_sleep_loc;   // this points at a kmp_flag<T>
+
+    ident_t          *th_ident;
+    unsigned         th_x;                     // Random number generator data
+    unsigned         th_a;                     // Random number generator data
+
+/*
+ * Tasking-related data for the thread
+ */
+    kmp_task_team_t    * th_task_team;           // Task team struct
+    kmp_taskdata_t     * th_current_task;        // Innermost Task being executed
+    kmp_uint8            th_task_state;          // alternating 0/1 for task team identification
+    kmp_uint8          * th_task_state_memo_stack;  // Stack holding memos of th_task_state at nested levels
+    kmp_uint32           th_task_state_top;         // Top element of th_task_state_memo_stack
+    kmp_uint32           th_task_state_stack_sz;    // Size of th_task_state_memo_stack
+
+    /*
+     * More stuff for keeping track of active/sleeping threads
+     * (this part is written by the worker thread)
+     */
+    kmp_uint8            th_active_in_pool;      // included in count of
+                                                 // #active threads in pool
+    int                  th_active;              // ! sleeping
+                                                 // 32 bits for TCR/TCW
+
+
+    struct cons_header * th_cons; // used for consistency check
+
+/*
+ * Add the syncronizing data which is cache aligned and padded.
+ */
+    KMP_ALIGN_CACHE kmp_balign_t      th_bar[ bs_last_barrier ];
+
+    KMP_ALIGN_CACHE volatile     kmp_int32    th_next_waiting;  /* gtid+1 of next thread on lock wait queue, 0 if none */
+
+#if ( USE_FAST_MEMORY == 3 ) || ( USE_FAST_MEMORY == 5 )
+    #define NUM_LISTS 4
+    kmp_free_list_t   th_free_lists[NUM_LISTS];   // Free lists for fast memory allocation routines
+#endif
+
+#if KMP_OS_WINDOWS
+    kmp_win32_cond_t  th_suspend_cv;
+    kmp_win32_mutex_t th_suspend_mx;
+    int               th_suspend_init;
+#endif
+#if KMP_OS_UNIX
+    kmp_cond_align_t  th_suspend_cv;
+    kmp_mutex_align_t th_suspend_mx;
+    int               th_suspend_init_count;
+#endif
+
+#if USE_ITT_BUILD
+    kmp_itt_mark_t        th_itt_mark_single;
+    // alignment ???
+#endif /* USE_ITT_BUILD */
+#if KMP_STATS_ENABLED
+    kmp_stats_list* th_stats;
+#endif
+} kmp_base_info_t;
+
+typedef union KMP_ALIGN_CACHE kmp_info {
+    double          th_align;        /* use worst case alignment */
+    char            th_pad[ KMP_PAD(kmp_base_info_t, CACHE_LINE) ];
+    kmp_base_info_t th;
+} kmp_info_t;
+
+/* ------------------------------------------------------------------------ */
+// OpenMP thread team data structures
+//
+typedef struct kmp_base_data {
+    volatile kmp_uint32 t_value;
+} kmp_base_data_t;
+
+typedef union KMP_ALIGN_CACHE kmp_sleep_team {
+    double              dt_align;        /* use worst case alignment */
+    char                dt_pad[ KMP_PAD(kmp_base_data_t, CACHE_LINE) ];
+    kmp_base_data_t     dt;
+} kmp_sleep_team_t;
+
+typedef union KMP_ALIGN_CACHE kmp_ordered_team {
+    double              dt_align;        /* use worst case alignment */
+    char                dt_pad[ KMP_PAD(kmp_base_data_t, CACHE_LINE) ];
+    kmp_base_data_t     dt;
+} kmp_ordered_team_t;
+
+typedef int     (*launch_t)( int gtid );
+
+/* Minimum number of ARGV entries to malloc if necessary */
+#define KMP_MIN_MALLOC_ARGV_ENTRIES     100
+
+// Set up how many argv pointers will fit in cache lines containing t_inline_argv. Historically, we
+// have supported at least 96 bytes. Using a larger value for more space between the master write/worker
+// read section and read/write by all section seems to buy more performance on EPCC PARALLEL.
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+# define KMP_INLINE_ARGV_BYTES         ( 4 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) + sizeof(kmp_int16) + sizeof(kmp_uint32) ) % CACHE_LINE ) )
+#else
+# define KMP_INLINE_ARGV_BYTES         ( 2 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) ) % CACHE_LINE ) )
+#endif
+#define KMP_INLINE_ARGV_ENTRIES        (int)( KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP )
+
+typedef struct KMP_ALIGN_CACHE kmp_base_team {
+    // Synchronization Data ---------------------------------------------------------------------------------
+    KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
+    kmp_balign_team_t        t_bar[ bs_last_barrier ];
+    volatile int             t_construct;    // count of single directive encountered by team
+    kmp_lock_t               t_single_lock;  // team specific lock
+
+    // Master only -----------------------------------------------------------------------------------------
+    KMP_ALIGN_CACHE int      t_master_tid;   // tid of master in parent team
+    int                      t_master_this_cons; // "this_construct" single counter of master in parent team
+    ident_t                 *t_ident;        // if volatile, have to change too much other crud to volatile too
+    kmp_team_p              *t_parent;       // parent team
+    kmp_team_p              *t_next_pool;    // next free team in the team pool
+    kmp_disp_t              *t_dispatch;     // thread's dispatch data
+    kmp_task_team_t         *t_task_team[2]; // Task team struct; switch between 2
+#if OMP_40_ENABLED
+    kmp_proc_bind_t          t_proc_bind;    // bind type for par region
+#endif // OMP_40_ENABLED
+#if USE_ITT_BUILD
+    kmp_uint64               t_region_time;  // region begin timestamp
+#endif /* USE_ITT_BUILD */
+
+    // Master write, workers read --------------------------------------------------------------------------
+    KMP_ALIGN_CACHE void   **t_argv;
+    int                      t_argc;
+    int                      t_nproc;        // number of threads in team
+    microtask_t              t_pkfn;
+    launch_t                 t_invoke;       // procedure to launch the microtask
+
+#if OMPT_SUPPORT
+    ompt_team_info_t         ompt_team_info;
+    ompt_lw_taskteam_t      *ompt_serialized_team_info;
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    kmp_int8                 t_fp_control_saved;
+    kmp_int8                 t_pad2b;
+    kmp_int16                t_x87_fpu_control_word; // FP control regs
+    kmp_uint32               t_mxcsr;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    void                    *t_inline_argv[ KMP_INLINE_ARGV_ENTRIES ];
+
+    KMP_ALIGN_CACHE kmp_info_t **t_threads;
+    int                      t_max_argc;
+    int                      t_max_nproc;    // maximum threads this team can handle (dynamicly expandable)
+    int                      t_serialized;   // levels deep of serialized teams
+    dispatch_shared_info_t  *t_disp_buffer;  // buffers for dispatch system
+    int                      t_id;           // team's id, assigned by debugger.
+    int                      t_level;        // nested parallel level
+    int                      t_active_level; // nested active parallel level
+    kmp_r_sched_t            t_sched;        // run-time schedule for the team
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    int                      t_first_place;  // first & last place in parent thread's partition.
+    int                      t_last_place;   // Restore these values to master after par region.
+#endif // OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call
+
+    // Read/write by workers as well -----------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel'
+    // and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel'
+    // and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding.
+    char dummy_padding[1024];
+#endif
+    KMP_ALIGN_CACHE kmp_taskdata_t *t_implicit_task_taskdata;  // Taskdata for the thread's implicit task
+    kmp_internal_control_t  *t_control_stack_top;  // internal control stack for additional nested teams.
+                                                   // for SERIALIZED teams nested 2 or more levels deep
+#if OMP_40_ENABLED
+    kmp_int32                t_cancel_request; // typed flag to store request state of cancellation
+#endif
+    int                      t_master_active;  // save on fork, restore on join
+    kmp_taskq_t              t_taskq;          // this team's task queue
+    void                    *t_copypriv_data;  // team specific pointer to copyprivate data array
+    kmp_uint32               t_copyin_counter;
+#if USE_ITT_BUILD
+    void                    *t_stack_id;       // team specific stack stitching id (for ittnotify)
+#endif /* USE_ITT_BUILD */
+} kmp_base_team_t;
+
+union KMP_ALIGN_CACHE kmp_team {
+    kmp_base_team_t     t;
+    double              t_align;       /* use worst case alignment */
+    char                t_pad[ KMP_PAD(kmp_base_team_t, CACHE_LINE) ];
+};
+
+
+typedef union KMP_ALIGN_CACHE kmp_time_global {
+    double              dt_align;        /* use worst case alignment */
+    char                dt_pad[ KMP_PAD(kmp_base_data_t, CACHE_LINE) ];
+    kmp_base_data_t     dt;
+} kmp_time_global_t;
+
+typedef struct kmp_base_global {
+    /* cache-aligned */
+    kmp_time_global_t   g_time;
+
+    /* non cache-aligned */
+    volatile int        g_abort;
+    volatile int        g_done;
+
+    int                 g_dynamic;
+    enum dynamic_mode   g_dynamic_mode;
+
+} kmp_base_global_t;
+
+typedef union KMP_ALIGN_CACHE kmp_global {
+    kmp_base_global_t   g;
+    double              g_align;        /* use worst case alignment */
+    char                g_pad[ KMP_PAD(kmp_base_global_t, CACHE_LINE) ];
+} kmp_global_t;
+
+
+typedef struct kmp_base_root {
+    // TODO: GEH - combine r_active with r_in_parallel then r_active == (r_in_parallel>= 0)
+    // TODO: GEH - then replace r_active with t_active_levels if we can to reduce the synch
+    //             overhead or keeping r_active
+
+    volatile int        r_active;       /* TRUE if some region in a nest has > 1 thread */
+                                        // GEH: This is misnamed, should be r_in_parallel
+    volatile int        r_nested;       // TODO: GEH - This is unused, just remove it entirely.
+    int                 r_in_parallel;  /* keeps a count of active parallel regions per root */
+                                        // GEH: This is misnamed, should be r_active_levels
+    kmp_team_t         *r_root_team;
+    kmp_team_t         *r_hot_team;
+    kmp_info_t         *r_uber_thread;
+    kmp_lock_t          r_begin_lock;
+    volatile int        r_begin;
+    int                 r_blocktime; /* blocktime for this root and descendants */
+} kmp_base_root_t;
+
+typedef union KMP_ALIGN_CACHE kmp_root {
+    kmp_base_root_t     r;
+    double              r_align;        /* use worst case alignment */
+    char                r_pad[ KMP_PAD(kmp_base_root_t, CACHE_LINE) ];
+} kmp_root_t;
+
+struct fortran_inx_info {
+    kmp_int32   data;
+};
+
+/* ------------------------------------------------------------------------ */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+extern int      __kmp_settings;
+extern int      __kmp_duplicate_library_ok;
+#if USE_ITT_BUILD
+extern int      __kmp_forkjoin_frames;
+extern int      __kmp_forkjoin_frames_mode;
+#endif
+extern PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method;
+extern int      __kmp_determ_red;
+
+#ifdef KMP_DEBUG
+extern int      kmp_a_debug;
+extern int      kmp_b_debug;
+extern int      kmp_c_debug;
+extern int      kmp_d_debug;
+extern int      kmp_e_debug;
+extern int      kmp_f_debug;
+#endif /* KMP_DEBUG */
+
+/* For debug information logging using rotating buffer */
+#define KMP_DEBUG_BUF_LINES_INIT        512
+#define KMP_DEBUG_BUF_LINES_MIN         1
+
+#define KMP_DEBUG_BUF_CHARS_INIT        128
+#define KMP_DEBUG_BUF_CHARS_MIN         2
+
+extern int     __kmp_debug_buf;            /* TRUE means use buffer, FALSE means print to stderr */
+extern int     __kmp_debug_buf_lines;      /* How many lines of debug stored in buffer */
+extern int     __kmp_debug_buf_chars;      /* How many characters allowed per line in buffer */
+extern int     __kmp_debug_buf_atomic;     /* TRUE means use atomic update of buffer entry pointer */
+
+extern char   *__kmp_debug_buffer;         /* Debug buffer itself */
+extern int     __kmp_debug_count;          /* Counter for number of lines printed in buffer so far */
+extern int     __kmp_debug_buf_warn_chars; /* Keep track of char increase recommended in warnings */
+/* end rotating debug buffer */
+
+#ifdef KMP_DEBUG
+extern int      __kmp_par_range;           /* +1 => only go par for constructs in range */
+
+#define KMP_PAR_RANGE_ROUTINE_LEN       1024
+extern char     __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN];
+#define KMP_PAR_RANGE_FILENAME_LEN      1024
+extern char     __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN];
+extern int      __kmp_par_range_lb;
+extern int      __kmp_par_range_ub;
+#endif
+
+/* For printing out dynamic storage map for threads and teams */
+extern int      __kmp_storage_map;         /* True means print storage map for threads and teams */
+extern int      __kmp_storage_map_verbose; /* True means storage map includes placement info */
+extern int      __kmp_storage_map_verbose_specified;
+
+extern kmp_cpuinfo_t    __kmp_cpuinfo;
+
+extern volatile int __kmp_init_serial;
+extern volatile int __kmp_init_gtid;
+extern volatile int __kmp_init_common;
+extern volatile int __kmp_init_middle;
+extern volatile int __kmp_init_parallel;
+extern volatile int __kmp_init_monitor;
+extern volatile int __kmp_init_user_locks;
+extern int __kmp_init_counter;
+extern int __kmp_root_counter;
+extern int __kmp_version;
+
+/* list of address of allocated caches for commons */
+extern kmp_cached_addr_t *__kmp_threadpriv_cache_list;
+
+/* Barrier algorithm types and options */
+extern kmp_uint32    __kmp_barrier_gather_bb_dflt;
+extern kmp_uint32    __kmp_barrier_release_bb_dflt;
+extern kmp_bar_pat_e __kmp_barrier_gather_pat_dflt;
+extern kmp_bar_pat_e __kmp_barrier_release_pat_dflt;
+extern kmp_uint32    __kmp_barrier_gather_branch_bits  [ bs_last_barrier ];
+extern kmp_uint32    __kmp_barrier_release_branch_bits [ bs_last_barrier ];
+extern kmp_bar_pat_e __kmp_barrier_gather_pattern      [ bs_last_barrier ];
+extern kmp_bar_pat_e __kmp_barrier_release_pattern     [ bs_last_barrier ];
+extern char const   *__kmp_barrier_branch_bit_env_name [ bs_last_barrier ];
+extern char const   *__kmp_barrier_pattern_env_name    [ bs_last_barrier ];
+extern char const   *__kmp_barrier_type_name           [ bs_last_barrier ];
+extern char const   *__kmp_barrier_pattern_name        [ bp_last_bar ];
+
+/* Global Locks */
+extern kmp_bootstrap_lock_t __kmp_initz_lock;     /* control initialization */
+extern kmp_bootstrap_lock_t __kmp_forkjoin_lock;  /* control fork/join access */
+extern kmp_bootstrap_lock_t __kmp_exit_lock;      /* exit() is not always thread-safe */
+extern kmp_bootstrap_lock_t __kmp_monitor_lock;   /* control monitor thread creation */
+extern kmp_bootstrap_lock_t __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and __kmp_threads expansion to co-exist */
+
+extern kmp_lock_t __kmp_global_lock;    /* control OS/global access  */
+extern kmp_queuing_lock_t __kmp_dispatch_lock;  /* control dispatch access  */
+extern kmp_lock_t __kmp_debug_lock;     /* control I/O access for KMP_DEBUG */
+
+/* used for yielding spin-waits */
+extern unsigned int __kmp_init_wait;    /* initial number of spin-tests   */
+extern unsigned int __kmp_next_wait;    /* susequent number of spin-tests */
+
+extern enum library_type __kmp_library;
+
+extern enum sched_type  __kmp_sched;    /* default runtime scheduling */
+extern enum sched_type  __kmp_static;   /* default static scheduling method */
+extern enum sched_type  __kmp_guided;   /* default guided scheduling method */
+extern enum sched_type  __kmp_auto;     /* default auto scheduling method */
+extern int              __kmp_chunk;    /* default runtime chunk size */
+
+extern size_t     __kmp_stksize;        /* stack size per thread         */
+extern size_t     __kmp_monitor_stksize;/* stack size for monitor thread */
+extern size_t     __kmp_stkoffset;      /* stack offset per thread       */
+extern int        __kmp_stkpadding;     /* Should we pad root thread(s) stack */
+
+extern size_t     __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */
+extern int        __kmp_env_chunk;      /* was KMP_CHUNK specified?     */
+extern int        __kmp_env_stksize;    /* was KMP_STACKSIZE specified? */
+extern int        __kmp_env_omp_stksize;/* was OMP_STACKSIZE specified? */
+extern int        __kmp_env_all_threads;    /* was KMP_ALL_THREADS or KMP_MAX_THREADS specified? */
+extern int        __kmp_env_omp_all_threads;/* was OMP_THREAD_LIMIT specified? */
+extern int        __kmp_env_blocktime;  /* was KMP_BLOCKTIME specified? */
+extern int        __kmp_env_checks;     /* was KMP_CHECKS specified?    */
+extern int        __kmp_env_consistency_check;     /* was KMP_CONSISTENCY_CHECK specified?    */
+extern int        __kmp_generate_warnings; /* should we issue warnings? */
+extern int        __kmp_reserve_warn;   /* have we issued reserve_threads warning? */
+
+#ifdef DEBUG_SUSPEND
+extern int        __kmp_suspend_count;  /* count inside __kmp_suspend_template() */
+#endif
+
+extern kmp_uint32 __kmp_yield_init;
+extern kmp_uint32 __kmp_yield_next;
+extern kmp_uint32 __kmp_yielding_on;
+extern kmp_uint32 __kmp_yield_cycle;
+extern kmp_int32  __kmp_yield_on_count;
+extern kmp_int32  __kmp_yield_off_count;
+
+
+/* ------------------------------------------------------------------------- */
+extern int        __kmp_allThreadsSpecified;
+
+extern size_t     __kmp_align_alloc;
+/* following data protected by initialization routines */
+extern int        __kmp_xproc;          /* number of processors in the system */
+extern int        __kmp_avail_proc;      /* number of processors available to the process */
+extern size_t     __kmp_sys_min_stksize; /* system-defined minimum stack size */
+extern int        __kmp_sys_max_nth;    /* system-imposed maximum number of threads */
+extern int        __kmp_max_nth;        /* maximum total number of concurrently-existing threads */
+extern int        __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and __kmp_root */
+extern int        __kmp_dflt_team_nth;  /* default number of threads in a parallel region a la OMP_NUM_THREADS */
+extern int        __kmp_dflt_team_nth_ub; /* upper bound on "" determined at serial initialization */
+extern int        __kmp_tp_capacity;    /* capacity of __kmp_threads if threadprivate is used (fixed) */
+extern int        __kmp_tp_cached;      /* whether threadprivate cache has been created (__kmpc_threadprivate_cached()) */
+extern int        __kmp_dflt_nested;    /* nested parallelism enabled by default a la OMP_NESTED */
+extern int        __kmp_dflt_blocktime; /* number of milliseconds to wait before blocking (env setting) */
+extern int        __kmp_monitor_wakeups;/* number of times monitor wakes up per second */
+extern int        __kmp_bt_intervals;   /* number of monitor timestamp intervals before blocking */
+#ifdef KMP_ADJUST_BLOCKTIME
+extern int        __kmp_zero_bt;        /* whether blocktime has been forced to zero */
+#endif /* KMP_ADJUST_BLOCKTIME */
+#ifdef KMP_DFLT_NTH_CORES
+extern int        __kmp_ncores;         /* Total number of cores for threads placement */
+#endif
+extern int        __kmp_abort_delay;    /* Number of millisecs to delay on abort for VTune */
+
+extern int        __kmp_need_register_atfork_specified;
+extern int        __kmp_need_register_atfork;/* At initialization, call pthread_atfork to install fork handler */
+extern int        __kmp_gtid_mode;      /* Method of getting gtid, values:
+                                           0 - not set, will be set at runtime
+                                           1 - using stack search
+                                           2 - dynamic TLS (pthread_getspecific(Linux* OS/OS X*) or TlsGetValue(Windows* OS))
+                                           3 - static TLS (__declspec(thread) __kmp_gtid), Linux* OS .so only.
+                                         */
+extern int        __kmp_adjust_gtid_mode; /* If true, adjust method based on #threads */
+#ifdef KMP_TDATA_GTID
+#if KMP_OS_WINDOWS
+extern __declspec(thread) int __kmp_gtid; /* This thread's gtid, if __kmp_gtid_mode == 3 */
+#else
+extern __thread int __kmp_gtid;
+#endif /* KMP_OS_WINDOWS - workaround because Intel(R) Many Integrated Core compiler 20110316 doesn't accept __declspec */
+#endif
+extern int        __kmp_tls_gtid_min;   /* #threads below which use sp search for gtid */
+extern int        __kmp_foreign_tp;     /* If true, separate TP var for each foreign thread */
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+extern int        __kmp_inherit_fp_control; /* copy fp creg(s) parent->workers at fork */
+extern kmp_int16  __kmp_init_x87_fpu_control_word; /* init thread's FP control reg */
+extern kmp_uint32 __kmp_init_mxcsr;      /* init thread's mxscr */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+extern int        __kmp_dflt_max_active_levels; /* max_active_levels for nested parallelism enabled by default a la OMP_MAX_ACTIVE_LEVELS */
+#if KMP_NESTED_HOT_TEAMS
+extern int        __kmp_hot_teams_mode;
+extern int        __kmp_hot_teams_max_level;
+#endif
+
+# if KMP_OS_LINUX
+extern enum clock_function_type __kmp_clock_function;
+extern int __kmp_clock_function_param;
+# endif /* KMP_OS_LINUX */
+
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+extern enum mic_type __kmp_mic_type;
+#endif
+
+# ifdef USE_LOAD_BALANCE
+extern double      __kmp_load_balance_interval;   /* Interval for the load balance algorithm */
+# endif /* USE_LOAD_BALANCE */
+
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+    int * nth;
+    int   size;
+    int   used;
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// Parameters for the speculative lock backoff system.
+struct kmp_adaptive_backoff_params_t {
+    // Number of soft retries before it counts as a hard retry.
+    kmp_uint32 max_soft_retries;
+    // Badness is a bit mask : 0,1,3,7,15,... on each hard failure we move one to the right
+    kmp_uint32 max_badness;
+};
+
+extern kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+extern char * __kmp_speculative_statsfile;
+#endif
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+#if OMP_40_ENABLED
+extern int __kmp_display_env;           /* TRUE or FALSE */
+extern int __kmp_display_env_verbose;   /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
+extern int __kmp_omp_cancellation;      /* TRUE or FALSE */
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+/* --------------------------------------------------------------------------- */
+/* the following are protected by the fork/join lock */
+/* write: lock  read: anytime */
+extern          kmp_info_t **__kmp_threads;      /* Descriptors for the threads */
+/* read/write: lock */
+extern volatile kmp_team_t  *     __kmp_team_pool;
+extern volatile kmp_info_t  *     __kmp_thread_pool;
+
+/* total number of threads reachable from some root thread including all root threads*/
+extern volatile int __kmp_nth;
+/* total number of threads reachable from some root thread including all root threads,
+   and those in the thread pool */
+extern volatile int __kmp_all_nth;
+extern int __kmp_thread_pool_nth;
+extern volatile int __kmp_thread_pool_active_nth;
+
+extern kmp_root_t **__kmp_root;         /* root of thread hierarchy */
+/* end data protected by fork/join lock */
+/* --------------------------------------------------------------------------- */
+
+extern kmp_global_t  __kmp_global;         /* global status */
+
+extern kmp_info_t __kmp_monitor;
+extern volatile kmp_uint32 __kmp_team_counter;      // Used by Debugging Support Library.
+extern volatile kmp_uint32 __kmp_task_counter;      // Used by Debugging Support Library.
+
+#if USE_DEBUGGER
+
+#define _KMP_GEN_ID( counter )                                         \
+    (                                                                  \
+        __kmp_debugging                                                \
+        ?                                                              \
+        KMP_TEST_THEN_INC32( (volatile kmp_int32 *) & counter ) + 1    \
+        :                                                              \
+        ~ 0                                                            \
+    )
+#else
+#define _KMP_GEN_ID( counter )                                         \
+    (                                                                  \
+        ~ 0                                                            \
+    )
+#endif /* USE_DEBUGGER */
+
+#define KMP_GEN_TASK_ID()    _KMP_GEN_ID( __kmp_task_counter )
+#define KMP_GEN_TEAM_ID()    _KMP_GEN_ID( __kmp_team_counter )
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+extern void __kmp_print_storage_map_gtid( int gtid, void *p1, void* p2, size_t size, char const *format, ... );
+
+extern void __kmp_serial_initialize( void );
+extern void __kmp_middle_initialize( void );
+extern void __kmp_parallel_initialize( void );
+
+extern void __kmp_internal_begin( void );
+extern void __kmp_internal_end_library( int gtid );
+extern void __kmp_internal_end_thread( int gtid );
+extern void __kmp_internal_end_atexit( void );
+extern void __kmp_internal_end_fini( void );
+extern void __kmp_internal_end_dtor( void );
+extern void __kmp_internal_end_dest( void* );
+
+extern int  __kmp_register_root( int initial_thread );
+extern void __kmp_unregister_root( int gtid );
+
+extern int  __kmp_ignore_mppbeg( void );
+extern int  __kmp_ignore_mppend( void );
+
+extern int  __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws );
+extern void __kmp_exit_single( int gtid );
+
+extern void __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref );
+extern void __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref );
+
+
+#ifdef USE_LOAD_BALANCE
+extern int  __kmp_get_load_balance( int );
+#endif
+
+#ifdef BUILD_TV
+extern void __kmp_tv_threadprivate_store( kmp_info_t *th, void *global_addr, void *thread_addr );
+#endif
+
+extern int  __kmp_get_global_thread_id( void );
+extern int  __kmp_get_global_thread_id_reg( void );
+extern void __kmp_exit_thread( int exit_status );
+extern void __kmp_abort( char const * format, ... );
+extern void __kmp_abort_thread( void );
+extern void __kmp_abort_process( void );
+extern void __kmp_warn( char const * format, ... );
+
+extern void __kmp_set_num_threads( int new_nth, int gtid );
+
+// Returns current thread (pointer to kmp_info_t). Current thread *must* be registered.
+static inline kmp_info_t * __kmp_entry_thread()
+{
+      int gtid = __kmp_entry_gtid();
+
+      return __kmp_threads[gtid];
+}
+
+extern void __kmp_set_max_active_levels( int gtid, int new_max_active_levels );
+extern int  __kmp_get_max_active_levels( int gtid );
+extern int  __kmp_get_ancestor_thread_num( int gtid, int level );
+extern int  __kmp_get_team_size( int gtid, int level );
+extern void __kmp_set_schedule( int gtid, kmp_sched_t new_sched, int chunk );
+extern void __kmp_get_schedule( int gtid, kmp_sched_t * sched, int * chunk );
+
+extern unsigned short __kmp_get_random( kmp_info_t * thread );
+extern void __kmp_init_random( kmp_info_t * thread );
+
+extern kmp_r_sched_t __kmp_get_schedule_global( void );
+extern void __kmp_adjust_num_threads( int new_nproc );
+
+extern void * ___kmp_allocate( size_t size KMP_SRC_LOC_DECL );
+extern void * ___kmp_page_allocate( size_t size KMP_SRC_LOC_DECL );
+extern void   ___kmp_free( void * ptr KMP_SRC_LOC_DECL );
+#define __kmp_allocate( size )      ___kmp_allocate( (size) KMP_SRC_LOC_CURR )
+#define __kmp_page_allocate( size ) ___kmp_page_allocate( (size) KMP_SRC_LOC_CURR )
+#define __kmp_free( ptr )           ___kmp_free( (ptr) KMP_SRC_LOC_CURR )
+
+#if USE_FAST_MEMORY
+extern void * ___kmp_fast_allocate( kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL );
+extern void   ___kmp_fast_free( kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL );
+extern void   __kmp_free_fast_memory( kmp_info_t *this_thr );
+extern void   __kmp_initialize_fast_memory( kmp_info_t *this_thr );
+#define __kmp_fast_allocate( this_thr, size ) ___kmp_fast_allocate( (this_thr), (size) KMP_SRC_LOC_CURR )
+#define __kmp_fast_free( this_thr, ptr )      ___kmp_fast_free( (this_thr), (ptr) KMP_SRC_LOC_CURR )
+#endif
+
+extern void * ___kmp_thread_malloc( kmp_info_t *th, size_t size KMP_SRC_LOC_DECL );
+extern void * ___kmp_thread_calloc( kmp_info_t *th, size_t nelem, size_t elsize KMP_SRC_LOC_DECL );
+extern void * ___kmp_thread_realloc( kmp_info_t *th, void *ptr, size_t size KMP_SRC_LOC_DECL );
+extern void   ___kmp_thread_free( kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL );
+#define __kmp_thread_malloc(  th, size )          ___kmp_thread_malloc(  (th), (size)            KMP_SRC_LOC_CURR )
+#define __kmp_thread_calloc(  th, nelem, elsize ) ___kmp_thread_calloc(  (th), (nelem), (elsize) KMP_SRC_LOC_CURR )
+#define __kmp_thread_realloc( th, ptr, size )     ___kmp_thread_realloc( (th), (ptr), (size)     KMP_SRC_LOC_CURR )
+#define __kmp_thread_free(    th, ptr )           ___kmp_thread_free(    (th), (ptr)             KMP_SRC_LOC_CURR )
+
+#define KMP_INTERNAL_MALLOC(sz)    malloc(sz)
+#define KMP_INTERNAL_FREE(p)       free(p)
+#define KMP_INTERNAL_REALLOC(p,sz) realloc((p),(sz))
+#define KMP_INTERNAL_CALLOC(n,sz)  calloc((n),(sz))
+
+extern void __kmp_push_num_threads( ident_t *loc, int gtid, int num_threads );
+
+#if OMP_40_ENABLED
+extern void __kmp_push_proc_bind( ident_t *loc, int gtid, kmp_proc_bind_t proc_bind );
+extern void __kmp_push_num_teams( ident_t *loc, int gtid, int num_teams, int num_threads );
+#endif
+
+extern void __kmp_yield( int cond );
+
+extern void __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
+    kmp_int32 chunk );
+extern void __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
+    kmp_int32 chunk );
+extern void __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
+    kmp_int64 chunk );
+extern void __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
+    kmp_int64 chunk );
+
+extern int __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid,
+    kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st );
+extern int __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid,
+    kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st );
+extern int __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid,
+    kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st );
+extern int __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid,
+    kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st );
+
+extern void __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid );
+extern void __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid );
+extern void __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid );
+extern void __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid );
+
+
+#ifdef KMP_GOMP_COMPAT
+
+extern void __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
+    kmp_int32 chunk, int push_ws );
+extern void __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
+    kmp_int32 chunk, int push_ws );
+extern void __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
+    kmp_int64 chunk, int push_ws );
+extern void __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid,
+    enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
+    kmp_int64 chunk, int push_ws );
+extern void __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid );
+extern void __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid );
+extern void __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid );
+extern void __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid );
+
+#endif /* KMP_GOMP_COMPAT */
+
+
+extern kmp_uint32 __kmp_eq_4(  kmp_uint32 value, kmp_uint32 checker );
+extern kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker );
+extern kmp_uint32 __kmp_lt_4(  kmp_uint32 value, kmp_uint32 checker );
+extern kmp_uint32 __kmp_ge_4(  kmp_uint32 value, kmp_uint32 checker );
+extern kmp_uint32 __kmp_le_4(  kmp_uint32 value, kmp_uint32 checker );
+
+extern kmp_uint32 __kmp_eq_8(  kmp_uint64 value, kmp_uint64 checker );
+extern kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker );
+extern kmp_uint32 __kmp_lt_8(  kmp_uint64 value, kmp_uint64 checker );
+extern kmp_uint32 __kmp_ge_8(  kmp_uint64 value, kmp_uint64 checker );
+extern kmp_uint32 __kmp_le_8(  kmp_uint64 value, kmp_uint64 checker );
+
+extern kmp_uint32 __kmp_wait_yield_4( kmp_uint32 volatile * spinner, kmp_uint32 checker, kmp_uint32 (*pred) (kmp_uint32, kmp_uint32), void * obj );
+extern kmp_uint64 __kmp_wait_yield_8( kmp_uint64 volatile * spinner, kmp_uint64 checker, kmp_uint32 (*pred) (kmp_uint64, kmp_uint64), void * obj );
+
+class kmp_flag_32;
+class kmp_flag_64;
+class kmp_flag_oncore;
+extern void __kmp_wait_32(kmp_info_t *this_thr, kmp_flag_32 *flag, int final_spin
+#if USE_ITT_BUILD
+                   , void * itt_sync_obj
+#endif
+                   );
+extern void __kmp_release_32(kmp_flag_32 *flag);
+extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, int final_spin
+#if USE_ITT_BUILD
+                   , void * itt_sync_obj
+#endif
+                   );
+extern void __kmp_release_64(kmp_flag_64 *flag);
+extern void __kmp_wait_oncore(kmp_info_t *this_thr, kmp_flag_oncore *flag, int final_spin
+#if USE_ITT_BUILD
+                   , void * itt_sync_obj
+#endif
+                   );
+extern void __kmp_release_oncore(kmp_flag_oncore *flag);
+
+extern void __kmp_infinite_loop( void );
+
+extern void __kmp_cleanup( void );
+
+#if KMP_HANDLE_SIGNALS
+    extern int  __kmp_handle_signals;
+    extern void __kmp_install_signals( int parallel_init );
+    extern void __kmp_remove_signals( void );
+#endif
+
+extern void __kmp_clear_system_time( void );
+extern void __kmp_read_system_time( double *delta );
+
+extern void __kmp_check_stack_overlap( kmp_info_t *thr );
+
+extern void __kmp_expand_host_name( char *buffer, size_t size );
+extern void __kmp_expand_file_name( char *result, size_t rlen, char *pattern );
+
+#if KMP_OS_WINDOWS
+extern void __kmp_initialize_system_tick( void );  /* Initialize timer tick value */
+#endif
+
+extern void __kmp_runtime_initialize( void );  /* machine specific initialization */
+extern void __kmp_runtime_destroy( void );
+
+#if KMP_AFFINITY_SUPPORTED
+extern char *__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask);
+extern void __kmp_affinity_initialize(void);
+extern void __kmp_affinity_uninitialize(void);
+extern void __kmp_affinity_set_init_mask(int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
+#if OMP_40_ENABLED
+extern void __kmp_affinity_set_place(int gtid);
+#endif
+extern void __kmp_affinity_determine_capable( const char *env_var );
+extern int __kmp_aux_set_affinity(void **mask);
+extern int __kmp_aux_get_affinity(void **mask);
+extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
+extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
+extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
+extern void __kmp_balanced_affinity( int tid, int team_size );
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar);
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+extern int __kmp_futex_determine_capable( void );
+
+#endif // KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+extern void __kmp_gtid_set_specific( int gtid );
+extern int  __kmp_gtid_get_specific( void );
+
+extern double __kmp_read_cpu_time( void );
+
+extern int  __kmp_read_system_info( struct kmp_sys_info *info );
+
+extern void __kmp_create_monitor( kmp_info_t *th );
+
+extern void *__kmp_launch_thread( kmp_info_t *thr );
+
+extern void __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size );
+
+#if KMP_OS_WINDOWS
+extern int  __kmp_still_running(kmp_info_t *th);
+extern int  __kmp_is_thread_alive( kmp_info_t * th, DWORD *exit_val );
+extern void __kmp_free_handle( kmp_thread_t tHandle );
+#endif
+
+extern void __kmp_reap_monitor( kmp_info_t *th );
+extern void __kmp_reap_worker( kmp_info_t *th );
+extern void __kmp_terminate_thread( int gtid );
+
+extern void __kmp_suspend_32( int th_gtid, kmp_flag_32 *flag );
+extern void __kmp_suspend_64( int th_gtid, kmp_flag_64 *flag );
+extern void __kmp_suspend_oncore( int th_gtid, kmp_flag_oncore *flag );
+extern void __kmp_resume_32( int target_gtid, kmp_flag_32 *flag );
+extern void __kmp_resume_64( int target_gtid, kmp_flag_64 *flag );
+extern void __kmp_resume_oncore( int target_gtid, kmp_flag_oncore *flag );
+
+extern void __kmp_elapsed( double * );
+extern void __kmp_elapsed_tick( double * );
+
+extern void __kmp_enable( int old_state );
+extern void __kmp_disable( int *old_state );
+
+extern void __kmp_thread_sleep( int millis );
+
+extern void __kmp_common_initialize( void );
+extern void __kmp_common_destroy( void );
+extern void __kmp_common_destroy_gtid( int gtid );
+
+#if KMP_OS_UNIX
+extern void __kmp_register_atfork( void );
+#endif
+extern void __kmp_suspend_initialize( void );
+extern void __kmp_suspend_uninitialize_thread( kmp_info_t *th );
+
+extern kmp_info_t * __kmp_allocate_thread( kmp_root_t *root,
+                                           kmp_team_t *team, int tid);
+#if OMP_40_ENABLED
+extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                                         ompt_parallel_id_t ompt_parallel_id,
+#endif
+                                         kmp_proc_bind_t proc_bind,
+                                         kmp_internal_control_t *new_icvs,
+                                         int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
+#else
+extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                                         ompt_parallel_id_t ompt_parallel_id,
+#endif
+                                         kmp_internal_control_t *new_icvs,
+                                         int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
+#endif // OMP_40_ENABLED
+extern void __kmp_free_thread( kmp_info_t * );
+extern void __kmp_free_team( kmp_root_t *, kmp_team_t *  USE_NESTED_HOT_ARG(kmp_info_t *) );
+extern kmp_team_t * __kmp_reap_team( kmp_team_t * );
+
+/* ------------------------------------------------------------------------ */
+
+extern void __kmp_initialize_bget( kmp_info_t *th );
+extern void __kmp_finalize_bget( kmp_info_t *th );
+
+KMP_EXPORT void *kmpc_malloc( size_t size );
+KMP_EXPORT void *kmpc_calloc( size_t nelem, size_t elsize );
+KMP_EXPORT void *kmpc_realloc( void *ptr, size_t size );
+KMP_EXPORT void  kmpc_free( void *ptr );
+
+/* ------------------------------------------------------------------------ */
+/* declarations for internal use */
+
+extern int  __kmp_barrier( enum barrier_type bt, int gtid, int is_split,
+                           size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) );
+extern void __kmp_end_split_barrier ( enum barrier_type bt, int gtid );
+
+/*!
+ * Tell the fork call which compiler generated the fork call, and therefore how to deal with the call.
+ */
+enum fork_context_e
+{
+    fork_context_gnu,                           /**< Called from GNU generated code, so must not invoke the microtask internally. */
+    fork_context_intel,                         /**< Called from Intel generated code.  */
+    fork_context_last
+};
+extern int __kmp_fork_call( ident_t *loc, int gtid, enum fork_context_e fork_context,
+  kmp_int32 argc,
+#if OMPT_SUPPORT
+  void *unwrapped_task,
+#endif
+  microtask_t microtask, launch_t invoker,
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                             va_list *ap
+#else
+                             va_list ap
+#endif
+                             );
+
+extern void __kmp_join_call( ident_t *loc, int gtid
+#if OMP_40_ENABLED
+                           , int exit_teams = 0
+#endif
+                           );
+
+extern void __kmp_serialized_parallel(ident_t *id, kmp_int32 gtid);
+extern void __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team );
+extern void __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team );
+extern int __kmp_invoke_task_func( int gtid );
+extern void __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr, kmp_team_t *team );
+extern void __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr, kmp_team_t *team );
+
+// should never have been exported
+KMP_EXPORT int __kmpc_invoke_task_func( int gtid );
+#if OMP_40_ENABLED
+extern int  __kmp_invoke_teams_master( int gtid );
+extern void __kmp_teams_master( int gtid );
+#endif
+extern void __kmp_save_internal_controls( kmp_info_t * thread );
+extern void __kmp_user_set_library (enum library_type arg);
+extern void __kmp_aux_set_library (enum library_type arg);
+extern void __kmp_aux_set_stacksize( size_t arg);
+extern void __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid);
+extern void __kmp_aux_set_defaults( char const * str, int len );
+
+/* Functions below put here to call them from __kmp_aux_env_initialize() in kmp_settings.c */
+void kmpc_set_blocktime (int arg);
+void ompc_set_nested( int flag );
+void ompc_set_dynamic( int flag );
+void ompc_set_num_threads( int arg );
+
+extern void __kmp_push_current_task_to_thread( kmp_info_t *this_thr,
+                  kmp_team_t *team, int tid );
+extern void __kmp_pop_current_task_from_thread( kmp_info_t *this_thr );
+extern kmp_task_t* __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid,
+  kmp_tasking_flags_t *flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry );
+#if OMPT_SUPPORT
+extern void __kmp_task_init_ompt( kmp_taskdata_t * task, int tid );
+#endif
+extern void __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr,
+                  kmp_team_t *team, int tid, int set_curr_task );
+
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void * itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void * itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
+                               int *thread_finished,
+#if USE_ITT_BUILD
+                               void * itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                               kmp_int32 is_constrained);
+
+extern void __kmp_reap_task_teams( void );
+extern void __kmp_unref_task_team( kmp_task_team_t *task_team, kmp_info_t *thread );
+extern void __kmp_wait_to_unref_task_teams( void );
+extern void __kmp_task_team_setup ( kmp_info_t *this_thr, kmp_team_t *team, int both, int always );
+extern void __kmp_task_team_sync  ( kmp_info_t *this_thr, kmp_team_t *team );
+extern void __kmp_task_team_wait  ( kmp_info_t *this_thr, kmp_team_t *team
+#if USE_ITT_BUILD
+                                    , void * itt_sync_obj
+#endif /* USE_ITT_BUILD */
+);
+extern void __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid );
+
+extern int  __kmp_is_address_mapped( void *addr );
+extern kmp_uint64 __kmp_hardware_timestamp(void);
+
+#if KMP_OS_UNIX
+extern int  __kmp_read_from_file( char const *path, char const *format, ... );
+#endif
+
+/* ------------------------------------------------------------------------ */
+//
+// Assembly routines that have no compiler intrinsic replacement
+//
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+extern void       __kmp_query_cpuid( kmp_cpuinfo_t *p );
+
+#define __kmp_load_mxcsr(p) _mm_setcsr(*(p))
+static inline void __kmp_store_mxcsr( kmp_uint32 *p ) { *p = _mm_getcsr(); }
+
+extern void __kmp_load_x87_fpu_control_word( kmp_int16 *p );
+extern void __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+extern void __kmp_clear_x87_fpu_status_word();
+# define KMP_X86_MXCSR_MASK      0xffffffc0   /* ignore status flags (6 lsb) */
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+extern int __kmp_invoke_microtask( microtask_t pkfn, int gtid, int npr, int argc, void *argv[]
+#if OMPT_SUPPORT
+                                   , void **exit_frame_ptr
+#endif
+);
+
+
+/* ------------------------------------------------------------------------ */
+
+KMP_EXPORT void   __kmpc_begin                ( ident_t *, kmp_int32 flags );
+KMP_EXPORT void   __kmpc_end                  ( ident_t * );
+
+KMP_EXPORT void   __kmpc_threadprivate_register_vec ( ident_t *, void * data, kmpc_ctor_vec ctor,
+                                                  kmpc_cctor_vec cctor, kmpc_dtor_vec dtor, size_t vector_length );
+KMP_EXPORT void   __kmpc_threadprivate_register     ( ident_t *, void * data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor );
+KMP_EXPORT void * __kmpc_threadprivate              ( ident_t *, kmp_int32 global_tid, void * data, size_t size );
+
+KMP_EXPORT kmp_int32  __kmpc_global_thread_num  ( ident_t * );
+KMP_EXPORT kmp_int32  __kmpc_global_num_threads ( ident_t * );
+KMP_EXPORT kmp_int32  __kmpc_bound_thread_num   ( ident_t * );
+KMP_EXPORT kmp_int32  __kmpc_bound_num_threads  ( ident_t * );
+
+KMP_EXPORT kmp_int32  __kmpc_ok_to_fork     ( ident_t * );
+KMP_EXPORT void   __kmpc_fork_call          ( ident_t *, kmp_int32 nargs, kmpc_micro microtask, ... );
+
+KMP_EXPORT void   __kmpc_serialized_parallel     ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT void   __kmpc_end_serialized_parallel ( ident_t *, kmp_int32 global_tid );
+
+KMP_EXPORT void   __kmpc_flush              ( ident_t *);
+KMP_EXPORT void   __kmpc_barrier            ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT kmp_int32  __kmpc_master         ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT void   __kmpc_end_master         ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT void   __kmpc_ordered            ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT void   __kmpc_end_ordered        ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT void   __kmpc_critical           ( ident_t *, kmp_int32 global_tid, kmp_critical_name * );
+KMP_EXPORT void   __kmpc_end_critical       ( ident_t *, kmp_int32 global_tid, kmp_critical_name * );
+
+KMP_EXPORT kmp_int32  __kmpc_barrier_master ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT void   __kmpc_end_barrier_master ( ident_t *, kmp_int32 global_tid );
+
+KMP_EXPORT kmp_int32  __kmpc_barrier_master_nowait ( ident_t *, kmp_int32 global_tid );
+
+KMP_EXPORT kmp_int32  __kmpc_single         ( ident_t *, kmp_int32 global_tid );
+KMP_EXPORT void   __kmpc_end_single         ( ident_t *, kmp_int32 global_tid );
+
+KMP_EXPORT void KMPC_FOR_STATIC_INIT    ( ident_t *loc, kmp_int32 global_tid, kmp_int32 schedtype, kmp_int32 *plastiter,
+                                          kmp_int *plower, kmp_int *pupper, kmp_int *pstride, kmp_int incr, kmp_int chunk );
+
+KMP_EXPORT void __kmpc_for_static_fini  ( ident_t *loc, kmp_int32 global_tid );
+
+KMP_EXPORT void __kmpc_copyprivate( ident_t *loc, kmp_int32 global_tid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit );
+
+extern void KMPC_SET_NUM_THREADS        ( int arg );
+extern void KMPC_SET_DYNAMIC            ( int flag );
+extern void KMPC_SET_NESTED             ( int flag );
+
+/* --------------------------------------------------------------------------- */
+
+/*
+ * Taskq interface routines
+ */
+
+KMP_EXPORT kmpc_thunk_t * __kmpc_taskq (ident_t *loc, kmp_int32 global_tid, kmpc_task_t taskq_task, size_t sizeof_thunk,
+                                        size_t sizeof_shareds, kmp_int32 flags, kmpc_shared_vars_t **shareds);
+KMP_EXPORT void __kmpc_end_taskq (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk);
+KMP_EXPORT kmp_int32 __kmpc_task (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk);
+KMP_EXPORT void __kmpc_taskq_task (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, kmp_int32 status);
+KMP_EXPORT void __kmpc_end_taskq_task (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk);
+KMP_EXPORT kmpc_thunk_t * __kmpc_task_buffer (ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk, kmpc_task_t task);
+
+/* ------------------------------------------------------------------------ */
+
+/*
+ * OMP 3.0 tasking interface routines
+ */
+
+KMP_EXPORT kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+KMP_EXPORT kmp_task_t*
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry );
+KMP_EXPORT void
+__kmpc_omp_task_begin_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task );
+KMP_EXPORT void
+__kmpc_omp_task_complete_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task );
+KMP_EXPORT kmp_int32
+__kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+KMP_EXPORT kmp_int32
+__kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid );
+
+KMP_EXPORT kmp_int32
+__kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part );
+
+#if TASK_UNUSED
+void __kmpc_omp_task_begin( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task );
+void __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task );
+#endif // TASK_UNUSED
+
+/* ------------------------------------------------------------------------ */
+
+#if OMP_40_ENABLED
+
+KMP_EXPORT void __kmpc_taskgroup( ident_t * loc, int gtid );
+KMP_EXPORT void __kmpc_end_taskgroup( ident_t * loc, int gtid );
+
+KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                                 kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                                 kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+KMP_EXPORT void __kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                          kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+extern void __kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task );
+
+extern kmp_int32 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate );
+
+KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t* loc_ref, kmp_int32 gtid);
+KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
+
+#if OMP_41_ENABLED
+
+KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
+KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+
+#endif
+
+#endif
+
+
+/*
+ * Lock interface routines (fast versions with gtid passed in)
+ */
+KMP_EXPORT void __kmpc_init_lock( ident_t *loc, kmp_int32 gtid,  void **user_lock );
+KMP_EXPORT void __kmpc_init_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT void __kmpc_destroy_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT void __kmpc_destroy_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT void __kmpc_set_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT void __kmpc_set_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT void __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT void __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT int __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+KMP_EXPORT int __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock );
+
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Interface to fast scalable reduce methods routines
+ */
+
+KMP_EXPORT kmp_int32 __kmpc_reduce_nowait( ident_t *loc, kmp_int32 global_tid,
+                                           kmp_int32 num_vars, size_t reduce_size,
+                                           void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+                                           kmp_critical_name *lck );
+KMP_EXPORT void __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck );
+KMP_EXPORT kmp_int32 __kmpc_reduce( ident_t *loc, kmp_int32 global_tid,
+                                    kmp_int32 num_vars, size_t reduce_size,
+                                    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+                                    kmp_critical_name *lck );
+KMP_EXPORT void __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck );
+
+/*
+ * internal fast reduction routines
+ */
+
+extern PACKED_REDUCTION_METHOD_T
+__kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
+                                  kmp_int32 num_vars, size_t reduce_size,
+                                  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+                                  kmp_critical_name *lck );
+
+// this function is for testing set/get/determine reduce method
+KMP_EXPORT kmp_int32 __kmp_get_reduce_method( void );
+
+KMP_EXPORT kmp_uint64 __kmpc_get_taskid();
+KMP_EXPORT kmp_uint64 __kmpc_get_parent_taskid();
+
+KMP_EXPORT void __kmpc_place_threads(int,int,int);
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+// C++ port
+// missing 'extern "C"' declarations
+
+KMP_EXPORT kmp_int32 __kmpc_in_parallel( ident_t *loc );
+KMP_EXPORT void __kmpc_pop_num_threads(  ident_t *loc, kmp_int32 global_tid );
+KMP_EXPORT void __kmpc_push_num_threads( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads );
+
+#if OMP_40_ENABLED
+KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind );
+KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads );
+KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...);
+
+#endif
+
+KMP_EXPORT void*
+__kmpc_threadprivate_cached( ident_t * loc, kmp_int32 global_tid,
+                             void * data, size_t size, void *** cache );
+
+// Symbols for MS mutual detection.
+extern int _You_must_link_with_exactly_one_OpenMP_library;
+extern int _You_must_link_with_Intel_OpenMP_library;
+#if KMP_OS_WINDOWS && ( KMP_VERSION_MAJOR > 4 )
+    extern int _You_must_link_with_Microsoft_OpenMP_library;
+#endif
+
+
+// The routines below are not exported.
+// Consider making them 'static' in corresponding source files.
+void
+kmp_threadprivate_insert_private_data( int gtid, void *pc_addr, void *data_addr, size_t pc_size );
+struct private_common *
+kmp_threadprivate_insert( int gtid, void *pc_addr, void *data_addr, size_t pc_size );
+
+//
+// ompc_, kmpc_ entries moved from omp.h.
+//
+#if KMP_OS_WINDOWS
+#   define KMPC_CONVENTION __cdecl
+#else
+#   define KMPC_CONVENTION
+#endif
+
+#ifndef __OMP_H
+typedef enum omp_sched_t {
+    omp_sched_static  = 1,
+    omp_sched_dynamic = 2,
+    omp_sched_guided  = 3,
+    omp_sched_auto    = 4
+} omp_sched_t;
+typedef void * kmp_affinity_mask_t;
+#endif
+
+KMP_EXPORT void KMPC_CONVENTION ompc_set_max_active_levels(int);
+KMP_EXPORT void KMPC_CONVENTION ompc_set_schedule(omp_sched_t, int);
+KMP_EXPORT int  KMPC_CONVENTION ompc_get_ancestor_thread_num(int);
+KMP_EXPORT int  KMPC_CONVENTION ompc_get_team_size(int);
+KMP_EXPORT int  KMPC_CONVENTION kmpc_set_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int  KMPC_CONVENTION kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int  KMPC_CONVENTION kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *);
+
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize(int);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_defaults(char const *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* KMP_H */
+

diff --git a/final/runtime/src/kmp_affinity.cpp b/final/runtime/src/kmp_affinity.cpp
new file mode 100644
index 0000000..5fcee14
--- /dev/null
+++ b/final/runtime/src/kmp_affinity.cpp

@@ -0,0 +1,4750 @@
+/*
+ * kmp_affinity.cpp -- affinity management
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_str.h"
+#include "kmp_wrapper_getpid.h"
+
+#if KMP_AFFINITY_SUPPORTED
+
+//
+// Print the affinity mask to the character array in a pretty format.
+//
+char *
+__kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
+{
+    KMP_ASSERT(buf_len >= 40);
+    char *scan = buf;
+    char *end = buf + buf_len - 1;
+
+    //
+    // Find first element / check for empty set.
+    //
+    size_t i;
+    for (i = 0; i < KMP_CPU_SETSIZE; i++) {
+        if (KMP_CPU_ISSET(i, mask)) {
+            break;
+        }
+    }
+    if (i == KMP_CPU_SETSIZE) {
+        KMP_SNPRINTF(scan, buf_len, "{<empty>}");
+        while (*scan != '\0') scan++;
+        KMP_ASSERT(scan <= end);
+        return buf;
+    }
+
+    KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
+    while (*scan != '\0') scan++;
+    i++;
+    for (; i < KMP_CPU_SETSIZE; i++) {
+        if (! KMP_CPU_ISSET(i, mask)) {
+            continue;
+        }
+
+        //
+        // Check for buffer overflow.  A string of the form ",<n>" will have
+        // at most 10 characters, plus we want to leave room to print ",...}"
+        // if the set is too large to print for a total of 15 characters.
+        // We already left room for '\0' in setting end.
+        //
+        if (end - scan < 15) {
+           break;
+        }
+        KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
+        while (*scan != '\0') scan++;
+    }
+    if (i < KMP_CPU_SETSIZE) {
+        KMP_SNPRINTF(scan, buf_len,  ",...");
+        while (*scan != '\0') scan++;
+    }
+    KMP_SNPRINTF(scan, buf_len, "}");
+    while (*scan != '\0') scan++;
+    KMP_ASSERT(scan <= end);
+    return buf;
+}
+
+
+void
+__kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
+{
+    KMP_CPU_ZERO(mask);
+
+# if KMP_GROUP_AFFINITY
+
+    if (__kmp_num_proc_groups > 1) {
+        int group;
+        KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
+        for (group = 0; group < __kmp_num_proc_groups; group++) {
+            int i;
+            int num = __kmp_GetActiveProcessorCount(group);
+            for (i = 0; i < num; i++) {
+                KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+            }
+        }
+    }
+    else
+
+# endif /* KMP_GROUP_AFFINITY */
+
+    {
+        int proc;
+        for (proc = 0; proc < __kmp_xproc; proc++) {
+            KMP_CPU_SET(proc, mask);
+        }
+    }
+}
+
+
+//
+// In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
+// functions.
+//
+// The icc codegen emits sections with extremely long names, of the form
+// ".gnu.linkonce.<mangled_name>".  There seems to have been a linker bug
+// introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
+// some sort of memory corruption or table overflow that is triggered by
+// these long strings.  I checked the latest version of the linker -
+// GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
+// fixed.
+//
+// Unfortunately, my attempts to reproduce it in a smaller example have
+// failed - I'm not sure what the prospects are of getting it fixed
+// properly - but we need a reproducer smaller than all of libomp.
+//
+// Work around the problem by avoiding inline constructors in such builds.
+// We do this for all platforms, not just Linux* OS - non-inline functions are
+// more debuggable and provide better coverage into than inline functions.
+// Use inline functions in shipping libs, for performance.
+//
+
+# if !defined(KMP_DEBUG) && !defined(COVER)
+
+class Address {
+public:
+    static const unsigned maxDepth = 32;
+    unsigned labels[maxDepth];
+    unsigned childNums[maxDepth];
+    unsigned depth;
+    unsigned leader;
+    Address(unsigned _depth)
+      : depth(_depth), leader(FALSE) {
+    }
+    Address &operator=(const Address &b) {
+        depth = b.depth;
+        for (unsigned i = 0; i < depth; i++) {
+            labels[i] = b.labels[i];
+            childNums[i] = b.childNums[i];
+        }
+        leader = FALSE;
+        return *this;
+    }
+    bool operator==(const Address &b) const {
+        if (depth != b.depth)
+            return false;
+        for (unsigned i = 0; i < depth; i++)
+            if(labels[i] != b.labels[i])
+                return false;
+        return true;
+    }
+    bool isClose(const Address &b, int level) const {
+        if (depth != b.depth)
+            return false;
+        if ((unsigned)level >= depth)
+            return true;
+        for (unsigned i = 0; i < (depth - level); i++)
+            if(labels[i] != b.labels[i])
+                return false;
+        return true;
+    }
+    bool operator!=(const Address &b) const {
+        return !operator==(b);
+    }
+};
+
+class AddrUnsPair {
+public:
+    Address first;
+    unsigned second;
+    AddrUnsPair(Address _first, unsigned _second)
+      : first(_first), second(_second) {
+    }
+    AddrUnsPair &operator=(const AddrUnsPair &b)
+    {
+        first = b.first;
+        second = b.second;
+        return *this;
+    }
+};
+
+# else
+
+class Address {
+public:
+    static const unsigned maxDepth = 32;
+    unsigned labels[maxDepth];
+    unsigned childNums[maxDepth];
+    unsigned depth;
+    unsigned leader;
+    Address(unsigned _depth);
+    Address &operator=(const Address &b);
+    bool operator==(const Address &b) const;
+    bool isClose(const Address &b, int level) const;
+    bool operator!=(const Address &b) const;
+};
+
+Address::Address(unsigned _depth)
+{
+    depth = _depth;
+    leader = FALSE;
+}
+
+Address &Address::operator=(const Address &b) {
+    depth = b.depth;
+    for (unsigned i = 0; i < depth; i++) {
+        labels[i] = b.labels[i];
+        childNums[i] = b.childNums[i];
+    }
+    leader = FALSE;
+    return *this;
+}
+
+bool Address::operator==(const Address &b) const {
+    if (depth != b.depth)
+        return false;
+    for (unsigned i = 0; i < depth; i++)
+        if(labels[i] != b.labels[i])
+            return false;
+    return true;
+}
+
+bool Address::isClose(const Address &b, int level) const {
+    if (depth != b.depth)
+        return false;
+    if ((unsigned)level >= depth)
+        return true;
+    for (unsigned i = 0; i < (depth - level); i++)
+        if(labels[i] != b.labels[i])
+            return false;
+    return true;
+}
+
+bool Address::operator!=(const Address &b) const {
+    return !operator==(b);
+}
+
+class AddrUnsPair {
+public:
+    Address first;
+    unsigned second;
+    AddrUnsPair(Address _first, unsigned _second);
+    AddrUnsPair &operator=(const AddrUnsPair &b);
+};
+
+AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
+  : first(_first), second(_second)
+{
+}
+
+AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
+{
+    first = b.first;
+    second = b.second;
+    return *this;
+}
+
+# endif /* !defined(KMP_DEBUG) && !defined(COVER) */
+
+
+static int
+__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
+{
+    const Address *aa = (const Address *)&(((AddrUnsPair *)a)
+      ->first);
+    const Address *bb = (const Address *)&(((AddrUnsPair *)b)
+      ->first);
+    unsigned depth = aa->depth;
+    unsigned i;
+    KMP_DEBUG_ASSERT(depth == bb->depth);
+    for (i  = 0; i < depth; i++) {
+        if (aa->labels[i] < bb->labels[i]) return -1;
+        if (aa->labels[i] > bb->labels[i]) return 1;
+    }
+    return 0;
+}
+
+
+static int
+__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
+{
+    const Address *aa = (const Address *)&(((AddrUnsPair *)a)
+      ->first);
+    const Address *bb = (const Address *)&(((AddrUnsPair *)b)
+      ->first);
+    unsigned depth = aa->depth;
+    unsigned i;
+    KMP_DEBUG_ASSERT(depth == bb->depth);
+    KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
+    KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
+    for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
+        int j = depth - i - 1;
+        if (aa->childNums[j] < bb->childNums[j]) return -1;
+        if (aa->childNums[j] > bb->childNums[j]) return 1;
+    }
+    for (; i < depth; i++) {
+        int j = i - __kmp_affinity_compact;
+        if (aa->childNums[j] < bb->childNums[j]) return -1;
+        if (aa->childNums[j] > bb->childNums[j]) return 1;
+    }
+    return 0;
+}
+
+/** A structure for holding machine-specific hierarchy info to be computed once at init.
+    This structure represents a mapping of threads to the actual machine hierarchy, or to
+    our best guess at what the hierarchy might be, for the purpose of performing an
+    efficient barrier.  In the worst case, when there is no machine hierarchy information,
+    it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
+class hierarchy_info {
+public:
+    /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
+    or socket, packages/node, nodes/machine, etc.  We don't want to get specific with
+    nomenclature.  When the machine is oversubscribed we add levels to duplicate the
+    hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
+    kmp_uint32 maxLevels;
+
+    /** This is specifically the depth of the machine configuration hierarchy, in terms of the
+        number of levels along the longest path from root to any leaf. It corresponds to the
+        number of entries in numPerLevel if we exclude all but one trailing 1. */
+    kmp_uint32 depth;
+    kmp_uint32 base_num_threads;
+    volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
+    volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
+
+    /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
+        node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
+        and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
+    kmp_uint32 *numPerLevel;
+    kmp_uint32 *skipPerLevel;
+
+    void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
+        int hier_depth = adr2os[0].first.depth;
+        int level = 0;
+        for (int i=hier_depth-1; i>=0; --i) {
+            int max = -1;
+            for (int j=0; j<num_addrs; ++j) {
+                int next = adr2os[j].first.childNums[i];
+                if (next > max) max = next;
+            }
+            numPerLevel[level] = max+1;
+            ++level;
+        }
+    }
+
+    hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
+
+    // TO FIX: This destructor causes a segfault in the library at shutdown.
+    //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
+
+    void init(AddrUnsPair *adr2os, int num_addrs)
+    {
+        kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
+        if (bool_result == 0) { // Wait for initialization
+            while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
+            return;
+        }
+        KMP_DEBUG_ASSERT(bool_result==1);
+
+        /* Added explicit initialization of the data fields here to prevent usage of dirty value
+           observed when static library is re-initialized multiple times (e.g. when
+           non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
+        depth = 1;
+        resizing = 0;
+        maxLevels = 7;
+        numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
+        skipPerLevel = &(numPerLevel[maxLevels]);
+        for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
+            numPerLevel[i] = 1;
+            skipPerLevel[i] = 1;
+        }
+
+        // Sort table by physical ID
+        if (adr2os) {
+            qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
+            deriveLevels(adr2os, num_addrs);
+        }
+        else {
+            numPerLevel[0] = 4;
+            numPerLevel[1] = num_addrs/4;
+            if (num_addrs%4) numPerLevel[1]++;
+        }
+
+        base_num_threads = num_addrs;
+        for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
+            if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
+                depth++;
+
+        kmp_uint32 branch = 4;
+        if (numPerLevel[0] == 1) branch = num_addrs/4;
+        if (branch<4) branch=4;
+        for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
+            while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
+                if (numPerLevel[d] & 1) numPerLevel[d]++;
+                numPerLevel[d] = numPerLevel[d] >> 1;
+                if (numPerLevel[d+1] == 1) depth++;
+                numPerLevel[d+1] = numPerLevel[d+1] << 1;
+            }
+            if(numPerLevel[0] == 1) {
+                branch = branch >> 1;
+                if (branch<4) branch = 4;
+            }
+        }
+
+        for (kmp_uint32 i=1; i<depth; ++i)
+            skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
+        // Fill in hierarchy in the case of oversubscription
+        for (kmp_uint32 i=depth; i<maxLevels; ++i)
+            skipPerLevel[i] = 2*skipPerLevel[i-1];
+
+        uninitialized = 0; // One writer
+
+    }
+
+    void resize(kmp_uint32 nproc)
+    {
+        kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
+        if (bool_result == 0) { // Someone else is resizing
+            while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
+            return;
+        }
+        KMP_DEBUG_ASSERT(bool_result!=0);
+        KMP_DEBUG_ASSERT(nproc > base_num_threads);
+
+        // Calculate new max_levels
+        kmp_uint32 old_sz = skipPerLevel[depth-1];
+        kmp_uint32 incs = 0, old_maxLevels= maxLevels;
+        while (nproc > old_sz) {
+            old_sz *=2;
+            incs++;
+        }
+        maxLevels += incs;
+
+        // Resize arrays
+        kmp_uint32 *old_numPerLevel = numPerLevel;
+        kmp_uint32 *old_skipPerLevel = skipPerLevel;
+        numPerLevel = skipPerLevel = NULL;
+        numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
+        skipPerLevel = &(numPerLevel[maxLevels]);
+
+        // Copy old elements from old arrays
+        for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
+            numPerLevel[i] = old_numPerLevel[i];
+            skipPerLevel[i] = old_skipPerLevel[i];
+        }
+
+        // Init new elements in arrays to 1
+        for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
+            numPerLevel[i] = 1;
+            skipPerLevel[i] = 1;
+        }
+
+        // Free old arrays
+        __kmp_free(old_numPerLevel);
+
+        // Fill in oversubscription levels of hierarchy
+        for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
+            skipPerLevel[i] = 2*skipPerLevel[i-1];
+
+        base_num_threads = nproc;
+        resizing = 0; // One writer
+
+    }
+};
+
+static hierarchy_info machine_hierarchy;
+
+void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
+    kmp_uint32 depth;
+    // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
+    if (TCR_1(machine_hierarchy.uninitialized))
+        machine_hierarchy.init(NULL, nproc);
+    // Adjust the hierarchy in case num threads exceeds original 
+    if (nproc > machine_hierarchy.base_num_threads)
+        machine_hierarchy.resize(nproc);
+
+    depth = machine_hierarchy.depth;
+    KMP_DEBUG_ASSERT(depth > 0);
+    // The loop below adjusts the depth in the case of a resize
+    while (nproc > machine_hierarchy.skipPerLevel[depth-1])
+        depth++;
+
+    thr_bar->depth = depth;
+    thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
+    thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
+}
+
+//
+// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
+// called to renumber the labels from [0..n] and place them into the child_num
+// vector of the address object.  This is done in case the labels used for
+// the children at one node of the hierarchy differ from those used for
+// another node at the same level.  Example:  suppose the machine has 2 nodes
+// with 2 packages each.  The first node contains packages 601 and 602, and
+// second node contains packages 603 and 604.  If we try to sort the table
+// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
+// because we are paying attention to the labels themselves, not the ordinal
+// child numbers.  By using the child numbers in the sort, the result is
+// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
+//
+static void
+__kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
+  int numAddrs)
+{
+    KMP_DEBUG_ASSERT(numAddrs > 0);
+    int depth = address2os->first.depth;
+    unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+    unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
+      * sizeof(unsigned));
+    int labCt;
+    for (labCt = 0; labCt < depth; labCt++) {
+        address2os[0].first.childNums[labCt] = counts[labCt] = 0;
+        lastLabel[labCt] = address2os[0].first.labels[labCt];
+    }
+    int i;
+    for (i = 1; i < numAddrs; i++) {
+        for (labCt = 0; labCt < depth; labCt++) {
+            if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
+                int labCt2;
+                for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
+                    counts[labCt2] = 0;
+                    lastLabel[labCt2] = address2os[i].first.labels[labCt2];
+                }
+                counts[labCt]++;
+                lastLabel[labCt] = address2os[i].first.labels[labCt];
+                break;
+            }
+        }
+        for (labCt = 0; labCt < depth; labCt++) {
+            address2os[i].first.childNums[labCt] = counts[labCt];
+        }
+        for (; labCt < (int)Address::maxDepth; labCt++) {
+            address2os[i].first.childNums[labCt] = 0;
+        }
+    }
+}
+
+
+//
+// All of the __kmp_affinity_create_*_map() routines should set
+// __kmp_affinity_masks to a vector of affinity mask objects of length
+// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
+// return the number of levels in the machine topology tree (zero if
+// __kmp_affinity_type == affinity_none).
+//
+// All of the __kmp_affinity_create_*_map() routines should set *fullMask
+// to the affinity mask for the initialization thread.  They need to save and
+// restore the mask, and it could be needed later, so saving it is just an
+// optimization to avoid calling kmp_get_system_affinity() again.
+//
+static kmp_affin_mask_t *fullMask = NULL;
+
+kmp_affin_mask_t *
+__kmp_affinity_get_fullMask() { return fullMask; }
+
+
+static int nCoresPerPkg, nPackages;
+static int __kmp_nThreadsPerCore;
+#ifndef KMP_DFLT_NTH_CORES
+static int __kmp_ncores;
+#endif
+
+//
+// __kmp_affinity_uniform_topology() doesn't work when called from
+// places which support arbitrarily many levels in the machine topology
+// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
+// __kmp_affinity_create_x2apicid_map().
+//
+inline static bool
+__kmp_affinity_uniform_topology()
+{
+    return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
+}
+
+
+//
+// Print out the detailed machine topology map, i.e. the physical locations
+// of each OS proc.
+//
+static void
+__kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
+  int pkgLevel, int coreLevel, int threadLevel)
+{
+    int proc;
+
+    KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
+    for (proc = 0; proc < len; proc++) {
+        int level;
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init(&buf);
+        for (level = 0; level < depth; level++) {
+            if (level == threadLevel) {
+                __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
+            }
+            else if (level == coreLevel) {
+                __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
+            }
+            else if (level == pkgLevel) {
+                __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
+            }
+            else if (level > pkgLevel) {
+                __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
+                  level - pkgLevel - 1);
+            }
+            else {
+                __kmp_str_buf_print(&buf, "L%d ", level);
+            }
+            __kmp_str_buf_print(&buf, "%d ",
+              address2os[proc].first.labels[level]);
+        }
+        KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
+          buf.str);
+        __kmp_str_buf_free(&buf);
+    }
+}
+
+
+//
+// If we don't know how to retrieve the machine's processor topology, or
+// encounter an error in doing so, this routine is called to form a "flat"
+// mapping of os thread id's <-> processor id's.
+//
+static int
+__kmp_affinity_create_flat_map(AddrUnsPair **address2os,
+  kmp_i18n_id_t *const msg_id)
+{
+    *address2os = NULL;
+    *msg_id = kmp_i18n_null;
+
+    //
+    // Even if __kmp_affinity_type == affinity_none, this routine might still
+    // called to set __kmp_ncores, as well as
+    // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+    //
+    if (! KMP_AFFINITY_CAPABLE()) {
+        KMP_ASSERT(__kmp_affinity_type == affinity_none);
+        __kmp_ncores = nPackages = __kmp_xproc;
+        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
+            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+            KMP_INFORM(Uniform, "KMP_AFFINITY");
+            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+              __kmp_nThreadsPerCore, __kmp_ncores);
+        }
+        return 0;
+    }
+
+    //
+    // When affinity is off, this routine will still be called to set
+    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
+    //  correctly, and return now if affinity is not enabled.
+    //
+    __kmp_ncores = nPackages = __kmp_avail_proc;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    if (__kmp_affinity_verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
+
+        KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
+        if (__kmp_affinity_respect_mask) {
+            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+        } else {
+            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+        }
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+        KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+          __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+    if (__kmp_affinity_type == affinity_none) {
+        return 0;
+    }
+
+    //
+    // Contruct the data structure to be returned.
+    //
+    *address2os = (AddrUnsPair*)
+      __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+    int avail_ct = 0;
+    unsigned int i;
+    for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+        //
+        // Skip this proc if it is not included in the machine model.
+        //
+        if (! KMP_CPU_ISSET(i, fullMask)) {
+            continue;
+        }
+
+        Address addr(1);
+        addr.labels[0] = i;
+        (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
+    }
+    if (__kmp_affinity_verbose) {
+        KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
+    }
+
+    if (__kmp_affinity_gran_levels < 0) {
+        //
+        // Only the package level is modeled in the machine topology map,
+        // so the #levels of granularity is either 0 or 1.
+        //
+        if (__kmp_affinity_gran > affinity_gran_package) {
+            __kmp_affinity_gran_levels = 1;
+        }
+        else {
+            __kmp_affinity_gran_levels = 0;
+        }
+    }
+    return 1;
+}
+
+
+# if KMP_GROUP_AFFINITY
+
+//
+// If multiple Windows* OS processor groups exist, we can create a 2-level
+// topology map with the groups at level 0 and the individual procs at
+// level 1.
+//
+// This facilitates letting the threads float among all procs in a group,
+// if granularity=group (the default when there are multiple groups).
+//
+static int
+__kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
+  kmp_i18n_id_t *const msg_id)
+{
+    *address2os = NULL;
+    *msg_id = kmp_i18n_null;
+
+    //
+    // If we don't have multiple processor groups, return now.
+    // The flat mapping will be used.
+    //
+    if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
+        // FIXME set *msg_id
+        return -1;
+    }
+
+    //
+    // Contruct the data structure to be returned.
+    //
+    *address2os = (AddrUnsPair*)
+      __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+    int avail_ct = 0;
+    int i;
+    for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+        //
+        // Skip this proc if it is not included in the machine model.
+        //
+        if (! KMP_CPU_ISSET(i, fullMask)) {
+            continue;
+        }
+
+        Address addr(2);
+        addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
+        addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
+        (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
+
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
+              addr.labels[1]);
+        }
+    }
+
+    if (__kmp_affinity_gran_levels < 0) {
+        if (__kmp_affinity_gran == affinity_gran_group) {
+            __kmp_affinity_gran_levels = 1;
+        }
+        else if ((__kmp_affinity_gran == affinity_gran_fine)
+          || (__kmp_affinity_gran == affinity_gran_thread)) {
+            __kmp_affinity_gran_levels = 0;
+        }
+        else {
+            const char *gran_str = NULL;
+            if (__kmp_affinity_gran == affinity_gran_core) {
+                gran_str = "core";
+            }
+            else if (__kmp_affinity_gran == affinity_gran_package) {
+                gran_str = "package";
+            }
+            else if (__kmp_affinity_gran == affinity_gran_node) {
+                gran_str = "node";
+            }
+            else {
+                KMP_ASSERT(0);
+            }
+
+            // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
+            __kmp_affinity_gran_levels = 0;
+        }
+    }
+    return 2;
+}
+
+# endif /* KMP_GROUP_AFFINITY */
+
+
+# if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+static int
+__kmp_cpuid_mask_width(int count) {
+    int r = 0;
+
+    while((1<<r) < count)
+        ++r;
+    return r;
+}
+
+
+class apicThreadInfo {
+public:
+    unsigned osId;              // param to __kmp_affinity_bind_thread
+    unsigned apicId;            // from cpuid after binding
+    unsigned maxCoresPerPkg;    //      ""
+    unsigned maxThreadsPerPkg;  //      ""
+    unsigned pkgId;             // inferred from above values
+    unsigned coreId;            //      ""
+    unsigned threadId;          //      ""
+};
+
+
+static int
+__kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
+{
+    const apicThreadInfo *aa = (const apicThreadInfo *)a;
+    const apicThreadInfo *bb = (const apicThreadInfo *)b;
+    if (aa->osId < bb->osId) return -1;
+    if (aa->osId > bb->osId) return 1;
+    return 0;
+}
+
+
+static int
+__kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
+{
+    const apicThreadInfo *aa = (const apicThreadInfo *)a;
+    const apicThreadInfo *bb = (const apicThreadInfo *)b;
+    if (aa->pkgId < bb->pkgId) return -1;
+    if (aa->pkgId > bb->pkgId) return 1;
+    if (aa->coreId < bb->coreId) return -1;
+    if (aa->coreId > bb->coreId) return 1;
+    if (aa->threadId < bb->threadId) return -1;
+    if (aa->threadId > bb->threadId) return 1;
+    return 0;
+}
+
+
+//
+// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
+// an algorithm which cycles through the available os threads, setting
+// the current thread's affinity mask to that thread, and then retrieves
+// the Apic Id for each thread context using the cpuid instruction.
+//
+static int
+__kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
+  kmp_i18n_id_t *const msg_id)
+{
+    kmp_cpuid buf;
+    int rc;
+    *address2os = NULL;
+    *msg_id = kmp_i18n_null;
+
+    //
+    // Check if cpuid leaf 4 is supported.
+    //
+        __kmp_x86_cpuid(0, 0, &buf);
+        if (buf.eax < 4) {
+            *msg_id = kmp_i18n_str_NoLeaf4Support;
+            return -1;
+        }
+
+    //
+    // The algorithm used starts by setting the affinity to each available
+    // thread and retrieving info from the cpuid instruction, so if we are
+    // not capable of calling __kmp_get_system_affinity() and
+    // _kmp_get_system_affinity(), then we need to do something else - use
+    // the defaults that we calculated from issuing cpuid without binding
+    // to each proc.
+    //
+    if (! KMP_AFFINITY_CAPABLE()) {
+        //
+        // Hack to try and infer the machine topology using only the data
+        // available from cpuid on the current thread, and __kmp_xproc.
+        //
+        KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+        //
+        // Get an upper bound on the number of threads per package using
+        // cpuid(1).
+        //
+        // On some OS/chps combinations where HT is supported by the chip
+        // but is disabled, this value will be 2 on a single core chip.
+        // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
+        //
+        __kmp_x86_cpuid(1, 0, &buf);
+        int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+        if (maxThreadsPerPkg == 0) {
+            maxThreadsPerPkg = 1;
+        }
+
+        //
+        // The num cores per pkg comes from cpuid(4).
+        // 1 must be added to the encoded value.
+        //
+        // The author of cpu_count.cpp treated this only an upper bound
+        // on the number of cores, but I haven't seen any cases where it
+        // was greater than the actual number of cores, so we will treat
+        // it as exact in this block of code.
+        //
+        // First, we need to check if cpuid(4) is supported on this chip.
+        // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
+        // has the value n or greater.
+        //
+        __kmp_x86_cpuid(0, 0, &buf);
+        if (buf.eax >= 4) {
+            __kmp_x86_cpuid(4, 0, &buf);
+            nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+        }
+        else {
+            nCoresPerPkg = 1;
+        }
+
+        //
+        // There is no way to reliably tell if HT is enabled without issuing
+        // the cpuid instruction from every thread, can correlating the cpuid
+        // info, so if the machine is not affinity capable, we assume that HT
+        // is off.  We have seen quite a few machines where maxThreadsPerPkg
+        // is 2, yet the machine does not support HT.
+        //
+        // - Older OSes are usually found on machines with older chips, which
+        //   do not support HT.
+        //
+        // - The performance penalty for mistakenly identifying a machine as
+        //   HT when it isn't (which results in blocktime being incorrecly set
+        //   to 0) is greater than the penalty when for mistakenly identifying
+        //   a machine as being 1 thread/core when it is really HT enabled
+        //   (which results in blocktime being incorrectly set to a positive
+        //   value).
+        //
+        __kmp_ncores = __kmp_xproc;
+        nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+        __kmp_nThreadsPerCore = 1;
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
+            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+            if (__kmp_affinity_uniform_topology()) {
+                KMP_INFORM(Uniform, "KMP_AFFINITY");
+            } else {
+                KMP_INFORM(NonUniform, "KMP_AFFINITY");
+            }
+            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+              __kmp_nThreadsPerCore, __kmp_ncores);
+        }
+        return 0;
+    }
+
+    //
+    //
+    // From here on, we can assume that it is safe to call
+    // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
+    // even if __kmp_affinity_type = affinity_none.
+    //
+
+    //
+    // Save the affinity mask for the current thread.
+    //
+    kmp_affin_mask_t *oldMask;
+    KMP_CPU_ALLOC(oldMask);
+    KMP_ASSERT(oldMask != NULL);
+    __kmp_get_system_affinity(oldMask, TRUE);
+
+    //
+    // Run through each of the available contexts, binding the current thread
+    // to it, and obtaining the pertinent information using the cpuid instr.
+    //
+    // The relevant information is:
+    //
+    // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
+    //    has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
+    //
+    // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1).  The
+    //    value of this field determines the width of the core# + thread#
+    //    fields in the Apic Id.  It is also an upper bound on the number
+    //    of threads per package, but it has been verified that situations
+    //    happen were it is not exact.  In particular, on certain OS/chip
+    //    combinations where Intel(R) Hyper-Threading Technology is supported
+    //    by the chip but has
+    //    been disabled, the value of this field will be 2 (for a single core
+    //    chip).  On other OS/chip combinations supporting
+    //    Intel(R) Hyper-Threading Technology, the value of
+    //    this field will be 1 when Intel(R) Hyper-Threading Technology is
+    //    disabled and 2 when it is enabled.
+    //
+    // Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4).  The
+    //    value of this field (+1) determines the width of the core# field in
+    //    the Apic Id.  The comments in "cpucount.cpp" say that this value is
+    //    an upper bound, but the IA-32 architecture manual says that it is
+    //    exactly the number of cores per package, and I haven't seen any
+    //    case where it wasn't.
+    //
+    // From this information, deduce the package Id, core Id, and thread Id,
+    // and set the corresponding fields in the apicThreadInfo struct.
+    //
+    unsigned i;
+    apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
+      __kmp_avail_proc * sizeof(apicThreadInfo));
+    unsigned nApics = 0;
+    for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+        //
+        // Skip this proc if it is not included in the machine model.
+        //
+        if (! KMP_CPU_ISSET(i, fullMask)) {
+            continue;
+        }
+        KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
+
+        __kmp_affinity_bind_thread(i);
+        threadInfo[nApics].osId = i;
+
+        //
+        // The apic id and max threads per pkg come from cpuid(1).
+        //
+        __kmp_x86_cpuid(1, 0, &buf);
+        if (! (buf.edx >> 9) & 1) {
+            __kmp_set_system_affinity(oldMask, TRUE);
+            __kmp_free(threadInfo);
+            KMP_CPU_FREE(oldMask);
+            *msg_id = kmp_i18n_str_ApicNotPresent;
+            return -1;
+        }
+        threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
+        threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+        if (threadInfo[nApics].maxThreadsPerPkg == 0) {
+            threadInfo[nApics].maxThreadsPerPkg = 1;
+        }
+
+        //
+        // Max cores per pkg comes from cpuid(4).
+        // 1 must be added to the encoded value.
+        //
+        // First, we need to check if cpuid(4) is supported on this chip.
+        // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
+        // has the value n or greater.
+        //
+        __kmp_x86_cpuid(0, 0, &buf);
+        if (buf.eax >= 4) {
+            __kmp_x86_cpuid(4, 0, &buf);
+            threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+        }
+        else {
+            threadInfo[nApics].maxCoresPerPkg = 1;
+        }
+
+        //
+        // Infer the pkgId / coreId / threadId using only the info
+        // obtained locally.
+        //
+        int widthCT = __kmp_cpuid_mask_width(
+          threadInfo[nApics].maxThreadsPerPkg);
+        threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
+
+        int widthC = __kmp_cpuid_mask_width(
+          threadInfo[nApics].maxCoresPerPkg);
+        int widthT = widthCT - widthC;
+        if (widthT < 0) {
+            //
+            // I've never seen this one happen, but I suppose it could, if
+            // the cpuid instruction on a chip was really screwed up.
+            // Make sure to restore the affinity mask before the tail call.
+            //
+            __kmp_set_system_affinity(oldMask, TRUE);
+            __kmp_free(threadInfo);
+            KMP_CPU_FREE(oldMask);
+            *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+            return -1;
+        }
+
+        int maskC = (1 << widthC) - 1;
+        threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
+          &maskC;
+
+        int maskT = (1 << widthT) - 1;
+        threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
+
+        nApics++;
+    }
+
+    //
+    // We've collected all the info we need.
+    // Restore the old affinity mask for this thread.
+    //
+    __kmp_set_system_affinity(oldMask, TRUE);
+
+    //
+    // If there's only one thread context to bind to, form an Address object
+    // with depth 1 and return immediately (or, if affinity is off, set
+    // address2os to NULL and return).
+    //
+    // If it is configured to omit the package level when there is only a
+    // single package, the logic at the end of this routine won't work if
+    // there is only a single thread - it would try to form an Address
+    // object with depth 0.
+    //
+    KMP_ASSERT(nApics > 0);
+    if (nApics == 1) {
+        __kmp_ncores = nPackages = 1;
+        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+        if (__kmp_affinity_verbose) {
+            char buf[KMP_AFFIN_MASK_PRINT_LEN];
+            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+            KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
+            if (__kmp_affinity_respect_mask) {
+                KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+            } else {
+                KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+            }
+            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+            KMP_INFORM(Uniform, "KMP_AFFINITY");
+            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+              __kmp_nThreadsPerCore, __kmp_ncores);
+        }
+
+        if (__kmp_affinity_type == affinity_none) {
+            __kmp_free(threadInfo);
+            KMP_CPU_FREE(oldMask);
+            return 0;
+        }
+
+        *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
+        Address addr(1);
+        addr.labels[0] = threadInfo[0].pkgId;
+        (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
+
+        if (__kmp_affinity_gran_levels < 0) {
+            __kmp_affinity_gran_levels = 0;
+        }
+
+        if (__kmp_affinity_verbose) {
+            __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
+        }
+
+        __kmp_free(threadInfo);
+        KMP_CPU_FREE(oldMask);
+        return 1;
+    }
+
+    //
+    // Sort the threadInfo table by physical Id.
+    //
+    qsort(threadInfo, nApics, sizeof(*threadInfo),
+      __kmp_affinity_cmp_apicThreadInfo_phys_id);
+
+    //
+    // The table is now sorted by pkgId / coreId / threadId, but we really
+    // don't know the radix of any of the fields.  pkgId's may be sparsely
+    // assigned among the chips on a system.  Although coreId's are usually
+    // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
+    // [0..threadsPerCore-1], we don't want to make any such assumptions.
+    //
+    // For that matter, we don't know what coresPerPkg and threadsPerCore
+    // (or the total # packages) are at this point - we want to determine
+    // that now.  We only have an upper bound on the first two figures.
+    //
+    // We also perform a consistency check at this point: the values returned
+    // by the cpuid instruction for any thread bound to a given package had
+    // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
+    //
+    nPackages = 1;
+    nCoresPerPkg = 1;
+    __kmp_nThreadsPerCore = 1;
+    unsigned nCores = 1;
+
+    unsigned pkgCt = 1;                         // to determine radii
+    unsigned lastPkgId = threadInfo[0].pkgId;
+    unsigned coreCt = 1;
+    unsigned lastCoreId = threadInfo[0].coreId;
+    unsigned threadCt = 1;
+    unsigned lastThreadId = threadInfo[0].threadId;
+
+                                                // intra-pkg consist checks
+    unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
+    unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
+
+    for (i = 1; i < nApics; i++) {
+        if (threadInfo[i].pkgId != lastPkgId) {
+            nCores++;
+            pkgCt++;
+            lastPkgId = threadInfo[i].pkgId;
+            if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
+            coreCt = 1;
+            lastCoreId = threadInfo[i].coreId;
+            if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
+            threadCt = 1;
+            lastThreadId = threadInfo[i].threadId;
+
+            //
+            // This is a different package, so go on to the next iteration
+            // without doing any consistency checks.  Reset the consistency
+            // check vars, though.
+            //
+            prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
+            prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
+            continue;
+        }
+
+        if (threadInfo[i].coreId != lastCoreId) {
+            nCores++;
+            coreCt++;
+            lastCoreId = threadInfo[i].coreId;
+            if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
+            threadCt = 1;
+            lastThreadId = threadInfo[i].threadId;
+        }
+        else if (threadInfo[i].threadId != lastThreadId) {
+            threadCt++;
+            lastThreadId = threadInfo[i].threadId;
+        }
+        else {
+            __kmp_free(threadInfo);
+            KMP_CPU_FREE(oldMask);
+            *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
+            return -1;
+        }
+
+        //
+        // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
+        // fields agree between all the threads bounds to a given package.
+        //
+        if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
+          || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
+            __kmp_free(threadInfo);
+            KMP_CPU_FREE(oldMask);
+            *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+            return -1;
+        }
+    }
+    nPackages = pkgCt;
+    if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
+    if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
+
+    //
+    // When affinity is off, this routine will still be called to set
+    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
+    // correctly, and return now if affinity is not enabled.
+    //
+    __kmp_ncores = nCores;
+    if (__kmp_affinity_verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+        KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
+        if (__kmp_affinity_respect_mask) {
+            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+        } else {
+            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+        }
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        if (__kmp_affinity_uniform_topology()) {
+            KMP_INFORM(Uniform, "KMP_AFFINITY");
+        } else {
+            KMP_INFORM(NonUniform, "KMP_AFFINITY");
+        }
+        KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+          __kmp_nThreadsPerCore, __kmp_ncores);
+
+    }
+
+    if (__kmp_affinity_type == affinity_none) {
+        __kmp_free(threadInfo);
+        KMP_CPU_FREE(oldMask);
+        return 0;
+    }
+
+    //
+    // Now that we've determined the number of packages, the number of cores
+    // per package, and the number of threads per core, we can construct the
+    // data structure that is to be returned.
+    //
+    int pkgLevel = 0;
+    int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
+    int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
+    unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+
+    KMP_ASSERT(depth > 0);
+    *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
+
+    for (i = 0; i < nApics; ++i) {
+        Address addr(depth);
+        unsigned os = threadInfo[i].osId;
+        int d = 0;
+
+        if (pkgLevel >= 0) {
+            addr.labels[d++] = threadInfo[i].pkgId;
+        }
+        if (coreLevel >= 0) {
+            addr.labels[d++] = threadInfo[i].coreId;
+        }
+        if (threadLevel >= 0) {
+            addr.labels[d++] = threadInfo[i].threadId;
+        }
+        (*address2os)[i] = AddrUnsPair(addr, os);
+    }
+
+    if (__kmp_affinity_gran_levels < 0) {
+        //
+        // Set the granularity level based on what levels are modeled
+        // in the machine topology map.
+        //
+        __kmp_affinity_gran_levels = 0;
+        if ((threadLevel >= 0)
+          && (__kmp_affinity_gran > affinity_gran_thread)) {
+            __kmp_affinity_gran_levels++;
+        }
+        if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+            __kmp_affinity_gran_levels++;
+        }
+        if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
+            __kmp_affinity_gran_levels++;
+        }
+    }
+
+    if (__kmp_affinity_verbose) {
+        __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
+          coreLevel, threadLevel);
+    }
+
+    __kmp_free(threadInfo);
+    KMP_CPU_FREE(oldMask);
+    return depth;
+}
+
+
+//
+// Intel(R) microarchitecture code name Nehalem, Dunnington and later
+// architectures support a newer interface for specifying the x2APIC Ids,
+// based on cpuid leaf 11.
+//
+static int
+__kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
+  kmp_i18n_id_t *const msg_id)
+{
+    kmp_cpuid buf;
+
+    *address2os = NULL;
+    *msg_id = kmp_i18n_null;
+
+    //
+    // Check to see if cpuid leaf 11 is supported.
+    //
+    __kmp_x86_cpuid(0, 0, &buf);
+    if (buf.eax < 11) {
+        *msg_id = kmp_i18n_str_NoLeaf11Support;
+        return -1;
+    }
+    __kmp_x86_cpuid(11, 0, &buf);
+    if (buf.ebx == 0) {
+        *msg_id = kmp_i18n_str_NoLeaf11Support;
+        return -1;
+    }
+
+    //
+    // Find the number of levels in the machine topology.  While we're at it,
+    // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg.  We will
+    // try to get more accurate values later by explicitly counting them,
+    // but get reasonable defaults now, in case we return early.
+    //
+    int level;
+    int threadLevel = -1;
+    int coreLevel = -1;
+    int pkgLevel = -1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+
+    for (level = 0;; level++) {
+        if (level > 31) {
+            //
+            // FIXME: Hack for DPD200163180
+            //
+            // If level is big then something went wrong -> exiting
+            //
+            // There could actually be 32 valid levels in the machine topology,
+            // but so far, the only machine we have seen which does not exit
+            // this loop before iteration 32 has fubar x2APIC settings.
+            //
+            // For now, just reject this case based upon loop trip count.
+            //
+            *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+            return -1;
+        }
+        __kmp_x86_cpuid(11, level, &buf);
+        if (buf.ebx == 0) {
+            if (pkgLevel < 0) {
+                //
+                // Will infer nPackages from __kmp_xproc
+                //
+                pkgLevel = level;
+                level++;
+            }
+            break;
+        }
+        int kind = (buf.ecx >> 8) & 0xff;
+        if (kind == 1) {
+            //
+            // SMT level
+            //
+            threadLevel = level;
+            coreLevel = -1;
+            pkgLevel = -1;
+            __kmp_nThreadsPerCore = buf.ebx & 0xff;
+            if (__kmp_nThreadsPerCore == 0) {
+                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+                return -1;
+            }
+        }
+        else if (kind == 2) {
+            //
+            // core level
+            //
+            coreLevel = level;
+            pkgLevel = -1;
+            nCoresPerPkg = buf.ebx & 0xff;
+            if (nCoresPerPkg == 0) {
+                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+                return -1;
+            }
+        }
+        else {
+            if (level <= 0) {
+                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+                return -1;
+            }
+            if (pkgLevel >= 0) {
+                continue;
+            }
+            pkgLevel = level;
+            nPackages = buf.ebx & 0xff;
+            if (nPackages == 0) {
+                *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+                return -1;
+            }
+        }
+    }
+    int depth = level;
+
+    //
+    // In the above loop, "level" was counted from the finest level (usually
+    // thread) to the coarsest.  The caller expects that we will place the
+    // labels in (*address2os)[].first.labels[] in the inverse order, so
+    // we need to invert the vars saying which level means what.
+    //
+    if (threadLevel >= 0) {
+        threadLevel = depth - threadLevel - 1;
+    }
+    if (coreLevel >= 0) {
+        coreLevel = depth - coreLevel - 1;
+    }
+    KMP_DEBUG_ASSERT(pkgLevel >= 0);
+    pkgLevel = depth - pkgLevel - 1;
+
+    //
+    // The algorithm used starts by setting the affinity to each available
+    // thread and retrieving info from the cpuid instruction, so if we are
+    // not capable of calling __kmp_get_system_affinity() and
+    // _kmp_get_system_affinity(), then we need to do something else - use
+    // the defaults that we calculated from issuing cpuid without binding
+    // to each proc.
+    //
+    if (! KMP_AFFINITY_CAPABLE())
+    {
+        //
+        // Hack to try and infer the machine topology using only the data
+        // available from cpuid on the current thread, and __kmp_xproc.
+        //
+        KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+        __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+        nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+            if (__kmp_affinity_uniform_topology()) {
+                KMP_INFORM(Uniform, "KMP_AFFINITY");
+            } else {
+                KMP_INFORM(NonUniform, "KMP_AFFINITY");
+            }
+            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+              __kmp_nThreadsPerCore, __kmp_ncores);
+        }
+        return 0;
+    }
+
+    //
+    //
+    // From here on, we can assume that it is safe to call
+    // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
+    // even if __kmp_affinity_type = affinity_none.
+    //
+
+    //
+    // Save the affinity mask for the current thread.
+    //
+    kmp_affin_mask_t *oldMask;
+    KMP_CPU_ALLOC(oldMask);
+    __kmp_get_system_affinity(oldMask, TRUE);
+
+    //
+    // Allocate the data structure to be returned.
+    //
+    AddrUnsPair *retval = (AddrUnsPair *)
+      __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+
+    //
+    // Run through each of the available contexts, binding the current thread
+    // to it, and obtaining the pertinent information using the cpuid instr.
+    //
+    unsigned int proc;
+    int nApics = 0;
+    for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
+        //
+        // Skip this proc if it is not included in the machine model.
+        //
+        if (! KMP_CPU_ISSET(proc, fullMask)) {
+            continue;
+        }
+        KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
+
+        __kmp_affinity_bind_thread(proc);
+
+        //
+        // Extrach the labels for each level in the machine topology map
+        // from the Apic ID.
+        //
+        Address addr(depth);
+        int prev_shift = 0;
+
+        for (level = 0; level < depth; level++) {
+            __kmp_x86_cpuid(11, level, &buf);
+            unsigned apicId = buf.edx;
+            if (buf.ebx == 0) {
+                if (level != depth - 1) {
+                    KMP_CPU_FREE(oldMask);
+                    *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+                    return -1;
+                }
+                addr.labels[depth - level - 1] = apicId >> prev_shift;
+                level++;
+                break;
+            }
+            int shift = buf.eax & 0x1f;
+            int mask = (1 << shift) - 1;
+            addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
+            prev_shift = shift;
+        }
+        if (level != depth) {
+            KMP_CPU_FREE(oldMask);
+            *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+            return -1;
+        }
+
+        retval[nApics] = AddrUnsPair(addr, proc);
+        nApics++;
+    }
+
+    //
+    // We've collected all the info we need.
+    // Restore the old affinity mask for this thread.
+    //
+    __kmp_set_system_affinity(oldMask, TRUE);
+
+    //
+    // If there's only one thread context to bind to, return now.
+    //
+    KMP_ASSERT(nApics > 0);
+    if (nApics == 1) {
+        __kmp_ncores = nPackages = 1;
+        __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+        if (__kmp_affinity_verbose) {
+            char buf[KMP_AFFIN_MASK_PRINT_LEN];
+            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+            KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+            if (__kmp_affinity_respect_mask) {
+                KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+            } else {
+                KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+            }
+            KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+            KMP_INFORM(Uniform, "KMP_AFFINITY");
+            KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+              __kmp_nThreadsPerCore, __kmp_ncores);
+        }
+
+        if (__kmp_affinity_type == affinity_none) {
+            __kmp_free(retval);
+            KMP_CPU_FREE(oldMask);
+            return 0;
+        }
+
+        //
+        // Form an Address object which only includes the package level.
+        //
+        Address addr(1);
+        addr.labels[0] = retval[0].first.labels[pkgLevel];
+        retval[0].first = addr;
+
+        if (__kmp_affinity_gran_levels < 0) {
+            __kmp_affinity_gran_levels = 0;
+        }
+
+        if (__kmp_affinity_verbose) {
+            __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
+        }
+
+        *address2os = retval;
+        KMP_CPU_FREE(oldMask);
+        return 1;
+    }
+
+    //
+    // Sort the table by physical Id.
+    //
+    qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
+
+    //
+    // Find the radix at each of the levels.
+    //
+    unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+    unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+    unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+    unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+    for (level = 0; level < depth; level++) {
+        totals[level] = 1;
+        maxCt[level] = 1;
+        counts[level] = 1;
+        last[level] = retval[0].first.labels[level];
+    }
+
+    //
+    // From here on, the iteration variable "level" runs from the finest
+    // level to the coarsest, i.e. we iterate forward through
+    // (*address2os)[].first.labels[] - in the previous loops, we iterated
+    // backwards.
+    //
+    for (proc = 1; (int)proc < nApics; proc++) {
+        int level;
+        for (level = 0; level < depth; level++) {
+            if (retval[proc].first.labels[level] != last[level]) {
+                int j;
+                for (j = level + 1; j < depth; j++) {
+                    totals[j]++;
+                    counts[j] = 1;
+                    // The line below causes printing incorrect topology information
+                    // in case the max value for some level (maxCt[level]) is encountered earlier than
+                    // some less value while going through the array.
+                    // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
+                    // whereas it must be 4.
+                    // TODO!!! Check if it can be commented safely
+                    //maxCt[j] = 1;
+                    last[j] = retval[proc].first.labels[j];
+                }
+                totals[level]++;
+                counts[level]++;
+                if (counts[level] > maxCt[level]) {
+                    maxCt[level] = counts[level];
+                }
+                last[level] = retval[proc].first.labels[level];
+                break;
+            }
+            else if (level == depth - 1) {
+                __kmp_free(last);
+                __kmp_free(maxCt);
+                __kmp_free(counts);
+                __kmp_free(totals);
+                __kmp_free(retval);
+                KMP_CPU_FREE(oldMask);
+                *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+                return -1;
+            }
+        }
+    }
+
+    //
+    // When affinity is off, this routine will still be called to set
+    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
+    // correctly, and return if affinity is not enabled.
+    //
+    if (threadLevel >= 0) {
+        __kmp_nThreadsPerCore = maxCt[threadLevel];
+    }
+    else {
+        __kmp_nThreadsPerCore = 1;
+    }
+    nPackages = totals[pkgLevel];
+
+    if (coreLevel >= 0) {
+        __kmp_ncores = totals[coreLevel];
+        nCoresPerPkg = maxCt[coreLevel];
+    }
+    else {
+        __kmp_ncores = nPackages;
+        nCoresPerPkg = 1;
+    }
+
+    //
+    // Check to see if the machine topology is uniform
+    //
+    unsigned prod = maxCt[0];
+    for (level = 1; level < depth; level++) {
+       prod *= maxCt[level];
+    }
+    bool uniform = (prod == totals[level - 1]);
+
+    //
+    // Print the machine topology summary.
+    //
+    if (__kmp_affinity_verbose) {
+        char mask[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+        KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+        if (__kmp_affinity_respect_mask) {
+            KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
+        } else {
+            KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
+        }
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        if (uniform) {
+            KMP_INFORM(Uniform, "KMP_AFFINITY");
+        } else {
+            KMP_INFORM(NonUniform, "KMP_AFFINITY");
+        }
+
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init(&buf);
+
+        __kmp_str_buf_print(&buf, "%d", totals[0]);
+        for (level = 1; level <= pkgLevel; level++) {
+            __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
+        }
+        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+          __kmp_nThreadsPerCore, __kmp_ncores);
+
+        __kmp_str_buf_free(&buf);
+    }
+
+    if (__kmp_affinity_type == affinity_none) {
+        __kmp_free(last);
+        __kmp_free(maxCt);
+        __kmp_free(counts);
+        __kmp_free(totals);
+        __kmp_free(retval);
+        KMP_CPU_FREE(oldMask);
+        return 0;
+    }
+
+    //
+    // Find any levels with radiix 1, and remove them from the map
+    // (except for the package level).
+    //
+    int new_depth = 0;
+    for (level = 0; level < depth; level++) {
+        if ((maxCt[level] == 1) && (level != pkgLevel)) {
+           continue;
+        }
+        new_depth++;
+    }
+
+    //
+    // If we are removing any levels, allocate a new vector to return,
+    // and copy the relevant information to it.
+    //
+    if (new_depth != depth) {
+        AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
+          sizeof(AddrUnsPair) * nApics);
+        for (proc = 0; (int)proc < nApics; proc++) {
+            Address addr(new_depth);
+            new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
+        }
+        int new_level = 0;
+        for (level = 0; level < depth; level++) {
+            if ((maxCt[level] == 1) && (level != pkgLevel)) {
+               if (level == threadLevel) {
+                   threadLevel = -1;
+               }
+               else if ((threadLevel >= 0) && (level < threadLevel)) {
+                   threadLevel--;
+               }
+               if (level == coreLevel) {
+                   coreLevel = -1;
+               }
+               else if ((coreLevel >= 0) && (level < coreLevel)) {
+                   coreLevel--;
+               }
+               if (level < pkgLevel) {
+                   pkgLevel--;
+               }
+               continue;
+            }
+            for (proc = 0; (int)proc < nApics; proc++) {
+                new_retval[proc].first.labels[new_level]
+                  = retval[proc].first.labels[level];
+            }
+            new_level++;
+        }
+
+        __kmp_free(retval);
+        retval = new_retval;
+        depth = new_depth;
+    }
+
+    if (__kmp_affinity_gran_levels < 0) {
+        //
+        // Set the granularity level based on what levels are modeled
+        // in the machine topology map.
+        //
+        __kmp_affinity_gran_levels = 0;
+        if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+            __kmp_affinity_gran_levels++;
+        }
+        if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+            __kmp_affinity_gran_levels++;
+        }
+        if (__kmp_affinity_gran > affinity_gran_package) {
+            __kmp_affinity_gran_levels++;
+        }
+    }
+
+    if (__kmp_affinity_verbose) {
+        __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
+          coreLevel, threadLevel);
+    }
+
+    __kmp_free(last);
+    __kmp_free(maxCt);
+    __kmp_free(counts);
+    __kmp_free(totals);
+    KMP_CPU_FREE(oldMask);
+    *address2os = retval;
+    return depth;
+}
+
+
+# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+
+#define osIdIndex       0
+#define threadIdIndex   1
+#define coreIdIndex     2
+#define pkgIdIndex      3
+#define nodeIdIndex     4
+
+typedef unsigned *ProcCpuInfo;
+static unsigned maxIndex = pkgIdIndex;
+
+
+static int
+__kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
+{
+    const unsigned *aa = (const unsigned *)a;
+    const unsigned *bb = (const unsigned *)b;
+    if (aa[osIdIndex] < bb[osIdIndex]) return -1;
+    if (aa[osIdIndex] > bb[osIdIndex]) return 1;
+    return 0;
+};
+
+
+static int
+__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
+{
+    unsigned i;
+    const unsigned *aa = *((const unsigned **)a);
+    const unsigned *bb = *((const unsigned **)b);
+    for (i = maxIndex; ; i--) {
+        if (aa[i] < bb[i]) return -1;
+        if (aa[i] > bb[i]) return 1;
+        if (i == osIdIndex) break;
+    }
+    return 0;
+}
+
+
+//
+// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
+// affinity map.
+//
+static int
+__kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
+  kmp_i18n_id_t *const msg_id, FILE *f)
+{
+    *address2os = NULL;
+    *msg_id = kmp_i18n_null;
+
+    //
+    // Scan of the file, and count the number of "processor" (osId) fields,
+    // and find the highest value of <n> for a node_<n> field.
+    //
+    char buf[256];
+    unsigned num_records = 0;
+    while (! feof(f)) {
+        buf[sizeof(buf) - 1] = 1;
+        if (! fgets(buf, sizeof(buf), f)) {
+            //
+            // Read errors presumably because of EOF
+            //
+            break;
+        }
+
+        char s1[] = "processor";
+        if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+            num_records++;
+            continue;
+        }
+
+        //
+        // FIXME - this will match "node_<n> <garbage>"
+        //
+        unsigned level;
+        if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
+            if (nodeIdIndex + level >= maxIndex) {
+                maxIndex = nodeIdIndex + level;
+            }
+            continue;
+        }
+    }
+
+    //
+    // Check for empty file / no valid processor records, or too many.
+    // The number of records can't exceed the number of valid bits in the
+    // affinity mask.
+    //
+    if (num_records == 0) {
+        *line = 0;
+        *msg_id = kmp_i18n_str_NoProcRecords;
+        return -1;
+    }
+    if (num_records > (unsigned)__kmp_xproc) {
+        *line = 0;
+        *msg_id = kmp_i18n_str_TooManyProcRecords;
+        return -1;
+    }
+
+    //
+    // Set the file pointer back to the begginning, so that we can scan the
+    // file again, this time performing a full parse of the data.
+    // Allocate a vector of ProcCpuInfo object, where we will place the data.
+    // Adding an extra element at the end allows us to remove a lot of extra
+    // checks for termination conditions.
+    //
+    if (fseek(f, 0, SEEK_SET) != 0) {
+        *line = 0;
+        *msg_id = kmp_i18n_str_CantRewindCpuinfo;
+        return -1;
+    }
+
+    //
+    // Allocate the array of records to store the proc info in.  The dummy
+    // element at the end makes the logic in filling them out easier to code.
+    //
+    unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
+      * sizeof(unsigned *));
+    unsigned i;
+    for (i = 0; i <= num_records; i++) {
+        threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
+          * sizeof(unsigned));
+    }
+
+#define CLEANUP_THREAD_INFO \
+    for (i = 0; i <= num_records; i++) {                                \
+        __kmp_free(threadInfo[i]);                                      \
+    }                                                                   \
+    __kmp_free(threadInfo);
+
+    //
+    // A value of UINT_MAX means that we didn't find the field
+    //
+    unsigned __index;
+
+#define INIT_PROC_INFO(p) \
+    for (__index = 0; __index <= maxIndex; __index++) {                 \
+        (p)[__index] = UINT_MAX;                                        \
+    }
+
+    for (i = 0; i <= num_records; i++) {
+        INIT_PROC_INFO(threadInfo[i]);
+    }
+
+    unsigned num_avail = 0;
+    *line = 0;
+    while (! feof(f)) {
+        //
+        // Create an inner scoping level, so that all the goto targets at the
+        // end of the loop appear in an outer scoping level.  This avoids
+        // warnings about jumping past an initialization to a target in the
+        // same block.
+        //
+        {
+            buf[sizeof(buf) - 1] = 1;
+            bool long_line = false;
+            if (! fgets(buf, sizeof(buf), f)) {
+                //
+                // Read errors presumably because of EOF
+                //
+                // If there is valid data in threadInfo[num_avail], then fake
+                // a blank line in ensure that the last address gets parsed.
+                //
+                bool valid = false;
+                for (i = 0; i <= maxIndex; i++) {
+                    if (threadInfo[num_avail][i] != UINT_MAX) {
+                        valid = true;
+                    }
+                }
+                if (! valid) {
+                    break;
+                }
+                buf[0] = 0;
+            } else if (!buf[sizeof(buf) - 1]) {
+                //
+                // The line is longer than the buffer.  Set a flag and don't
+                // emit an error if we were going to ignore the line, anyway.
+                //
+                long_line = true;
+
+#define CHECK_LINE \
+    if (long_line) {                                                    \
+        CLEANUP_THREAD_INFO;                                            \
+        *msg_id = kmp_i18n_str_LongLineCpuinfo;                         \
+        return -1;                                                      \
+    }
+            }
+            (*line)++;
+
+            char s1[] = "processor";
+            if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+                CHECK_LINE;
+                char *p = strchr(buf + sizeof(s1) - 1, ':');
+                unsigned val;
+                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
+                if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
+                threadInfo[num_avail][osIdIndex] = val;
+#if KMP_OS_LINUX && USE_SYSFS_INFO
+                char path[256];
+                KMP_SNPRINTF(path, sizeof(path),
+                    "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
+                    threadInfo[num_avail][osIdIndex]);
+                __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
+
+                KMP_SNPRINTF(path, sizeof(path),
+                    "/sys/devices/system/cpu/cpu%u/topology/core_id",
+                    threadInfo[num_avail][osIdIndex]);
+                __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
+                continue;
+#else
+            }
+            char s2[] = "physical id";
+            if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
+                CHECK_LINE;
+                char *p = strchr(buf + sizeof(s2) - 1, ':');
+                unsigned val;
+                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
+                if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
+                threadInfo[num_avail][pkgIdIndex] = val;
+                continue;
+            }
+            char s3[] = "core id";
+            if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
+                CHECK_LINE;
+                char *p = strchr(buf + sizeof(s3) - 1, ':');
+                unsigned val;
+                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
+                if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
+                threadInfo[num_avail][coreIdIndex] = val;
+                continue;
+#endif // KMP_OS_LINUX && USE_SYSFS_INFO
+            }
+            char s4[] = "thread id";
+            if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
+                CHECK_LINE;
+                char *p = strchr(buf + sizeof(s4) - 1, ':');
+                unsigned val;
+                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
+                if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
+                threadInfo[num_avail][threadIdIndex] = val;
+                continue;
+            }
+            unsigned level;
+            if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
+                CHECK_LINE;
+                char *p = strchr(buf + sizeof(s4) - 1, ':');
+                unsigned val;
+                if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
+                KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+                if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
+                threadInfo[num_avail][nodeIdIndex + level] = val;
+                continue;
+            }
+
+            //
+            // We didn't recognize the leading token on the line.
+            // There are lots of leading tokens that we don't recognize -
+            // if the line isn't empty, go on to the next line.
+            //
+            if ((*buf != 0) && (*buf != '\n')) {
+                //
+                // If the line is longer than the buffer, read characters
+                // until we find a newline.
+                //
+                if (long_line) {
+                    int ch;
+                    while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
+                }
+                continue;
+            }
+
+            //
+            // A newline has signalled the end of the processor record.
+            // Check that there aren't too many procs specified.
+            //
+            if ((int)num_avail == __kmp_xproc) {
+                CLEANUP_THREAD_INFO;
+                *msg_id = kmp_i18n_str_TooManyEntries;
+                return -1;
+            }
+
+            //
+            // Check for missing fields.  The osId field must be there, and we
+            // currently require that the physical id field is specified, also.
+            //
+            if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
+                CLEANUP_THREAD_INFO;
+                *msg_id = kmp_i18n_str_MissingProcField;
+                return -1;
+            }
+            if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
+                CLEANUP_THREAD_INFO;
+                *msg_id = kmp_i18n_str_MissingPhysicalIDField;
+                return -1;
+            }
+
+            //
+            // Skip this proc if it is not included in the machine model.
+            //
+            if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
+                INIT_PROC_INFO(threadInfo[num_avail]);
+                continue;
+            }
+
+            //
+            // We have a successful parse of this proc's info.
+            // Increment the counter, and prepare for the next proc.
+            //
+            num_avail++;
+            KMP_ASSERT(num_avail <= num_records);
+            INIT_PROC_INFO(threadInfo[num_avail]);
+        }
+        continue;
+
+        no_val:
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_MissingValCpuinfo;
+        return -1;
+
+        dup_field:
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
+        return -1;
+    }
+    *line = 0;
+
+# if KMP_MIC && REDUCE_TEAM_SIZE
+    unsigned teamSize = 0;
+# endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+    // check for num_records == __kmp_xproc ???
+
+    //
+    // If there's only one thread context to bind to, form an Address object
+    // with depth 1 and return immediately (or, if affinity is off, set
+    // address2os to NULL and return).
+    //
+    // If it is configured to omit the package level when there is only a
+    // single package, the logic at the end of this routine won't work if
+    // there is only a single thread - it would try to form an Address
+    // object with depth 0.
+    //
+    KMP_ASSERT(num_avail > 0);
+    KMP_ASSERT(num_avail <= num_records);
+    if (num_avail == 1) {
+        __kmp_ncores = 1;
+        __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+        if (__kmp_affinity_verbose) {
+            if (! KMP_AFFINITY_CAPABLE()) {
+                KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
+                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+                KMP_INFORM(Uniform, "KMP_AFFINITY");
+            }
+            else {
+                char buf[KMP_AFFIN_MASK_PRINT_LEN];
+                __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                  fullMask);
+                KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
+                if (__kmp_affinity_respect_mask) {
+                    KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+                } else {
+                    KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+                }
+                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+                KMP_INFORM(Uniform, "KMP_AFFINITY");
+            }
+            int index;
+            kmp_str_buf_t buf;
+            __kmp_str_buf_init(&buf);
+            __kmp_str_buf_print(&buf, "1");
+            for (index = maxIndex - 1; index > pkgIdIndex; index--) {
+                __kmp_str_buf_print(&buf, " x 1");
+            }
+            KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
+            __kmp_str_buf_free(&buf);
+        }
+
+        if (__kmp_affinity_type == affinity_none) {
+            CLEANUP_THREAD_INFO;
+            return 0;
+        }
+
+        *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
+        Address addr(1);
+        addr.labels[0] = threadInfo[0][pkgIdIndex];
+        (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
+
+        if (__kmp_affinity_gran_levels < 0) {
+            __kmp_affinity_gran_levels = 0;
+        }
+
+        if (__kmp_affinity_verbose) {
+            __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
+        }
+
+        CLEANUP_THREAD_INFO;
+        return 1;
+    }
+
+    //
+    // Sort the threadInfo table by physical Id.
+    //
+    qsort(threadInfo, num_avail, sizeof(*threadInfo),
+      __kmp_affinity_cmp_ProcCpuInfo_phys_id);
+
+    //
+    // The table is now sorted by pkgId / coreId / threadId, but we really
+    // don't know the radix of any of the fields.  pkgId's may be sparsely
+    // assigned among the chips on a system.  Although coreId's are usually
+    // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
+    // [0..threadsPerCore-1], we don't want to make any such assumptions.
+    //
+    // For that matter, we don't know what coresPerPkg and threadsPerCore
+    // (or the total # packages) are at this point - we want to determine
+    // that now.  We only have an upper bound on the first two figures.
+    //
+    unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
+      * sizeof(unsigned));
+    unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
+      * sizeof(unsigned));
+    unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
+      * sizeof(unsigned));
+    unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
+      * sizeof(unsigned));
+
+    bool assign_thread_ids = false;
+    unsigned threadIdCt;
+    unsigned index;
+
+    restart_radix_check:
+    threadIdCt = 0;
+
+    //
+    // Initialize the counter arrays with data from threadInfo[0].
+    //
+    if (assign_thread_ids) {
+        if (threadInfo[0][threadIdIndex] == UINT_MAX) {
+            threadInfo[0][threadIdIndex] = threadIdCt++;
+        }
+        else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
+            threadIdCt = threadInfo[0][threadIdIndex] + 1;
+        }
+    }
+    for (index = 0; index <= maxIndex; index++) {
+        counts[index] = 1;
+        maxCt[index] = 1;
+        totals[index] = 1;
+        lastId[index] = threadInfo[0][index];;
+    }
+
+    //
+    // Run through the rest of the OS procs.
+    //
+    for (i = 1; i < num_avail; i++) {
+        //
+        // Find the most significant index whose id differs
+        // from the id for the previous OS proc.
+        //
+        for (index = maxIndex; index >= threadIdIndex; index--) {
+            if (assign_thread_ids && (index == threadIdIndex)) {
+                //
+                // Auto-assign the thread id field if it wasn't specified.
+                //
+                if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+                    threadInfo[i][threadIdIndex] = threadIdCt++;
+                }
+
+                //
+                // Aparrently the thread id field was specified for some
+                // entries and not others.  Start the thread id counter
+                // off at the next higher thread id.
+                //
+                else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+                    threadIdCt = threadInfo[i][threadIdIndex] + 1;
+                }
+            }
+            if (threadInfo[i][index] != lastId[index]) {
+                //
+                // Run through all indices which are less significant,
+                // and reset the counts to 1.
+                //
+                // At all levels up to and including index, we need to
+                // increment the totals and record the last id.
+                //
+                unsigned index2;
+                for (index2 = threadIdIndex; index2 < index; index2++) {
+                    totals[index2]++;
+                    if (counts[index2] > maxCt[index2]) {
+                        maxCt[index2] = counts[index2];
+                    }
+                    counts[index2] = 1;
+                    lastId[index2] = threadInfo[i][index2];
+                }
+                counts[index]++;
+                totals[index]++;
+                lastId[index] = threadInfo[i][index];
+
+                if (assign_thread_ids && (index > threadIdIndex)) {
+
+# if KMP_MIC && REDUCE_TEAM_SIZE
+                    //
+                    // The default team size is the total #threads in the machine
+                    // minus 1 thread for every core that has 3 or more threads.
+                    //
+                    teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
+# endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+                    //
+                    // Restart the thread counter, as we are on a new core.
+                    //
+                    threadIdCt = 0;
+
+                    //
+                    // Auto-assign the thread id field if it wasn't specified.
+                    //
+                    if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+                        threadInfo[i][threadIdIndex] = threadIdCt++;
+                    }
+
+                    //
+                    // Aparrently the thread id field was specified for some
+                    // entries and not others.  Start the thread id counter
+                    // off at the next higher thread id.
+                    //
+                    else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+                        threadIdCt = threadInfo[i][threadIdIndex] + 1;
+                    }
+                }
+                break;
+            }
+        }
+        if (index < threadIdIndex) {
+            //
+            // If thread ids were specified, it is an error if they are not
+            // unique.  Also, check that we waven't already restarted the
+            // loop (to be safe - shouldn't need to).
+            //
+            if ((threadInfo[i][threadIdIndex] != UINT_MAX)
+              || assign_thread_ids) {
+                __kmp_free(lastId);
+                __kmp_free(totals);
+                __kmp_free(maxCt);
+                __kmp_free(counts);
+                CLEANUP_THREAD_INFO;
+                *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
+                return -1;
+            }
+
+            //
+            // If the thread ids were not specified and we see entries
+            // entries that are duplicates, start the loop over and
+            // assign the thread ids manually.
+            //
+            assign_thread_ids = true;
+            goto restart_radix_check;
+        }
+    }
+
+# if KMP_MIC && REDUCE_TEAM_SIZE
+    //
+    // The default team size is the total #threads in the machine
+    // minus 1 thread for every core that has 3 or more threads.
+    //
+    teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
+# endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+    for (index = threadIdIndex; index <= maxIndex; index++) {
+        if (counts[index] > maxCt[index]) {
+            maxCt[index] = counts[index];
+        }
+    }
+
+    __kmp_nThreadsPerCore = maxCt[threadIdIndex];
+    nCoresPerPkg = maxCt[coreIdIndex];
+    nPackages = totals[pkgIdIndex];
+
+    //
+    // Check to see if the machine topology is uniform
+    //
+    unsigned prod = totals[maxIndex];
+    for (index = threadIdIndex; index < maxIndex; index++) {
+       prod *= maxCt[index];
+    }
+    bool uniform = (prod == totals[threadIdIndex]);
+
+    //
+    // When affinity is off, this routine will still be called to set
+    // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+    // nCoresPerPkg, & nPackages.  Make sure all these vars are set
+    // correctly, and return now if affinity is not enabled.
+    //
+    __kmp_ncores = totals[coreIdIndex];
+
+    if (__kmp_affinity_verbose) {
+        if (! KMP_AFFINITY_CAPABLE()) {
+                KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
+                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+                if (uniform) {
+                    KMP_INFORM(Uniform, "KMP_AFFINITY");
+                } else {
+                    KMP_INFORM(NonUniform, "KMP_AFFINITY");
+                }
+        }
+        else {
+            char buf[KMP_AFFIN_MASK_PRINT_LEN];
+            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
+                KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
+                if (__kmp_affinity_respect_mask) {
+                    KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+                } else {
+                    KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+                }
+                KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+                if (uniform) {
+                    KMP_INFORM(Uniform, "KMP_AFFINITY");
+                } else {
+                    KMP_INFORM(NonUniform, "KMP_AFFINITY");
+                }
+        }
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init(&buf);
+
+        __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
+        for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
+            __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
+        }
+        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str,  maxCt[coreIdIndex],
+          maxCt[threadIdIndex], __kmp_ncores);
+
+        __kmp_str_buf_free(&buf);
+    }
+
+# if KMP_MIC && REDUCE_TEAM_SIZE
+    //
+    // Set the default team size.
+    //
+    if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
+        __kmp_dflt_team_nth = teamSize;
+        KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
+          __kmp_dflt_team_nth));
+    }
+# endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+    if (__kmp_affinity_type == affinity_none) {
+        __kmp_free(lastId);
+        __kmp_free(totals);
+        __kmp_free(maxCt);
+        __kmp_free(counts);
+        CLEANUP_THREAD_INFO;
+        return 0;
+    }
+
+    //
+    // Count the number of levels which have more nodes at that level than
+    // at the parent's level (with there being an implicit root node of
+    // the top level).  This is equivalent to saying that there is at least
+    // one node at this level which has a sibling.  These levels are in the
+    // map, and the package level is always in the map.
+    //
+    bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
+    int level = 0;
+    for (index = threadIdIndex; index < maxIndex; index++) {
+        KMP_ASSERT(totals[index] >= totals[index + 1]);
+        inMap[index] = (totals[index] > totals[index + 1]);
+    }
+    inMap[maxIndex] = (totals[maxIndex] > 1);
+    inMap[pkgIdIndex] = true;
+
+    int depth = 0;
+    for (index = threadIdIndex; index <= maxIndex; index++) {
+        if (inMap[index]) {
+            depth++;
+        }
+    }
+    KMP_ASSERT(depth > 0);
+
+    //
+    // Construct the data structure that is to be returned.
+    //
+    *address2os = (AddrUnsPair*)
+      __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
+    int pkgLevel = -1;
+    int coreLevel = -1;
+    int threadLevel = -1;
+
+    for (i = 0; i < num_avail; ++i) {
+        Address addr(depth);
+        unsigned os = threadInfo[i][osIdIndex];
+        int src_index;
+        int dst_index = 0;
+
+        for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
+            if (! inMap[src_index]) {
+                continue;
+            }
+            addr.labels[dst_index] = threadInfo[i][src_index];
+            if (src_index == pkgIdIndex) {
+                pkgLevel = dst_index;
+            }
+            else if (src_index == coreIdIndex) {
+                coreLevel = dst_index;
+            }
+            else if (src_index == threadIdIndex) {
+                threadLevel = dst_index;
+            }
+            dst_index++;
+        }
+        (*address2os)[i] = AddrUnsPair(addr, os);
+    }
+
+    if (__kmp_affinity_gran_levels < 0) {
+        //
+        // Set the granularity level based on what levels are modeled
+        // in the machine topology map.
+        //
+        unsigned src_index;
+        __kmp_affinity_gran_levels = 0;
+        for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
+            if (! inMap[src_index]) {
+                continue;
+            }
+            switch (src_index) {
+                case threadIdIndex:
+                if (__kmp_affinity_gran > affinity_gran_thread) {
+                    __kmp_affinity_gran_levels++;
+                }
+
+                break;
+                case coreIdIndex:
+                if (__kmp_affinity_gran > affinity_gran_core) {
+                    __kmp_affinity_gran_levels++;
+                }
+                break;
+
+                case pkgIdIndex:
+                if (__kmp_affinity_gran > affinity_gran_package) {
+                    __kmp_affinity_gran_levels++;
+                }
+                break;
+            }
+        }
+    }
+
+    if (__kmp_affinity_verbose) {
+        __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
+          coreLevel, threadLevel);
+    }
+
+    __kmp_free(inMap);
+    __kmp_free(lastId);
+    __kmp_free(totals);
+    __kmp_free(maxCt);
+    __kmp_free(counts);
+    CLEANUP_THREAD_INFO;
+    return depth;
+}
+
+
+//
+// Create and return a table of affinity masks, indexed by OS thread ID.
+// This routine handles OR'ing together all the affinity masks of threads
+// that are sufficiently close, if granularity > fine.
+//
+static kmp_affin_mask_t *
+__kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
+  AddrUnsPair *address2os, unsigned numAddrs)
+{
+    //
+    // First form a table of affinity masks in order of OS thread id.
+    //
+    unsigned depth;
+    unsigned maxOsId;
+    unsigned i;
+
+    KMP_ASSERT(numAddrs > 0);
+    depth = address2os[0].first.depth;
+
+    maxOsId = 0;
+    for (i = 0; i < numAddrs; i++) {
+        unsigned osId = address2os[i].second;
+        if (osId > maxOsId) {
+            maxOsId = osId;
+        }
+    }
+    kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
+      (maxOsId + 1) * __kmp_affin_mask_size);
+
+    //
+    // Sort the address2os table according to physical order.  Doing so
+    // will put all threads on the same core/package/node in consecutive
+    // locations.
+    //
+    qsort(address2os, numAddrs, sizeof(*address2os),
+      __kmp_affinity_cmp_Address_labels);
+
+    KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
+    if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
+        KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY",  __kmp_affinity_gran_levels);
+    }
+    if (__kmp_affinity_gran_levels >= (int)depth) {
+        if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+          && (__kmp_affinity_type != affinity_none))) {
+            KMP_WARNING(AffThreadsMayMigrate);
+        }
+    }
+
+    //
+    // Run through the table, forming the masks for all threads on each
+    // core.  Threads on the same core will have identical "Address"
+    // objects, not considering the last level, which must be the thread
+    // id.  All threads on a core will appear consecutively.
+    //
+    unsigned unique = 0;
+    unsigned j = 0;                             // index of 1st thread on core
+    unsigned leader = 0;
+    Address *leaderAddr = &(address2os[0].first);
+    kmp_affin_mask_t *sum
+      = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+    KMP_CPU_ZERO(sum);
+    KMP_CPU_SET(address2os[0].second, sum);
+    for (i = 1; i < numAddrs; i++) {
+        //
+        // If this thread is sufficiently close to the leader (within the
+        // granularity setting), then set the bit for this os thread in the
+        // affinity mask for this group, and go on to the next thread.
+        //
+        if (leaderAddr->isClose(address2os[i].first,
+          __kmp_affinity_gran_levels)) {
+            KMP_CPU_SET(address2os[i].second, sum);
+            continue;
+        }
+
+        //
+        // For every thread in this group, copy the mask to the thread's
+        // entry in the osId2Mask table.  Mark the first address as a
+        // leader.
+        //
+        for (; j < i; j++) {
+            unsigned osId = address2os[j].second;
+            KMP_DEBUG_ASSERT(osId <= maxOsId);
+            kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+            KMP_CPU_COPY(mask, sum);
+            address2os[j].first.leader = (j == leader);
+        }
+        unique++;
+
+        //
+        // Start a new mask.
+        //
+        leader = i;
+        leaderAddr = &(address2os[i].first);
+        KMP_CPU_ZERO(sum);
+        KMP_CPU_SET(address2os[i].second, sum);
+    }
+
+    //
+    // For every thread in last group, copy the mask to the thread's
+    // entry in the osId2Mask table.
+    //
+    for (; j < i; j++) {
+        unsigned osId = address2os[j].second;
+        KMP_DEBUG_ASSERT(osId <= maxOsId);
+        kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+        KMP_CPU_COPY(mask, sum);
+        address2os[j].first.leader = (j == leader);
+    }
+    unique++;
+
+    *maxIndex = maxOsId;
+    *numUnique = unique;
+    return osId2Mask;
+}
+
+
+//
+// Stuff for the affinity proclist parsers.  It's easier to declare these vars
+// as file-static than to try and pass them through the calling sequence of
+// the recursive-descent OMP_PLACES parser.
+//
+static kmp_affin_mask_t *newMasks;
+static int numNewMasks;
+static int nextNewMask;
+
+#define ADD_MASK(_mask) \
+    {                                                                   \
+        if (nextNewMask >= numNewMasks) {                               \
+            numNewMasks *= 2;                                           \
+            newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
+              numNewMasks * __kmp_affin_mask_size);                     \
+        }                                                               \
+        KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));    \
+        nextNewMask++;                                                  \
+    }
+
+#define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
+    {                                                                   \
+        if (((_osId) > _maxOsId) ||                                     \
+          (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
+            if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
+              && (__kmp_affinity_type != affinity_none))) {             \
+                KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
+            }                                                           \
+        }                                                               \
+        else {                                                          \
+            ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));               \
+        }                                                               \
+    }
+
+
+//
+// Re-parse the proclist (for the explicit affinity type), and form the list
+// of affinity newMasks indexed by gtid.
+//
+static void
+__kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
+  unsigned int *out_numMasks, const char *proclist,
+  kmp_affin_mask_t *osId2Mask, int maxOsId)
+{
+    const char *scan = proclist;
+    const char *next = proclist;
+
+    //
+    // We use malloc() for the temporary mask vector,
+    // so that we can use realloc() to extend it.
+    //
+    numNewMasks = 2;
+    newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
+      * __kmp_affin_mask_size);
+    nextNewMask = 0;
+    kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
+      __kmp_affin_mask_size);
+    int setSize = 0;
+
+    for (;;) {
+        int start, end, stride;
+
+        SKIP_WS(scan);
+        next = scan;
+        if (*next == '\0') {
+            break;
+        }
+
+        if (*next == '{') {
+            int num;
+            setSize = 0;
+            next++;     // skip '{'
+            SKIP_WS(next);
+            scan = next;
+
+            //
+            // Read the first integer in the set.
+            //
+            KMP_ASSERT2((*next >= '0') && (*next <= '9'),
+              "bad proclist");
+            SKIP_DIGITS(next);
+            num = __kmp_str_to_int(scan, *next);
+            KMP_ASSERT2(num >= 0, "bad explicit proc list");
+
+            //
+            // Copy the mask for that osId to the sum (union) mask.
+            //
+            if ((num > maxOsId) ||
+              (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                  && (__kmp_affinity_type != affinity_none))) {
+                    KMP_WARNING(AffIgnoreInvalidProcID, num);
+                }
+                KMP_CPU_ZERO(sumMask);
+            }
+            else {
+                KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+                setSize = 1;
+            }
+
+            for (;;) {
+                //
+                // Check for end of set.
+                //
+                SKIP_WS(next);
+                if (*next == '}') {
+                    next++;     // skip '}'
+                    break;
+                }
+
+                //
+                // Skip optional comma.
+                //
+                if (*next == ',') {
+                    next++;
+                }
+                SKIP_WS(next);
+
+                //
+                // Read the next integer in the set.
+                //
+                scan = next;
+                KMP_ASSERT2((*next >= '0') && (*next <= '9'),
+                  "bad explicit proc list");
+
+                SKIP_DIGITS(next);
+                num = __kmp_str_to_int(scan, *next);
+                KMP_ASSERT2(num >= 0, "bad explicit proc list");
+
+                //
+                // Add the mask for that osId to the sum mask.
+                //
+                if ((num > maxOsId) ||
+                  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+                    if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                      && (__kmp_affinity_type != affinity_none))) {
+                        KMP_WARNING(AffIgnoreInvalidProcID, num);
+                    }
+                }
+                else {
+                    KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+                    setSize++;
+                }
+            }
+            if (setSize > 0) {
+                ADD_MASK(sumMask);
+            }
+
+            SKIP_WS(next);
+            if (*next == ',') {
+                next++;
+            }
+            scan = next;
+            continue;
+        }
+
+        //
+        // Read the first integer.
+        //
+        KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+        SKIP_DIGITS(next);
+        start = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT2(start >= 0, "bad explicit proc list");
+        SKIP_WS(next);
+
+        //
+        // If this isn't a range, then add a mask to the list and go on.
+        //
+        if (*next != '-') {
+            ADD_MASK_OSID(start, osId2Mask, maxOsId);
+
+            //
+            // Skip optional comma.
+            //
+            if (*next == ',') {
+                next++;
+            }
+            scan = next;
+            continue;
+        }
+
+        //
+        // This is a range.  Skip over the '-' and read in the 2nd int.
+        //
+        next++;         // skip '-'
+        SKIP_WS(next);
+        scan = next;
+        KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+        SKIP_DIGITS(next);
+        end = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT2(end >= 0, "bad explicit proc list");
+
+        //
+        // Check for a stride parameter
+        //
+        stride = 1;
+        SKIP_WS(next);
+        if (*next == ':') {
+            //
+            // A stride is specified.  Skip over the ':" and read the 3rd int.
+            //
+            int sign = +1;
+            next++;         // skip ':'
+            SKIP_WS(next);
+            scan = next;
+            if (*next == '-') {
+                sign = -1;
+                next++;
+                SKIP_WS(next);
+                scan = next;
+            }
+            KMP_ASSERT2((*next >=  '0') && (*next <= '9'),
+              "bad explicit proc list");
+            SKIP_DIGITS(next);
+            stride = __kmp_str_to_int(scan, *next);
+            KMP_ASSERT2(stride >= 0, "bad explicit proc list");
+            stride *= sign;
+        }
+
+        //
+        // Do some range checks.
+        //
+        KMP_ASSERT2(stride != 0, "bad explicit proc list");
+        if (stride > 0) {
+            KMP_ASSERT2(start <= end, "bad explicit proc list");
+        }
+        else {
+            KMP_ASSERT2(start >= end, "bad explicit proc list");
+        }
+        KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
+
+        //
+        // Add the mask for each OS proc # to the list.
+        //
+        if (stride > 0) {
+            do {
+                ADD_MASK_OSID(start, osId2Mask, maxOsId);
+                start += stride;
+            } while (start <= end);
+        }
+        else {
+            do {
+                ADD_MASK_OSID(start, osId2Mask, maxOsId);
+                start += stride;
+            } while (start >= end);
+        }
+
+        //
+        // Skip optional comma.
+        //
+        SKIP_WS(next);
+        if (*next == ',') {
+            next++;
+        }
+        scan = next;
+    }
+
+    *out_numMasks = nextNewMask;
+    if (nextNewMask == 0) {
+        *out_masks = NULL;
+        KMP_INTERNAL_FREE(newMasks);
+        return;
+    }
+    *out_masks
+      = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
+    KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
+    __kmp_free(sumMask);
+    KMP_INTERNAL_FREE(newMasks);
+}
+
+
+# if OMP_40_ENABLED
+
+/*-----------------------------------------------------------------------------
+
+Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
+places.  Again, Here is the grammar:
+
+place_list := place
+place_list := place , place_list
+place := num
+place := place : num
+place := place : num : signed
+place := { subplacelist }
+place := ! place                  // (lowest priority)
+subplace_list := subplace
+subplace_list := subplace , subplace_list
+subplace := num
+subplace := num : num
+subplace := num : num : signed
+signed := num
+signed := + signed
+signed := - signed
+
+-----------------------------------------------------------------------------*/
+
+static void
+__kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
+  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
+{
+    const char *next;
+
+    for (;;) {
+        int start, count, stride, i;
+
+        //
+        // Read in the starting proc id
+        //
+        SKIP_WS(*scan);
+        KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
+          "bad explicit places list");
+        next = *scan;
+        SKIP_DIGITS(next);
+        start = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(start >= 0);
+        *scan = next;
+
+        //
+        // valid follow sets are ',' ':' and '}'
+        //
+        SKIP_WS(*scan);
+        if (**scan == '}' || **scan == ',') {
+            if ((start > maxOsId) ||
+              (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                  && (__kmp_affinity_type != affinity_none))) {
+                    KMP_WARNING(AffIgnoreInvalidProcID, start);
+                }
+            }
+            else {
+                KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+                (*setSize)++;
+            }
+            if (**scan == '}') {
+                break;
+            }
+            (*scan)++;  // skip ','
+            continue;
+        }
+        KMP_ASSERT2(**scan == ':', "bad explicit places list");
+        (*scan)++;      // skip ':'
+
+        //
+        // Read count parameter
+        //
+        SKIP_WS(*scan);
+        KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
+          "bad explicit places list");
+        next = *scan;
+        SKIP_DIGITS(next);
+        count = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(count >= 0);
+        *scan = next;
+
+        //
+        // valid follow sets are ',' ':' and '}'
+        //
+        SKIP_WS(*scan);
+        if (**scan == '}' || **scan == ',') {
+            for (i = 0; i < count; i++) {
+                if ((start > maxOsId) ||
+                  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+                    if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                      && (__kmp_affinity_type != affinity_none))) {
+                        KMP_WARNING(AffIgnoreInvalidProcID, start);
+                    }
+                    break;  // don't proliferate warnings for large count
+                }
+                else {
+                    KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+                    start++;
+                    (*setSize)++;
+                }
+            }
+            if (**scan == '}') {
+                break;
+            }
+            (*scan)++;  // skip ','
+            continue;
+        }
+        KMP_ASSERT2(**scan == ':', "bad explicit places list");
+        (*scan)++;      // skip ':'
+
+        //
+        // Read stride parameter
+        //
+        int sign = +1;
+        for (;;) {
+            SKIP_WS(*scan);
+            if (**scan == '+') {
+                (*scan)++; // skip '+'
+                continue;
+            }
+            if (**scan == '-') {
+                sign *= -1;
+                (*scan)++; // skip '-'
+                continue;
+            }
+            break;
+        }
+        SKIP_WS(*scan);
+        KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
+          "bad explicit places list");
+        next = *scan;
+        SKIP_DIGITS(next);
+        stride = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(stride >= 0);
+        *scan = next;
+        stride *= sign;
+
+        //
+        // valid follow sets are ',' and '}'
+        //
+        SKIP_WS(*scan);
+        if (**scan == '}' || **scan == ',') {
+            for (i = 0; i < count; i++) {
+                if ((start > maxOsId) ||
+                  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+                    if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                      && (__kmp_affinity_type != affinity_none))) {
+                        KMP_WARNING(AffIgnoreInvalidProcID, start);
+                    }
+                    break;  // don't proliferate warnings for large count
+                }
+                else {
+                    KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+                    start += stride;
+                    (*setSize)++;
+                }
+            }
+            if (**scan == '}') {
+                break;
+            }
+            (*scan)++;  // skip ','
+            continue;
+        }
+
+        KMP_ASSERT2(0, "bad explicit places list");
+    }
+}
+
+
+static void
+__kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
+  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
+{
+    const char *next;
+
+    //
+    // valid follow sets are '{' '!' and num
+    //
+    SKIP_WS(*scan);
+    if (**scan == '{') {
+        (*scan)++;      // skip '{'
+        __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
+          setSize);
+        KMP_ASSERT2(**scan == '}', "bad explicit places list");
+        (*scan)++;      // skip '}'
+    }
+    else if (**scan == '!') {
+        __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
+        KMP_CPU_COMPLEMENT(tempMask);
+        (*scan)++;      // skip '!'
+    }
+    else if ((**scan >= '0') && (**scan <= '9')) {
+        next = *scan;
+        SKIP_DIGITS(next);
+        int num = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(num >= 0);
+        if ((num > maxOsId) ||
+          (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+            if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+              && (__kmp_affinity_type != affinity_none))) {
+                KMP_WARNING(AffIgnoreInvalidProcID, num);
+            }
+        }
+        else {
+            KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
+            (*setSize)++;
+        }
+        *scan = next;  // skip num
+    }
+    else {
+        KMP_ASSERT2(0, "bad explicit places list");
+    }
+}
+
+
+//static void
+void
+__kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
+  unsigned int *out_numMasks, const char *placelist,
+  kmp_affin_mask_t *osId2Mask, int maxOsId)
+{
+    const char *scan = placelist;
+    const char *next = placelist;
+
+    numNewMasks = 2;
+    newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
+      * __kmp_affin_mask_size);
+    nextNewMask = 0;
+
+    kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
+      __kmp_affin_mask_size);
+    KMP_CPU_ZERO(tempMask);
+    int setSize = 0;
+
+    for (;;) {
+        __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
+
+        //
+        // valid follow sets are ',' ':' and EOL
+        //
+        SKIP_WS(scan);
+        if (*scan == '\0' || *scan == ',') {
+            if (setSize > 0) {
+                ADD_MASK(tempMask);
+            }
+            KMP_CPU_ZERO(tempMask);
+            setSize = 0;
+            if (*scan == '\0') {
+                break;
+            }
+            scan++;     // skip ','
+            continue;
+        }
+
+        KMP_ASSERT2(*scan == ':', "bad explicit places list");
+        scan++;         // skip ':'
+
+        //
+        // Read count parameter
+        //
+        SKIP_WS(scan);
+        KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
+          "bad explicit places list");
+        next = scan;
+        SKIP_DIGITS(next);
+        int count = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT(count >= 0);
+        scan = next;
+
+        //
+        // valid follow sets are ',' ':' and EOL
+        //
+        SKIP_WS(scan);
+        int stride;
+        if (*scan == '\0' || *scan == ',') {
+            stride = +1;
+        }
+        else {
+            KMP_ASSERT2(*scan == ':', "bad explicit places list");
+            scan++;         // skip ':'
+
+            //
+            // Read stride parameter
+            //
+            int sign = +1;
+            for (;;) {
+                SKIP_WS(scan);
+                if (*scan == '+') {
+                    scan++; // skip '+'
+                    continue;
+                }
+                if (*scan == '-') {
+                    sign *= -1;
+                    scan++; // skip '-'
+                    continue;
+                }
+                break;
+            }
+            SKIP_WS(scan);
+            KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
+              "bad explicit places list");
+            next = scan;
+            SKIP_DIGITS(next);
+            stride = __kmp_str_to_int(scan, *next);
+            KMP_DEBUG_ASSERT(stride >= 0);
+            scan = next;
+            stride *= sign;
+        }
+
+        if (stride > 0) {
+            int i;
+            for (i = 0; i < count; i++) {
+                int j;
+                if (setSize == 0) {
+                    break;
+                }
+                ADD_MASK(tempMask);
+                setSize = 0;
+                for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
+                    if (! KMP_CPU_ISSET(j - stride, tempMask)) {
+                        KMP_CPU_CLR(j, tempMask);
+                    }
+                    else if ((j > maxOsId) ||
+                      (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
+                        if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
+                          && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
+                            KMP_WARNING(AffIgnoreInvalidProcID, j);
+                        }
+                        KMP_CPU_CLR(j, tempMask);
+                    }
+                    else {
+                        KMP_CPU_SET(j, tempMask);
+                        setSize++;
+                    }
+                }
+                for (; j >= 0; j--) {
+                    KMP_CPU_CLR(j, tempMask);
+                }
+            }
+        }
+        else {
+            int i;
+            for (i = 0; i < count; i++) {
+                int j;
+                if (setSize == 0) {
+                    break;
+                }
+                ADD_MASK(tempMask);
+                setSize = 0;
+                for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
+                  j++) {
+                    if (! KMP_CPU_ISSET(j - stride, tempMask)) {
+                        KMP_CPU_CLR(j, tempMask);
+                    }
+                    else if ((j > maxOsId) ||
+                      (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
+                        if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
+                          && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
+                            KMP_WARNING(AffIgnoreInvalidProcID, j);
+                        }
+                        KMP_CPU_CLR(j, tempMask);
+                    }
+                    else {
+                        KMP_CPU_SET(j, tempMask);
+                        setSize++;
+                    }
+                }
+                for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
+                    KMP_CPU_CLR(j, tempMask);
+                }
+            }
+        }
+        KMP_CPU_ZERO(tempMask);
+        setSize = 0;
+
+        //
+        // valid follow sets are ',' and EOL
+        //
+        SKIP_WS(scan);
+        if (*scan == '\0') {
+            break;
+        }
+        if (*scan == ',') {
+            scan++;     // skip ','
+            continue;
+        }
+
+        KMP_ASSERT2(0, "bad explicit places list");
+    }
+
+    *out_numMasks = nextNewMask;
+    if (nextNewMask == 0) {
+        *out_masks = NULL;
+        KMP_INTERNAL_FREE(newMasks);
+        return;
+    }
+    *out_masks
+      = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
+    KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
+    __kmp_free(tempMask);
+    KMP_INTERNAL_FREE(newMasks);
+}
+
+# endif /* OMP_40_ENABLED */
+
+#undef ADD_MASK
+#undef ADD_MASK_OSID
+
+static void
+__kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
+{
+    if ( __kmp_place_num_cores == 0 ) {
+        if ( __kmp_place_num_threads_per_core == 0 ) {
+            return;   // no cores limiting actions requested, exit
+        }
+        __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
+    }
+    if ( !__kmp_affinity_uniform_topology() ) {
+        KMP_WARNING( AffThrPlaceNonUniform );
+        return; // don't support non-uniform topology
+    }
+    if ( depth != 3 ) {
+        KMP_WARNING( AffThrPlaceNonThreeLevel );
+        return; // don't support not-3-level topology
+    }
+    if ( __kmp_place_num_threads_per_core == 0 ) {
+        __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
+    }
+    if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
+        KMP_WARNING( AffThrPlaceManyCores );
+        return;
+    }
+
+    AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
+                            nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
+    int i, j, k, n_old = 0, n_new = 0;
+    for ( i = 0; i < nPackages; ++i ) {
+        for ( j = 0; j < nCoresPerPkg; ++j ) {
+            if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
+                n_old += __kmp_nThreadsPerCore;   // skip not-requested core
+            } else {
+                for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
+                    if ( k < __kmp_place_num_threads_per_core ) {
+                        newAddr[n_new] = (*pAddr)[n_old];   // copy requested core' data to new location
+                        n_new++;
+                    }
+                    n_old++;
+                }
+            }
+        }
+    }
+    nCoresPerPkg = __kmp_place_num_cores;                     // correct nCoresPerPkg
+    __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
+    __kmp_avail_proc = n_new;                                 // correct avail_proc
+    __kmp_ncores = nPackages * __kmp_place_num_cores;         // correct ncores
+
+    __kmp_free( *pAddr );
+    *pAddr = newAddr;      // replace old topology with new one
+}
+
+
+static AddrUnsPair *address2os = NULL;
+static int           * procarr = NULL;
+static int     __kmp_aff_depth = 0;
+
+static void
+__kmp_aux_affinity_initialize(void)
+{
+    if (__kmp_affinity_masks != NULL) {
+        KMP_ASSERT(fullMask != NULL);
+        return;
+    }
+
+    //
+    // Create the "full" mask - this defines all of the processors that we
+    // consider to be in the machine model.  If respect is set, then it is
+    // the initialization thread's affinity mask.  Otherwise, it is all
+    // processors that we know about on the machine.
+    //
+    if (fullMask == NULL) {
+        fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
+    }
+    if (KMP_AFFINITY_CAPABLE()) {
+        if (__kmp_affinity_respect_mask) {
+            __kmp_get_system_affinity(fullMask, TRUE);
+
+            //
+            // Count the number of available processors.
+            //
+            unsigned i;
+            __kmp_avail_proc = 0;
+            for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
+                if (! KMP_CPU_ISSET(i, fullMask)) {
+                    continue;
+                }
+                __kmp_avail_proc++;
+            }
+            if (__kmp_avail_proc > __kmp_xproc) {
+                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                  && (__kmp_affinity_type != affinity_none))) {
+                    KMP_WARNING(ErrorInitializeAffinity);
+                }
+                __kmp_affinity_type = affinity_none;
+                KMP_AFFINITY_DISABLE();
+                return;
+            }
+        }
+        else {
+            __kmp_affinity_entire_machine_mask(fullMask);
+            __kmp_avail_proc = __kmp_xproc;
+        }
+    }
+
+    int depth = -1;
+    kmp_i18n_id_t msg_id = kmp_i18n_null;
+
+    //
+    // For backward compatibility, setting KMP_CPUINFO_FILE =>
+    // KMP_TOPOLOGY_METHOD=cpuinfo
+    //
+    if ((__kmp_cpuinfo_file != NULL) &&
+      (__kmp_affinity_top_method == affinity_top_method_all)) {
+        __kmp_affinity_top_method = affinity_top_method_cpuinfo;
+    }
+
+    if (__kmp_affinity_top_method == affinity_top_method_all) {
+        //
+        // In the default code path, errors are not fatal - we just try using
+        // another method.  We only emit a warning message if affinity is on,
+        // or the verbose flag is set, an the nowarnings flag was not set.
+        //
+        const char *file_name = NULL;
+        int line = 0;
+
+# if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+        }
+
+        file_name = NULL;
+        depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
+        if (depth == 0) {
+            KMP_ASSERT(__kmp_affinity_type == affinity_none);
+            KMP_ASSERT(address2os == NULL);
+            return;
+        }
+
+        if (depth < 0) {
+            if (__kmp_affinity_verbose) {
+                if (msg_id != kmp_i18n_null) {
+                    KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
+                      KMP_I18N_STR(DecodingLegacyAPIC));
+                }
+                else {
+                    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+                }
+            }
+
+            file_name = NULL;
+            depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+            if (depth == 0) {
+                KMP_ASSERT(__kmp_affinity_type == affinity_none);
+                KMP_ASSERT(address2os == NULL);
+                return;
+            }
+        }
+
+# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+# if KMP_OS_LINUX
+
+        if (depth < 0) {
+            if (__kmp_affinity_verbose) {
+                if (msg_id != kmp_i18n_null) {
+                    KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
+                }
+                else {
+                    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
+                }
+            }
+
+            FILE *f = fopen("/proc/cpuinfo", "r");
+            if (f == NULL) {
+                msg_id = kmp_i18n_str_CantOpenCpuinfo;
+            }
+            else {
+                file_name = "/proc/cpuinfo";
+                depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
+                fclose(f);
+                if (depth == 0) {
+                    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+                    KMP_ASSERT(address2os == NULL);
+                    return;
+                }
+            }
+        }
+
+# endif /* KMP_OS_LINUX */
+
+# if KMP_GROUP_AFFINITY
+
+        if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
+            if (__kmp_affinity_verbose) {
+                KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+            }
+
+            depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
+            KMP_ASSERT(depth != 0);
+        }
+
+# endif /* KMP_GROUP_AFFINITY */
+
+        if (depth < 0) {
+            if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
+                if (file_name == NULL) {
+                    KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
+                }
+                else if (line == 0) {
+                    KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
+                }
+                else {
+                    KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
+                }
+            }
+            // FIXME - print msg if msg_id = kmp_i18n_null ???
+
+            file_name = "";
+            depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
+            if (depth == 0) {
+                KMP_ASSERT(__kmp_affinity_type == affinity_none);
+                KMP_ASSERT(address2os == NULL);
+                return;
+            }
+            KMP_ASSERT(depth > 0);
+            KMP_ASSERT(address2os != NULL);
+        }
+    }
+
+    //
+    // If the user has specified that a paricular topology discovery method
+    // is to be used, then we abort if that method fails.  The exception is
+    // group affinity, which might have been implicitly set.
+    //
+
+# if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+    else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
+              KMP_I18N_STR(Decodingx2APIC));
+        }
+
+        depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
+        if (depth == 0) {
+            KMP_ASSERT(__kmp_affinity_type == affinity_none);
+            KMP_ASSERT(address2os == NULL);
+            return;
+        }
+        if (depth < 0) {
+            KMP_ASSERT(msg_id != kmp_i18n_null);
+            KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+        }
+    }
+    else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
+              KMP_I18N_STR(DecodingLegacyAPIC));
+        }
+
+        depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+        if (depth == 0) {
+            KMP_ASSERT(__kmp_affinity_type == affinity_none);
+            KMP_ASSERT(address2os == NULL);
+            return;
+        }
+        if (depth < 0) {
+            KMP_ASSERT(msg_id != kmp_i18n_null);
+            KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+        }
+    }
+
+# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
+        const char *filename;
+        if (__kmp_cpuinfo_file != NULL) {
+            filename = __kmp_cpuinfo_file;
+        }
+        else {
+            filename = "/proc/cpuinfo";
+        }
+
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
+        }
+
+        FILE *f = fopen(filename, "r");
+        if (f == NULL) {
+            int code = errno;
+            if (__kmp_cpuinfo_file != NULL) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG(CantOpenFileForReading, filename),
+                    KMP_ERR(code),
+                    KMP_HNT(NameComesFrom_CPUINFO_FILE),
+                    __kmp_msg_null
+                );
+            }
+            else {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG(CantOpenFileForReading, filename),
+                    KMP_ERR(code),
+                    __kmp_msg_null
+                );
+            }
+        }
+        int line = 0;
+        depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
+        fclose(f);
+        if (depth < 0) {
+            KMP_ASSERT(msg_id != kmp_i18n_null);
+            if (line > 0) {
+                KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
+            }
+            else {
+                KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
+            }
+        }
+        if (__kmp_affinity_type == affinity_none) {
+            KMP_ASSERT(depth == 0);
+            KMP_ASSERT(address2os == NULL);
+            return;
+        }
+    }
+
+# if KMP_GROUP_AFFINITY
+
+    else if (__kmp_affinity_top_method == affinity_top_method_group) {
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+        }
+
+        depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
+        KMP_ASSERT(depth != 0);
+        if (depth < 0) {
+            KMP_ASSERT(msg_id != kmp_i18n_null);
+            KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+        }
+    }
+
+# endif /* KMP_GROUP_AFFINITY */
+
+    else if (__kmp_affinity_top_method == affinity_top_method_flat) {
+        if (__kmp_affinity_verbose) {
+            KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
+        }
+
+        depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
+        if (depth == 0) {
+            KMP_ASSERT(__kmp_affinity_type == affinity_none);
+            KMP_ASSERT(address2os == NULL);
+            return;
+        }
+        // should not fail
+        KMP_ASSERT(depth > 0);
+        KMP_ASSERT(address2os != NULL);
+    }
+
+    if (address2os == NULL) {
+        if (KMP_AFFINITY_CAPABLE()
+          && (__kmp_affinity_verbose || (__kmp_affinity_warnings
+          && (__kmp_affinity_type != affinity_none)))) {
+            KMP_WARNING(ErrorInitializeAffinity);
+        }
+        __kmp_affinity_type = affinity_none;
+        KMP_AFFINITY_DISABLE();
+        return;
+    }
+
+    __kmp_apply_thread_places(&address2os, depth);
+
+    //
+    // Create the table of masks, indexed by thread Id.
+    //
+    unsigned maxIndex;
+    unsigned numUnique;
+    kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
+      address2os, __kmp_avail_proc);
+    if (__kmp_affinity_gran_levels == 0) {
+        KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
+    }
+
+    //
+    // Set the childNums vector in all Address objects.  This must be done
+    // before we can sort using __kmp_affinity_cmp_Address_child_num(),
+    // which takes into account the setting of __kmp_affinity_compact.
+    //
+    __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
+
+    switch (__kmp_affinity_type) {
+
+        case affinity_explicit:
+        KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
+# if OMP_40_ENABLED
+        if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
+# endif
+        {
+            __kmp_affinity_process_proclist(&__kmp_affinity_masks,
+              &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
+              maxIndex);
+        }
+# if OMP_40_ENABLED
+        else {
+            __kmp_affinity_process_placelist(&__kmp_affinity_masks,
+              &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
+              maxIndex);
+        }
+# endif
+        if (__kmp_affinity_num_masks == 0) {
+            if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+              && (__kmp_affinity_type != affinity_none))) {
+                KMP_WARNING(AffNoValidProcID);
+            }
+            __kmp_affinity_type = affinity_none;
+            return;
+        }
+        break;
+
+        //
+        // The other affinity types rely on sorting the Addresses according
+        // to some permutation of the machine topology tree.  Set
+        // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
+        // then jump to a common code fragment to do the sort and create
+        // the array of affinity masks.
+        //
+
+        case affinity_logical:
+        __kmp_affinity_compact = 0;
+        if (__kmp_affinity_offset) {
+            __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
+              % __kmp_avail_proc;
+        }
+        goto sortAddresses;
+
+        case affinity_physical:
+        if (__kmp_nThreadsPerCore > 1) {
+            __kmp_affinity_compact = 1;
+            if (__kmp_affinity_compact >= depth) {
+                __kmp_affinity_compact = 0;
+            }
+        } else {
+            __kmp_affinity_compact = 0;
+        }
+        if (__kmp_affinity_offset) {
+            __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
+              % __kmp_avail_proc;
+        }
+        goto sortAddresses;
+
+        case affinity_scatter:
+        if (__kmp_affinity_compact >= depth) {
+            __kmp_affinity_compact = 0;
+        }
+        else {
+            __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
+        }
+        goto sortAddresses;
+
+        case affinity_compact:
+        if (__kmp_affinity_compact >= depth) {
+            __kmp_affinity_compact = depth - 1;
+        }
+        goto sortAddresses;
+
+        case affinity_balanced:
+        // Balanced works only for the case of a single package
+        if( nPackages > 1 ) {
+            if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
+                KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
+            }
+            __kmp_affinity_type = affinity_none;
+            return;
+        } else if( __kmp_affinity_uniform_topology() ) {
+            break;
+        } else { // Non-uniform topology
+
+            // Save the depth for further usage
+            __kmp_aff_depth = depth;
+
+            // Number of hyper threads per core in HT machine
+            int nth_per_core = __kmp_nThreadsPerCore;
+
+            int core_level;
+            if( nth_per_core > 1 ) {
+                core_level = depth - 2;
+            } else {
+                core_level = depth - 1;
+            }
+            int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
+            int nproc = nth_per_core * ncores;
+
+            procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
+            for( int i = 0; i < nproc; i++ ) {
+                procarr[ i ] = -1;
+            }
+
+            for( int i = 0; i < __kmp_avail_proc; i++ ) {
+                int proc = address2os[ i ].second;
+                // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
+                // If there is only one thread per core then depth == 2: level 0 - package,
+                // level 1 - core.
+                int level = depth - 1;
+
+                // __kmp_nth_per_core == 1
+                int thread = 0;
+                int core = address2os[ i ].first.labels[ level ];
+                // If the thread level exists, that is we have more than one thread context per core
+                if( nth_per_core > 1 ) {
+                    thread = address2os[ i ].first.labels[ level ] % nth_per_core;
+                    core = address2os[ i ].first.labels[ level - 1 ];
+                }
+                procarr[ core * nth_per_core + thread ] = proc;
+            }
+
+            break;
+        }
+
+        sortAddresses:
+        //
+        // Allocate the gtid->affinity mask table.
+        //
+        if (__kmp_affinity_dups) {
+            __kmp_affinity_num_masks = __kmp_avail_proc;
+        }
+        else {
+            __kmp_affinity_num_masks = numUnique;
+        }
+
+# if OMP_40_ENABLED
+        if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
+          && ( __kmp_affinity_num_places > 0 )
+          && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
+            __kmp_affinity_num_masks = __kmp_affinity_num_places;
+        }
+# endif
+
+        __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
+          __kmp_affinity_num_masks * __kmp_affin_mask_size);
+
+        //
+        // Sort the address2os table according to the current setting of
+        // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
+        //
+        qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
+          __kmp_affinity_cmp_Address_child_num);
+        {
+            int i;
+            unsigned j;
+            for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
+                if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
+                    continue;
+                }
+                unsigned osId = address2os[i].second;
+                kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
+                kmp_affin_mask_t *dest
+                  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
+                KMP_ASSERT(KMP_CPU_ISSET(osId, src));
+                KMP_CPU_COPY(dest, src);
+                if (++j >= __kmp_affinity_num_masks) {
+                    break;
+                }
+            }
+            KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
+        }
+        break;
+
+        default:
+        KMP_ASSERT2(0, "Unexpected affinity setting");
+    }
+
+    __kmp_free(osId2Mask);
+    machine_hierarchy.init(address2os, __kmp_avail_proc);
+}
+
+
+void
+__kmp_affinity_initialize(void)
+{
+    //
+    // Much of the code above was written assumming that if a machine was not
+    // affinity capable, then __kmp_affinity_type == affinity_none.  We now
+    // explicitly represent this as __kmp_affinity_type == affinity_disabled.
+    //
+    // There are too many checks for __kmp_affinity_type == affinity_none
+    // in this code.  Instead of trying to change them all, check if
+    // __kmp_affinity_type == affinity_disabled, and if so, slam it with
+    // affinity_none, call the real initialization routine, then restore
+    // __kmp_affinity_type to affinity_disabled.
+    //
+    int disabled = (__kmp_affinity_type == affinity_disabled);
+    if (! KMP_AFFINITY_CAPABLE()) {
+        KMP_ASSERT(disabled);
+    }
+    if (disabled) {
+        __kmp_affinity_type = affinity_none;
+    }
+    __kmp_aux_affinity_initialize();
+    if (disabled) {
+        __kmp_affinity_type = affinity_disabled;
+    }
+}
+
+
+void
+__kmp_affinity_uninitialize(void)
+{
+    if (__kmp_affinity_masks != NULL) {
+        __kmp_free(__kmp_affinity_masks);
+        __kmp_affinity_masks = NULL;
+    }
+    if (fullMask != NULL) {
+        KMP_CPU_FREE(fullMask);
+        fullMask = NULL;
+    }
+    __kmp_affinity_num_masks = 0;
+# if OMP_40_ENABLED
+    __kmp_affinity_num_places = 0;
+# endif
+    if (__kmp_affinity_proclist != NULL) {
+        __kmp_free(__kmp_affinity_proclist);
+        __kmp_affinity_proclist = NULL;
+    }
+    if( address2os != NULL ) {
+        __kmp_free( address2os );
+        address2os = NULL;
+    }
+    if( procarr != NULL ) {
+        __kmp_free( procarr );
+        procarr = NULL;
+    }
+}
+
+
+void
+__kmp_affinity_set_init_mask(int gtid, int isa_root)
+{
+    if (! KMP_AFFINITY_CAPABLE()) {
+        return;
+    }
+
+    kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+    if (th->th.th_affin_mask == NULL) {
+        KMP_CPU_ALLOC(th->th.th_affin_mask);
+    }
+    else {
+        KMP_CPU_ZERO(th->th.th_affin_mask);
+    }
+
+    //
+    // Copy the thread mask to the kmp_info_t strucuture.
+    // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
+    // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
+    // is set, then the full mask is the same as the mask of the initialization
+    // thread.
+    //
+    kmp_affin_mask_t *mask;
+    int i;
+
+# if OMP_40_ENABLED
+    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
+# endif
+    {
+        if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
+          ) {
+# if KMP_GROUP_AFFINITY
+            if (__kmp_num_proc_groups > 1) {
+                return;
+            }
+# endif
+            KMP_ASSERT(fullMask != NULL);
+            i = KMP_PLACE_ALL;
+            mask = fullMask;
+        }
+        else {
+            KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
+            i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+            mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+        }
+    }
+# if OMP_40_ENABLED
+    else {
+        if ((! isa_root)
+          || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+#  if KMP_GROUP_AFFINITY
+            if (__kmp_num_proc_groups > 1) {
+                return;
+            }
+#  endif
+            KMP_ASSERT(fullMask != NULL);
+            i = KMP_PLACE_ALL;
+            mask = fullMask;
+        }
+        else {
+            //
+            // int i = some hash function or just a counter that doesn't
+            // always start at 0.  Use gtid for now.
+            //
+            KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
+            i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+            mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+        }
+    }
+# endif
+
+# if OMP_40_ENABLED
+    th->th.th_current_place = i;
+    if (isa_root) {
+        th->th.th_new_place = i;
+        th->th.th_first_place = 0;
+        th->th.th_last_place = __kmp_affinity_num_masks - 1;
+    }
+
+    if (i == KMP_PLACE_ALL) {
+        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
+          gtid));
+    }
+    else {
+        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
+          gtid, i));
+    }
+# else
+    if (i == -1) {
+        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
+          gtid));
+    }
+    else {
+        KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
+          gtid, i));
+    }
+# endif /* OMP_40_ENABLED */
+
+    KMP_CPU_COPY(th->th.th_affin_mask, mask);
+
+    if (__kmp_affinity_verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          th->th.th_affin_mask);
+        KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
+          buf);
+    }
+
+# if KMP_OS_WINDOWS
+    //
+    // On Windows* OS, the process affinity mask might have changed.
+    // If the user didn't request affinity and this call fails,
+    // just continue silently.  See CQ171393.
+    //
+    if ( __kmp_affinity_type == affinity_none ) {
+        __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
+    }
+    else
+# endif
+    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+}
+
+
+# if OMP_40_ENABLED
+
+void
+__kmp_affinity_set_place(int gtid)
+{
+    int retval;
+
+    if (! KMP_AFFINITY_CAPABLE()) {
+        return;
+    }
+
+    kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+    KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
+      gtid, th->th.th_new_place, th->th.th_current_place));
+
+    //
+    // Check that the new place is within this thread's partition.
+    //
+    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+    KMP_ASSERT(th->th.th_new_place >= 0);
+    KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
+    if (th->th.th_first_place <= th->th.th_last_place) {
+        KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
+         && (th->th.th_new_place <= th->th.th_last_place));
+    }
+    else {
+        KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
+         || (th->th.th_new_place >= th->th.th_last_place));
+    }
+
+    //
+    // Copy the thread mask to the kmp_info_t strucuture,
+    // and set this thread's affinity.
+    //
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
+      th->th.th_new_place);
+    KMP_CPU_COPY(th->th.th_affin_mask, mask);
+    th->th.th_current_place = th->th.th_new_place;
+
+    if (__kmp_affinity_verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          th->th.th_affin_mask);
+        KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
+          gtid, buf);
+    }
+    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+}
+
+# endif /* OMP_40_ENABLED */
+
+
+int
+__kmp_aux_set_affinity(void **mask)
+{
+    int gtid;
+    kmp_info_t *th;
+    int retval;
+
+    if (! KMP_AFFINITY_CAPABLE()) {
+        return -1;
+    }
+
+    gtid = __kmp_entry_gtid();
+    KA_TRACE(1000, ;{
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
+          gtid, buf);
+    });
+
+    if (__kmp_env_consistency_check) {
+        if ((mask == NULL) || (*mask == NULL)) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+        }
+        else {
+            unsigned proc;
+            int num_procs = 0;
+
+            for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
+                if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
+                    continue;
+                }
+                num_procs++;
+                if (! KMP_CPU_ISSET(proc, fullMask)) {
+                    KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+                    break;
+                }
+            }
+            if (num_procs == 0) {
+                KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+            }
+
+# if KMP_GROUP_AFFINITY
+            if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
+                KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+            }
+# endif /* KMP_GROUP_AFFINITY */
+
+        }
+    }
+
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+    retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+    if (retval == 0) {
+        KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
+    }
+
+# if OMP_40_ENABLED
+    th->th.th_current_place = KMP_PLACE_UNDEFINED;
+    th->th.th_new_place = KMP_PLACE_UNDEFINED;
+    th->th.th_first_place = 0;
+    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+
+    //
+    // Turn off 4.0 affinity for the current tread at this parallel level.
+    //
+    th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
+# endif
+
+    return retval;
+}
+
+
+int
+__kmp_aux_get_affinity(void **mask)
+{
+    int gtid;
+    int retval;
+    kmp_info_t *th;
+
+    if (! KMP_AFFINITY_CAPABLE()) {
+        return -1;
+    }
+
+    gtid = __kmp_entry_gtid();
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+
+    KA_TRACE(1000, ;{
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          th->th.th_affin_mask);
+        __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
+    });
+
+    if (__kmp_env_consistency_check) {
+        if ((mask == NULL) || (*mask == NULL)) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
+        }
+    }
+
+# if !KMP_OS_WINDOWS
+
+    retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+    KA_TRACE(1000, ;{
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          (kmp_affin_mask_t *)(*mask));
+        __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
+    });
+    return retval;
+
+# else
+
+    KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
+    return 0;
+
+# endif /* KMP_OS_WINDOWS */
+
+}
+
+int
+__kmp_aux_set_affinity_mask_proc(int proc, void **mask)
+{
+    int retval;
+
+    if (! KMP_AFFINITY_CAPABLE()) {
+        return -1;
+    }
+
+    KA_TRACE(1000, ;{
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
+          proc, gtid, buf);
+    });
+
+    if (__kmp_env_consistency_check) {
+        if ((mask == NULL) || (*mask == NULL)) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
+        }
+    }
+
+    if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
+        return -1;
+    }
+    if (! KMP_CPU_ISSET(proc, fullMask)) {
+        return -2;
+    }
+
+    KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
+    return 0;
+}
+
+
+int
+__kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
+{
+    int retval;
+
+    if (! KMP_AFFINITY_CAPABLE()) {
+        return -1;
+    }
+
+    KA_TRACE(1000, ;{
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
+          proc, gtid, buf);
+    });
+
+    if (__kmp_env_consistency_check) {
+        if ((mask == NULL) || (*mask == NULL)) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
+        }
+    }
+
+    if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
+        return -1;
+    }
+    if (! KMP_CPU_ISSET(proc, fullMask)) {
+        return -2;
+    }
+
+    KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
+    return 0;
+}
+
+
+int
+__kmp_aux_get_affinity_mask_proc(int proc, void **mask)
+{
+    int retval;
+
+    if (! KMP_AFFINITY_CAPABLE()) {
+        return -1;
+    }
+
+    KA_TRACE(1000, ;{
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+          (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
+          proc, gtid, buf);
+    });
+
+    if (__kmp_env_consistency_check) {
+        if ((mask == NULL) || (*mask == NULL)) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
+        }
+    }
+
+    if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
+        return 0;
+    }
+    if (! KMP_CPU_ISSET(proc, fullMask)) {
+        return 0;
+    }
+
+    return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
+}
+
+
+// Dynamic affinity settings - Affinity balanced
+void __kmp_balanced_affinity( int tid, int nthreads )
+{
+    if( __kmp_affinity_uniform_topology() ) {
+        int coreID;
+        int threadID;
+        // Number of hyper threads per core in HT machine
+        int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
+        // Number of cores
+        int ncores = __kmp_ncores;
+        // How many threads will be bound to each core
+        int chunk = nthreads / ncores;
+        // How many cores will have an additional thread bound to it - "big cores"
+        int big_cores = nthreads % ncores;
+        // Number of threads on the big cores
+        int big_nth = ( chunk + 1 ) * big_cores;
+        if( tid < big_nth ) {
+            coreID = tid / (chunk + 1 );
+            threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
+        } else { //tid >= big_nth
+            coreID = ( tid - big_cores ) / chunk;
+            threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
+        }
+
+        KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
+          "Illegal set affinity operation when not capable");
+
+        kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+        KMP_CPU_ZERO(mask);
+
+        // Granularity == thread
+        if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+            int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
+            KMP_CPU_SET( osID, mask);
+        } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+            for( int i = 0; i < __kmp_nth_per_core; i++ ) {
+                int osID;
+                osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
+                KMP_CPU_SET( osID, mask);
+            }
+        }
+        if (__kmp_affinity_verbose) {
+            char buf[KMP_AFFIN_MASK_PRINT_LEN];
+            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+              tid, buf);
+        }
+        __kmp_set_system_affinity( mask, TRUE );
+    } else { // Non-uniform topology
+
+        kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+        KMP_CPU_ZERO(mask);
+
+        // Number of hyper threads per core in HT machine
+        int nth_per_core = __kmp_nThreadsPerCore;
+        int core_level;
+        if( nth_per_core > 1 ) {
+            core_level = __kmp_aff_depth - 2;
+        } else {
+            core_level = __kmp_aff_depth - 1;
+        }
+
+        // Number of cores - maximum value; it does not count trail cores with 0 processors
+        int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
+
+        // For performance gain consider the special case nthreads == __kmp_avail_proc
+        if( nthreads == __kmp_avail_proc ) {
+            if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+                int osID = address2os[ tid ].second;
+                KMP_CPU_SET( osID, mask);
+            } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+                int coreID = address2os[ tid ].first.labels[ core_level ];
+                // We'll count found osIDs for the current core; they can be not more than nth_per_core;
+                // since the address2os is sortied we can break when cnt==nth_per_core
+                int cnt = 0;
+                for( int i = 0; i < __kmp_avail_proc; i++ ) {
+                    int osID = address2os[ i ].second;
+                    int core = address2os[ i ].first.labels[ core_level ];
+                    if( core == coreID ) {
+                        KMP_CPU_SET( osID, mask);
+                        cnt++;
+                        if( cnt == nth_per_core ) {
+                            break;
+                        }
+                    }
+                }
+            }
+        } else if( nthreads <= __kmp_ncores ) {
+
+            int core = 0;
+            for( int i = 0; i < ncores; i++ ) {
+                // Check if this core from procarr[] is in the mask
+                int in_mask = 0;
+                for( int j = 0; j < nth_per_core; j++ ) {
+                    if( procarr[ i * nth_per_core + j ] != - 1 ) {
+                        in_mask = 1;
+                        break;
+                    }
+                }
+                if( in_mask ) {
+                    if( tid == core ) {
+                        for( int j = 0; j < nth_per_core; j++ ) {
+                            int osID = procarr[ i * nth_per_core + j ];
+                            if( osID != -1 ) {
+                                KMP_CPU_SET( osID, mask );
+                                // For granularity=thread it is enough to set the first available osID for this core
+                                if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+                                    break;
+                                }
+                            }
+                        }
+                        break;
+                    } else {
+                        core++;
+                    }
+                }
+            }
+
+        } else { // nthreads > __kmp_ncores
+
+            // Array to save the number of processors at each core
+            int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
+            // Array to save the number of cores with "x" available processors;
+            int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
+            // Array to save the number of cores with # procs from x to nth_per_core
+            int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
+
+            for( int i = 0; i <= nth_per_core; i++ ) {
+                ncores_with_x_procs[ i ] = 0;
+                ncores_with_x_to_max_procs[ i ] = 0;
+            }
+
+            for( int i = 0; i < ncores; i++ ) {
+                int cnt = 0;
+                for( int j = 0; j < nth_per_core; j++ ) {
+                    if( procarr[ i * nth_per_core + j ] != -1 ) {
+                        cnt++;
+                    }
+                }
+                nproc_at_core[ i ] = cnt;
+                ncores_with_x_procs[ cnt ]++;
+            }
+
+            for( int i = 0; i <= nth_per_core; i++ ) {
+                for( int j = i; j <= nth_per_core; j++ ) {
+                    ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
+                }
+            }
+
+            // Max number of processors
+            int nproc = nth_per_core * ncores;
+            // An array to keep number of threads per each context
+            int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
+            for( int i = 0; i < nproc; i++ ) {
+                newarr[ i ] = 0;
+            }
+
+            int nth = nthreads;
+            int flag = 0;
+            while( nth > 0 ) {
+                for( int j = 1; j <= nth_per_core; j++ ) {
+                    int cnt = ncores_with_x_to_max_procs[ j ];
+                    for( int i = 0; i < ncores; i++ ) {
+                        // Skip the core with 0 processors
+                        if( nproc_at_core[ i ] == 0 ) {
+                            continue;
+                        }
+                        for( int k = 0; k < nth_per_core; k++ ) {
+                            if( procarr[ i * nth_per_core + k ] != -1 ) {
+                                if( newarr[ i * nth_per_core + k ] == 0 ) {
+                                    newarr[ i * nth_per_core + k ] = 1;
+                                    cnt--;
+                                    nth--;
+                                    break;
+                                } else {
+                                    if( flag != 0 ) {
+                                        newarr[ i * nth_per_core + k ] ++;
+                                        cnt--;
+                                        nth--;
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+                        if( cnt == 0 || nth == 0 ) {
+                            break;
+                        }
+                    }
+                    if( nth == 0 ) {
+                        break;
+                    }
+                }
+                flag = 1;
+            }
+            int sum = 0;
+            for( int i = 0; i < nproc; i++ ) {
+                sum += newarr[ i ];
+                if( sum > tid ) {
+                    // Granularity == thread
+                    if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
+                        int osID = procarr[ i ];
+                        KMP_CPU_SET( osID, mask);
+                    } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
+                        int coreID = i / nth_per_core;
+                        for( int ii = 0; ii < nth_per_core; ii++ ) {
+                            int osID = procarr[ coreID * nth_per_core + ii ];
+                            if( osID != -1 ) {
+                                KMP_CPU_SET( osID, mask);
+                            }
+                        }
+                    }
+                    break;
+                }
+            }
+            __kmp_free( newarr );
+        }
+
+        if (__kmp_affinity_verbose) {
+            char buf[KMP_AFFIN_MASK_PRINT_LEN];
+            __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+              tid, buf);
+        }
+        __kmp_set_system_affinity( mask, TRUE );
+    }
+}
+
+#else
+    // affinity not supported
+
+static const kmp_uint32 noaff_maxLevels=7;
+kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
+kmp_uint32 noaff_depth;
+kmp_uint8 noaff_leaf_kids;
+kmp_int8 noaff_uninitialized=1;
+
+void noaff_init(int nprocs)
+{
+    kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
+    if (result == 0) return; // Already initialized
+    else if (result == 2) { // Someone else is initializing
+        while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
+        return;
+    }
+    KMP_DEBUG_ASSERT(result==1);
+
+    kmp_uint32 numPerLevel[noaff_maxLevels];
+    noaff_depth = 1;
+    for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
+        numPerLevel[i] = 1;
+        noaff_skipPerLevel[i] = 1;
+    }
+
+    numPerLevel[0] = 4;
+    numPerLevel[1] = nprocs/4;
+    if (nprocs%4) numPerLevel[1]++;
+
+    for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
+        if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
+            noaff_depth++;
+
+    kmp_uint32 branch = 4;
+    if (numPerLevel[0] == 1) branch = nprocs/4;
+    if (branch<4) branch=4;
+    for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
+        while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
+            if (numPerLevel[d] & 1) numPerLevel[d]++;
+            numPerLevel[d] = numPerLevel[d] >> 1;
+            if (numPerLevel[d+1] == 1) noaff_depth++;
+            numPerLevel[d+1] = numPerLevel[d+1] << 1;
+        }
+        if(numPerLevel[0] == 1) {
+            branch = branch >> 1;
+            if (branch<4) branch = 4;
+        }
+    }
+
+    for (kmp_uint32 i=1; i<noaff_depth; ++i)
+        noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
+    // Fill in hierarchy in the case of oversubscription
+    for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
+        noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
+    noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
+    noaff_uninitialized = 0; // One writer
+
+}
+
+void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
+    if (noaff_uninitialized)
+        noaff_init(nproc);
+
+    thr_bar->depth = noaff_depth;
+    thr_bar->base_leaf_kids = noaff_leaf_kids;
+    thr_bar->skip_per_level = noaff_skipPerLevel;
+}
+
+#endif // KMP_AFFINITY_SUPPORTED

diff --git a/final/runtime/src/kmp_alloc.c b/final/runtime/src/kmp_alloc.c
new file mode 100644
index 0000000..c9c57f3
--- /dev/null
+++ b/final/runtime/src/kmp_alloc.c

@@ -0,0 +1,2048 @@
+/*
+ * kmp_alloc.c -- private/shared dyanmic memory allocation and management
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_wrapper_malloc.h"
+#include "kmp_io.h"
+
+// Disable bget when it is not used
+#if KMP_USE_BGET
+
+/* Thread private buffer management code */
+
+typedef int   (*bget_compact_t)(size_t, int);
+typedef void *(*bget_acquire_t)(size_t);
+typedef void  (*bget_release_t)(void *);
+
+/* NOTE: bufsize must be a signed datatype */
+
+#if KMP_OS_WINDOWS
+# if KMP_ARCH_X86 || KMP_ARCH_ARM
+   typedef kmp_int32 bufsize;
+# else
+   typedef kmp_int64 bufsize;
+# endif
+#else
+  typedef ssize_t bufsize;
+#endif
+
+/* The three modes of operation are, fifo search, lifo search, and best-fit */
+
+typedef enum bget_mode {
+    bget_mode_fifo = 0,
+    bget_mode_lifo = 1,
+    bget_mode_best = 2
+} bget_mode_t;
+
+
+static void    bpool( kmp_info_t *th, void *buffer, bufsize len);
+static void   *bget( kmp_info_t *th, bufsize size);
+static void   *bgetz( kmp_info_t *th, bufsize size);
+static void   *bgetr( kmp_info_t *th, void *buffer, bufsize newsize);
+static void    brel( kmp_info_t *th, void *buf);
+static void    bectl(  kmp_info_t *th, bget_compact_t compact, bget_acquire_t acquire, bget_release_t release, bufsize pool_incr );
+
+#ifdef KMP_DEBUG
+static void    bstats( kmp_info_t *th, bufsize *curalloc, bufsize *totfree, bufsize *maxfree, long *nget, long *nrel);
+static void    bstatse( kmp_info_t *th, bufsize *pool_incr, long *npool, long *npget, long *nprel, long *ndget, long *ndrel);
+static void    bufdump( kmp_info_t *th, void *buf);
+static void    bpoold( kmp_info_t *th, void *pool, int dumpalloc, int dumpfree);
+static int     bpoolv( kmp_info_t *th, void *pool);
+#endif
+
+/* BGET CONFIGURATION */
+                                      /* Buffer allocation size quantum:
+                                         all buffers allocated are a
+                                         multiple of this size.  This
+                                         MUST be a power of two. */
+
+                                      /* On IA-32 architecture with  Linux* OS,
+                                         malloc() does not
+                                         ensure 16 byte alignmnent */
+
+#if KMP_ARCH_X86 || !KMP_HAVE_QUAD
+
+#define SizeQuant   8
+#define AlignType   double
+
+#else
+
+#define SizeQuant   16
+#define AlignType   _Quad
+
+#endif
+
+#define BufStats    1                 /* Define this symbol to enable the
+                                         bstats() function which calculates
+                                         the total free space in the buffer
+                                         pool, the largest available
+                                         buffer, and the total space
+                                         currently allocated. */
+
+#ifdef KMP_DEBUG
+
+#define BufDump     1                 /* Define this symbol to enable the
+                                         bpoold() function which dumps the
+                                         buffers in a buffer pool. */
+
+#define BufValid    1                 /* Define this symbol to enable the
+                                         bpoolv() function for validating
+                                         a buffer pool. */
+
+#define DumpData    1                 /* Define this symbol to enable the
+                                         bufdump() function which allows
+                                         dumping the contents of an allocated
+                                         or free buffer. */
+#ifdef NOT_USED_NOW
+
+#define FreeWipe    1                 /* Wipe free buffers to a guaranteed
+                                         pattern of garbage to trip up
+                                         miscreants who attempt to use
+                                         pointers into released buffers. */
+
+#define BestFit     1                 /* Use a best fit algorithm when
+                                         searching for space for an
+                                         allocation request.  This uses
+                                         memory more efficiently, but
+                                         allocation will be much slower. */
+#endif /* NOT_USED_NOW */
+#endif /* KMP_DEBUG */
+
+
+static bufsize bget_bin_size[ ] = {
+    0,
+//    1 << 6,    /* .5 Cache line */
+    1 << 7,    /* 1 Cache line, new */
+    1 << 8,    /* 2 Cache lines */
+    1 << 9,    /* 4 Cache lines, new */
+    1 << 10,   /* 8 Cache lines */
+    1 << 11,   /* 16 Cache lines, new */
+    1 << 12,
+    1 << 13,   /* new */
+    1 << 14,
+    1 << 15,   /* new */
+    1 << 16,
+    1 << 17,
+    1 << 18,
+    1 << 19,
+    1 << 20,    /*  1MB */
+    1 << 21,    /*  2MB */
+    1 << 22,    /*  4MB */
+    1 << 23,    /*  8MB */
+    1 << 24,    /* 16MB */
+    1 << 25,    /* 32MB */
+};
+
+#define MAX_BGET_BINS   (int)(sizeof(bget_bin_size) / sizeof(bufsize))
+
+struct bfhead;
+
+/*  Declare the interface, including the requested buffer size type,
+    bufsize.  */
+
+/* Queue links */
+
+typedef struct qlinks {
+    struct bfhead *flink;             /* Forward link */
+    struct bfhead *blink;             /* Backward link */
+} qlinks_t;
+
+/* Header in allocated and free buffers */
+
+typedef struct bhead2 {
+    kmp_info_t *bthr;                 /* The thread which owns the buffer pool */
+    bufsize     prevfree;             /* Relative link back to previous
+                                         free buffer in memory or 0 if
+                                         previous buffer is allocated.  */
+    bufsize     bsize;                /* Buffer size: positive if free,
+                                         negative if allocated. */
+} bhead2_t;
+
+/* Make sure the bhead structure is a multiple of SizeQuant in size. */
+
+typedef union bhead {
+    KMP_ALIGN( SizeQuant )
+    AlignType           b_align;
+    char                b_pad[ sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant)) ];
+    bhead2_t            bb;
+} bhead_t;
+#define BH(p)   ((bhead_t *) (p))
+
+/*  Header in directly allocated buffers (by acqfcn) */
+
+typedef struct bdhead
+{
+    bufsize tsize;                    /* Total size, including overhead */
+    bhead_t bh;                       /* Common header */
+} bdhead_t;
+#define BDH(p)  ((bdhead_t *) (p))
+
+/* Header in free buffers */
+
+typedef struct bfhead {
+    bhead_t  bh;                      /* Common allocated/free header */
+    qlinks_t ql;                      /* Links on free list */
+} bfhead_t;
+#define BFH(p)  ((bfhead_t *) (p))
+
+typedef struct thr_data {
+    bfhead_t freelist[ MAX_BGET_BINS ];
+#if BufStats
+    size_t totalloc;               /* Total space currently allocated */
+    long numget, numrel;           /* Number of bget() and brel() calls */
+    long numpblk;                  /* Number of pool blocks */
+    long numpget, numprel;         /* Number of block gets and rels */
+    long numdget, numdrel;         /* Number of direct gets and rels */
+#endif /* BufStats */
+
+    /* Automatic expansion block management functions */
+    bget_compact_t compfcn;
+    bget_acquire_t acqfcn;
+    bget_release_t relfcn;
+
+    bget_mode_t    mode;              /* what allocation mode to use? */
+
+    bufsize exp_incr;                 /* Expansion block size */
+    bufsize pool_len;                 /* 0: no bpool calls have been made
+                                         -1: not all pool blocks are
+                                             the same size
+                                         >0: (common) block size for all
+                                             bpool calls made so far
+                                      */
+    bfhead_t * last_pool;             /* Last pool owned by this thread (delay dealocation) */
+} thr_data_t;
+
+/*  Minimum allocation quantum: */
+
+#define QLSize  (sizeof(qlinks_t))
+#define SizeQ   ((SizeQuant > QLSize) ? SizeQuant : QLSize)
+#define MaxSize (bufsize)( ~ ( ( (bufsize)( 1 ) << ( sizeof( bufsize ) * CHAR_BIT - 1 ) ) | ( SizeQuant - 1 ) ) )
+    // Maximun for the requested size.
+
+/* End sentinel: value placed in bsize field of dummy block delimiting
+   end of pool block.  The most negative number which will  fit  in  a
+   bufsize, defined in a way that the compiler will accept. */
+
+#define ESent   ((bufsize) (-(((((bufsize)1)<<((int)sizeof(bufsize)*8-2))-1)*2)-2))
+
+/* ------------------------------------------------------------------------ */
+
+/* Thread Data management routines */
+
+static int
+bget_get_bin( bufsize size )
+{
+    // binary chop bins
+    int lo = 0, hi = MAX_BGET_BINS - 1;
+
+    KMP_DEBUG_ASSERT( size > 0 );
+
+    while ( (hi - lo) > 1 ) {
+        int mid = (lo + hi) >> 1;
+        if (size < bget_bin_size[ mid ])
+            hi = mid - 1;
+        else
+            lo = mid;
+    }
+
+    KMP_DEBUG_ASSERT( (lo >= 0) && (lo < MAX_BGET_BINS) );
+
+    return lo;
+}
+
+static void
+set_thr_data( kmp_info_t *th )
+{
+    int i;
+    thr_data_t *data;
+
+    data =
+        (thr_data_t *)(
+            ( ! th->th.th_local.bget_data ) ? __kmp_allocate( sizeof( *data ) ) : th->th.th_local.bget_data
+        );
+
+    memset( data, '\0', sizeof( *data ) );
+
+    for (i = 0; i < MAX_BGET_BINS; ++i) {
+        data->freelist[ i ].ql.flink = & data->freelist[ i ];
+        data->freelist[ i ].ql.blink = & data->freelist[ i ];
+    }
+
+    th->th.th_local.bget_data = data;
+    th->th.th_local.bget_list = 0;
+#if ! USE_CMP_XCHG_FOR_BGET
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_init_lock( & th->th.th_local.bget_lock );
+#else
+    __kmp_init_bootstrap_lock( & th->th.th_local.bget_lock );
+#endif /* USE_LOCK_FOR_BGET */
+#endif /* ! USE_CMP_XCHG_FOR_BGET */
+}
+
+static thr_data_t *
+get_thr_data( kmp_info_t *th )
+{
+    thr_data_t *data;
+
+    data = (thr_data_t *) th->th.th_local.bget_data;
+
+    KMP_DEBUG_ASSERT( data != 0 );
+
+    return data;
+}
+
+
+#ifdef KMP_DEBUG
+
+static void
+__kmp_bget_validate_queue( kmp_info_t *th )
+{
+    /* NOTE: assume that the global_lock is held */
+
+    void *p = (void *) th->th.th_local.bget_list;
+
+    while (p != 0) {
+        bfhead_t *b = BFH(((char *) p) - sizeof(bhead_t));
+
+        KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+        p = (void *) b->ql.flink;
+    }
+}
+
+#endif
+
+/* Walk the free list and release the enqueued buffers */
+
+static void
+__kmp_bget_dequeue( kmp_info_t *th )
+{
+    void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
+
+    if (p != 0) {
+        #if USE_CMP_XCHG_FOR_BGET
+            {
+                volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+                while ( ! KMP_COMPARE_AND_STORE_PTR(
+                    & th->th.th_local.bget_list, old_value, NULL ) )
+                {
+                    KMP_CPU_PAUSE();
+                    old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+                }
+                p = (void *) old_value;
+            }
+        #else /* ! USE_CMP_XCHG_FOR_BGET */
+            #ifdef USE_QUEUING_LOCK_FOR_BGET
+                __kmp_acquire_lock( & th->th.th_local.bget_lock,
+                                    __kmp_gtid_from_thread(th) );
+            #else
+                __kmp_acquire_bootstrap_lock( & th->th.th_local.bget_lock );
+            #endif /* USE_QUEUING_LOCK_FOR_BGET */
+
+             p = (void *) th->th.th_local.bget_list;
+             th->th.th_local.bget_list = 0;
+
+            #ifdef USE_QUEUING_LOCK_FOR_BGET
+                __kmp_release_lock( & th->th.th_local.bget_lock,
+                                    __kmp_gtid_from_thread(th) );
+            #else
+                __kmp_release_bootstrap_lock( & th->th.th_local.bget_lock );
+            #endif
+        #endif /* USE_CMP_XCHG_FOR_BGET */
+
+        /* Check again to make sure the list is not empty */
+
+        while (p != 0) {
+            void *buf = p;
+            bfhead_t *b = BFH(((char *) p) - sizeof(bhead_t));
+
+            KMP_DEBUG_ASSERT( b->bh.bb.bsize != 0 );
+            KMP_DEBUG_ASSERT( ( (kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1 ) ==
+                                (kmp_uintptr_t)th ); // clear possible mark
+            KMP_DEBUG_ASSERT( b->ql.blink == 0 );
+
+            p = (void *) b->ql.flink;
+
+            brel( th, buf );
+        }
+    }
+}
+
+/* Chain together the free buffers by using the thread owner field */
+
+static void
+__kmp_bget_enqueue( kmp_info_t *th, void *buf
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+                    , kmp_int32 rel_gtid
+#endif
+                  )
+{
+    bfhead_t *b = BFH(((char *) buf) - sizeof(bhead_t));
+
+    KMP_DEBUG_ASSERT( b->bh.bb.bsize != 0 );
+    KMP_DEBUG_ASSERT( ( (kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1 ) ==
+                        (kmp_uintptr_t)th ); // clear possible mark
+
+    b->ql.blink = 0;
+
+    KC_TRACE( 10, ( "__kmp_bget_enqueue: moving buffer to T#%d list\n",
+                    __kmp_gtid_from_thread( th ) ) );
+
+#if USE_CMP_XCHG_FOR_BGET
+    {
+        volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
+        /* the next pointer must be set before setting bget_list to buf to avoid
+           exposing a broken list to other threads, even for an instant. */
+        b->ql.flink = BFH( old_value );
+
+        while ( ! KMP_COMPARE_AND_STORE_PTR(
+            & th->th.th_local.bget_list, old_value, buf ) )
+        {
+            KMP_CPU_PAUSE();
+            old_value = TCR_PTR(th->th.th_local.bget_list);
+            /* the next pointer must be set before setting bget_list to buf to avoid
+               exposing a broken list to other threads, even for an instant. */
+            b->ql.flink = BFH( old_value );
+        }
+    }
+#else /* ! USE_CMP_XCHG_FOR_BGET */
+# ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_acquire_lock( & th->th.th_local.bget_lock, rel_gtid );
+# else
+    __kmp_acquire_bootstrap_lock( & th->th.th_local.bget_lock );
+ # endif
+
+    b->ql.flink = BFH( th->th.th_local.bget_list );
+    th->th.th_local.bget_list = (void *) buf;
+
+# ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_release_lock( & th->th.th_local.bget_lock, rel_gtid );
+# else
+    __kmp_release_bootstrap_lock( & th->th.th_local.bget_lock );
+# endif
+#endif /* USE_CMP_XCHG_FOR_BGET */
+}
+
+/* insert buffer back onto a new freelist */
+
+static void
+__kmp_bget_insert_into_freelist( thr_data_t *thr, bfhead_t *b )
+{
+    int bin;
+
+    KMP_DEBUG_ASSERT( ((size_t)b ) % SizeQuant == 0 );
+    KMP_DEBUG_ASSERT( b->bh.bb.bsize % SizeQuant == 0 );
+
+    bin = bget_get_bin( b->bh.bb.bsize );
+
+    KMP_DEBUG_ASSERT(thr->freelist[ bin ].ql.blink->ql.flink == &thr->freelist[ bin ]);
+    KMP_DEBUG_ASSERT(thr->freelist[ bin ].ql.flink->ql.blink == &thr->freelist[ bin ]);
+
+    b->ql.flink = &thr->freelist[ bin ];
+    b->ql.blink = thr->freelist[ bin ].ql.blink;
+
+    thr->freelist[ bin ].ql.blink = b;
+    b->ql.blink->ql.flink = b;
+}
+
+/* unlink the buffer from the old freelist */
+
+static void
+__kmp_bget_remove_from_freelist( bfhead_t *b )
+{
+    KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
+    KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
+
+    b->ql.blink->ql.flink = b->ql.flink;
+    b->ql.flink->ql.blink = b->ql.blink;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/*  GET STATS -- check info on free list */
+
+static void
+bcheck(  kmp_info_t *th, bufsize *max_free, bufsize *total_free )
+{
+    thr_data_t *thr = get_thr_data( th );
+    int bin;
+
+    *total_free = *max_free = 0;
+
+    for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+        bfhead_t *b, *best;
+
+        best = &thr->freelist[ bin ];
+        b = best->ql.flink;
+
+        while (b != &thr->freelist[ bin ]) {
+            *total_free += (b->bh.bb.bsize - sizeof( bhead_t ));
+            if ((best == &thr->freelist[ bin ]) || (b->bh.bb.bsize < best->bh.bb.bsize))
+                best = b;
+
+            /* Link to next buffer */
+            b = b->ql.flink;
+        }
+
+        if (*max_free < best->bh.bb.bsize)
+            *max_free = best->bh.bb.bsize;
+    }
+
+    if (*max_free > (bufsize)sizeof( bhead_t ))
+        *max_free -= sizeof( bhead_t );
+}
+
+/* ------------------------------------------------------------------------ */
+
+/*  BGET  --  Allocate a buffer.  */
+
+static void *
+bget(  kmp_info_t *th, bufsize requested_size )
+{
+    thr_data_t *thr = get_thr_data( th );
+    bufsize size = requested_size;
+    bfhead_t *b;
+    void *buf;
+    int compactseq = 0;
+    int use_blink = 0;
+/* For BestFit */
+    bfhead_t *best;
+
+    if ( size < 0 || size + sizeof( bhead_t ) > MaxSize ) {
+        return NULL;
+    }; // if
+
+    __kmp_bget_dequeue( th );         /* Release any queued buffers */
+
+    if (size < (bufsize)SizeQ) {      /* Need at least room for the */
+        size = SizeQ;                 /*    queue links.  */
+    }
+    #if defined( SizeQuant ) && ( SizeQuant > 1 )
+        size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
+    #endif
+
+    size += sizeof(bhead_t);     /* Add overhead in allocated buffer
+                                         to size required. */
+    KMP_DEBUG_ASSERT( size >= 0 );
+    KMP_DEBUG_ASSERT( size % SizeQuant == 0 );
+
+    use_blink = ( thr->mode == bget_mode_lifo );
+
+    /* If a compact function was provided in the call to bectl(), wrap
+       a loop around the allocation process  to  allow  compaction  to
+       intervene in case we don't find a suitable buffer in the chain. */
+
+    for (;;) {
+        int bin;
+
+        for (bin = bget_get_bin( size ); bin < MAX_BGET_BINS; ++bin) {
+            /* Link to next buffer */
+            b = ( use_blink ? thr->freelist[ bin ].ql.blink : thr->freelist[ bin ].ql.flink );
+
+            if (thr->mode == bget_mode_best) {
+                best = &thr->freelist[ bin ];
+
+                /* Scan the free list searching for the first buffer big enough
+                   to hold the requested size buffer. */
+
+                while (b != &thr->freelist[ bin ]) {
+                    if (b->bh.bb.bsize >= (bufsize) size) {
+                        if ((best == &thr->freelist[ bin ]) || (b->bh.bb.bsize < best->bh.bb.bsize)) {
+                            best = b;
+                        }
+                    }
+
+                    /* Link to next buffer */
+                    b = ( use_blink ? b->ql.blink : b->ql.flink );
+                }
+                b = best;
+            }
+
+            while (b != &thr->freelist[ bin ]) {
+                if ((bufsize) b->bh.bb.bsize >= (bufsize) size) {
+
+                    /* Buffer  is big enough to satisfy  the request.  Allocate it
+                       to the caller.  We must decide whether the buffer is  large
+                       enough  to  split  into  the part given to the caller and a
+                       free buffer that remains on the free list, or  whether  the
+                       entire  buffer  should  be  removed  from the free list and
+                       given to the caller in its entirety.   We  only  split  the
+                       buffer if enough room remains for a header plus the minimum
+                       quantum of allocation. */
+
+                    if ((b->bh.bb.bsize - (bufsize) size) > (bufsize)(SizeQ + (sizeof(bhead_t)))) {
+                        bhead_t *ba, *bn;
+
+                        ba = BH(((char *) b) + (b->bh.bb.bsize - (bufsize) size));
+                        bn = BH(((char *) ba) + size);
+
+                        KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
+
+                        /* Subtract size from length of free block. */
+                        b->bh.bb.bsize -= (bufsize) size;
+
+                        /* Link allocated buffer to the previous free buffer. */
+                        ba->bb.prevfree = b->bh.bb.bsize;
+
+                        /* Plug negative size into user buffer. */
+                        ba->bb.bsize = -size;
+
+                        /* Mark this buffer as owned by this thread. */
+                        TCW_PTR(ba->bb.bthr, th);   // not an allocated address (do not mark it)
+                        /* Mark buffer after this one not preceded by free block. */
+                        bn->bb.prevfree = 0;
+
+                        /* unlink the buffer from the old freelist, and reinsert it into the new freelist */
+                        __kmp_bget_remove_from_freelist( b );
+                        __kmp_bget_insert_into_freelist( thr, b );
+#if BufStats
+                        thr->totalloc += (size_t) size;
+                        thr->numget++;        /* Increment number of bget() calls */
+#endif
+                        buf = (void *) ((((char *) ba) + sizeof(bhead_t)));
+                        KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
+                        return buf;
+                    } else {
+                        bhead_t *ba;
+
+                        ba = BH(((char *) b) + b->bh.bb.bsize);
+
+                        KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
+
+                        /* The buffer isn't big enough to split.  Give  the  whole
+                           shebang to the caller and remove it from the free list. */
+
+                       __kmp_bget_remove_from_freelist( b );
+#if BufStats
+                        thr->totalloc += (size_t) b->bh.bb.bsize;
+                        thr->numget++;        /* Increment number of bget() calls */
+#endif
+                        /* Negate size to mark buffer allocated. */
+                        b->bh.bb.bsize = -(b->bh.bb.bsize);
+
+                        /* Mark this buffer as owned by this thread. */
+                        TCW_PTR(ba->bb.bthr, th);   // not an allocated address (do not mark it)
+                        /* Zero the back pointer in the next buffer in memory
+                           to indicate that this buffer is allocated. */
+                        ba->bb.prevfree = 0;
+
+                        /* Give user buffer starting at queue links. */
+                        buf =  (void *) &(b->ql);
+                        KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
+                        return buf;
+                    }
+                }
+
+                /* Link to next buffer */
+                b = ( use_blink ? b->ql.blink : b->ql.flink );
+            }
+        }
+
+        /* We failed to find a buffer.  If there's a compact  function
+           defined,  notify  it  of the size requested.  If it returns
+           TRUE, try the allocation again. */
+
+        if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
+            break;
+        }
+    }
+
+    /* No buffer available with requested size free. */
+
+    /* Don't give up yet -- look in the reserve supply. */
+
+    if (thr->acqfcn != 0) {
+        if (size > (bufsize) (thr->exp_incr - sizeof(bhead_t))) {
+
+            /* Request  is  too  large  to  fit in a single expansion
+               block.  Try to satisy it by a direct buffer acquisition. */
+
+            bdhead_t *bdh;
+
+            size += sizeof(bdhead_t) - sizeof(bhead_t);
+
+            KE_TRACE( 10, ("%%%%%% MALLOC( %d )\n", (int) size ) );
+
+            /* richryan */
+            bdh = BDH((*thr->acqfcn)((bufsize) size));
+            if (bdh != NULL) {
+
+                /*  Mark the buffer special by setting the size field
+                    of its header to zero.  */
+                bdh->bh.bb.bsize = 0;
+
+                /* Mark this buffer as owned by this thread. */
+                TCW_PTR(bdh->bh.bb.bthr, th);  // don't mark buffer as allocated,
+                                               // because direct buffer never goes to free list
+                bdh->bh.bb.prevfree = 0;
+                bdh->tsize = size;
+#if BufStats
+                thr->totalloc += (size_t) size;
+                thr->numget++;        /* Increment number of bget() calls */
+                thr->numdget++;       /* Direct bget() call count */
+#endif
+                buf =  (void *) (bdh + 1);
+                KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
+                return buf;
+            }
+
+        } else {
+
+            /*  Try to obtain a new expansion block */
+
+            void *newpool;
+
+            KE_TRACE( 10, ("%%%%%% MALLOCB( %d )\n", (int) thr->exp_incr ) );
+
+            /* richryan */
+            newpool = (*thr->acqfcn)((bufsize) thr->exp_incr);
+            KMP_DEBUG_ASSERT( ((size_t)newpool) % SizeQuant == 0 );
+            if (newpool != NULL) {
+                bpool( th, newpool, thr->exp_incr);
+                buf =  bget( th, requested_size);  /* This can't, I say, can't get into a loop. */
+                return buf;
+            }
+        }
+    }
+
+    /*  Still no buffer available */
+
+    return NULL;
+}
+
+/*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
+               the  entire  contents  of  the buffer to zero, not just the
+               region requested by the caller. */
+
+static void *
+bgetz(  kmp_info_t *th, bufsize size )
+{
+    char *buf = (char *) bget( th, size);
+
+    if (buf != NULL) {
+        bhead_t *b;
+        bufsize rsize;
+
+        b = BH(buf - sizeof(bhead_t));
+        rsize = -(b->bb.bsize);
+        if (rsize == 0) {
+            bdhead_t *bd;
+
+            bd = BDH(buf - sizeof(bdhead_t));
+            rsize = bd->tsize - (bufsize) sizeof(bdhead_t);
+        } else {
+            rsize -= sizeof(bhead_t);
+        }
+
+        KMP_DEBUG_ASSERT(rsize >= size);
+
+        (void) memset(buf, 0, (bufsize) rsize);
+    }
+    return ((void *) buf);
+}
+
+/*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
+               simply in terms of brel()  and  bget().   It  could  be
+               enhanced to allow the buffer to grow into adjacent free
+               blocks and to avoid moving data unnecessarily.  */
+
+static void *
+bgetr(  kmp_info_t *th, void *buf, bufsize size)
+{
+    void *nbuf;
+    bufsize osize;                    /* Old size of buffer */
+    bhead_t *b;
+
+    nbuf = bget( th, size );
+    if ( nbuf == NULL ) { /* Acquire new buffer */
+        return NULL;
+    }
+    if ( buf == NULL ) {
+        return nbuf;
+    }
+    b = BH(((char *) buf) - sizeof(bhead_t));
+    osize = -b->bb.bsize;
+    if (osize == 0) {
+        /*  Buffer acquired directly through acqfcn. */
+        bdhead_t *bd;
+
+        bd = BDH(((char *) buf) - sizeof(bdhead_t));
+        osize = bd->tsize - (bufsize) sizeof(bdhead_t);
+    } else {
+        osize -= sizeof(bhead_t);
+    };
+
+    KMP_DEBUG_ASSERT(osize > 0);
+
+    (void) KMP_MEMCPY((char *) nbuf, (char *) buf, /* Copy the data */
+             (size_t) ((size < osize) ? size : osize));
+    brel( th, buf );
+
+    return nbuf;
+}
+
+/*  BREL  --  Release a buffer.  */
+
+static void
+brel(  kmp_info_t *th, void *buf )
+{
+    thr_data_t *thr = get_thr_data( th );
+    bfhead_t *b, *bn;
+    kmp_info_t *bth;
+
+    KMP_DEBUG_ASSERT(buf != NULL);
+    KMP_DEBUG_ASSERT( ((size_t)buf) % SizeQuant == 0 );
+
+    b = BFH(((char *) buf) - sizeof(bhead_t));
+
+    if (b->bh.bb.bsize == 0) {        /* Directly-acquired buffer? */
+        bdhead_t *bdh;
+
+        bdh = BDH(((char *) buf) - sizeof(bdhead_t));
+        KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+#if BufStats
+        thr->totalloc -= (size_t) bdh->tsize;
+        thr->numdrel++;               /* Number of direct releases */
+        thr->numrel++;                /* Increment number of brel() calls */
+#endif /* BufStats */
+#ifdef FreeWipe
+        (void) memset((char *) buf, 0x55,
+                 (size_t) (bdh->tsize - sizeof(bdhead_t)));
+#endif /* FreeWipe */
+
+        KE_TRACE( 10, ("%%%%%% FREE( %p )\n", (void *) bdh ) );
+
+        KMP_DEBUG_ASSERT( thr->relfcn != 0 );
+        (*thr->relfcn)((void *) bdh);      /* Release it directly. */
+        return;
+    }
+
+    bth = (kmp_info_t *)( (kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1 ); // clear possible mark before comparison
+    if ( bth != th ) {
+        /* Add this buffer to be released by the owning thread later */
+        __kmp_bget_enqueue( bth, buf
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+                            , __kmp_gtid_from_thread( th )
+#endif
+        );
+        return;
+    }
+
+    /* Buffer size must be negative, indicating that the buffer is
+       allocated. */
+
+    if (b->bh.bb.bsize >= 0) {
+        bn = NULL;
+    }
+    KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
+
+    /*  Back pointer in next buffer must be zero, indicating the
+        same thing: */
+
+    KMP_DEBUG_ASSERT(BH((char *) b - b->bh.bb.bsize)->bb.prevfree == 0);
+
+#if BufStats
+    thr->numrel++;                    /* Increment number of brel() calls */
+    thr->totalloc += (size_t) b->bh.bb.bsize;
+#endif
+
+    /* If the back link is nonzero, the previous buffer is free.  */
+
+    if (b->bh.bb.prevfree != 0) {
+        /* The previous buffer is free.  Consolidate this buffer  with  it
+           by  adding  the  length  of  this  buffer  to the previous free
+           buffer.  Note that we subtract the size  in  the  buffer  being
+           released,  since  it's  negative to indicate that the buffer is
+           allocated. */
+
+        register bufsize size = b->bh.bb.bsize;
+
+        /* Make the previous buffer the one we're working on. */
+        KMP_DEBUG_ASSERT(BH((char *) b - b->bh.bb.prevfree)->bb.bsize == b->bh.bb.prevfree);
+        b = BFH(((char *) b) - b->bh.bb.prevfree);
+        b->bh.bb.bsize -= size;
+
+        /* unlink the buffer from the old freelist */
+        __kmp_bget_remove_from_freelist( b );
+    }
+    else {
+        /* The previous buffer isn't allocated.  Mark this buffer
+           size as positive (i.e. free) and fall through to place
+           the buffer on the free list as an isolated free block. */
+
+        b->bh.bb.bsize = -b->bh.bb.bsize;
+    }
+
+    /* insert buffer back onto a new freelist */
+    __kmp_bget_insert_into_freelist( thr, b );
+
+
+    /* Now we look at the next buffer in memory, located by advancing from
+       the  start  of  this  buffer  by its size, to see if that buffer is
+       free.  If it is, we combine  this  buffer  with  the  next  one  in
+       memory, dechaining the second buffer from the free list. */
+
+    bn =  BFH(((char *) b) + b->bh.bb.bsize);
+    if (bn->bh.bb.bsize > 0) {
+
+        /* The buffer is free.  Remove it from the free list and add
+           its size to that of our buffer. */
+
+        KMP_DEBUG_ASSERT(BH((char *) bn + bn->bh.bb.bsize)->bb.prevfree == bn->bh.bb.bsize);
+
+        __kmp_bget_remove_from_freelist( bn );
+
+        b->bh.bb.bsize += bn->bh.bb.bsize;
+
+        /* unlink the buffer from the old freelist, and reinsert it into the new freelist */
+
+        __kmp_bget_remove_from_freelist( b );
+        __kmp_bget_insert_into_freelist( thr, b );
+
+        /* Finally,  advance  to   the  buffer  that   follows  the  newly
+           consolidated free block.  We must set its  backpointer  to  the
+           head  of  the  consolidated free block.  We know the next block
+           must be an allocated block because the process of recombination
+           guarantees  that  two  free  blocks will never be contiguous in
+           memory.  */
+
+        bn = BFH(((char *) b) + b->bh.bb.bsize);
+    }
+#ifdef FreeWipe
+    (void) memset(((char *) b) + sizeof(bfhead_t), 0x55,
+            (size_t) (b->bh.bb.bsize - sizeof(bfhead_t)));
+#endif
+    KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
+
+    /* The next buffer is allocated.  Set the backpointer in it  to  point
+       to this buffer; the previous free buffer in memory. */
+
+    bn->bh.bb.prevfree = b->bh.bb.bsize;
+
+    /*  If  a  block-release function is defined, and this free buffer
+        constitutes the entire block, release it.  Note that  pool_len
+        is  defined  in  such a way that the test will fail unless all
+        pool blocks are the same size.  */
+
+    if (thr->relfcn != 0 &&
+        b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t)))
+    {
+#if BufStats
+        if (thr->numpblk != 1) {        /* Do not release the last buffer until finalization time */
+#endif
+
+            KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+            KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.bsize == ESent);
+            KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.prevfree == b->bh.bb.bsize);
+
+            /*  Unlink the buffer from the free list  */
+            __kmp_bget_remove_from_freelist( b );
+
+            KE_TRACE( 10, ("%%%%%% FREE( %p )\n", (void *) b ) );
+
+            (*thr->relfcn)(b);
+#if BufStats
+            thr->numprel++;               /* Nr of expansion block releases */
+            thr->numpblk--;               /* Total number of blocks */
+            KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+
+            /* avoid leaving stale last_pool pointer around if it is being dealloced */
+            if (thr->last_pool == b) thr->last_pool = 0;
+        }
+        else {
+            thr->last_pool = b;
+        }
+#endif /* BufStats */
+    }
+}
+
+/*  BECTL  --  Establish automatic pool expansion control  */
+
+static void
+bectl(  kmp_info_t *th, bget_compact_t compact, bget_acquire_t acquire, bget_release_t release, bufsize pool_incr)
+{
+    thr_data_t *thr = get_thr_data( th );
+
+    thr->compfcn = compact;
+    thr->acqfcn = acquire;
+    thr->relfcn = release;
+    thr->exp_incr = pool_incr;
+}
+
+/*  BPOOL  --  Add a region of memory to the buffer pool.  */
+
+static void
+bpool(  kmp_info_t *th, void *buf, bufsize len)
+{
+/*    int bin = 0; */
+    thr_data_t *thr = get_thr_data( th );
+    bfhead_t *b = BFH(buf);
+    bhead_t *bn;
+
+    __kmp_bget_dequeue( th );         /* Release any queued buffers */
+
+#ifdef SizeQuant
+    len &= ~(SizeQuant - 1);
+#endif
+    if (thr->pool_len == 0) {
+        thr->pool_len = len;
+    } else if (len != thr->pool_len) {
+        thr->pool_len = -1;
+    }
+#if BufStats
+    thr->numpget++;                   /* Number of block acquisitions */
+    thr->numpblk++;                   /* Number of blocks total */
+    KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+#endif /* BufStats */
+
+    /* Since the block is initially occupied by a single free  buffer,
+       it  had  better  not  be  (much) larger than the largest buffer
+       whose size we can store in bhead.bb.bsize. */
+
+    KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize) ESent + 1));
+
+    /* Clear  the  backpointer at  the start of the block to indicate that
+       there  is  no  free  block  prior  to  this   one.    That   blocks
+       recombination when the first block in memory is released. */
+
+    b->bh.bb.prevfree = 0;
+
+    /* Create a dummy allocated buffer at the end of the pool.  This dummy
+       buffer is seen when a buffer at the end of the pool is released and
+       blocks  recombination  of  the last buffer with the dummy buffer at
+       the end.  The length in the dummy buffer  is  set  to  the  largest
+       negative  number  to  denote  the  end  of  the pool for diagnostic
+       routines (this specific value is  not  counted  on  by  the  actual
+       allocation and release functions). */
+
+    len -= sizeof(bhead_t);
+    b->bh.bb.bsize = (bufsize) len;
+    /* Set the owner of this buffer */
+    TCW_PTR( b->bh.bb.bthr, (kmp_info_t*)((kmp_uintptr_t)th | 1) ); // mark the buffer as allocated address
+
+    /* Chain the new block to the free list. */
+    __kmp_bget_insert_into_freelist( thr, b );
+
+#ifdef FreeWipe
+    (void) memset(((char *) b) + sizeof(bfhead_t), 0x55,
+             (size_t) (len - sizeof(bfhead_t)));
+#endif
+    bn = BH(((char *) b) + len);
+    bn->bb.prevfree = (bufsize) len;
+    /* Definition of ESent assumes two's complement! */
+    KMP_DEBUG_ASSERT( (~0) == -1 && (bn != 0) );
+
+    bn->bb.bsize = ESent;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/*  BFREED  --  Dump the free lists for this thread. */
+
+static void
+bfreed(  kmp_info_t *th )
+{
+    int bin = 0, count = 0;
+    int gtid = __kmp_gtid_from_thread( th );
+    thr_data_t *thr = get_thr_data( th );
+
+#if BufStats
+    __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC " get=%" KMP_INT64_SPEC " rel=%" \
+           KMP_INT64_SPEC " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC " prel=%" KMP_INT64_SPEC \
+           " dget=%" KMP_INT64_SPEC " drel=%" KMP_INT64_SPEC "\n",
+           gtid, (kmp_uint64) thr->totalloc,
+           (kmp_int64) thr->numget,  (kmp_int64) thr->numrel,
+           (kmp_int64) thr->numpblk,
+           (kmp_int64) thr->numpget, (kmp_int64) thr->numprel,
+           (kmp_int64) thr->numdget, (kmp_int64) thr->numdrel );
+#endif
+
+    for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+        bfhead_t *b;
+
+        for (b = thr->freelist[ bin ].ql.flink; b != &thr->freelist[ bin ]; b = b->ql.flink) {
+            bufsize bs = b->bh.bb.bsize;
+
+            KMP_DEBUG_ASSERT( b->ql.blink->ql.flink == b );
+            KMP_DEBUG_ASSERT( b->ql.flink->ql.blink == b );
+            KMP_DEBUG_ASSERT( bs > 0 );
+
+            count += 1;
+
+            __kmp_printf_no_lock("__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b, (long) bs );
+#ifdef FreeWipe
+            {
+                char *lerr = ((char *) b) + sizeof(bfhead_t);
+                if ((bs > sizeof(bfhead_t)) && ((*lerr != 0x55) || (memcmp(lerr, lerr + 1, (size_t) (bs - (sizeof(bfhead_t) + 1))) != 0))) {
+                    __kmp_printf_no_lock( "__kmp_printpool: T#%d     (Contents of above free block have been overstored.)\n", gtid );
+                }
+            }
+#endif
+        }
+    }
+
+    if (count == 0)
+        __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid );
+}
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef KMP_DEBUG
+
+#if BufStats
+
+/*  BSTATS  --  Return buffer allocation free space statistics.  */
+
+static void
+bstats(  kmp_info_t *th, bufsize *curalloc,  bufsize *totfree,  bufsize *maxfree, long *nget, long *nrel)
+{
+    int bin = 0;
+    thr_data_t *thr = get_thr_data( th );
+
+    *nget = thr->numget;
+    *nrel = thr->numrel;
+    *curalloc = (bufsize) thr->totalloc;
+    *totfree = 0;
+    *maxfree = -1;
+
+    for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+        bfhead_t *b = thr->freelist[ bin ].ql.flink;
+
+        while (b != &thr->freelist[ bin ]) {
+            KMP_DEBUG_ASSERT(b->bh.bb.bsize > 0);
+            *totfree += b->bh.bb.bsize;
+            if (b->bh.bb.bsize > *maxfree) {
+                *maxfree = b->bh.bb.bsize;
+            }
+            b = b->ql.flink;              /* Link to next buffer */
+        }
+    }
+}
+
+/*  BSTATSE  --  Return extended statistics  */
+
+static void
+bstatse(  kmp_info_t *th, bufsize *pool_incr, long *npool, long *npget, long *nprel, long *ndget, long *ndrel)
+{
+    thr_data_t *thr = get_thr_data( th );
+
+    *pool_incr = (thr->pool_len < 0) ? -thr->exp_incr : thr->exp_incr;
+    *npool = thr->numpblk;
+    *npget = thr->numpget;
+    *nprel = thr->numprel;
+    *ndget = thr->numdget;
+    *ndrel = thr->numdrel;
+}
+
+#endif /* BufStats */
+
+/*  BUFDUMP  --  Dump the data in a buffer.  This is called with the  user
+                 data pointer, and backs up to the buffer header.  It will
+                 dump either a free block or an allocated one.  */
+
+static void
+bufdump(  kmp_info_t *th, void *buf )
+{
+    bfhead_t *b;
+    unsigned char *bdump;
+    bufsize bdlen;
+
+    b = BFH(((char *) buf) - sizeof(bhead_t));
+    KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+    if (b->bh.bb.bsize < 0) {
+        bdump = (unsigned char *) buf;
+        bdlen = (-b->bh.bb.bsize) - (bufsize) sizeof(bhead_t);
+    } else {
+        bdump = (unsigned char *) (((char *) b) + sizeof(bfhead_t));
+        bdlen = b->bh.bb.bsize - (bufsize) sizeof(bfhead_t);
+    }
+
+    while (bdlen > 0) {
+        int i, dupes = 0;
+        bufsize l = bdlen;
+        char bhex[50], bascii[20];
+
+        if (l > 16) {
+            l = 16;
+        }
+
+        for (i = 0; i < l; i++) {
+            (void) KMP_SNPRINTF(bhex + i * 3, sizeof(bhex), "%02X ", bdump[i]);
+            if (bdump[i] > 0x20 && bdump[i] < 0x7F)
+                bascii[ i ] = bdump[ i ];
+            else
+                bascii[ i ] = ' ';
+        }
+        bascii[i] = 0;
+        (void) __kmp_printf_no_lock("%-48s   %s\n", bhex, bascii);
+        bdump += l;
+        bdlen -= l;
+        while ((bdlen > 16) && (memcmp((char *) (bdump - 16),
+                                       (char *) bdump, 16) == 0)) {
+            dupes++;
+            bdump += 16;
+            bdlen -= 16;
+        }
+        if (dupes > 1) {
+            (void) __kmp_printf_no_lock(
+                "     (%d lines [%d bytes] identical to above line skipped)\n",
+                dupes, dupes * 16);
+        } else if (dupes == 1) {
+            bdump -= 16;
+            bdlen += 16;
+        }
+    }
+}
+
+/*  BPOOLD  --  Dump a buffer pool.  The buffer headers are always listed.
+                If DUMPALLOC is nonzero, the contents of allocated buffers
+                are  dumped.   If  DUMPFREE  is  nonzero,  free blocks are
+                dumped as well.  If FreeWipe  checking  is  enabled,  free
+                blocks  which  have  been clobbered will always be dumped. */
+
+static void
+bpoold(  kmp_info_t *th, void *buf, int dumpalloc, int dumpfree)
+{
+    bfhead_t *b = BFH( (char*)buf - sizeof(bhead_t));
+
+    while (b->bh.bb.bsize != ESent) {
+        bufsize bs = b->bh.bb.bsize;
+
+        if (bs < 0) {
+            bs = -bs;
+            (void) __kmp_printf_no_lock("Allocated buffer: size %6ld bytes.\n", (long) bs);
+            if (dumpalloc) {
+                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
+            }
+        } else {
+            const char *lerr = "";
+
+            KMP_DEBUG_ASSERT(bs > 0);
+            if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
+                lerr = "  (Bad free list links)";
+            }
+            (void) __kmp_printf_no_lock("Free block:       size %6ld bytes.%s\n",
+                (long) bs, lerr);
+#ifdef FreeWipe
+            lerr = ((char *) b) + sizeof(bfhead_t);
+            if ((bs > sizeof(bfhead_t)) && ((*lerr != 0x55) ||
+                (memcmp(lerr, lerr + 1,
+                  (size_t) (bs - (sizeof(bfhead_t) + 1))) != 0))) {
+                (void) __kmp_printf_no_lock(
+                    "(Contents of above free block have been overstored.)\n");
+                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
+            } else
+#endif
+            if (dumpfree) {
+                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
+            }
+        }
+        b = BFH(((char *) b) + bs);
+    }
+}
+
+/*  BPOOLV  --  Validate a buffer pool. */
+
+static int
+bpoolv(  kmp_info_t *th, void *buf )
+{
+    bfhead_t *b = BFH(buf);
+
+    while (b->bh.bb.bsize != ESent) {
+        bufsize bs = b->bh.bb.bsize;
+
+        if (bs < 0) {
+            bs = -bs;
+        } else {
+#ifdef FreeWipe
+            char *lerr = "";
+#endif
+
+            KMP_DEBUG_ASSERT(bs > 0);
+            if (bs <= 0) {
+                return 0;
+            }
+            if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
+                (void) __kmp_printf_no_lock("Free block: size %6ld bytes.  (Bad free list links)\n",
+                     (long) bs);
+                KMP_DEBUG_ASSERT(0);
+                return 0;
+            }
+#ifdef FreeWipe
+            lerr = ((char *) b) + sizeof(bfhead_t);
+            if ((bs > sizeof(bfhead_t)) && ((*lerr != 0x55) ||
+                (memcmp(lerr, lerr + 1,
+                  (size_t) (bs - (sizeof(bfhead_t) + 1))) != 0))) {
+                (void) __kmp_printf_no_lock(
+                    "(Contents of above free block have been overstored.)\n");
+                bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
+                KMP_DEBUG_ASSERT(0);
+                return 0;
+            }
+#endif /* FreeWipe */
+        }
+        b = BFH(((char *) b) + bs);
+    }
+    return 1;
+}
+
+#endif /* KMP_DEBUG */
+
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_initialize_bget( kmp_info_t *th )
+{
+    KMP_DEBUG_ASSERT( SizeQuant >= sizeof( void * ) && (th != 0) );
+
+    set_thr_data( th );
+
+    bectl( th, (bget_compact_t) 0, (bget_acquire_t) malloc, (bget_release_t) free,
+           (bufsize) __kmp_malloc_pool_incr );
+}
+
+void
+__kmp_finalize_bget( kmp_info_t *th )
+{
+    thr_data_t *thr;
+    bfhead_t *b;
+
+    KMP_DEBUG_ASSERT( th != 0 );
+
+#if BufStats
+    thr = (thr_data_t *) th->th.th_local.bget_data;
+    KMP_DEBUG_ASSERT( thr != NULL );
+    b = thr->last_pool;
+
+    /*  If  a  block-release function is defined, and this free buffer
+        constitutes the entire block, release it.  Note that  pool_len
+        is  defined  in  such a way that the test will fail unless all
+        pool blocks are the same size.  */
+
+    /* Deallocate the last pool if one exists because we no longer do it in brel() */
+    if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
+        b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t)))
+    {
+        KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+        KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.bsize == ESent);
+        KMP_DEBUG_ASSERT(BH((char *) b + b->bh.bb.bsize)->bb.prevfree == b->bh.bb.bsize);
+
+        /*  Unlink the buffer from the free list  */
+        __kmp_bget_remove_from_freelist( b );
+
+        KE_TRACE( 10, ("%%%%%% FREE( %p )\n", (void *) b ) );
+
+        (*thr->relfcn)(b);
+        thr->numprel++;               /* Nr of expansion block releases */
+        thr->numpblk--;               /* Total number of blocks */
+        KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+    }
+#endif /* BufStats */
+
+    /* Deallocate bget_data */
+    if ( th->th.th_local.bget_data != NULL ) {
+        __kmp_free( th->th.th_local.bget_data );
+        th->th.th_local.bget_data = NULL;
+    }; // if
+}
+
+void
+kmpc_set_poolsize( size_t size )
+{
+    bectl( __kmp_get_thread(), (bget_compact_t) 0, (bget_acquire_t) malloc,
+           (bget_release_t) free, (bufsize) size );
+}
+
+size_t
+kmpc_get_poolsize( void )
+{
+    thr_data_t *p;
+
+    p = get_thr_data( __kmp_get_thread() );
+
+    return p->exp_incr;
+}
+
+void
+kmpc_set_poolmode( int mode )
+{
+    thr_data_t *p;
+
+    if (mode == bget_mode_fifo || mode == bget_mode_lifo || mode == bget_mode_best) {
+        p = get_thr_data( __kmp_get_thread() );
+        p->mode = (bget_mode_t) mode;
+    }
+}
+
+int
+kmpc_get_poolmode( void )
+{
+    thr_data_t *p;
+
+    p = get_thr_data( __kmp_get_thread() );
+
+    return p->mode;
+}
+
+void
+kmpc_get_poolstat( size_t *maxmem, size_t *allmem )
+{
+    kmp_info_t *th = __kmp_get_thread();
+    bufsize a, b;
+
+    __kmp_bget_dequeue( th );         /* Release any queued buffers */
+
+    bcheck( th, &a, &b );
+
+    *maxmem = a;
+    *allmem = b;
+}
+
+void
+kmpc_poolprint( void )
+{
+    kmp_info_t *th = __kmp_get_thread();
+
+    __kmp_bget_dequeue( th );         /* Release any queued buffers */
+
+    bfreed( th );
+}
+
+#endif // #if KMP_USE_BGET
+
+/* ------------------------------------------------------------------------ */
+
+void *
+kmpc_malloc( size_t size )
+{
+    void * ptr;
+        ptr = bget( __kmp_entry_thread(), (bufsize) size );
+
+    return ptr;
+}
+
+void *
+kmpc_calloc( size_t nelem, size_t elsize )
+{
+    void * ptr;
+        ptr = bgetz( __kmp_entry_thread(), (bufsize) (nelem * elsize) );
+
+    return ptr;
+}
+
+void *
+kmpc_realloc( void * ptr, size_t size )
+{
+    void * result = NULL;
+
+        if ( ptr == NULL ) {
+            // If pointer is NULL, realloc behaves like malloc.
+            result = bget( __kmp_entry_thread(), (bufsize) size );
+        } else if ( size == 0 ) {
+            // If size is 0, realloc behaves like free.
+            // The thread must be registered by the call to kmpc_malloc() or kmpc_calloc() before.
+            // So it should be safe to call __kmp_get_thread(), not __kmp_entry_thread().
+            brel( __kmp_get_thread(), ptr );
+        } else {
+            result = bgetr( __kmp_entry_thread(), ptr, (bufsize) size );
+        }; // if
+
+    return result;
+}
+
+/* NOTE: the library must have already been initialized by a previous allocate */
+
+void
+kmpc_free( void * ptr )
+{
+    if ( ! __kmp_init_serial ) {
+        return;
+    }; // if
+    if ( ptr != NULL ) {
+            kmp_info_t *th = __kmp_get_thread();
+            __kmp_bget_dequeue( th );         /* Release any queued buffers */
+            brel( th, ptr );
+    };
+}
+
+
+/* ------------------------------------------------------------------------ */
+
+void *
+___kmp_thread_malloc( kmp_info_t *th, size_t size KMP_SRC_LOC_DECL )
+{
+    void * ptr;
+    KE_TRACE( 30, (
+        "-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n",
+        th,
+        (int) size
+        KMP_SRC_LOC_PARM
+    ) );
+        ptr = bget( th, (bufsize) size );
+    KE_TRACE( 30, ( "<- __kmp_thread_malloc() returns %p\n", ptr ) );
+    return ptr;
+}
+
+void *
+___kmp_thread_calloc( kmp_info_t *th, size_t nelem, size_t elsize KMP_SRC_LOC_DECL )
+{
+    void * ptr;
+    KE_TRACE( 30, (
+        "-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n",
+        th,
+        (int) nelem,
+        (int) elsize
+        KMP_SRC_LOC_PARM
+    ) );
+        ptr = bgetz( th, (bufsize) (nelem * elsize) );
+    KE_TRACE( 30, ( "<- __kmp_thread_calloc() returns %p\n", ptr ) );
+    return ptr;
+}
+
+void *
+___kmp_thread_realloc( kmp_info_t *th, void *ptr, size_t size KMP_SRC_LOC_DECL )
+{
+    KE_TRACE( 30, (
+        "-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n",
+        th,
+        ptr,
+        (int) size
+        KMP_SRC_LOC_PARM
+    ) );
+        ptr = bgetr( th, ptr, (bufsize) size );
+    KE_TRACE( 30, ( "<- __kmp_thread_realloc() returns %p\n", ptr ) );
+    return ptr;
+}
+
+void
+___kmp_thread_free( kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL )
+{
+    KE_TRACE( 30, (
+        "-> __kmp_thread_free( %p, %p ) called from %s:%d\n",
+        th,
+        ptr
+        KMP_SRC_LOC_PARM
+    ) );
+    if ( ptr != NULL ) {
+            __kmp_bget_dequeue( th );         /* Release any queued buffers */
+            brel( th, ptr );
+    }
+    KE_TRACE( 30, ( "<- __kmp_thread_free()\n" ) );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+/*
+    If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes memory leaks, but it
+    may be useful for debugging memory corruptions, used freed pointers, etc.
+*/
+/* #define LEAK_MEMORY */
+
+struct kmp_mem_descr {      // Memory block descriptor.
+    void * ptr_allocated;   // Pointer returned by malloc(), subject for free().
+    size_t size_allocated;  // Size of allocated memory block.
+    void * ptr_aligned;     // Pointer to aligned memory, to be used by client code.
+    size_t size_aligned;    // Size of aligned memory block.
+};
+typedef struct kmp_mem_descr kmp_mem_descr_t;
+
+/*
+    Allocate memory on requested boundary, fill allocated memory with 0x00.
+    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation error.
+    Must use __kmp_free when freeing memory allocated by this routine!
+ */
+static
+void *
+___kmp_allocate_align( size_t size, size_t alignment KMP_SRC_LOC_DECL )
+{
+    /*
+            __kmp_allocate() allocates (by call to malloc()) bigger memory block than requested to
+        return properly aligned pointer. Original pointer returned by malloc() and size of allocated
+        block is saved in descriptor just before the aligned pointer. This information used by
+        __kmp_free() -- it has to pass to free() original pointer, not aligned one.
+
+            +---------+------------+-----------------------------------+---------+
+            | padding | descriptor |           aligned block           | padding |
+            +---------+------------+-----------------------------------+---------+
+            ^                      ^
+            |                      |
+            |                      +- Aligned pointer returned to caller
+            +- Pointer returned by malloc()
+
+        Aligned block is filled with zeros, paddings are filled with 0xEF.
+    */
+
+    kmp_mem_descr_t  descr;
+    kmp_uintptr_t    addr_allocated;        // Address returned by malloc().
+    kmp_uintptr_t    addr_aligned;          // Aligned address to return to caller.
+    kmp_uintptr_t    addr_descr;            // Address of memory block descriptor.
+
+    KE_TRACE( 25, (
+        "-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
+        (int) size,
+        (int) alignment
+        KMP_SRC_LOC_PARM
+    ) );
+
+    KMP_DEBUG_ASSERT( alignment < 32 * 1024 ); // Alignment should not be too
+    KMP_DEBUG_ASSERT( sizeof( void * ) <= sizeof( kmp_uintptr_t ) );
+        // Make sure kmp_uintptr_t is enough to store addresses.
+
+    descr.size_aligned = size;
+    descr.size_allocated = descr.size_aligned + sizeof( kmp_mem_descr_t ) + alignment;
+
+    #if KMP_DEBUG
+        descr.ptr_allocated = _malloc_src_loc( descr.size_allocated, _file_, _line_ );
+    #else
+    descr.ptr_allocated = malloc_src_loc( descr.size_allocated KMP_SRC_LOC_PARM );
+    #endif
+    KE_TRACE( 10, (
+        "   malloc( %d ) returned %p\n",
+        (int) descr.size_allocated,
+        descr.ptr_allocated
+    ) );
+    if ( descr.ptr_allocated == NULL ) {
+        KMP_FATAL( OutOfHeapMemory );
+    };
+
+    addr_allocated = (kmp_uintptr_t) descr.ptr_allocated;
+    addr_aligned =
+        ( addr_allocated + sizeof( kmp_mem_descr_t ) + alignment )
+        & ~ ( alignment - 1 );
+    addr_descr = addr_aligned - sizeof( kmp_mem_descr_t );
+
+    descr.ptr_aligned = (void *) addr_aligned;
+
+    KE_TRACE( 26, (
+        "   ___kmp_allocate_align: "
+            "ptr_allocated=%p, size_allocated=%d, "
+            "ptr_aligned=%p, size_aligned=%d\n",
+        descr.ptr_allocated,
+        (int) descr.size_allocated,
+        descr.ptr_aligned,
+        (int) descr.size_aligned
+    ) );
+
+    KMP_DEBUG_ASSERT( addr_allocated <= addr_descr );
+    KMP_DEBUG_ASSERT( addr_descr + sizeof( kmp_mem_descr_t ) == addr_aligned );
+    KMP_DEBUG_ASSERT( addr_aligned + descr.size_aligned <= addr_allocated + descr.size_allocated );
+    KMP_DEBUG_ASSERT( addr_aligned % alignment == 0 );
+
+    #ifdef KMP_DEBUG
+        memset( descr.ptr_allocated, 0xEF, descr.size_allocated );
+            // Fill allocated memory block with 0xEF.
+    #endif
+    memset( descr.ptr_aligned, 0x00, descr.size_aligned );
+        // Fill the aligned memory block (which is intended for using by caller) with 0x00. Do not
+        // put this filling under KMP_DEBUG condition! Many callers expect zeroed memory. (Padding
+        // bytes remain filled with 0xEF in debugging library.)
+    * ( (kmp_mem_descr_t *) addr_descr ) = descr;
+
+    KMP_MB();
+
+    KE_TRACE( 25, ( "<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned ) );
+    return descr.ptr_aligned;
+
+} // func ___kmp_allocate_align
+
+
+/*
+    Allocate memory on cache line boundary, fill allocated memory with 0x00.
+    Do not call this func directly! Use __kmp_allocate macro instead.
+    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation error.
+    Must use __kmp_free when freeing memory allocated by this routine!
+ */
+void *
+___kmp_allocate( size_t size KMP_SRC_LOC_DECL )
+{
+
+    void * ptr;
+    KE_TRACE( 25, ( "-> __kmp_allocate( %d ) called from %s:%d\n", (int) size KMP_SRC_LOC_PARM ) );
+        ptr = ___kmp_allocate_align( size, __kmp_align_alloc KMP_SRC_LOC_PARM );
+    KE_TRACE( 25, ( "<- __kmp_allocate() returns %p\n", ptr ) );
+    return ptr;
+
+} // func ___kmp_allocate
+
+#if (BUILD_MEMORY==FIRST_TOUCH)
+void *
+__kmp_ft_page_allocate(size_t size)
+{
+  void *adr, *aadr;
+#if KMP_OS_LINUX
+  /* TODO: Use this function to get page size everywhere */
+  int page_size = getpagesize();
+#else
+  /* TODO: Find windows function to get page size and use it everywhere */
+  int page_size = PAGE_SIZE;
+#endif /* KMP_OS_LINUX */
+
+  adr = (void *) __kmp_thread_malloc( __kmp_get_thread(),
+                                    size + page_size + KMP_PTR_SKIP);
+  if ( adr == 0 )
+    KMP_FATAL( OutOfHeapMemory );
+
+  /* check to see if adr is on a page boundary. */
+  if ( ( (kmp_uintptr_t) adr & (page_size - 1)) == 0)
+    /* nothing to do if adr is already on a page boundary. */
+    aadr = adr;
+  else
+    /* else set aadr to the first page boundary in the allocated memory. */
+    aadr = (void *) ( ( (kmp_uintptr_t) adr + page_size) & ~(page_size - 1) );
+
+  /* the first touch by the owner thread. */
+  *((void**)aadr) = adr;
+
+  /* skip the memory space used for storing adr above. */
+  return (void*)((char*)aadr + KMP_PTR_SKIP);
+}
+#endif
+
+/*
+    Allocate memory on page boundary, fill allocated memory with 0x00.
+    Does not call this func directly! Use __kmp_page_allocate macro instead.
+    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation error.
+    Must use __kmp_free when freeing memory allocated by this routine!
+ */
+void *
+___kmp_page_allocate( size_t size KMP_SRC_LOC_DECL )
+{
+    int    page_size = 8 * 1024;
+    void * ptr;
+
+    KE_TRACE( 25, (
+        "-> __kmp_page_allocate( %d ) called from %s:%d\n",
+        (int) size
+        KMP_SRC_LOC_PARM
+    ) );
+        ptr = ___kmp_allocate_align( size, page_size KMP_SRC_LOC_PARM );
+    KE_TRACE( 25, ( "<- __kmp_page_allocate( %d ) returns %p\n", (int) size, ptr ) );
+    return ptr;
+} // ___kmp_page_allocate
+
+/*
+    Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
+    In debug mode, fill the memory block with 0xEF before call to free().
+*/
+void
+___kmp_free( void * ptr KMP_SRC_LOC_DECL )
+{
+
+        kmp_mem_descr_t descr;
+        kmp_uintptr_t   addr_allocated;        // Address returned by malloc().
+        kmp_uintptr_t   addr_aligned;          // Aligned address passed by caller.
+
+        KE_TRACE( 25, ( "-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM ) );
+        KMP_ASSERT( ptr != NULL );
+
+        descr = * ( kmp_mem_descr_t *) ( (kmp_uintptr_t) ptr - sizeof( kmp_mem_descr_t ) );
+
+        KE_TRACE( 26, ( "   __kmp_free:     "
+                        "ptr_allocated=%p, size_allocated=%d, "
+                        "ptr_aligned=%p, size_aligned=%d\n",
+                        descr.ptr_allocated, (int) descr.size_allocated,
+                        descr.ptr_aligned, (int) descr.size_aligned ));
+
+        addr_allocated = (kmp_uintptr_t) descr.ptr_allocated;
+        addr_aligned   = (kmp_uintptr_t) descr.ptr_aligned;
+
+        KMP_DEBUG_ASSERT( addr_aligned % CACHE_LINE == 0 );
+        KMP_DEBUG_ASSERT( descr.ptr_aligned == ptr );
+        KMP_DEBUG_ASSERT( addr_allocated + sizeof( kmp_mem_descr_t ) <= addr_aligned );
+        KMP_DEBUG_ASSERT( descr.size_aligned < descr.size_allocated );
+        KMP_DEBUG_ASSERT( addr_aligned + descr.size_aligned <= addr_allocated + descr.size_allocated );
+
+        #ifdef KMP_DEBUG
+            memset( descr.ptr_allocated, 0xEF, descr.size_allocated );
+                // Fill memory block with 0xEF, it helps catch using freed memory.
+        #endif
+
+        #ifndef LEAK_MEMORY
+            KE_TRACE( 10, ( "   free( %p )\n", descr.ptr_allocated ) );
+        # ifdef KMP_DEBUG
+            _free_src_loc( descr.ptr_allocated, _file_, _line_ );
+        # else
+            free_src_loc( descr.ptr_allocated KMP_SRC_LOC_PARM );
+        # endif
+        #endif
+
+    KMP_MB();
+
+    KE_TRACE( 25, ( "<- __kmp_free() returns\n" ) );
+
+} // func ___kmp_free
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if USE_FAST_MEMORY == 3
+// Allocate fast memory by first scanning the thread's free lists
+// If a chunk the right size exists, grab it off the free list.
+// Otherwise allocate normally using kmp_thread_malloc.
+
+// AC: How to choose the limit? Just get 16 for now...
+#define KMP_FREE_LIST_LIMIT 16
+
+// Always use 128 bytes for determining buckets for caching memory blocks
+#define DCACHE_LINE  128
+
+void *
+___kmp_fast_allocate( kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL )
+{
+    void            * ptr;
+    int               num_lines;
+    int               idx;
+    int               index;
+    void            * alloc_ptr;
+    size_t            alloc_size;
+    kmp_mem_descr_t * descr;
+
+    KE_TRACE( 25, ( "-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
+      __kmp_gtid_from_thread(this_thr), (int) size KMP_SRC_LOC_PARM ) );
+
+    num_lines = ( size + DCACHE_LINE - 1 ) / DCACHE_LINE;
+    idx = num_lines - 1;
+    KMP_DEBUG_ASSERT( idx >= 0 );
+    if ( idx < 2 ) {
+        index = 0;       // idx is [ 0, 1 ], use first free list
+        num_lines = 2;   // 1, 2 cache lines or less than cache line
+    } else if ( ( idx >>= 2 ) == 0 ) {
+        index = 1;       // idx is [ 2, 3 ], use second free list
+        num_lines = 4;   // 3, 4 cache lines
+    } else if ( ( idx >>= 2 ) == 0 ) {
+        index = 2;       // idx is [ 4, 15 ], use third free list
+        num_lines = 16;  // 5, 6, ..., 16 cache lines
+    } else if ( ( idx >>= 2 ) == 0 ) {
+        index = 3;       // idx is [ 16, 63 ], use fourth free list
+        num_lines = 64;  // 17, 18, ..., 64 cache lines
+    } else {
+        goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
+    }
+
+    ptr = this_thr->th.th_free_lists[index].th_free_list_self;
+    if ( ptr != NULL ) {
+        // pop the head of no-sync free list
+        this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+        KMP_DEBUG_ASSERT( this_thr ==
+            ((kmp_mem_descr_t *)( (kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t) ))->ptr_aligned );
+        goto end;
+    };
+    ptr = TCR_SYNC_PTR( this_thr->th.th_free_lists[index].th_free_list_sync );
+    if ( ptr != NULL ) {
+        // no-sync free list is empty, use sync free list (filled in by other threads only)
+        // pop the head of the sync free list, push NULL instead
+        while ( ! KMP_COMPARE_AND_STORE_PTR(
+            &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, NULL ) )
+        {
+            KMP_CPU_PAUSE();
+            ptr = TCR_SYNC_PTR( this_thr->th.th_free_lists[index].th_free_list_sync );
+        }
+        // push the rest of chain into no-sync free list (can be NULL if there was the only block)
+        this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+        KMP_DEBUG_ASSERT( this_thr ==
+            ((kmp_mem_descr_t *)( (kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t) ))->ptr_aligned );
+        goto end;
+    }
+
+    alloc_call:
+    // haven't found block in the free lists, thus allocate it
+    size = num_lines * DCACHE_LINE;
+
+    alloc_size = size + sizeof( kmp_mem_descr_t ) + DCACHE_LINE;
+    KE_TRACE( 25, ( "__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with alloc_size %d\n",
+                   __kmp_gtid_from_thread( this_thr ), alloc_size ) );
+    alloc_ptr = bget( this_thr, (bufsize) alloc_size );
+
+    // align ptr to DCACHE_LINE
+    ptr = (void *)(( ((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) + DCACHE_LINE ) & ~( DCACHE_LINE - 1 ));
+    descr = (kmp_mem_descr_t *)( ((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t) );
+
+    descr->ptr_allocated = alloc_ptr;        // remember allocated pointer
+    // we don't need size_allocated
+    descr->ptr_aligned   = (void *)this_thr; // remember allocating thread
+                                             // (it is already saved in bget buffer,
+                                             // but we may want to use another allocator in future)
+    descr->size_aligned  = size;
+
+    end:
+    KE_TRACE( 25, ( "<- __kmp_fast_allocate( T#%d ) returns %p\n",
+                    __kmp_gtid_from_thread( this_thr ), ptr ) );
+    return ptr;
+} // func __kmp_fast_allocate
+
+// Free fast memory and place it on the thread's free list if it is of
+// the correct size.
+void
+___kmp_fast_free( kmp_info_t *this_thr, void * ptr KMP_SRC_LOC_DECL )
+{
+    kmp_mem_descr_t * descr;
+    kmp_info_t      * alloc_thr;
+    size_t            size;
+    size_t            idx;
+    int               index;
+
+    KE_TRACE( 25, ( "-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
+      __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM ) );
+    KMP_ASSERT( ptr != NULL );
+
+    descr = (kmp_mem_descr_t *)( ((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t) );
+
+    KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
+                  (int) descr->size_aligned ) );
+
+    size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
+
+    idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
+    if ( idx == size ) {
+        index = 0;       // 2 cache lines
+    } else if ( ( idx <<= 1 ) == size ) {
+        index = 1;       // 4 cache lines
+    } else if ( ( idx <<= 2 ) == size ) {
+        index = 2;       // 16 cache lines
+    } else if ( ( idx <<= 2 ) == size ) {
+        index = 3;       // 64 cache lines
+    } else {
+        KMP_DEBUG_ASSERT( size > DCACHE_LINE * 64 );
+        goto free_call;  // 65 or more cache lines ( > 8KB )
+    }
+
+    alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
+    if ( alloc_thr == this_thr ) {
+        // push block to self no-sync free list, linking previous head (LIFO)
+        *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
+        this_thr->th.th_free_lists[index].th_free_list_self = ptr;
+    } else {
+        void * head = this_thr->th.th_free_lists[index].th_free_list_other;
+        if ( head == NULL ) {
+            // Create new free list
+            this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+            *((void **)ptr) = NULL;             // mark the tail of the list
+            descr->size_allocated = (size_t)1;  // head of the list keeps its length
+        } else {
+            // need to check existed "other" list's owner thread and size of queue
+            kmp_mem_descr_t * dsc  = (kmp_mem_descr_t *)( (char*)head - sizeof(kmp_mem_descr_t) );
+            kmp_info_t      * q_th = (kmp_info_t *)(dsc->ptr_aligned); // allocating thread, same for all queue nodes
+            size_t            q_sz = dsc->size_allocated + 1;          // new size in case we add current task
+            if ( q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT ) {
+                // we can add current task to "other" list, no sync needed
+                *((void **)ptr) = head;
+                descr->size_allocated = q_sz;
+                this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+            } else {
+                // either queue blocks owner is changing or size limit exceeded
+                // return old queue to allocating thread (q_th) synchroneously,
+                // and start new list for alloc_thr's tasks
+                void * old_ptr;
+                void * tail = head;
+                void * next = *((void **)head);
+                while ( next != NULL ) {
+                    KMP_DEBUG_ASSERT(
+                        // queue size should decrease by 1 each step through the list
+                        ((kmp_mem_descr_t*)((char*)next - sizeof(kmp_mem_descr_t)))->size_allocated + 1 ==
+                        ((kmp_mem_descr_t*)((char*)tail - sizeof(kmp_mem_descr_t)))->size_allocated );
+                    tail = next;   // remember tail node
+                    next = *((void **)next);
+                }
+                KMP_DEBUG_ASSERT( q_th != NULL );
+                // push block to owner's sync free list
+                old_ptr = TCR_PTR( q_th->th.th_free_lists[index].th_free_list_sync );
+                /* the next pointer must be set before setting free_list to ptr to avoid
+                   exposing a broken list to other threads, even for an instant. */
+                *((void **)tail) = old_ptr;
+
+                while ( ! KMP_COMPARE_AND_STORE_PTR(
+                    &q_th->th.th_free_lists[index].th_free_list_sync,
+                    old_ptr,
+                    head ) )
+                {
+                    KMP_CPU_PAUSE();
+                    old_ptr = TCR_PTR( q_th->th.th_free_lists[index].th_free_list_sync );
+                    *((void **)tail) = old_ptr;
+                }
+
+                // start new list of not-selt tasks
+                this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+                *((void **)ptr) = NULL;
+                descr->size_allocated = (size_t)1;  // head of queue keeps its length
+            }
+        }
+    }
+    goto end;
+
+    free_call:
+    KE_TRACE(25, ( "__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
+                   __kmp_gtid_from_thread( this_thr), size ) );
+    __kmp_bget_dequeue( this_thr );         /* Release any queued buffers */
+    brel( this_thr, descr->ptr_allocated );
+
+    end:
+    KE_TRACE( 25, ( "<- __kmp_fast_free() returns\n" ) );
+
+} // func __kmp_fast_free
+
+
+// Initialize the thread free lists related to fast memory
+// Only do this when a thread is initially created.
+void
+__kmp_initialize_fast_memory( kmp_info_t *this_thr )
+{
+    KE_TRACE(10, ( "__kmp_initialize_fast_memory: Called from th %p\n", this_thr ) );
+
+    memset ( this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof( kmp_free_list_t ) );
+}
+
+// Free the memory in the thread free lists related to fast memory
+// Only do this when a thread is being reaped (destroyed).
+void
+__kmp_free_fast_memory( kmp_info_t *th )
+{
+    // Suppose we use BGET underlying allocator, walk through its structures...
+    int          bin;
+    thr_data_t * thr = get_thr_data( th );
+    void      ** lst = NULL;
+
+    KE_TRACE(5, ( "__kmp_free_fast_memory: Called T#%d\n",
+                   __kmp_gtid_from_thread( th ) ) );
+
+    __kmp_bget_dequeue( th );         // Release any queued buffers
+
+    // Dig through free lists and extract all allocated blocks
+    for ( bin = 0; bin < MAX_BGET_BINS; ++bin ) {
+        bfhead_t * b = thr->freelist[ bin ].ql.flink;
+        while ( b != &thr->freelist[ bin ] ) {
+            if ( (kmp_uintptr_t)b->bh.bb.bthr & 1 ) {   // if the buffer is an allocated address?
+                *((void**)b) = lst;   // link the list (override bthr, but keep flink yet)
+                lst = (void**)b;      // push b into lst
+            }
+            b = b->ql.flink;          // get next buffer
+        }
+    }
+    while ( lst != NULL ) {
+        void * next = *lst;
+        KE_TRACE(10, ( "__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
+                      lst, next, th, __kmp_gtid_from_thread( th ) ) );
+        (*thr->relfcn)(lst);
+        #if BufStats
+            // count blocks to prevent problems in __kmp_finalize_bget()
+            thr->numprel++;       /* Nr of expansion block releases */
+            thr->numpblk--;       /* Total number of blocks */
+        #endif
+        lst = (void**)next;
+    }
+
+    KE_TRACE(5, ( "__kmp_free_fast_memory: Freed T#%d\n",
+                  __kmp_gtid_from_thread( th ) ) );
+}
+
+#endif // USE_FAST_MEMORY

diff --git a/final/runtime/src/kmp_atomic.c b/final/runtime/src/kmp_atomic.c
new file mode 100644
index 0000000..5d5d344
--- /dev/null
+++ b/final/runtime/src/kmp_atomic.c

@@ -0,0 +1,2907 @@
+/*
+ * kmp_atomic.c -- ATOMIC implementation routines
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp_atomic.h"
+#include "kmp.h"                  // TRUE, asm routines prototypes
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+/*!
+@defgroup ATOMIC_OPS Atomic Operations
+These functions are used for implementing the many different varieties of atomic operations.
+
+The compiler is at liberty to inline atomic operations that are naturally supported
+by the target architecture. For instance on IA-32 architecture an atomic like this can be inlined
+@code
+static int s = 0;
+#pragma omp atomic
+    s++;
+@endcode
+using the single instruction: `lock; incl s`
+
+However the runtime does provide entrypoints for these operations to support compilers that choose
+not to inline them. (For instance, `__kmpc_atomic_fixed4_add` could be used to perform the
+increment above.)
+
+The names of the functions are encoded by using the data type name and the operation name, as in these tables.
+
+Data Type  | Data type encoding
+-----------|---------------
+int8_t     | `fixed1`
+uint8_t    | `fixed1u`
+int16_t    | `fixed2`
+uint16_t   | `fixed2u`
+int32_t    | `fixed4`
+uint32_t   | `fixed4u`
+int32_t    | `fixed8`
+uint32_t   | `fixed8u`
+float      | `float4`
+double     | `float8`
+float 10 (8087 eighty bit float)  | `float10`
+complex<float>   |  `cmplx4`
+complex<double>  | `cmplx8`
+complex<float10> | `cmplx10`
+<br>
+
+Operation | Operation encoding
+----------|-------------------
++ | add
+- | sub
+\* | mul
+/ | div
+& | andb
+<< | shl
+\>\> | shr
+\| | orb
+^  | xor
+&& | andl
+\|\| | orl
+maximum | max
+minimum | min
+.eqv.   | eqv
+.neqv.  | neqv
+
+<br>
+For non-commutative operations, `_rev` can also be added for the reversed operation.
+For the functions that capture the result, the suffix `_cpt` is added.
+
+Update Functions
+================
+The general form of an atomic function that just performs an update (without a `capture`)
+@code
+void __kmpc_atomic_<datatype>_<operation>( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs );
+@endcode
+@param ident_t  a pointer to source location
+@param gtid  the global thread id
+@param lhs   a pointer to the left operand
+@param rhs   the right operand
+
+`capture` functions
+===================
+The capture functions perform an atomic update and return a result, which is either the value
+before the capture, or that after. They take an additional argument to determine which result is returned.
+Their general form is therefore
+@code
+TYPE __kmpc_atomic_<datatype>_<operation>_cpt( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs, int flag );
+@endcode
+@param ident_t  a pointer to source location
+@param gtid  the global thread id
+@param lhs   a pointer to the left operand
+@param rhs   the right operand
+@param flag  one if the result is to be captured *after* the operation, zero if captured *before*.
+
+The one set of exceptions to this is the `complex<float>` type where the value is not returned,
+rather an extra argument pointer is passed.
+
+They look like
+@code
+void __kmpc_atomic_cmplx4_<op>_cpt(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag );
+@endcode
+
+Read and Write Operations
+=========================
+The OpenMP<sup>*</sup> standard now supports atomic operations that simply ensure that the
+value is read or written atomically, with no modification
+performed. In many cases on IA-32 architecture these operations can be inlined since
+the architecture guarantees that no tearing occurs on aligned objects
+accessed with a single memory operation of up to 64 bits in size.
+
+The general form of the read operations is
+@code
+TYPE __kmpc_atomic_<type>_rd ( ident_t *id_ref, int gtid, TYPE * loc );
+@endcode
+
+For the write operations the form is
+@code
+void __kmpc_atomic_<type>_wr ( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs );
+@endcode
+
+Full list of functions
+======================
+This leads to the generation of 376 atomic functions, as follows.
+
+Functons for integers
+---------------------
+There are versions here for integers of size 1,2,4 and 8 bytes both signed and unsigned (where that matters).
+@code
+    __kmpc_atomic_fixed1_add
+    __kmpc_atomic_fixed1_add_cpt
+    __kmpc_atomic_fixed1_add_fp
+    __kmpc_atomic_fixed1_andb
+    __kmpc_atomic_fixed1_andb_cpt
+    __kmpc_atomic_fixed1_andl
+    __kmpc_atomic_fixed1_andl_cpt
+    __kmpc_atomic_fixed1_div
+    __kmpc_atomic_fixed1_div_cpt
+    __kmpc_atomic_fixed1_div_cpt_rev
+    __kmpc_atomic_fixed1_div_float8
+    __kmpc_atomic_fixed1_div_fp
+    __kmpc_atomic_fixed1_div_rev
+    __kmpc_atomic_fixed1_eqv
+    __kmpc_atomic_fixed1_eqv_cpt
+    __kmpc_atomic_fixed1_max
+    __kmpc_atomic_fixed1_max_cpt
+    __kmpc_atomic_fixed1_min
+    __kmpc_atomic_fixed1_min_cpt
+    __kmpc_atomic_fixed1_mul
+    __kmpc_atomic_fixed1_mul_cpt
+    __kmpc_atomic_fixed1_mul_float8
+    __kmpc_atomic_fixed1_mul_fp
+    __kmpc_atomic_fixed1_neqv
+    __kmpc_atomic_fixed1_neqv_cpt
+    __kmpc_atomic_fixed1_orb
+    __kmpc_atomic_fixed1_orb_cpt
+    __kmpc_atomic_fixed1_orl
+    __kmpc_atomic_fixed1_orl_cpt
+    __kmpc_atomic_fixed1_rd
+    __kmpc_atomic_fixed1_shl
+    __kmpc_atomic_fixed1_shl_cpt
+    __kmpc_atomic_fixed1_shl_cpt_rev
+    __kmpc_atomic_fixed1_shl_rev
+    __kmpc_atomic_fixed1_shr
+    __kmpc_atomic_fixed1_shr_cpt
+    __kmpc_atomic_fixed1_shr_cpt_rev
+    __kmpc_atomic_fixed1_shr_rev
+    __kmpc_atomic_fixed1_sub
+    __kmpc_atomic_fixed1_sub_cpt
+    __kmpc_atomic_fixed1_sub_cpt_rev
+    __kmpc_atomic_fixed1_sub_fp
+    __kmpc_atomic_fixed1_sub_rev
+    __kmpc_atomic_fixed1_swp
+    __kmpc_atomic_fixed1_wr
+    __kmpc_atomic_fixed1_xor
+    __kmpc_atomic_fixed1_xor_cpt
+    __kmpc_atomic_fixed1u_div
+    __kmpc_atomic_fixed1u_div_cpt
+    __kmpc_atomic_fixed1u_div_cpt_rev
+    __kmpc_atomic_fixed1u_div_fp
+    __kmpc_atomic_fixed1u_div_rev
+    __kmpc_atomic_fixed1u_shr
+    __kmpc_atomic_fixed1u_shr_cpt
+    __kmpc_atomic_fixed1u_shr_cpt_rev
+    __kmpc_atomic_fixed1u_shr_rev
+    __kmpc_atomic_fixed2_add
+    __kmpc_atomic_fixed2_add_cpt
+    __kmpc_atomic_fixed2_add_fp
+    __kmpc_atomic_fixed2_andb
+    __kmpc_atomic_fixed2_andb_cpt
+    __kmpc_atomic_fixed2_andl
+    __kmpc_atomic_fixed2_andl_cpt
+    __kmpc_atomic_fixed2_div
+    __kmpc_atomic_fixed2_div_cpt
+    __kmpc_atomic_fixed2_div_cpt_rev
+    __kmpc_atomic_fixed2_div_float8
+    __kmpc_atomic_fixed2_div_fp
+    __kmpc_atomic_fixed2_div_rev
+    __kmpc_atomic_fixed2_eqv
+    __kmpc_atomic_fixed2_eqv_cpt
+    __kmpc_atomic_fixed2_max
+    __kmpc_atomic_fixed2_max_cpt
+    __kmpc_atomic_fixed2_min
+    __kmpc_atomic_fixed2_min_cpt
+    __kmpc_atomic_fixed2_mul
+    __kmpc_atomic_fixed2_mul_cpt
+    __kmpc_atomic_fixed2_mul_float8
+    __kmpc_atomic_fixed2_mul_fp
+    __kmpc_atomic_fixed2_neqv
+    __kmpc_atomic_fixed2_neqv_cpt
+    __kmpc_atomic_fixed2_orb
+    __kmpc_atomic_fixed2_orb_cpt
+    __kmpc_atomic_fixed2_orl
+    __kmpc_atomic_fixed2_orl_cpt
+    __kmpc_atomic_fixed2_rd
+    __kmpc_atomic_fixed2_shl
+    __kmpc_atomic_fixed2_shl_cpt
+    __kmpc_atomic_fixed2_shl_cpt_rev
+    __kmpc_atomic_fixed2_shl_rev
+    __kmpc_atomic_fixed2_shr
+    __kmpc_atomic_fixed2_shr_cpt
+    __kmpc_atomic_fixed2_shr_cpt_rev
+    __kmpc_atomic_fixed2_shr_rev
+    __kmpc_atomic_fixed2_sub
+    __kmpc_atomic_fixed2_sub_cpt
+    __kmpc_atomic_fixed2_sub_cpt_rev
+    __kmpc_atomic_fixed2_sub_fp
+    __kmpc_atomic_fixed2_sub_rev
+    __kmpc_atomic_fixed2_swp
+    __kmpc_atomic_fixed2_wr
+    __kmpc_atomic_fixed2_xor
+    __kmpc_atomic_fixed2_xor_cpt
+    __kmpc_atomic_fixed2u_div
+    __kmpc_atomic_fixed2u_div_cpt
+    __kmpc_atomic_fixed2u_div_cpt_rev
+    __kmpc_atomic_fixed2u_div_fp
+    __kmpc_atomic_fixed2u_div_rev
+    __kmpc_atomic_fixed2u_shr
+    __kmpc_atomic_fixed2u_shr_cpt
+    __kmpc_atomic_fixed2u_shr_cpt_rev
+    __kmpc_atomic_fixed2u_shr_rev
+    __kmpc_atomic_fixed4_add
+    __kmpc_atomic_fixed4_add_cpt
+    __kmpc_atomic_fixed4_add_fp
+    __kmpc_atomic_fixed4_andb
+    __kmpc_atomic_fixed4_andb_cpt
+    __kmpc_atomic_fixed4_andl
+    __kmpc_atomic_fixed4_andl_cpt
+    __kmpc_atomic_fixed4_div
+    __kmpc_atomic_fixed4_div_cpt
+    __kmpc_atomic_fixed4_div_cpt_rev
+    __kmpc_atomic_fixed4_div_float8
+    __kmpc_atomic_fixed4_div_fp
+    __kmpc_atomic_fixed4_div_rev
+    __kmpc_atomic_fixed4_eqv
+    __kmpc_atomic_fixed4_eqv_cpt
+    __kmpc_atomic_fixed4_max
+    __kmpc_atomic_fixed4_max_cpt
+    __kmpc_atomic_fixed4_min
+    __kmpc_atomic_fixed4_min_cpt
+    __kmpc_atomic_fixed4_mul
+    __kmpc_atomic_fixed4_mul_cpt
+    __kmpc_atomic_fixed4_mul_float8
+    __kmpc_atomic_fixed4_mul_fp
+    __kmpc_atomic_fixed4_neqv
+    __kmpc_atomic_fixed4_neqv_cpt
+    __kmpc_atomic_fixed4_orb
+    __kmpc_atomic_fixed4_orb_cpt
+    __kmpc_atomic_fixed4_orl
+    __kmpc_atomic_fixed4_orl_cpt
+    __kmpc_atomic_fixed4_rd
+    __kmpc_atomic_fixed4_shl
+    __kmpc_atomic_fixed4_shl_cpt
+    __kmpc_atomic_fixed4_shl_cpt_rev
+    __kmpc_atomic_fixed4_shl_rev
+    __kmpc_atomic_fixed4_shr
+    __kmpc_atomic_fixed4_shr_cpt
+    __kmpc_atomic_fixed4_shr_cpt_rev
+    __kmpc_atomic_fixed4_shr_rev
+    __kmpc_atomic_fixed4_sub
+    __kmpc_atomic_fixed4_sub_cpt
+    __kmpc_atomic_fixed4_sub_cpt_rev
+    __kmpc_atomic_fixed4_sub_fp
+    __kmpc_atomic_fixed4_sub_rev
+    __kmpc_atomic_fixed4_swp
+    __kmpc_atomic_fixed4_wr
+    __kmpc_atomic_fixed4_xor
+    __kmpc_atomic_fixed4_xor_cpt
+    __kmpc_atomic_fixed4u_div
+    __kmpc_atomic_fixed4u_div_cpt
+    __kmpc_atomic_fixed4u_div_cpt_rev
+    __kmpc_atomic_fixed4u_div_fp
+    __kmpc_atomic_fixed4u_div_rev
+    __kmpc_atomic_fixed4u_shr
+    __kmpc_atomic_fixed4u_shr_cpt
+    __kmpc_atomic_fixed4u_shr_cpt_rev
+    __kmpc_atomic_fixed4u_shr_rev
+    __kmpc_atomic_fixed8_add
+    __kmpc_atomic_fixed8_add_cpt
+    __kmpc_atomic_fixed8_add_fp
+    __kmpc_atomic_fixed8_andb
+    __kmpc_atomic_fixed8_andb_cpt
+    __kmpc_atomic_fixed8_andl
+    __kmpc_atomic_fixed8_andl_cpt
+    __kmpc_atomic_fixed8_div
+    __kmpc_atomic_fixed8_div_cpt
+    __kmpc_atomic_fixed8_div_cpt_rev
+    __kmpc_atomic_fixed8_div_float8
+    __kmpc_atomic_fixed8_div_fp
+    __kmpc_atomic_fixed8_div_rev
+    __kmpc_atomic_fixed8_eqv
+    __kmpc_atomic_fixed8_eqv_cpt
+    __kmpc_atomic_fixed8_max
+    __kmpc_atomic_fixed8_max_cpt
+    __kmpc_atomic_fixed8_min
+    __kmpc_atomic_fixed8_min_cpt
+    __kmpc_atomic_fixed8_mul
+    __kmpc_atomic_fixed8_mul_cpt
+    __kmpc_atomic_fixed8_mul_float8
+    __kmpc_atomic_fixed8_mul_fp
+    __kmpc_atomic_fixed8_neqv
+    __kmpc_atomic_fixed8_neqv_cpt
+    __kmpc_atomic_fixed8_orb
+    __kmpc_atomic_fixed8_orb_cpt
+    __kmpc_atomic_fixed8_orl
+    __kmpc_atomic_fixed8_orl_cpt
+    __kmpc_atomic_fixed8_rd
+    __kmpc_atomic_fixed8_shl
+    __kmpc_atomic_fixed8_shl_cpt
+    __kmpc_atomic_fixed8_shl_cpt_rev
+    __kmpc_atomic_fixed8_shl_rev
+    __kmpc_atomic_fixed8_shr
+    __kmpc_atomic_fixed8_shr_cpt
+    __kmpc_atomic_fixed8_shr_cpt_rev
+    __kmpc_atomic_fixed8_shr_rev
+    __kmpc_atomic_fixed8_sub
+    __kmpc_atomic_fixed8_sub_cpt
+    __kmpc_atomic_fixed8_sub_cpt_rev
+    __kmpc_atomic_fixed8_sub_fp
+    __kmpc_atomic_fixed8_sub_rev
+    __kmpc_atomic_fixed8_swp
+    __kmpc_atomic_fixed8_wr
+    __kmpc_atomic_fixed8_xor
+    __kmpc_atomic_fixed8_xor_cpt
+    __kmpc_atomic_fixed8u_div
+    __kmpc_atomic_fixed8u_div_cpt
+    __kmpc_atomic_fixed8u_div_cpt_rev
+    __kmpc_atomic_fixed8u_div_fp
+    __kmpc_atomic_fixed8u_div_rev
+    __kmpc_atomic_fixed8u_shr
+    __kmpc_atomic_fixed8u_shr_cpt
+    __kmpc_atomic_fixed8u_shr_cpt_rev
+    __kmpc_atomic_fixed8u_shr_rev
+@endcode
+
+Functions for floating point
+----------------------------
+There are versions here for floating point numbers of size 4, 8, 10 and 16 bytes.
+(Ten byte floats are used by X87, but are now rare).
+@code
+    __kmpc_atomic_float4_add
+    __kmpc_atomic_float4_add_cpt
+    __kmpc_atomic_float4_add_float8
+    __kmpc_atomic_float4_add_fp
+    __kmpc_atomic_float4_div
+    __kmpc_atomic_float4_div_cpt
+    __kmpc_atomic_float4_div_cpt_rev
+    __kmpc_atomic_float4_div_float8
+    __kmpc_atomic_float4_div_fp
+    __kmpc_atomic_float4_div_rev
+    __kmpc_atomic_float4_max
+    __kmpc_atomic_float4_max_cpt
+    __kmpc_atomic_float4_min
+    __kmpc_atomic_float4_min_cpt
+    __kmpc_atomic_float4_mul
+    __kmpc_atomic_float4_mul_cpt
+    __kmpc_atomic_float4_mul_float8
+    __kmpc_atomic_float4_mul_fp
+    __kmpc_atomic_float4_rd
+    __kmpc_atomic_float4_sub
+    __kmpc_atomic_float4_sub_cpt
+    __kmpc_atomic_float4_sub_cpt_rev
+    __kmpc_atomic_float4_sub_float8
+    __kmpc_atomic_float4_sub_fp
+    __kmpc_atomic_float4_sub_rev
+    __kmpc_atomic_float4_swp
+    __kmpc_atomic_float4_wr
+    __kmpc_atomic_float8_add
+    __kmpc_atomic_float8_add_cpt
+    __kmpc_atomic_float8_add_fp
+    __kmpc_atomic_float8_div
+    __kmpc_atomic_float8_div_cpt
+    __kmpc_atomic_float8_div_cpt_rev
+    __kmpc_atomic_float8_div_fp
+    __kmpc_atomic_float8_div_rev
+    __kmpc_atomic_float8_max
+    __kmpc_atomic_float8_max_cpt
+    __kmpc_atomic_float8_min
+    __kmpc_atomic_float8_min_cpt
+    __kmpc_atomic_float8_mul
+    __kmpc_atomic_float8_mul_cpt
+    __kmpc_atomic_float8_mul_fp
+    __kmpc_atomic_float8_rd
+    __kmpc_atomic_float8_sub
+    __kmpc_atomic_float8_sub_cpt
+    __kmpc_atomic_float8_sub_cpt_rev
+    __kmpc_atomic_float8_sub_fp
+    __kmpc_atomic_float8_sub_rev
+    __kmpc_atomic_float8_swp
+    __kmpc_atomic_float8_wr
+    __kmpc_atomic_float10_add
+    __kmpc_atomic_float10_add_cpt
+    __kmpc_atomic_float10_add_fp
+    __kmpc_atomic_float10_div
+    __kmpc_atomic_float10_div_cpt
+    __kmpc_atomic_float10_div_cpt_rev
+    __kmpc_atomic_float10_div_fp
+    __kmpc_atomic_float10_div_rev
+    __kmpc_atomic_float10_mul
+    __kmpc_atomic_float10_mul_cpt
+    __kmpc_atomic_float10_mul_fp
+    __kmpc_atomic_float10_rd
+    __kmpc_atomic_float10_sub
+    __kmpc_atomic_float10_sub_cpt
+    __kmpc_atomic_float10_sub_cpt_rev
+    __kmpc_atomic_float10_sub_fp
+    __kmpc_atomic_float10_sub_rev
+    __kmpc_atomic_float10_swp
+    __kmpc_atomic_float10_wr
+    __kmpc_atomic_float16_add
+    __kmpc_atomic_float16_add_cpt
+    __kmpc_atomic_float16_div
+    __kmpc_atomic_float16_div_cpt
+    __kmpc_atomic_float16_div_cpt_rev
+    __kmpc_atomic_float16_div_rev
+    __kmpc_atomic_float16_max
+    __kmpc_atomic_float16_max_cpt
+    __kmpc_atomic_float16_min
+    __kmpc_atomic_float16_min_cpt
+    __kmpc_atomic_float16_mul
+    __kmpc_atomic_float16_mul_cpt
+    __kmpc_atomic_float16_rd
+    __kmpc_atomic_float16_sub
+    __kmpc_atomic_float16_sub_cpt
+    __kmpc_atomic_float16_sub_cpt_rev
+    __kmpc_atomic_float16_sub_rev
+    __kmpc_atomic_float16_swp
+    __kmpc_atomic_float16_wr
+@endcode
+
+Functions for Complex types
+---------------------------
+Functions for complex types whose component floating point variables are of size 4,8,10 or 16 bytes.
+The names here are based on the size of the component float, *not* the size of the complex type. So
+`__kmpc_atomc_cmplx8_add` is an operation on a `complex<double>` or `complex(kind=8)`, *not* `complex<float>`.
+
+@code
+    __kmpc_atomic_cmplx4_add
+    __kmpc_atomic_cmplx4_add_cmplx8
+    __kmpc_atomic_cmplx4_add_cpt
+    __kmpc_atomic_cmplx4_div
+    __kmpc_atomic_cmplx4_div_cmplx8
+    __kmpc_atomic_cmplx4_div_cpt
+    __kmpc_atomic_cmplx4_div_cpt_rev
+    __kmpc_atomic_cmplx4_div_rev
+    __kmpc_atomic_cmplx4_mul
+    __kmpc_atomic_cmplx4_mul_cmplx8
+    __kmpc_atomic_cmplx4_mul_cpt
+    __kmpc_atomic_cmplx4_rd
+    __kmpc_atomic_cmplx4_sub
+    __kmpc_atomic_cmplx4_sub_cmplx8
+    __kmpc_atomic_cmplx4_sub_cpt
+    __kmpc_atomic_cmplx4_sub_cpt_rev
+    __kmpc_atomic_cmplx4_sub_rev
+    __kmpc_atomic_cmplx4_swp
+    __kmpc_atomic_cmplx4_wr
+    __kmpc_atomic_cmplx8_add
+    __kmpc_atomic_cmplx8_add_cpt
+    __kmpc_atomic_cmplx8_div
+    __kmpc_atomic_cmplx8_div_cpt
+    __kmpc_atomic_cmplx8_div_cpt_rev
+    __kmpc_atomic_cmplx8_div_rev
+    __kmpc_atomic_cmplx8_mul
+    __kmpc_atomic_cmplx8_mul_cpt
+    __kmpc_atomic_cmplx8_rd
+    __kmpc_atomic_cmplx8_sub
+    __kmpc_atomic_cmplx8_sub_cpt
+    __kmpc_atomic_cmplx8_sub_cpt_rev
+    __kmpc_atomic_cmplx8_sub_rev
+    __kmpc_atomic_cmplx8_swp
+    __kmpc_atomic_cmplx8_wr
+    __kmpc_atomic_cmplx10_add
+    __kmpc_atomic_cmplx10_add_cpt
+    __kmpc_atomic_cmplx10_div
+    __kmpc_atomic_cmplx10_div_cpt
+    __kmpc_atomic_cmplx10_div_cpt_rev
+    __kmpc_atomic_cmplx10_div_rev
+    __kmpc_atomic_cmplx10_mul
+    __kmpc_atomic_cmplx10_mul_cpt
+    __kmpc_atomic_cmplx10_rd
+    __kmpc_atomic_cmplx10_sub
+    __kmpc_atomic_cmplx10_sub_cpt
+    __kmpc_atomic_cmplx10_sub_cpt_rev
+    __kmpc_atomic_cmplx10_sub_rev
+    __kmpc_atomic_cmplx10_swp
+    __kmpc_atomic_cmplx10_wr
+    __kmpc_atomic_cmplx16_add
+    __kmpc_atomic_cmplx16_add_cpt
+    __kmpc_atomic_cmplx16_div
+    __kmpc_atomic_cmplx16_div_cpt
+    __kmpc_atomic_cmplx16_div_cpt_rev
+    __kmpc_atomic_cmplx16_div_rev
+    __kmpc_atomic_cmplx16_mul
+    __kmpc_atomic_cmplx16_mul_cpt
+    __kmpc_atomic_cmplx16_rd
+    __kmpc_atomic_cmplx16_sub
+    __kmpc_atomic_cmplx16_sub_cpt
+    __kmpc_atomic_cmplx16_sub_cpt_rev
+    __kmpc_atomic_cmplx16_swp
+    __kmpc_atomic_cmplx16_wr
+@endcode
+*/
+
+/*!
+@ingroup ATOMIC_OPS
+@{
+*/
+
+/*
+ * Global vars
+ */
+
+#ifndef KMP_GOMP_COMPAT
+int __kmp_atomic_mode = 1;      // Intel perf
+#else
+int __kmp_atomic_mode = 2;      // GOMP compatibility
+#endif /* KMP_GOMP_COMPAT */
+
+KMP_ALIGN(128)
+
+kmp_atomic_lock_t __kmp_atomic_lock;     /* Control access to all user coded atomics in Gnu compat mode   */
+kmp_atomic_lock_t __kmp_atomic_lock_1i;  /* Control access to all user coded atomics for 1-byte fixed data types */
+kmp_atomic_lock_t __kmp_atomic_lock_2i;  /* Control access to all user coded atomics for 2-byte fixed data types */
+kmp_atomic_lock_t __kmp_atomic_lock_4i;  /* Control access to all user coded atomics for 4-byte fixed data types */
+kmp_atomic_lock_t __kmp_atomic_lock_4r;  /* Control access to all user coded atomics for kmp_real32 data type    */
+kmp_atomic_lock_t __kmp_atomic_lock_8i;  /* Control access to all user coded atomics for 8-byte fixed data types */
+kmp_atomic_lock_t __kmp_atomic_lock_8r;  /* Control access to all user coded atomics for kmp_real64 data type    */
+kmp_atomic_lock_t __kmp_atomic_lock_8c;  /* Control access to all user coded atomics for complex byte data type  */
+kmp_atomic_lock_t __kmp_atomic_lock_10r; /* Control access to all user coded atomics for long double data type   */
+kmp_atomic_lock_t __kmp_atomic_lock_16r; /* Control access to all user coded atomics for _Quad data type         */
+kmp_atomic_lock_t __kmp_atomic_lock_16c; /* Control access to all user coded atomics for double complex data type*/
+kmp_atomic_lock_t __kmp_atomic_lock_20c; /* Control access to all user coded atomics for long double complex type*/
+kmp_atomic_lock_t __kmp_atomic_lock_32c; /* Control access to all user coded atomics for _Quad complex data type */
+
+
+/*
+  2007-03-02:
+  Without "volatile" specifier in OP_CMPXCHG and MIN_MAX_CMPXCHG we have a
+  bug on *_32 and *_32e. This is just a temporary workaround for the problem.
+  It seems the right solution is writing OP_CMPXCHG and MIN_MAX_CMPXCHG
+  routines in assembler language.
+*/
+#define KMP_ATOMIC_VOLATILE volatile
+
+#if ( KMP_ARCH_X86 ) && KMP_HAVE_QUAD
+
+    static inline void operator +=( Quad_a4_t & lhs, Quad_a4_t & rhs ) { lhs.q += rhs.q; };
+    static inline void operator -=( Quad_a4_t & lhs, Quad_a4_t & rhs ) { lhs.q -= rhs.q; };
+    static inline void operator *=( Quad_a4_t & lhs, Quad_a4_t & rhs ) { lhs.q *= rhs.q; };
+    static inline void operator /=( Quad_a4_t & lhs, Quad_a4_t & rhs ) { lhs.q /= rhs.q; };
+    static inline bool operator < ( Quad_a4_t & lhs, Quad_a4_t & rhs ) { return lhs.q < rhs.q; }
+    static inline bool operator > ( Quad_a4_t & lhs, Quad_a4_t & rhs ) { return lhs.q > rhs.q; }
+
+    static inline void operator +=( Quad_a16_t & lhs, Quad_a16_t & rhs ) { lhs.q += rhs.q; };
+    static inline void operator -=( Quad_a16_t & lhs, Quad_a16_t & rhs ) { lhs.q -= rhs.q; };
+    static inline void operator *=( Quad_a16_t & lhs, Quad_a16_t & rhs ) { lhs.q *= rhs.q; };
+    static inline void operator /=( Quad_a16_t & lhs, Quad_a16_t & rhs ) { lhs.q /= rhs.q; };
+    static inline bool operator < ( Quad_a16_t & lhs, Quad_a16_t & rhs ) { return lhs.q < rhs.q; }
+    static inline bool operator > ( Quad_a16_t & lhs, Quad_a16_t & rhs ) { return lhs.q > rhs.q; }
+
+    static inline void operator +=( kmp_cmplx128_a4_t & lhs, kmp_cmplx128_a4_t & rhs ) { lhs.q += rhs.q; };
+    static inline void operator -=( kmp_cmplx128_a4_t & lhs, kmp_cmplx128_a4_t & rhs ) { lhs.q -= rhs.q; };
+    static inline void operator *=( kmp_cmplx128_a4_t & lhs, kmp_cmplx128_a4_t & rhs ) { lhs.q *= rhs.q; };
+    static inline void operator /=( kmp_cmplx128_a4_t & lhs, kmp_cmplx128_a4_t & rhs ) { lhs.q /= rhs.q; };
+
+    static inline void operator +=( kmp_cmplx128_a16_t & lhs, kmp_cmplx128_a16_t & rhs ) { lhs.q += rhs.q; };
+    static inline void operator -=( kmp_cmplx128_a16_t & lhs, kmp_cmplx128_a16_t & rhs ) { lhs.q -= rhs.q; };
+    static inline void operator *=( kmp_cmplx128_a16_t & lhs, kmp_cmplx128_a16_t & rhs ) { lhs.q *= rhs.q; };
+    static inline void operator /=( kmp_cmplx128_a16_t & lhs, kmp_cmplx128_a16_t & rhs ) { lhs.q /= rhs.q; };
+
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ATOMIC implementation routines                                           */
+/* one routine for each operation and operand type                          */
+/* ------------------------------------------------------------------------ */
+
+// All routines declarations looks like
+// void __kmpc_atomic_RTYPE_OP( ident_t*, int, TYPE *lhs, TYPE rhs );
+// ------------------------------------------------------------------------
+
+#define KMP_CHECK_GTID                                                    \
+    if ( gtid == KMP_GTID_UNKNOWN ) {                                     \
+        gtid = __kmp_entry_gtid();                                        \
+    } // check and get gtid when needed
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE, RET_TYPE) \
+RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs ) \
+{                                                                                         \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                                \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid ));
+
+// ------------------------------------------------------------------------
+// Lock variables used for critical sections for various size operands
+#define ATOMIC_LOCK0   __kmp_atomic_lock       // all types, for Gnu compat
+#define ATOMIC_LOCK1i  __kmp_atomic_lock_1i    // char
+#define ATOMIC_LOCK2i  __kmp_atomic_lock_2i    // short
+#define ATOMIC_LOCK4i  __kmp_atomic_lock_4i    // long int
+#define ATOMIC_LOCK4r  __kmp_atomic_lock_4r    // float
+#define ATOMIC_LOCK8i  __kmp_atomic_lock_8i    // long long int
+#define ATOMIC_LOCK8r  __kmp_atomic_lock_8r    // double
+#define ATOMIC_LOCK8c  __kmp_atomic_lock_8c    // float complex
+#define ATOMIC_LOCK10r __kmp_atomic_lock_10r   // long double
+#define ATOMIC_LOCK16r __kmp_atomic_lock_16r   // _Quad
+#define ATOMIC_LOCK16c __kmp_atomic_lock_16c   // double complex
+#define ATOMIC_LOCK20c __kmp_atomic_lock_20c   // long double complex
+#define ATOMIC_LOCK32c __kmp_atomic_lock_32c   // _Quad complex
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL(OP,LCK_ID) \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );                    \
+                                                                          \
+    (*lhs) OP (rhs);                                                      \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );
+
+// ------------------------------------------------------------------------
+// For GNU compatibility, we may need to use a critical section,
+// even though it is not required by the ISA.
+//
+// On IA-32 architecture, all atomic operations except for fixed 4 byte add,
+// sub, and bitwise logical ops, and 1 & 2 byte logical ops use a common
+// critical section.  On Intel(R) 64, all atomic operations are done with fetch
+// and add or compare and exchange.  Therefore, the FLAG parameter to this
+// macro is either KMP_ARCH_X86 or 0 (or 1, for Intel-specific extension which
+// require a critical section, where we predict that they will be implemented
+// in the Gnu codegen by calling GOMP_atomic_start() / GOMP_atomic_end()).
+//
+// When the OP_GOMP_CRITICAL macro is used in a *CRITICAL* macro construct,
+// the FLAG parameter should always be 1.  If we know that we will be using
+// a critical section, then we want to make certain that we use the generic
+// lock __kmp_atomic_lock to protect the atomic update, and not of of the
+// locks that are specialized based upon the size or type of the data.
+//
+// If FLAG is 0, then we are relying on dead code elimination by the build
+// compiler to get rid of the useless block of code, and save a needless
+// branch at runtime.
+//
+
+#ifdef KMP_GOMP_COMPAT
+# define OP_GOMP_CRITICAL(OP,FLAG)                                        \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL( OP, 0 );                                             \
+        return;                                                           \
+    }
+# else
+# define OP_GOMP_CRITICAL(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+#if KMP_MIC
+# define KMP_DO_PAUSE _mm_delay_32( 1 )
+#else
+# define KMP_DO_PAUSE KMP_CPU_PAUSE()
+#endif /* KMP_MIC */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+#define OP_CMPXCHG(TYPE,BITS,OP)                                          \
+    {                                                                     \
+        TYPE old_value, new_value;                                        \
+        old_value = *(TYPE volatile *)lhs;                                \
+        new_value = old_value OP rhs;                                     \
+        while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,         \
+                      *VOLATILE_CAST(kmp_int##BITS *) &new_value ) )      \
+        {                                                                 \
+                KMP_DO_PAUSE;                                             \
+                                                                          \
+            old_value = *(TYPE volatile *)lhs;                            \
+            new_value = old_value OP rhs;                                 \
+        }                                                                 \
+    }
+
+#if USE_CMPXCHG_FIX
+// 2007-06-25:
+// workaround for C78287 (complex(kind=4) data type)
+// lin_32, lin_32e, win_32 and win_32e are affected (I verified the asm)
+// Compiler ignores the volatile qualifier of the temp_val in the OP_CMPXCHG macro.
+// This is a problem of the compiler.
+// Related tracker is C76005, targeted to 11.0.
+// I verified the asm of the workaround.
+#define OP_CMPXCHG_WORKAROUND(TYPE,BITS,OP)                               \
+    {                                                                     \
+	struct _sss {                                                     \
+	    TYPE            cmp;                                          \
+	    kmp_int##BITS   *vvv;                                         \
+	};                                                                \
+        struct _sss old_value, new_value;                                 \
+        old_value.vvv = ( kmp_int##BITS * )&old_value.cmp;                \
+        new_value.vvv = ( kmp_int##BITS * )&new_value.cmp;                \
+        *old_value.vvv = * ( volatile kmp_int##BITS * ) lhs;              \
+        new_value.cmp = old_value.cmp OP rhs;                             \
+        while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
+                      *VOLATILE_CAST(kmp_int##BITS *) old_value.vvv,      \
+                      *VOLATILE_CAST(kmp_int##BITS *) new_value.vvv ) )   \
+        {                                                                 \
+            KMP_DO_PAUSE;                                                 \
+                                                                          \
+	    *old_value.vvv = * ( volatile kmp_int##BITS * ) lhs;          \
+	    new_value.cmp = old_value.cmp OP rhs;                         \
+        }                                                                 \
+    }
+// end of the first part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define ATOMIC_FIXED_ADD(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
+    /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */      \
+    KMP_TEST_THEN_ADD##BITS( lhs, OP rhs );                                \
+}
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
+    OP_CMPXCHG(TYPE,BITS,OP)                                               \
+}
+#if USE_CMPXCHG_FIX
+// -------------------------------------------------------------------------
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                                 \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                                 \
+    OP_CMPXCHG_WORKAROUND(TYPE,BITS,OP)                                               \
+}
+// end of the second part of the workaround for C78287
+#endif
+
+#else
+// -------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_FIXED_ADD(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
+    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                           \
+        /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */  \
+        KMP_TEST_THEN_ADD##BITS( lhs, OP rhs );                            \
+    } else {                                                               \
+        KMP_CHECK_GTID;                                                    \
+        OP_CRITICAL(OP##=,LCK_ID)  /* unaligned address - use critical */  \
+    }                                                                      \
+}
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
+    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                           \
+        OP_CMPXCHG(TYPE,BITS,OP)     /* aligned address */                 \
+    } else {                                                               \
+        KMP_CHECK_GTID;                                                    \
+        OP_CRITICAL(OP##=,LCK_ID)  /* unaligned address - use critical */  \
+    }                                                                      \
+}
+#if USE_CMPXCHG_FIX
+// -------------------------------------------------------------------------
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                                 \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                                 \
+    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                                      \
+        OP_CMPXCHG(TYPE,BITS,OP)             /* aligned address */                    \
+    } else {                                                                          \
+        KMP_CHECK_GTID;                                                               \
+        OP_CRITICAL(OP##=,LCK_ID)  /* unaligned address - use critical */             \
+    }                                                                                 \
+}
+// end of the second part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// Routines for ATOMIC 4-byte operands addition and subtraction
+ATOMIC_FIXED_ADD( fixed4, add, kmp_int32,  32, +, 4i, 3, 0            )  // __kmpc_atomic_fixed4_add
+ATOMIC_FIXED_ADD( fixed4, sub, kmp_int32,  32, -, 4i, 3, 0            )  // __kmpc_atomic_fixed4_sub
+
+ATOMIC_CMPXCHG( float4,  add, kmp_real32, 32, +,  4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_add
+ATOMIC_CMPXCHG( float4,  sub, kmp_real32, 32, -,  4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub
+
+// Routines for ATOMIC 8-byte operands addition and subtraction
+ATOMIC_FIXED_ADD( fixed8, add, kmp_int64,  64, +, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_add
+ATOMIC_FIXED_ADD( fixed8, sub, kmp_int64,  64, -, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_sub
+
+ATOMIC_CMPXCHG( float8,  add, kmp_real64, 64, +,  8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_add
+ATOMIC_CMPXCHG( float8,  sub, kmp_real64, 64, -,  8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+//     MASK    - used for alignment check
+
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,LCK_ID,MASK,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//              TYPE_ID,OP_ID, TYPE,          OP, LCK_ID, GOMP_FLAG
+ATOMIC_CMPXCHG( fixed1,  add, kmp_int8,    8, +,  1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_add
+ATOMIC_CMPXCHG( fixed1, andb, kmp_int8,    8, &,  1i, 0, 0            )  // __kmpc_atomic_fixed1_andb
+ATOMIC_CMPXCHG( fixed1,  div, kmp_int8,    8, /,  1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_div
+ATOMIC_CMPXCHG( fixed1u, div, kmp_uint8,   8, /,  1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_div
+ATOMIC_CMPXCHG( fixed1,  mul, kmp_int8,    8, *,  1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_mul
+ATOMIC_CMPXCHG( fixed1,  orb, kmp_int8,    8, |,  1i, 0, 0            )  // __kmpc_atomic_fixed1_orb
+ATOMIC_CMPXCHG( fixed1,  shl, kmp_int8,    8, <<, 1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shl
+ATOMIC_CMPXCHG( fixed1,  shr, kmp_int8,    8, >>, 1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shr
+ATOMIC_CMPXCHG( fixed1u, shr, kmp_uint8,   8, >>, 1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_shr
+ATOMIC_CMPXCHG( fixed1,  sub, kmp_int8,    8, -,  1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_sub
+ATOMIC_CMPXCHG( fixed1,  xor, kmp_int8,    8, ^,  1i, 0, 0            )  // __kmpc_atomic_fixed1_xor
+ATOMIC_CMPXCHG( fixed2,  add, kmp_int16,  16, +,  2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_add
+ATOMIC_CMPXCHG( fixed2, andb, kmp_int16,  16, &,  2i, 1, 0            )  // __kmpc_atomic_fixed2_andb
+ATOMIC_CMPXCHG( fixed2,  div, kmp_int16,  16, /,  2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_div
+ATOMIC_CMPXCHG( fixed2u, div, kmp_uint16, 16, /,  2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_div
+ATOMIC_CMPXCHG( fixed2,  mul, kmp_int16,  16, *,  2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_mul
+ATOMIC_CMPXCHG( fixed2,  orb, kmp_int16,  16, |,  2i, 1, 0            )  // __kmpc_atomic_fixed2_orb
+ATOMIC_CMPXCHG( fixed2,  shl, kmp_int16,  16, <<, 2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shl
+ATOMIC_CMPXCHG( fixed2,  shr, kmp_int16,  16, >>, 2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shr
+ATOMIC_CMPXCHG( fixed2u, shr, kmp_uint16, 16, >>, 2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_shr
+ATOMIC_CMPXCHG( fixed2,  sub, kmp_int16,  16, -,  2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_sub
+ATOMIC_CMPXCHG( fixed2,  xor, kmp_int16,  16, ^,  2i, 1, 0            )  // __kmpc_atomic_fixed2_xor
+ATOMIC_CMPXCHG( fixed4, andb, kmp_int32,  32, &,  4i, 3, 0            )  // __kmpc_atomic_fixed4_andb
+ATOMIC_CMPXCHG( fixed4,  div, kmp_int32,  32, /,  4i, 3, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_div
+ATOMIC_CMPXCHG( fixed4u, div, kmp_uint32, 32, /,  4i, 3, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_div
+ATOMIC_CMPXCHG( fixed4,  mul, kmp_int32,  32, *,  4i, 3, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_mul
+ATOMIC_CMPXCHG( fixed4,  orb, kmp_int32,  32, |,  4i, 3, 0            )  // __kmpc_atomic_fixed4_orb
+ATOMIC_CMPXCHG( fixed4,  shl, kmp_int32,  32, <<, 4i, 3, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shl
+ATOMIC_CMPXCHG( fixed4,  shr, kmp_int32,  32, >>, 4i, 3, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shr
+ATOMIC_CMPXCHG( fixed4u, shr, kmp_uint32, 32, >>, 4i, 3, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_shr
+ATOMIC_CMPXCHG( fixed4,  xor, kmp_int32,  32, ^,  4i, 3, 0            )  // __kmpc_atomic_fixed4_xor
+ATOMIC_CMPXCHG( fixed8, andb, kmp_int64,  64, &,  8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_andb
+ATOMIC_CMPXCHG( fixed8,  div, kmp_int64,  64, /,  8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_div
+ATOMIC_CMPXCHG( fixed8u, div, kmp_uint64, 64, /,  8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_div
+ATOMIC_CMPXCHG( fixed8,  mul, kmp_int64,  64, *,  8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_mul
+ATOMIC_CMPXCHG( fixed8,  orb, kmp_int64,  64, |,  8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_orb
+ATOMIC_CMPXCHG( fixed8,  shl, kmp_int64,  64, <<, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shl
+ATOMIC_CMPXCHG( fixed8,  shr, kmp_int64,  64, >>, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shr
+ATOMIC_CMPXCHG( fixed8u, shr, kmp_uint64, 64, >>, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_shr
+ATOMIC_CMPXCHG( fixed8,  xor, kmp_int64,  64, ^,  8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_xor
+ATOMIC_CMPXCHG( float4,  div, kmp_real32, 32, /,  4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_div
+ATOMIC_CMPXCHG( float4,  mul, kmp_real32, 32, *,  4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_mul
+ATOMIC_CMPXCHG( float8,  div, kmp_real64, 64, /,  8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_div
+ATOMIC_CMPXCHG( float8,  mul, kmp_real64, 64, *,  8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_mul
+//              TYPE_ID,OP_ID, TYPE,          OP, LCK_ID, GOMP_FLAG
+
+
+/* ------------------------------------------------------------------------ */
+/* Routines for C/C++ Reduction operators && and ||                         */
+/* ------------------------------------------------------------------------ */
+
+// ------------------------------------------------------------------------
+// Need separate macros for &&, || because there is no combined assignment
+//   TODO: eliminate ATOMIC_CRIT_{L,EQV} macros as not used
+#define ATOMIC_CRIT_L(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)             \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL( = *lhs OP, GOMP_FLAG )                              \
+    OP_CRITICAL( = *lhs OP, LCK_ID )                                      \
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ===================================
+#define ATOMIC_CMPX_L(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL( = *lhs OP, GOMP_FLAG )                              \
+    OP_CMPXCHG(TYPE,BITS,OP)                                              \
+}
+
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPX_L(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(= *lhs OP,GOMP_FLAG)                                 \
+    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                          \
+        OP_CMPXCHG(TYPE,BITS,OP)       /* aligned address */              \
+    } else {                                                              \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL(= *lhs OP,LCK_ID)  /* unaligned - use critical */     \
+    }                                                                     \
+}
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPX_L( fixed1, andl, char,       8, &&, 1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_andl
+ATOMIC_CMPX_L( fixed1,  orl, char,       8, ||, 1i, 0, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_orl
+ATOMIC_CMPX_L( fixed2, andl, short,     16, &&, 2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_andl
+ATOMIC_CMPX_L( fixed2,  orl, short,     16, ||, 2i, 1, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_orl
+ATOMIC_CMPX_L( fixed4, andl, kmp_int32, 32, &&, 4i, 3, 0 )             // __kmpc_atomic_fixed4_andl
+ATOMIC_CMPX_L( fixed4,  orl, kmp_int32, 32, ||, 4i, 3, 0 )             // __kmpc_atomic_fixed4_orl
+ATOMIC_CMPX_L( fixed8, andl, kmp_int64, 64, &&, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_andl
+ATOMIC_CMPX_L( fixed8,  orl, kmp_int64, 64, ||, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_orl
+
+
+/* ------------------------------------------------------------------------- */
+/* Routines for Fortran operators that matched no one in C:                  */
+/* MAX, MIN, .EQV., .NEQV.                                                   */
+/* Operators .AND., .OR. are covered by __kmpc_atomic_*_{andl,orl}           */
+/* Intrinsics IAND, IOR, IEOR are covered by __kmpc_atomic_*_{andb,orb,xor}  */
+/* ------------------------------------------------------------------------- */
+
+// -------------------------------------------------------------------------
+// MIN and MAX need separate macros
+// OP - operator to check if we need any actions?
+#define MIN_MAX_CRITSECT(OP,LCK_ID)                                        \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );                     \
+                                                                           \
+    if ( *lhs OP rhs ) {                 /* still need actions? */         \
+        *lhs = rhs;                                                        \
+    }                                                                      \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_MIN_MAX_CRITSECT(OP,FLAG)                                     \
+    if (( FLAG ) && ( __kmp_atomic_mode == 2 )) {                          \
+        KMP_CHECK_GTID;                                                    \
+        MIN_MAX_CRITSECT( OP, 0 );                                         \
+        return;                                                            \
+    }
+#else
+#define GOMP_MIN_MAX_CRITSECT(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define MIN_MAX_CMPXCHG(TYPE,BITS,OP)                                      \
+    {                                                                      \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                 \
+        TYPE old_value;                                                    \
+        temp_val = *lhs;                                                   \
+        old_value = temp_val;                                              \
+        while ( old_value OP rhs &&          /* still need actions? */     \
+            ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs,      \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,          \
+                      *VOLATILE_CAST(kmp_int##BITS *) &rhs ) )             \
+        {                                                                  \
+            KMP_CPU_PAUSE();                                               \
+            temp_val = *lhs;                                               \
+            old_value = temp_val;                                          \
+        }                                                                  \
+    }
+
+// -------------------------------------------------------------------------
+// 1-byte, 2-byte operands - use critical section
+#define MIN_MAX_CRITICAL(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)           \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
+    if ( *lhs OP rhs ) {     /* need actions? */                           \
+        GOMP_MIN_MAX_CRITSECT(OP,GOMP_FLAG)                                \
+        MIN_MAX_CRITSECT(OP,LCK_ID)                                        \
+    }                                                                      \
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// -------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define MIN_MAX_COMPXCHG(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
+    if ( *lhs OP rhs ) {                                                   \
+        GOMP_MIN_MAX_CRITSECT(OP,GOMP_FLAG)                                \
+        MIN_MAX_CMPXCHG(TYPE,BITS,OP)                                      \
+    }                                                                      \
+}
+
+#else
+// -------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define MIN_MAX_COMPXCHG(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
+    if ( *lhs OP rhs ) {                                                   \
+        GOMP_MIN_MAX_CRITSECT(OP,GOMP_FLAG)                                \
+        if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                       \
+            MIN_MAX_CMPXCHG(TYPE,BITS,OP) /* aligned address */            \
+        } else {                                                           \
+            KMP_CHECK_GTID;                                                \
+            MIN_MAX_CRITSECT(OP,LCK_ID)   /* unaligned address */          \
+        }                                                                  \
+    }                                                                      \
+}
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+MIN_MAX_COMPXCHG( fixed1,  max, char,        8, <, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_max
+MIN_MAX_COMPXCHG( fixed1,  min, char,        8, >, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_min
+MIN_MAX_COMPXCHG( fixed2,  max, short,      16, <, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_max
+MIN_MAX_COMPXCHG( fixed2,  min, short,      16, >, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_min
+MIN_MAX_COMPXCHG( fixed4,  max, kmp_int32,  32, <, 4i, 3, 0 )            // __kmpc_atomic_fixed4_max
+MIN_MAX_COMPXCHG( fixed4,  min, kmp_int32,  32, >, 4i, 3, 0 )            // __kmpc_atomic_fixed4_min
+MIN_MAX_COMPXCHG( fixed8,  max, kmp_int64,  64, <, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_max
+MIN_MAX_COMPXCHG( fixed8,  min, kmp_int64,  64, >, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_min
+MIN_MAX_COMPXCHG( float4,  max, kmp_real32, 32, <, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_max
+MIN_MAX_COMPXCHG( float4,  min, kmp_real32, 32, >, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_min
+MIN_MAX_COMPXCHG( float8,  max, kmp_real64, 64, <, 8r, 7, KMP_ARCH_X86 ) // __kmpc_atomic_float8_max
+MIN_MAX_COMPXCHG( float8,  min, kmp_real64, 64, >, 8r, 7, KMP_ARCH_X86 ) // __kmpc_atomic_float8_min
+#if KMP_HAVE_QUAD
+MIN_MAX_CRITICAL( float16, max,     QUAD_LEGACY,      <, 16r,   1 )            // __kmpc_atomic_float16_max
+MIN_MAX_CRITICAL( float16, min,     QUAD_LEGACY,      >, 16r,   1 )            // __kmpc_atomic_float16_min
+#if ( KMP_ARCH_X86 )
+    MIN_MAX_CRITICAL( float16, max_a16, Quad_a16_t,     <, 16r,   1 )            // __kmpc_atomic_float16_max_a16
+    MIN_MAX_CRITICAL( float16, min_a16, Quad_a16_t,     >, 16r,   1 )            // __kmpc_atomic_float16_min_a16
+#endif
+#endif
+// ------------------------------------------------------------------------
+// Need separate macros for .EQV. because of the need of complement (~)
+// OP ignored for critical sections, ^=~ used instead
+#define ATOMIC_CRIT_EQV(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)           \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(^=~,GOMP_FLAG)  /* send assignment */                \
+    OP_CRITICAL(^=~,LCK_ID)    /* send assignment and complement */       \
+}
+
+// ------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ===================================
+#define ATOMIC_CMPX_EQV(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(^=~,GOMP_FLAG)  /* send assignment */                \
+    OP_CMPXCHG(TYPE,BITS,OP)                                              \
+}
+// ------------------------------------------------------------------------
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPX_EQV(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(^=~,GOMP_FLAG)                                       \
+    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                          \
+        OP_CMPXCHG(TYPE,BITS,OP)   /* aligned address */                  \
+    } else {                                                              \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL(^=~,LCK_ID)    /* unaligned address - use critical */ \
+    }                                                                     \
+}
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPXCHG(  fixed1, neqv, kmp_int8,   8,   ^, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_neqv
+ATOMIC_CMPXCHG(  fixed2, neqv, kmp_int16, 16,   ^, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_neqv
+ATOMIC_CMPXCHG(  fixed4, neqv, kmp_int32, 32,   ^, 4i, 3, KMP_ARCH_X86 ) // __kmpc_atomic_fixed4_neqv
+ATOMIC_CMPXCHG(  fixed8, neqv, kmp_int64, 64,   ^, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_neqv
+ATOMIC_CMPX_EQV( fixed1, eqv,  kmp_int8,   8,  ^~, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_eqv
+ATOMIC_CMPX_EQV( fixed2, eqv,  kmp_int16, 16,  ^~, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_eqv
+ATOMIC_CMPX_EQV( fixed4, eqv,  kmp_int32, 32,  ^~, 4i, 3, KMP_ARCH_X86 ) // __kmpc_atomic_fixed4_eqv
+ATOMIC_CMPX_EQV( fixed8, eqv,  kmp_int64, 64,  ^~, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_eqv
+
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)           \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)  /* send assignment */              \
+    OP_CRITICAL(OP##=,LCK_ID)          /* send assignment */              \
+}
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL( float10, add, long double,     +, 10r,   1 )            // __kmpc_atomic_float10_add
+ATOMIC_CRITICAL( float10, sub, long double,     -, 10r,   1 )            // __kmpc_atomic_float10_sub
+ATOMIC_CRITICAL( float10, mul, long double,     *, 10r,   1 )            // __kmpc_atomic_float10_mul
+ATOMIC_CRITICAL( float10, div, long double,     /, 10r,   1 )            // __kmpc_atomic_float10_div
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL( float16, add, QUAD_LEGACY,     +, 16r,   1 )            // __kmpc_atomic_float16_add
+ATOMIC_CRITICAL( float16, sub, QUAD_LEGACY,     -, 16r,   1 )            // __kmpc_atomic_float16_sub
+ATOMIC_CRITICAL( float16, mul, QUAD_LEGACY,     *, 16r,   1 )            // __kmpc_atomic_float16_mul
+ATOMIC_CRITICAL( float16, div, QUAD_LEGACY,     /, 16r,   1 )            // __kmpc_atomic_float16_div
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL( float16, add_a16, Quad_a16_t, +, 16r, 1 )           // __kmpc_atomic_float16_add_a16
+    ATOMIC_CRITICAL( float16, sub_a16, Quad_a16_t, -, 16r, 1 )           // __kmpc_atomic_float16_sub_a16
+    ATOMIC_CRITICAL( float16, mul_a16, Quad_a16_t, *, 16r, 1 )           // __kmpc_atomic_float16_mul_a16
+    ATOMIC_CRITICAL( float16, div_a16, Quad_a16_t, /, 16r, 1 )           // __kmpc_atomic_float16_div_a16
+#endif
+#endif
+// routines for complex types
+
+#if USE_CMPXCHG_FIX
+// workaround for C78287 (complex(kind=4) data type)
+ATOMIC_CMPXCHG_WORKAROUND( cmplx4, add, kmp_cmplx32, 64, +, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_add
+ATOMIC_CMPXCHG_WORKAROUND( cmplx4, sub, kmp_cmplx32, 64, -, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_sub
+ATOMIC_CMPXCHG_WORKAROUND( cmplx4, mul, kmp_cmplx32, 64, *, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_mul
+ATOMIC_CMPXCHG_WORKAROUND( cmplx4, div, kmp_cmplx32, 64, /, 8c, 7, 1 )   // __kmpc_atomic_cmplx4_div
+// end of the workaround for C78287
+#else
+ATOMIC_CRITICAL( cmplx4,  add, kmp_cmplx32,     +,  8c,   1 )            // __kmpc_atomic_cmplx4_add
+ATOMIC_CRITICAL( cmplx4,  sub, kmp_cmplx32,     -,  8c,   1 )            // __kmpc_atomic_cmplx4_sub
+ATOMIC_CRITICAL( cmplx4,  mul, kmp_cmplx32,     *,  8c,   1 )            // __kmpc_atomic_cmplx4_mul
+ATOMIC_CRITICAL( cmplx4,  div, kmp_cmplx32,     /,  8c,   1 )            // __kmpc_atomic_cmplx4_div
+#endif // USE_CMPXCHG_FIX
+
+ATOMIC_CRITICAL( cmplx8,  add, kmp_cmplx64,     +, 16c,   1 )            // __kmpc_atomic_cmplx8_add
+ATOMIC_CRITICAL( cmplx8,  sub, kmp_cmplx64,     -, 16c,   1 )            // __kmpc_atomic_cmplx8_sub
+ATOMIC_CRITICAL( cmplx8,  mul, kmp_cmplx64,     *, 16c,   1 )            // __kmpc_atomic_cmplx8_mul
+ATOMIC_CRITICAL( cmplx8,  div, kmp_cmplx64,     /, 16c,   1 )            // __kmpc_atomic_cmplx8_div
+ATOMIC_CRITICAL( cmplx10, add, kmp_cmplx80,     +, 20c,   1 )            // __kmpc_atomic_cmplx10_add
+ATOMIC_CRITICAL( cmplx10, sub, kmp_cmplx80,     -, 20c,   1 )            // __kmpc_atomic_cmplx10_sub
+ATOMIC_CRITICAL( cmplx10, mul, kmp_cmplx80,     *, 20c,   1 )            // __kmpc_atomic_cmplx10_mul
+ATOMIC_CRITICAL( cmplx10, div, kmp_cmplx80,     /, 20c,   1 )            // __kmpc_atomic_cmplx10_div
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL( cmplx16, add, CPLX128_LEG,     +, 32c,   1 )            // __kmpc_atomic_cmplx16_add
+ATOMIC_CRITICAL( cmplx16, sub, CPLX128_LEG,     -, 32c,   1 )            // __kmpc_atomic_cmplx16_sub
+ATOMIC_CRITICAL( cmplx16, mul, CPLX128_LEG,     *, 32c,   1 )            // __kmpc_atomic_cmplx16_mul
+ATOMIC_CRITICAL( cmplx16, div, CPLX128_LEG,     /, 32c,   1 )            // __kmpc_atomic_cmplx16_div
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL( cmplx16, add_a16, kmp_cmplx128_a16_t, +, 32c, 1 )   // __kmpc_atomic_cmplx16_add_a16
+    ATOMIC_CRITICAL( cmplx16, sub_a16, kmp_cmplx128_a16_t, -, 32c, 1 )   // __kmpc_atomic_cmplx16_sub_a16
+    ATOMIC_CRITICAL( cmplx16, mul_a16, kmp_cmplx128_a16_t, *, 32c, 1 )   // __kmpc_atomic_cmplx16_mul_a16
+    ATOMIC_CRITICAL( cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c, 1 )   // __kmpc_atomic_cmplx16_div_a16
+#endif
+#endif
+
+#if OMP_40_ENABLED
+
+// OpenMP 4.0: x = expr binop x for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_REV(OP,LCK_ID) \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+                                                                          \
+    (*lhs) = (rhs) OP (*lhs);                                             \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_REV(OP,FLAG)                                     \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_REV( OP, 0 );                                         \
+        return;                                                           \
+    }
+#else
+#define OP_GOMP_CRITICAL_REV(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_REV(TYPE_ID,OP_ID,TYPE, RET_TYPE) \
+RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID##_rev( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs ) \
+{                                                                                         \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                                \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_rev: T#%d\n", gtid ));
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_REV(TYPE,BITS,OP)                                      \
+    {                                                                     \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+        TYPE old_value, new_value;                                        \
+        temp_val = *lhs;                                                  \
+        old_value = temp_val;                                             \
+        new_value = rhs OP old_value;                                     \
+        while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,         \
+                      *VOLATILE_CAST(kmp_int##BITS *) &new_value ) )      \
+        {                                                                 \
+            KMP_DO_PAUSE;                                                 \
+                                                                          \
+            temp_val = *lhs;                                              \
+            old_value = temp_val;                                         \
+            new_value = rhs OP old_value;                                 \
+        }                                                                 \
+    }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_REV(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,GOMP_FLAG)   \
+ATOMIC_BEGIN_REV(TYPE_ID,OP_ID,TYPE,void)                                 \
+    OP_GOMP_CRITICAL_REV(OP,GOMP_FLAG)                                    \
+    OP_CMPXCHG_REV(TYPE,BITS,OP)                                          \
+}
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,LCK_ID,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//                  TYPE_ID,OP_ID, TYPE,    BITS, OP, LCK_ID, GOMP_FLAG
+ATOMIC_CMPXCHG_REV( fixed1,  div, kmp_int8,    8, /,  1i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_div_rev
+ATOMIC_CMPXCHG_REV( fixed1u, div, kmp_uint8,   8, /,  1i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_div_rev
+ATOMIC_CMPXCHG_REV( fixed1,  shl, kmp_int8,    8, <<, 1i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shl_rev
+ATOMIC_CMPXCHG_REV( fixed1,  shr, kmp_int8,    8, >>, 1i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shr_rev
+ATOMIC_CMPXCHG_REV( fixed1u, shr, kmp_uint8,   8, >>, 1i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_shr_rev
+ATOMIC_CMPXCHG_REV( fixed1,  sub, kmp_int8,    8, -,  1i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_sub_rev
+
+ATOMIC_CMPXCHG_REV( fixed2,  div, kmp_int16,  16, /,  2i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_div_rev
+ATOMIC_CMPXCHG_REV( fixed2u, div, kmp_uint16, 16, /,  2i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_div_rev
+ATOMIC_CMPXCHG_REV( fixed2,  shl, kmp_int16,  16, <<, 2i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shl_rev
+ATOMIC_CMPXCHG_REV( fixed2,  shr, kmp_int16,  16, >>, 2i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shr_rev
+ATOMIC_CMPXCHG_REV( fixed2u, shr, kmp_uint16, 16, >>, 2i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_shr_rev
+ATOMIC_CMPXCHG_REV( fixed2,  sub, kmp_int16,  16, -,  2i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_sub_rev
+
+ATOMIC_CMPXCHG_REV( fixed4,  div, kmp_int32,  32, /,  4i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_div_rev
+ATOMIC_CMPXCHG_REV( fixed4u, div, kmp_uint32, 32, /,  4i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_div_rev
+ATOMIC_CMPXCHG_REV( fixed4,  shl, kmp_int32,  32, <<, 4i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shl_rev
+ATOMIC_CMPXCHG_REV( fixed4,  shr, kmp_int32,  32, >>, 4i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shr_rev
+ATOMIC_CMPXCHG_REV( fixed4u, shr, kmp_uint32, 32, >>, 4i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_shr_rev
+ATOMIC_CMPXCHG_REV( fixed4,  sub, kmp_int32,  32, -,  4i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_sub_rev
+
+ATOMIC_CMPXCHG_REV( fixed8,  div, kmp_int64,  64, /,  8i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_div_rev
+ATOMIC_CMPXCHG_REV( fixed8u, div, kmp_uint64, 64, /,  8i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_div_rev
+ATOMIC_CMPXCHG_REV( fixed8,  shl, kmp_int64,  64, <<, 8i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shl_rev
+ATOMIC_CMPXCHG_REV( fixed8,  shr, kmp_int64,  64, >>, 8i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shr_rev
+ATOMIC_CMPXCHG_REV( fixed8u, shr, kmp_uint64, 64, >>, 8i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_shr_rev
+ATOMIC_CMPXCHG_REV( fixed8,  sub, kmp_int64,  64, -,  8i, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_sub_rev
+
+ATOMIC_CMPXCHG_REV( float4,  div, kmp_real32, 32, /,  4r, KMP_ARCH_X86 )  // __kmpc_atomic_float4_div_rev
+ATOMIC_CMPXCHG_REV( float4,  sub, kmp_real32, 32, -,  4r, KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub_rev
+
+ATOMIC_CMPXCHG_REV( float8,  div, kmp_real64, 64, /,  8r, KMP_ARCH_X86 )  // __kmpc_atomic_float8_div_rev
+ATOMIC_CMPXCHG_REV( float8,  sub, kmp_real64, 64, -,  8r, KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub_rev
+//                  TYPE_ID,OP_ID, TYPE,     BITS,OP,LCK_ID, GOMP_FLAG
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_REV(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)           \
+ATOMIC_BEGIN_REV(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL_REV(OP,GOMP_FLAG)                                        \
+    OP_CRITICAL_REV(OP,LCK_ID)                                                \
+}
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_REV( float10, sub, long double,     -, 10r,   1 )            // __kmpc_atomic_float10_sub_rev
+ATOMIC_CRITICAL_REV( float10, div, long double,     /, 10r,   1 )            // __kmpc_atomic_float10_div_rev
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_REV( float16, sub, QUAD_LEGACY,     -, 16r,   1 )            // __kmpc_atomic_float16_sub_rev
+ATOMIC_CRITICAL_REV( float16, div, QUAD_LEGACY,     /, 16r,   1 )            // __kmpc_atomic_float16_div_rev
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_REV( float16, sub_a16, Quad_a16_t, -, 16r, 1 )           // __kmpc_atomic_float16_sub_a16_rev
+    ATOMIC_CRITICAL_REV( float16, div_a16, Quad_a16_t, /, 16r, 1 )           // __kmpc_atomic_float16_div_a16_rev
+#endif
+#endif
+
+// routines for complex types
+ATOMIC_CRITICAL_REV( cmplx4,  sub, kmp_cmplx32,     -, 8c,    1 )            // __kmpc_atomic_cmplx4_sub_rev
+ATOMIC_CRITICAL_REV( cmplx4,  div, kmp_cmplx32,     /, 8c,    1 )            // __kmpc_atomic_cmplx4_div_rev
+ATOMIC_CRITICAL_REV( cmplx8,  sub, kmp_cmplx64,     -, 16c,   1 )            // __kmpc_atomic_cmplx8_sub_rev
+ATOMIC_CRITICAL_REV( cmplx8,  div, kmp_cmplx64,     /, 16c,   1 )            // __kmpc_atomic_cmplx8_div_rev
+ATOMIC_CRITICAL_REV( cmplx10, sub, kmp_cmplx80,     -, 20c,   1 )            // __kmpc_atomic_cmplx10_sub_rev
+ATOMIC_CRITICAL_REV( cmplx10, div, kmp_cmplx80,     /, 20c,   1 )            // __kmpc_atomic_cmplx10_div_rev
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_REV( cmplx16, sub, CPLX128_LEG,     -, 32c,   1 )            // __kmpc_atomic_cmplx16_sub_rev
+ATOMIC_CRITICAL_REV( cmplx16, div, CPLX128_LEG,     /, 32c,   1 )            // __kmpc_atomic_cmplx16_div_rev
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_REV( cmplx16, sub_a16, kmp_cmplx128_a16_t, -, 32c, 1 )   // __kmpc_atomic_cmplx16_sub_a16_rev
+    ATOMIC_CRITICAL_REV( cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c, 1 )   // __kmpc_atomic_cmplx16_div_a16_rev
+#endif
+#endif
+
+
+#endif //KMP_ARCH_X86 || KMP_ARCH_X86_64
+// End of OpenMP 4.0: x = expr binop x for non-commutative operations.
+
+#endif //OMP_40_ENABLED
+
+
+/* ------------------------------------------------------------------------ */
+/* Routines for mixed types of LHS and RHS, when RHS is "larger"            */
+/* Note: in order to reduce the total number of types combinations          */
+/*       it is supposed that compiler converts RHS to longest floating type,*/
+/*       that is _Quad, before call to any of these routines                */
+/* Conversion to _Quad will be done by the compiler during calculation,     */
+/*    conversion back to TYPE - before the assignment, like:                */
+/*    *lhs = (TYPE)( (_Quad)(*lhs) OP rhs )                                 */
+/* Performance penalty expected because of SW emulation use                 */
+/* ------------------------------------------------------------------------ */
+
+#define ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)                                             \
+void __kmpc_atomic_##TYPE_ID##_##OP_ID##_##RTYPE_ID( ident_t *id_ref, int gtid, TYPE * lhs, RTYPE rhs ) \
+{                                                                                                       \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                                              \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_" #RTYPE_ID ": T#%d\n", gtid ));
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_FP(TYPE_ID,TYPE,OP_ID,OP,RTYPE_ID,RTYPE,LCK_ID,GOMP_FLAG)         \
+ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)                                       \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)  /* send assignment */                              \
+    OP_CRITICAL(OP##=,LCK_ID)  /* send assignment */                                      \
+}
+
+// -------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// -------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define ATOMIC_CMPXCHG_MIX(TYPE_ID,TYPE,OP_ID,BITS,OP,RTYPE_ID,RTYPE,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)                                         \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                                       \
+    OP_CMPXCHG(TYPE,BITS,OP)                                                                \
+}
+// -------------------------------------------------------------------------
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPXCHG_MIX(TYPE_ID,TYPE,OP_ID,BITS,OP,RTYPE_ID,RTYPE,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)                                         \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                                       \
+    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                                            \
+        OP_CMPXCHG(TYPE,BITS,OP)     /* aligned address */                                  \
+    } else {                                                                                \
+        KMP_CHECK_GTID;                                                                     \
+        OP_CRITICAL(OP##=,LCK_ID)  /* unaligned address - use critical */                   \
+    }                                                                                       \
+}
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// RHS=float8
+ATOMIC_CMPXCHG_MIX( fixed1, char,       mul,  8, *, float8, kmp_real64, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_mul_float8
+ATOMIC_CMPXCHG_MIX( fixed1, char,       div,  8, /, float8, kmp_real64, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_div_float8
+ATOMIC_CMPXCHG_MIX( fixed2, short,      mul, 16, *, float8, kmp_real64, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_mul_float8
+ATOMIC_CMPXCHG_MIX( fixed2, short,      div, 16, /, float8, kmp_real64, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_div_float8
+ATOMIC_CMPXCHG_MIX( fixed4, kmp_int32,  mul, 32, *, float8, kmp_real64, 4i, 3, 0 )            // __kmpc_atomic_fixed4_mul_float8
+ATOMIC_CMPXCHG_MIX( fixed4, kmp_int32,  div, 32, /, float8, kmp_real64, 4i, 3, 0 )            // __kmpc_atomic_fixed4_div_float8
+ATOMIC_CMPXCHG_MIX( fixed8, kmp_int64,  mul, 64, *, float8, kmp_real64, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_mul_float8
+ATOMIC_CMPXCHG_MIX( fixed8, kmp_int64,  div, 64, /, float8, kmp_real64, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_div_float8
+ATOMIC_CMPXCHG_MIX( float4, kmp_real32, add, 32, +, float8, kmp_real64, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_add_float8
+ATOMIC_CMPXCHG_MIX( float4, kmp_real32, sub, 32, -, float8, kmp_real64, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_sub_float8
+ATOMIC_CMPXCHG_MIX( float4, kmp_real32, mul, 32, *, float8, kmp_real64, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_mul_float8
+ATOMIC_CMPXCHG_MIX( float4, kmp_real32, div, 32, /, float8, kmp_real64, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_div_float8
+
+// RHS=float16 (deprecated, to be removed when we are sure the compiler does not use them)
+#if KMP_HAVE_QUAD
+ATOMIC_CMPXCHG_MIX( fixed1,  char,       add,  8, +, fp, _Quad, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_add_fp
+ATOMIC_CMPXCHG_MIX( fixed1,  char,       sub,  8, -, fp, _Quad, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_sub_fp
+ATOMIC_CMPXCHG_MIX( fixed1,  char,       mul,  8, *, fp, _Quad, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_mul_fp
+ATOMIC_CMPXCHG_MIX( fixed1,  char,       div,  8, /, fp, _Quad, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_div_fp
+ATOMIC_CMPXCHG_MIX( fixed1u, uchar,      div,  8, /, fp, _Quad, 1i, 0, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1u_div_fp
+
+ATOMIC_CMPXCHG_MIX( fixed2,  short,      add, 16, +, fp, _Quad, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_add_fp
+ATOMIC_CMPXCHG_MIX( fixed2,  short,      sub, 16, -, fp, _Quad, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_sub_fp
+ATOMIC_CMPXCHG_MIX( fixed2,  short,      mul, 16, *, fp, _Quad, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_mul_fp
+ATOMIC_CMPXCHG_MIX( fixed2,  short,      div, 16, /, fp, _Quad, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_div_fp
+ATOMIC_CMPXCHG_MIX( fixed2u, ushort,     div, 16, /, fp, _Quad, 2i, 1, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2u_div_fp
+
+ATOMIC_CMPXCHG_MIX( fixed4,  kmp_int32,  add, 32, +, fp, _Quad, 4i, 3, 0 )            // __kmpc_atomic_fixed4_add_fp
+ATOMIC_CMPXCHG_MIX( fixed4,  kmp_int32,  sub, 32, -, fp, _Quad, 4i, 3, 0 )            // __kmpc_atomic_fixed4_sub_fp
+ATOMIC_CMPXCHG_MIX( fixed4,  kmp_int32,  mul, 32, *, fp, _Quad, 4i, 3, 0 )            // __kmpc_atomic_fixed4_mul_fp
+ATOMIC_CMPXCHG_MIX( fixed4,  kmp_int32,  div, 32, /, fp, _Quad, 4i, 3, 0 )            // __kmpc_atomic_fixed4_div_fp
+ATOMIC_CMPXCHG_MIX( fixed4u, kmp_uint32, div, 32, /, fp, _Quad, 4i, 3, 0 )            // __kmpc_atomic_fixed4u_div_fp
+
+ATOMIC_CMPXCHG_MIX( fixed8,  kmp_int64,  add, 64, +, fp, _Quad, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_add_fp
+ATOMIC_CMPXCHG_MIX( fixed8,  kmp_int64,  sub, 64, -, fp, _Quad, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_sub_fp
+ATOMIC_CMPXCHG_MIX( fixed8,  kmp_int64,  mul, 64, *, fp, _Quad, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_mul_fp
+ATOMIC_CMPXCHG_MIX( fixed8,  kmp_int64,  div, 64, /, fp, _Quad, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_div_fp
+ATOMIC_CMPXCHG_MIX( fixed8u, kmp_uint64, div, 64, /, fp, _Quad, 8i, 7, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8u_div_fp
+
+ATOMIC_CMPXCHG_MIX( float4,  kmp_real32, add, 32, +, fp, _Quad, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_add_fp
+ATOMIC_CMPXCHG_MIX( float4,  kmp_real32, sub, 32, -, fp, _Quad, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_sub_fp
+ATOMIC_CMPXCHG_MIX( float4,  kmp_real32, mul, 32, *, fp, _Quad, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_mul_fp
+ATOMIC_CMPXCHG_MIX( float4,  kmp_real32, div, 32, /, fp, _Quad, 4r, 3, KMP_ARCH_X86 ) // __kmpc_atomic_float4_div_fp
+
+ATOMIC_CMPXCHG_MIX( float8,  kmp_real64, add, 64, +, fp, _Quad, 8r, 7, KMP_ARCH_X86 ) // __kmpc_atomic_float8_add_fp
+ATOMIC_CMPXCHG_MIX( float8,  kmp_real64, sub, 64, -, fp, _Quad, 8r, 7, KMP_ARCH_X86 ) // __kmpc_atomic_float8_sub_fp
+ATOMIC_CMPXCHG_MIX( float8,  kmp_real64, mul, 64, *, fp, _Quad, 8r, 7, KMP_ARCH_X86 ) // __kmpc_atomic_float8_mul_fp
+ATOMIC_CMPXCHG_MIX( float8,  kmp_real64, div, 64, /, fp, _Quad, 8r, 7, KMP_ARCH_X86 ) // __kmpc_atomic_float8_div_fp
+
+ATOMIC_CRITICAL_FP( float10, long double,    add, +, fp, _Quad, 10r,   1 )            // __kmpc_atomic_float10_add_fp
+ATOMIC_CRITICAL_FP( float10, long double,    sub, -, fp, _Quad, 10r,   1 )            // __kmpc_atomic_float10_sub_fp
+ATOMIC_CRITICAL_FP( float10, long double,    mul, *, fp, _Quad, 10r,   1 )            // __kmpc_atomic_float10_mul_fp
+ATOMIC_CRITICAL_FP( float10, long double,    div, /, fp, _Quad, 10r,   1 )            // __kmpc_atomic_float10_div_fp
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#if USE_CMPXCHG_FIX
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID,TYPE,OP_ID,BITS,OP,RTYPE_ID,RTYPE,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)                                           \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                                         \
+    OP_CMPXCHG_WORKAROUND(TYPE,BITS,OP)                                                       \
+}
+// end of the second part of the workaround for C78287
+#else
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID,TYPE,OP_ID,BITS,OP,RTYPE_ID,RTYPE,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)                                           \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                                         \
+    OP_CMPXCHG(TYPE,BITS,OP)                                                                  \
+}
+#endif // USE_CMPXCHG_FIX
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID,TYPE,OP_ID,BITS,OP,RTYPE_ID,RTYPE,LCK_ID,MASK,GOMP_FLAG) \
+ATOMIC_BEGIN_MIX(TYPE_ID,TYPE,OP_ID,RTYPE_ID,RTYPE)                                           \
+    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                                         \
+    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                                              \
+        OP_CMPXCHG(TYPE,BITS,OP)     /* aligned address */                                    \
+    } else {                                                                                  \
+        KMP_CHECK_GTID;                                                                       \
+        OP_CRITICAL(OP##=,LCK_ID)  /* unaligned address - use critical */                     \
+    }                                                                                         \
+}
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, add, 64, +, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_add_cmplx8
+ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, sub, 64, -, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_sub_cmplx8
+ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, mul, 64, *, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_mul_cmplx8
+ATOMIC_CMPXCHG_CMPLX( cmplx4, kmp_cmplx32, div, 64, /, cmplx8,  kmp_cmplx64,  8c, 7, KMP_ARCH_X86 ) // __kmpc_atomic_cmplx4_div_cmplx8
+
+// READ, WRITE, CAPTURE are supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+// ------------------------------------------------------------------------
+// Atomic READ routines
+// ------------------------------------------------------------------------
+
+// ------------------------------------------------------------------------
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_READ(TYPE_ID,OP_ID,TYPE, RET_TYPE) \
+RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID( ident_t *id_ref, int gtid, TYPE * loc ) \
+{                                                                                   \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                          \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid ));
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store_ret" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+// TODO: check if it is still necessary
+// Return old value regardless of the result of "compare & swap# operation
+
+#define OP_CMPXCHG_READ(TYPE,BITS,OP)                                     \
+    {                                                                     \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+        union f_i_union {                                                 \
+            TYPE f_val;                                                   \
+            kmp_int##BITS i_val;                                          \
+        };                                                                \
+        union f_i_union old_value;                                        \
+        temp_val = *loc;                                                  \
+        old_value.f_val = temp_val;                                       \
+        old_value.i_val = KMP_COMPARE_AND_STORE_RET##BITS( (kmp_int##BITS *) loc, \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value.i_val,   \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value.i_val ); \
+        new_value = old_value.f_val;                                      \
+        return new_value;                                                 \
+    }
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_READ(OP,LCK_ID)                                       \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );                    \
+                                                                          \
+    new_value = (*loc);                                                   \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_READ(OP,FLAG)                                    \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_READ( OP, 0 );                                        \
+        return new_value;                                                 \
+    }
+#else
+#define OP_GOMP_CRITICAL_READ(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define ATOMIC_FIXED_READ(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)           \
+ATOMIC_BEGIN_READ(TYPE_ID,OP_ID,TYPE,TYPE)                                \
+    TYPE new_value;                                                       \
+    OP_GOMP_CRITICAL_READ(OP##=,GOMP_FLAG)                                \
+    new_value = KMP_TEST_THEN_ADD##BITS( loc, OP 0 );                     \
+    return new_value;                                                     \
+}
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_READ(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)         \
+ATOMIC_BEGIN_READ(TYPE_ID,OP_ID,TYPE,TYPE)                                \
+    TYPE new_value;                                                       \
+    OP_GOMP_CRITICAL_READ(OP##=,GOMP_FLAG)                                \
+    OP_CMPXCHG_READ(TYPE,BITS,OP)                                         \
+}
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_READ(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)      \
+ATOMIC_BEGIN_READ(TYPE_ID,OP_ID,TYPE,TYPE)                                \
+    TYPE new_value;                                                       \
+    OP_GOMP_CRITICAL_READ(OP##=,GOMP_FLAG)  /* send assignment */         \
+    OP_CRITICAL_READ(OP,LCK_ID)          /* send assignment */            \
+    return new_value;                                                     \
+}
+
+// ------------------------------------------------------------------------
+// Fix for cmplx4 read (CQ220361) on Windows* OS. Regular routine with return value doesn't work.
+// Let's return the read value through the additional parameter.
+
+#if ( KMP_OS_WINDOWS )
+
+#define OP_CRITICAL_READ_WRK(OP,LCK_ID)                                   \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );                    \
+                                                                          \
+    (*out) = (*loc);                                                      \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_READ_WRK(OP,FLAG)                                \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_READ_WRK( OP, 0 );                                    \
+    }
+#else
+#define OP_GOMP_CRITICAL_READ_WRK(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+#define ATOMIC_BEGIN_READ_WRK(TYPE_ID,OP_ID,TYPE) \
+void __kmpc_atomic_##TYPE_ID##_##OP_ID( TYPE * out, ident_t *id_ref, int gtid, TYPE * loc ) \
+{                                                                                   \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                          \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid ));
+
+// ------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_READ_WRK(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)      \
+ATOMIC_BEGIN_READ_WRK(TYPE_ID,OP_ID,TYPE)                                     \
+    OP_GOMP_CRITICAL_READ_WRK(OP##=,GOMP_FLAG)  /* send assignment */         \
+    OP_CRITICAL_READ_WRK(OP,LCK_ID)          /* send assignment */            \
+}
+
+#endif // KMP_OS_WINDOWS
+
+// ------------------------------------------------------------------------
+//                  TYPE_ID,OP_ID, TYPE,      OP, GOMP_FLAG
+ATOMIC_FIXED_READ( fixed4, rd, kmp_int32,  32, +, 0            )      // __kmpc_atomic_fixed4_rd
+ATOMIC_FIXED_READ( fixed8, rd, kmp_int64,  64, +, KMP_ARCH_X86 )      // __kmpc_atomic_fixed8_rd
+ATOMIC_CMPXCHG_READ( float4, rd, kmp_real32, 32, +, KMP_ARCH_X86 )    // __kmpc_atomic_float4_rd
+ATOMIC_CMPXCHG_READ( float8, rd, kmp_real64, 64, +, KMP_ARCH_X86 )    // __kmpc_atomic_float8_rd
+
+// !!! TODO: Remove lock operations for "char" since it can't be non-atomic
+ATOMIC_CMPXCHG_READ( fixed1,  rd, kmp_int8,    8, +,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_rd
+ATOMIC_CMPXCHG_READ( fixed2,  rd, kmp_int16,  16, +,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_rd
+
+ATOMIC_CRITICAL_READ( float10, rd, long double, +, 10r,   1 )         // __kmpc_atomic_float10_rd
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_READ( float16, rd, QUAD_LEGACY, +, 16r,   1 )         // __kmpc_atomic_float16_rd
+#endif // KMP_HAVE_QUAD
+
+// Fix for CQ220361 on Windows* OS
+#if ( KMP_OS_WINDOWS )
+    ATOMIC_CRITICAL_READ_WRK( cmplx4,  rd, kmp_cmplx32, +,  8c, 1 )   // __kmpc_atomic_cmplx4_rd
+#else
+    ATOMIC_CRITICAL_READ( cmplx4,  rd, kmp_cmplx32, +,  8c, 1 )       // __kmpc_atomic_cmplx4_rd
+#endif
+ATOMIC_CRITICAL_READ( cmplx8,  rd, kmp_cmplx64, +, 16c, 1 )           // __kmpc_atomic_cmplx8_rd
+ATOMIC_CRITICAL_READ( cmplx10, rd, kmp_cmplx80, +, 20c, 1 )           // __kmpc_atomic_cmplx10_rd
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_READ( cmplx16, rd, CPLX128_LEG, +, 32c, 1 )           // __kmpc_atomic_cmplx16_rd
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_READ( float16, a16_rd, Quad_a16_t, +, 16r, 1 )         // __kmpc_atomic_float16_a16_rd
+    ATOMIC_CRITICAL_READ( cmplx16, a16_rd, kmp_cmplx128_a16_t, +, 32c, 1 ) // __kmpc_atomic_cmplx16_a16_rd
+#endif
+#endif
+
+
+// ------------------------------------------------------------------------
+// Atomic WRITE routines
+// ------------------------------------------------------------------------
+
+#define ATOMIC_XCHG_WR(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)              \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(OP,GOMP_FLAG)                                        \
+    KMP_XCHG_FIXED##BITS( lhs, rhs );                                     \
+}
+// ------------------------------------------------------------------------
+#define ATOMIC_XCHG_FLOAT_WR(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)        \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(OP,GOMP_FLAG)                                        \
+    KMP_XCHG_REAL##BITS( lhs, rhs );                                      \
+}
+
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_WR(TYPE,BITS,OP)                                       \
+    {                                                                     \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+        TYPE old_value, new_value;                                        \
+        temp_val = *lhs;                                                  \
+        old_value = temp_val;                                             \
+        new_value = rhs;                                                  \
+        while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,         \
+                      *VOLATILE_CAST(kmp_int##BITS *) &new_value ) )      \
+        {                                                                 \
+            KMP_CPU_PAUSE();                                              \
+                                                                          \
+            temp_val = *lhs;                                              \
+            old_value = temp_val;                                         \
+            new_value = rhs;                                              \
+        }                                                                 \
+    }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_WR(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)           \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(OP,GOMP_FLAG)                                        \
+    OP_CMPXCHG_WR(TYPE,BITS,OP)                                           \
+}
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_WR(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)        \
+ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                     \
+    OP_GOMP_CRITICAL(OP,GOMP_FLAG)       /* send assignment */            \
+    OP_CRITICAL(OP,LCK_ID)               /* send assignment */            \
+}
+// -------------------------------------------------------------------------
+
+ATOMIC_XCHG_WR( fixed1,  wr, kmp_int8,    8, =,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_wr
+ATOMIC_XCHG_WR( fixed2,  wr, kmp_int16,  16, =,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_wr
+ATOMIC_XCHG_WR( fixed4,  wr, kmp_int32,  32, =,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_wr
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CMPXCHG_WR( fixed8,  wr, kmp_int64,  64, =,  KMP_ARCH_X86 )      // __kmpc_atomic_fixed8_wr
+#else
+    ATOMIC_XCHG_WR( fixed8,  wr, kmp_int64,  64, =,  KMP_ARCH_X86 )         // __kmpc_atomic_fixed8_wr
+#endif
+
+ATOMIC_XCHG_FLOAT_WR( float4, wr, kmp_real32, 32, =, KMP_ARCH_X86 )         // __kmpc_atomic_float4_wr
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CMPXCHG_WR( float8,  wr, kmp_real64,  64, =,  KMP_ARCH_X86 )     // __kmpc_atomic_float8_wr
+#else
+    ATOMIC_XCHG_FLOAT_WR( float8,  wr, kmp_real64,  64, =,  KMP_ARCH_X86 )  // __kmpc_atomic_float8_wr
+#endif
+
+ATOMIC_CRITICAL_WR( float10, wr, long double, =, 10r,   1 )         // __kmpc_atomic_float10_wr
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR( float16, wr, QUAD_LEGACY, =, 16r,   1 )         // __kmpc_atomic_float16_wr
+#endif
+ATOMIC_CRITICAL_WR( cmplx4,  wr, kmp_cmplx32, =,  8c,   1 )         // __kmpc_atomic_cmplx4_wr
+ATOMIC_CRITICAL_WR( cmplx8,  wr, kmp_cmplx64, =, 16c,   1 )         // __kmpc_atomic_cmplx8_wr
+ATOMIC_CRITICAL_WR( cmplx10, wr, kmp_cmplx80, =, 20c,   1 )         // __kmpc_atomic_cmplx10_wr
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR( cmplx16, wr, CPLX128_LEG, =, 32c,   1 )         // __kmpc_atomic_cmplx16_wr
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_WR( float16, a16_wr, Quad_a16_t,         =, 16r, 1 ) // __kmpc_atomic_float16_a16_wr
+    ATOMIC_CRITICAL_WR( cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c, 1 ) // __kmpc_atomic_cmplx16_a16_wr
+#endif
+#endif
+
+
+// ------------------------------------------------------------------------
+// Atomic CAPTURE routines
+// ------------------------------------------------------------------------
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,RET_TYPE)                                    \
+RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs, int flag ) \
+{                                                                                         \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                                \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid ));
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_CPT(OP,LCK_ID)                                        \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+                                                                          \
+    if( flag ) {                                                          \
+        (*lhs) OP rhs;                                                    \
+        new_value = (*lhs);                                               \
+    } else {                                                              \
+        new_value = (*lhs);                                               \
+        (*lhs) OP rhs;                                                    \
+    }                                                                     \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+    return new_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT(OP,FLAG)                                     \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_CPT( OP##=, 0 );                                      \
+    }
+#else
+#define OP_GOMP_CRITICAL_CPT(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_CPT(TYPE,BITS,OP)                                      \
+    {                                                                     \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+        TYPE old_value, new_value;                                        \
+        temp_val = *lhs;                                                  \
+        old_value = temp_val;                                             \
+        new_value = old_value OP rhs;                                     \
+        while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,         \
+                      *VOLATILE_CAST(kmp_int##BITS *) &new_value ) )      \
+        {                                                                 \
+            KMP_CPU_PAUSE();                                              \
+                                                                          \
+            temp_val = *lhs;                                              \
+            old_value = temp_val;                                         \
+            new_value = old_value OP rhs;                                 \
+        }                                                                 \
+        if( flag ) {                                                      \
+            return new_value;                                             \
+        } else                                                            \
+            return old_value;                                             \
+    }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)           \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                  \
+    TYPE new_value;                                                        \
+    OP_GOMP_CRITICAL_CPT(OP,GOMP_FLAG)                                     \
+    OP_CMPXCHG_CPT(TYPE,BITS,OP)                                           \
+}
+
+// -------------------------------------------------------------------------
+#define ATOMIC_FIXED_ADD_CPT(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)         \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                  \
+    TYPE old_value, new_value;                                             \
+    OP_GOMP_CRITICAL_CPT(OP,GOMP_FLAG)                                     \
+    /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */      \
+    old_value = KMP_TEST_THEN_ADD##BITS( lhs, OP rhs );                    \
+    if( flag ) {                                                           \
+        return old_value OP rhs;                                           \
+    } else                                                                 \
+        return old_value;                                                  \
+}
+// -------------------------------------------------------------------------
+
+ATOMIC_FIXED_ADD_CPT( fixed4, add_cpt, kmp_int32,  32, +, 0            )  // __kmpc_atomic_fixed4_add_cpt
+ATOMIC_FIXED_ADD_CPT( fixed4, sub_cpt, kmp_int32,  32, -, 0            )  // __kmpc_atomic_fixed4_sub_cpt
+ATOMIC_FIXED_ADD_CPT( fixed8, add_cpt, kmp_int64,  64, +, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_add_cpt
+ATOMIC_FIXED_ADD_CPT( fixed8, sub_cpt, kmp_int64,  64, -, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_sub_cpt
+
+ATOMIC_CMPXCHG_CPT( float4, add_cpt, kmp_real32, 32, +, KMP_ARCH_X86 )  // __kmpc_atomic_float4_add_cpt
+ATOMIC_CMPXCHG_CPT( float4, sub_cpt, kmp_real32, 32, -, KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub_cpt
+ATOMIC_CMPXCHG_CPT( float8, add_cpt, kmp_real64, 64, +, KMP_ARCH_X86 )  // __kmpc_atomic_float8_add_cpt
+ATOMIC_CMPXCHG_CPT( float8, sub_cpt, kmp_real64, 64, -, KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub_cpt
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+ATOMIC_CMPXCHG_CPT( fixed1,  add_cpt, kmp_int8,    8, +,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_add_cpt
+ATOMIC_CMPXCHG_CPT( fixed1, andb_cpt, kmp_int8,    8, &,  0            )  // __kmpc_atomic_fixed1_andb_cpt
+ATOMIC_CMPXCHG_CPT( fixed1,  div_cpt, kmp_int8,    8, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed1u, div_cpt, kmp_uint8,   8, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed1,  mul_cpt, kmp_int8,    8, *,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_mul_cpt
+ATOMIC_CMPXCHG_CPT( fixed1,  orb_cpt, kmp_int8,    8, |,  0            )  // __kmpc_atomic_fixed1_orb_cpt
+ATOMIC_CMPXCHG_CPT( fixed1,  shl_cpt, kmp_int8,    8, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shl_cpt
+ATOMIC_CMPXCHG_CPT( fixed1,  shr_cpt, kmp_int8,    8, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed1u, shr_cpt, kmp_uint8,   8, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed1,  sub_cpt, kmp_int8,    8, -,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_sub_cpt
+ATOMIC_CMPXCHG_CPT( fixed1,  xor_cpt, kmp_int8,    8, ^,  0            )  // __kmpc_atomic_fixed1_xor_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  add_cpt, kmp_int16,  16, +,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_add_cpt
+ATOMIC_CMPXCHG_CPT( fixed2, andb_cpt, kmp_int16,  16, &,  0            )  // __kmpc_atomic_fixed2_andb_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  div_cpt, kmp_int16,  16, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed2u, div_cpt, kmp_uint16, 16, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  mul_cpt, kmp_int16,  16, *,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_mul_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  orb_cpt, kmp_int16,  16, |,  0            )  // __kmpc_atomic_fixed2_orb_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  shl_cpt, kmp_int16,  16, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shl_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  shr_cpt, kmp_int16,  16, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed2u, shr_cpt, kmp_uint16, 16, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  sub_cpt, kmp_int16,  16, -,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_sub_cpt
+ATOMIC_CMPXCHG_CPT( fixed2,  xor_cpt, kmp_int16,  16, ^,  0            )  // __kmpc_atomic_fixed2_xor_cpt
+ATOMIC_CMPXCHG_CPT( fixed4, andb_cpt, kmp_int32,  32, &,  0            )  // __kmpc_atomic_fixed4_andb_cpt
+ATOMIC_CMPXCHG_CPT( fixed4,  div_cpt, kmp_int32,  32, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed4u, div_cpt, kmp_uint32, 32, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed4,  mul_cpt, kmp_int32,  32, *,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_mul_cpt
+ATOMIC_CMPXCHG_CPT( fixed4,  orb_cpt, kmp_int32,  32, |,  0            )  // __kmpc_atomic_fixed4_orb_cpt
+ATOMIC_CMPXCHG_CPT( fixed4,  shl_cpt, kmp_int32,  32, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shl_cpt
+ATOMIC_CMPXCHG_CPT( fixed4,  shr_cpt, kmp_int32,  32, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed4u, shr_cpt, kmp_uint32, 32, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed4,  xor_cpt, kmp_int32,  32, ^,  0            )  // __kmpc_atomic_fixed4_xor_cpt
+ATOMIC_CMPXCHG_CPT( fixed8, andb_cpt, kmp_int64,  64, &,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_andb_cpt
+ATOMIC_CMPXCHG_CPT( fixed8,  div_cpt, kmp_int64,  64, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed8u, div_cpt, kmp_uint64, 64, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_div_cpt
+ATOMIC_CMPXCHG_CPT( fixed8,  mul_cpt, kmp_int64,  64, *,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_mul_cpt
+ATOMIC_CMPXCHG_CPT( fixed8,  orb_cpt, kmp_int64,  64, |,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_orb_cpt
+ATOMIC_CMPXCHG_CPT( fixed8,  shl_cpt, kmp_int64,  64, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shl_cpt
+ATOMIC_CMPXCHG_CPT( fixed8,  shr_cpt, kmp_int64,  64, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed8u, shr_cpt, kmp_uint64, 64, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_shr_cpt
+ATOMIC_CMPXCHG_CPT( fixed8,  xor_cpt, kmp_int64,  64, ^,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_xor_cpt
+ATOMIC_CMPXCHG_CPT( float4,  div_cpt, kmp_real32, 32, /,  KMP_ARCH_X86 )  // __kmpc_atomic_float4_div_cpt
+ATOMIC_CMPXCHG_CPT( float4,  mul_cpt, kmp_real32, 32, *,  KMP_ARCH_X86 )  // __kmpc_atomic_float4_mul_cpt
+ATOMIC_CMPXCHG_CPT( float8,  div_cpt, kmp_real64, 64, /,  KMP_ARCH_X86 )  // __kmpc_atomic_float8_div_cpt
+ATOMIC_CMPXCHG_CPT( float8,  mul_cpt, kmp_real64, 64, *,  KMP_ARCH_X86 )  // __kmpc_atomic_float8_mul_cpt
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+
+// ------------------------------------------------------------------------
+// Routines for C/C++ Reduction operators && and ||
+// ------------------------------------------------------------------------
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_L_CPT(OP,LCK_ID)                                      \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );                    \
+                                                                          \
+    if( flag ) {                                                          \
+        new_value OP rhs;                                                 \
+    } else                                                                \
+        new_value = (*lhs);                                               \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_L_CPT(OP,FLAG)                                   \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_L_CPT( OP, 0 );                                       \
+        return new_value;                                                 \
+    }
+#else
+#define OP_GOMP_CRITICAL_L_CPT(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Need separate macros for &&, || because there is no combined assignment
+#define ATOMIC_CMPX_L_CPT(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)           \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                 \
+    TYPE new_value;                                                       \
+    OP_GOMP_CRITICAL_L_CPT( = *lhs OP, GOMP_FLAG )                        \
+    OP_CMPXCHG_CPT(TYPE,BITS,OP)                                          \
+}
+
+ATOMIC_CMPX_L_CPT( fixed1, andl_cpt, char,       8, &&, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_andl_cpt
+ATOMIC_CMPX_L_CPT( fixed1,  orl_cpt, char,       8, ||, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_orl_cpt
+ATOMIC_CMPX_L_CPT( fixed2, andl_cpt, short,     16, &&, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_andl_cpt
+ATOMIC_CMPX_L_CPT( fixed2,  orl_cpt, short,     16, ||, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_orl_cpt
+ATOMIC_CMPX_L_CPT( fixed4, andl_cpt, kmp_int32, 32, &&, 0 )             // __kmpc_atomic_fixed4_andl_cpt
+ATOMIC_CMPX_L_CPT( fixed4,  orl_cpt, kmp_int32, 32, ||, 0 )             // __kmpc_atomic_fixed4_orl_cpt
+ATOMIC_CMPX_L_CPT( fixed8, andl_cpt, kmp_int64, 64, &&, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_andl_cpt
+ATOMIC_CMPX_L_CPT( fixed8,  orl_cpt, kmp_int64, 64, ||, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_orl_cpt
+
+
+// -------------------------------------------------------------------------
+// Routines for Fortran operators that matched no one in C:
+// MAX, MIN, .EQV., .NEQV.
+// Operators .AND., .OR. are covered by __kmpc_atomic_*_{andl,orl}_cpt
+// Intrinsics IAND, IOR, IEOR are covered by __kmpc_atomic_*_{andb,orb,xor}_cpt
+// -------------------------------------------------------------------------
+
+// -------------------------------------------------------------------------
+// MIN and MAX need separate macros
+// OP - operator to check if we need any actions?
+#define MIN_MAX_CRITSECT_CPT(OP,LCK_ID)                                    \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );                     \
+                                                                           \
+    if ( *lhs OP rhs ) {                 /* still need actions? */         \
+        old_value = *lhs;                                                  \
+        *lhs = rhs;                                                        \
+        if ( flag )                                                        \
+            new_value = rhs;                                               \
+        else                                                               \
+            new_value = old_value;                                         \
+    }                                                                      \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );                     \
+    return new_value;                                                      \
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_MIN_MAX_CRITSECT_CPT(OP,FLAG)                                 \
+    if (( FLAG ) && ( __kmp_atomic_mode == 2 )) {                          \
+        KMP_CHECK_GTID;                                                    \
+        MIN_MAX_CRITSECT_CPT( OP, 0 );                                     \
+    }
+#else
+#define GOMP_MIN_MAX_CRITSECT_CPT(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define MIN_MAX_CMPXCHG_CPT(TYPE,BITS,OP)                                  \
+    {                                                                      \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                 \
+        /*TYPE old_value; */                                               \
+        temp_val = *lhs;                                                   \
+        old_value = temp_val;                                              \
+        while ( old_value OP rhs &&          /* still need actions? */     \
+            ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs,      \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,          \
+                      *VOLATILE_CAST(kmp_int##BITS *) &rhs ) )             \
+        {                                                                  \
+            KMP_CPU_PAUSE();                                               \
+            temp_val = *lhs;                                               \
+            old_value = temp_val;                                          \
+        }                                                                  \
+        if( flag )                                                         \
+            return rhs;                                                    \
+        else                                                               \
+            return old_value;                                              \
+    }
+
+// -------------------------------------------------------------------------
+// 1-byte, 2-byte operands - use critical section
+#define MIN_MAX_CRITICAL_CPT(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)       \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                  \
+    TYPE new_value, old_value;                                             \
+    if ( *lhs OP rhs ) {     /* need actions? */                           \
+        GOMP_MIN_MAX_CRITSECT_CPT(OP,GOMP_FLAG)                            \
+        MIN_MAX_CRITSECT_CPT(OP,LCK_ID)                                    \
+    }                                                                      \
+    return *lhs;                                                           \
+}
+
+#define MIN_MAX_COMPXCHG_CPT(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)         \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                  \
+    TYPE new_value, old_value;                                             \
+    if ( *lhs OP rhs ) {                                                   \
+        GOMP_MIN_MAX_CRITSECT_CPT(OP,GOMP_FLAG)                            \
+        MIN_MAX_CMPXCHG_CPT(TYPE,BITS,OP)                                  \
+    }                                                                      \
+    return *lhs;                                                           \
+}
+
+
+MIN_MAX_COMPXCHG_CPT( fixed1,  max_cpt, char,        8, <, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_max_cpt
+MIN_MAX_COMPXCHG_CPT( fixed1,  min_cpt, char,        8, >, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_min_cpt
+MIN_MAX_COMPXCHG_CPT( fixed2,  max_cpt, short,      16, <, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_max_cpt
+MIN_MAX_COMPXCHG_CPT( fixed2,  min_cpt, short,      16, >, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_min_cpt
+MIN_MAX_COMPXCHG_CPT( fixed4,  max_cpt, kmp_int32,  32, <, 0 )            // __kmpc_atomic_fixed4_max_cpt
+MIN_MAX_COMPXCHG_CPT( fixed4,  min_cpt, kmp_int32,  32, >, 0 )            // __kmpc_atomic_fixed4_min_cpt
+MIN_MAX_COMPXCHG_CPT( fixed8,  max_cpt, kmp_int64,  64, <, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_max_cpt
+MIN_MAX_COMPXCHG_CPT( fixed8,  min_cpt, kmp_int64,  64, >, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_min_cpt
+MIN_MAX_COMPXCHG_CPT( float4,  max_cpt, kmp_real32, 32, <, KMP_ARCH_X86 ) // __kmpc_atomic_float4_max_cpt
+MIN_MAX_COMPXCHG_CPT( float4,  min_cpt, kmp_real32, 32, >, KMP_ARCH_X86 ) // __kmpc_atomic_float4_min_cpt
+MIN_MAX_COMPXCHG_CPT( float8,  max_cpt, kmp_real64, 64, <, KMP_ARCH_X86 ) // __kmpc_atomic_float8_max_cpt
+MIN_MAX_COMPXCHG_CPT( float8,  min_cpt, kmp_real64, 64, >, KMP_ARCH_X86 ) // __kmpc_atomic_float8_min_cpt
+#if KMP_HAVE_QUAD
+MIN_MAX_CRITICAL_CPT( float16, max_cpt, QUAD_LEGACY,    <, 16r,   1 )     // __kmpc_atomic_float16_max_cpt
+MIN_MAX_CRITICAL_CPT( float16, min_cpt, QUAD_LEGACY,    >, 16r,   1 )     // __kmpc_atomic_float16_min_cpt
+#if ( KMP_ARCH_X86 )
+    MIN_MAX_CRITICAL_CPT( float16, max_a16_cpt, Quad_a16_t, <, 16r,  1 )  // __kmpc_atomic_float16_max_a16_cpt
+    MIN_MAX_CRITICAL_CPT( float16, min_a16_cpt, Quad_a16_t, >, 16r,  1 )  // __kmpc_atomic_float16_mix_a16_cpt
+#endif
+#endif
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_EQV_CPT(OP,FLAG)                                 \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_CPT( OP, 0 );                                         \
+    }
+#else
+#define OP_GOMP_CRITICAL_EQV_CPT(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+#define ATOMIC_CMPX_EQV_CPT(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)         \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                 \
+    TYPE new_value;                                                       \
+    OP_GOMP_CRITICAL_EQV_CPT(^=~,GOMP_FLAG)  /* send assignment */        \
+    OP_CMPXCHG_CPT(TYPE,BITS,OP)                                          \
+}
+
+// ------------------------------------------------------------------------
+
+ATOMIC_CMPXCHG_CPT(  fixed1, neqv_cpt, kmp_int8,   8,   ^, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_neqv_cpt
+ATOMIC_CMPXCHG_CPT(  fixed2, neqv_cpt, kmp_int16, 16,   ^, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_neqv_cpt
+ATOMIC_CMPXCHG_CPT(  fixed4, neqv_cpt, kmp_int32, 32,   ^, KMP_ARCH_X86 ) // __kmpc_atomic_fixed4_neqv_cpt
+ATOMIC_CMPXCHG_CPT(  fixed8, neqv_cpt, kmp_int64, 64,   ^, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_neqv_cpt
+ATOMIC_CMPX_EQV_CPT( fixed1, eqv_cpt,  kmp_int8,   8,  ^~, KMP_ARCH_X86 ) // __kmpc_atomic_fixed1_eqv_cpt
+ATOMIC_CMPX_EQV_CPT( fixed2, eqv_cpt,  kmp_int16, 16,  ^~, KMP_ARCH_X86 ) // __kmpc_atomic_fixed2_eqv_cpt
+ATOMIC_CMPX_EQV_CPT( fixed4, eqv_cpt,  kmp_int32, 32,  ^~, KMP_ARCH_X86 ) // __kmpc_atomic_fixed4_eqv_cpt
+ATOMIC_CMPX_EQV_CPT( fixed8, eqv_cpt,  kmp_int64, 64,  ^~, KMP_ARCH_X86 ) // __kmpc_atomic_fixed8_eqv_cpt
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_CPT(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG) \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                           \
+    TYPE new_value;                                                 \
+    OP_GOMP_CRITICAL_CPT(OP,GOMP_FLAG)  /* send assignment */       \
+    OP_CRITICAL_CPT(OP##=,LCK_ID)          /* send assignment */    \
+}
+
+// ------------------------------------------------------------------------
+
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+#define OP_CRITICAL_CPT_WRK(OP,LCK_ID)                                    \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+                                                                          \
+    if( flag ) {                                                          \
+        (*lhs) OP rhs;                                                    \
+        (*out) = (*lhs);                                                  \
+    } else {                                                              \
+        (*out) = (*lhs);                                                  \
+        (*lhs) OP rhs;                                                    \
+    }                                                                     \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+    return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_WRK(OP,FLAG)                                 \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_CPT_WRK( OP##=, 0 );                                  \
+    }
+#else
+#define OP_GOMP_CRITICAL_CPT_WRK(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_BEGIN_WRK(TYPE_ID,OP_ID,TYPE)                              \
+void __kmpc_atomic_##TYPE_ID##_##OP_ID( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs, TYPE * out, int flag ) \
+{                                                                         \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid ));
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_CPT_WRK(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)   \
+ATOMIC_BEGIN_WRK(TYPE_ID,OP_ID,TYPE)                                      \
+    OP_GOMP_CRITICAL_CPT_WRK(OP,GOMP_FLAG)                                \
+    OP_CRITICAL_CPT_WRK(OP##=,LCK_ID)                                     \
+}
+// The end of workaround for cmplx4
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_CPT( float10, add_cpt, long double,     +, 10r,   1 )            // __kmpc_atomic_float10_add_cpt
+ATOMIC_CRITICAL_CPT( float10, sub_cpt, long double,     -, 10r,   1 )            // __kmpc_atomic_float10_sub_cpt
+ATOMIC_CRITICAL_CPT( float10, mul_cpt, long double,     *, 10r,   1 )            // __kmpc_atomic_float10_mul_cpt
+ATOMIC_CRITICAL_CPT( float10, div_cpt, long double,     /, 10r,   1 )            // __kmpc_atomic_float10_div_cpt
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_CPT( float16, add_cpt, QUAD_LEGACY,     +, 16r,   1 )            // __kmpc_atomic_float16_add_cpt
+ATOMIC_CRITICAL_CPT( float16, sub_cpt, QUAD_LEGACY,     -, 16r,   1 )            // __kmpc_atomic_float16_sub_cpt
+ATOMIC_CRITICAL_CPT( float16, mul_cpt, QUAD_LEGACY,     *, 16r,   1 )            // __kmpc_atomic_float16_mul_cpt
+ATOMIC_CRITICAL_CPT( float16, div_cpt, QUAD_LEGACY,     /, 16r,   1 )            // __kmpc_atomic_float16_div_cpt
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_CPT( float16, add_a16_cpt, Quad_a16_t, +, 16r,  1 )          // __kmpc_atomic_float16_add_a16_cpt
+    ATOMIC_CRITICAL_CPT( float16, sub_a16_cpt, Quad_a16_t, -, 16r,  1 )          // __kmpc_atomic_float16_sub_a16_cpt
+    ATOMIC_CRITICAL_CPT( float16, mul_a16_cpt, Quad_a16_t, *, 16r,  1 )          // __kmpc_atomic_float16_mul_a16_cpt
+    ATOMIC_CRITICAL_CPT( float16, div_a16_cpt, Quad_a16_t, /, 16r,  1 )          // __kmpc_atomic_float16_div_a16_cpt
+#endif
+#endif
+
+// routines for complex types
+
+// cmplx4 routines to return void
+ATOMIC_CRITICAL_CPT_WRK( cmplx4,  add_cpt, kmp_cmplx32, +, 8c,    1 )            // __kmpc_atomic_cmplx4_add_cpt
+ATOMIC_CRITICAL_CPT_WRK( cmplx4,  sub_cpt, kmp_cmplx32, -, 8c,    1 )            // __kmpc_atomic_cmplx4_sub_cpt
+ATOMIC_CRITICAL_CPT_WRK( cmplx4,  mul_cpt, kmp_cmplx32, *, 8c,    1 )            // __kmpc_atomic_cmplx4_mul_cpt
+ATOMIC_CRITICAL_CPT_WRK( cmplx4,  div_cpt, kmp_cmplx32, /, 8c,    1 )            // __kmpc_atomic_cmplx4_div_cpt
+
+ATOMIC_CRITICAL_CPT( cmplx8,  add_cpt, kmp_cmplx64, +, 16c,   1 )            // __kmpc_atomic_cmplx8_add_cpt
+ATOMIC_CRITICAL_CPT( cmplx8,  sub_cpt, kmp_cmplx64, -, 16c,   1 )            // __kmpc_atomic_cmplx8_sub_cpt
+ATOMIC_CRITICAL_CPT( cmplx8,  mul_cpt, kmp_cmplx64, *, 16c,   1 )            // __kmpc_atomic_cmplx8_mul_cpt
+ATOMIC_CRITICAL_CPT( cmplx8,  div_cpt, kmp_cmplx64, /, 16c,   1 )            // __kmpc_atomic_cmplx8_div_cpt
+ATOMIC_CRITICAL_CPT( cmplx10, add_cpt, kmp_cmplx80, +, 20c,   1 )            // __kmpc_atomic_cmplx10_add_cpt
+ATOMIC_CRITICAL_CPT( cmplx10, sub_cpt, kmp_cmplx80, -, 20c,   1 )            // __kmpc_atomic_cmplx10_sub_cpt
+ATOMIC_CRITICAL_CPT( cmplx10, mul_cpt, kmp_cmplx80, *, 20c,   1 )            // __kmpc_atomic_cmplx10_mul_cpt
+ATOMIC_CRITICAL_CPT( cmplx10, div_cpt, kmp_cmplx80, /, 20c,   1 )            // __kmpc_atomic_cmplx10_div_cpt
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_CPT( cmplx16, add_cpt, CPLX128_LEG, +, 32c,   1 )            // __kmpc_atomic_cmplx16_add_cpt
+ATOMIC_CRITICAL_CPT( cmplx16, sub_cpt, CPLX128_LEG, -, 32c,   1 )            // __kmpc_atomic_cmplx16_sub_cpt
+ATOMIC_CRITICAL_CPT( cmplx16, mul_cpt, CPLX128_LEG, *, 32c,   1 )            // __kmpc_atomic_cmplx16_mul_cpt
+ATOMIC_CRITICAL_CPT( cmplx16, div_cpt, CPLX128_LEG, /, 32c,   1 )            // __kmpc_atomic_cmplx16_div_cpt
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_CPT( cmplx16, add_a16_cpt, kmp_cmplx128_a16_t, +, 32c,   1 )   // __kmpc_atomic_cmplx16_add_a16_cpt
+    ATOMIC_CRITICAL_CPT( cmplx16, sub_a16_cpt, kmp_cmplx128_a16_t, -, 32c,   1 )   // __kmpc_atomic_cmplx16_sub_a16_cpt
+    ATOMIC_CRITICAL_CPT( cmplx16, mul_a16_cpt, kmp_cmplx128_a16_t, *, 32c,   1 )   // __kmpc_atomic_cmplx16_mul_a16_cpt
+    ATOMIC_CRITICAL_CPT( cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,   1 )   // __kmpc_atomic_cmplx16_div_a16_cpt
+#endif
+#endif
+
+#if OMP_40_ENABLED
+
+// OpenMP 4.0: v = x = expr binop x; { v = x; x = expr binop x; } { x = expr binop x; v = x; }  for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_CPT_REV(OP,LCK_ID)                                    \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+                                                                          \
+    if( flag ) {                                                          \
+        /*temp_val = (*lhs);*/\
+        (*lhs) = (rhs) OP (*lhs);                                         \
+        new_value = (*lhs);                                               \
+    } else {                                                              \
+        new_value = (*lhs);\
+        (*lhs) = (rhs) OP (*lhs);                                         \
+    }                                                                     \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+    return new_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_REV(OP,FLAG)                                 \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_CPT_REV( OP, 0 );                                     \
+    }
+#else
+#define OP_GOMP_CRITICAL_CPT_REV(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_CPT_REV(TYPE,BITS,OP)                                  \
+    {                                                                     \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+        TYPE old_value, new_value;                                        \
+        temp_val = *lhs;                                                  \
+        old_value = temp_val;                                             \
+        new_value = rhs OP old_value;                                     \
+        while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,         \
+                      *VOLATILE_CAST(kmp_int##BITS *) &new_value ) )      \
+        {                                                                 \
+            KMP_CPU_PAUSE();                                              \
+                                                                          \
+            temp_val = *lhs;                                              \
+            old_value = temp_val;                                         \
+            new_value = rhs OP old_value;                                 \
+        }                                                                 \
+        if( flag ) {                                                      \
+            return new_value;                                             \
+        } else                                                            \
+            return old_value;                                             \
+    }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT_REV(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)       \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                  \
+    TYPE new_value;                                                        \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+    OP_GOMP_CRITICAL_CPT_REV(OP,GOMP_FLAG)                                 \
+    OP_CMPXCHG_CPT_REV(TYPE,BITS,OP)                                       \
+}
+
+
+ATOMIC_CMPXCHG_CPT_REV( fixed1,  div_cpt_rev, kmp_int8,    8, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed1u, div_cpt_rev, kmp_uint8,   8, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed1,  shl_cpt_rev, kmp_int8,    8, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed1,  shr_cpt_rev, kmp_int8,    8, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed1u, shr_cpt_rev, kmp_uint8,   8, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed1,  sub_cpt_rev, kmp_int8,    8, -,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed2,  div_cpt_rev, kmp_int16,  16, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed2u, div_cpt_rev, kmp_uint16, 16, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed2,  shl_cpt_rev, kmp_int16,  16, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed2,  shr_cpt_rev, kmp_int16,  16, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed2u, shr_cpt_rev, kmp_uint16, 16, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed2,  sub_cpt_rev, kmp_int16,  16, -,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed4,  div_cpt_rev, kmp_int32,  32, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed4u, div_cpt_rev, kmp_uint32, 32, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed4,  shl_cpt_rev, kmp_int32,  32, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed4,  shr_cpt_rev, kmp_int32,  32, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed4u, shr_cpt_rev, kmp_uint32, 32, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed4,  sub_cpt_rev, kmp_int32,  32, -,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed8,  div_cpt_rev, kmp_int64,  64, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed8u, div_cpt_rev, kmp_uint64, 64, /,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed8,  shl_cpt_rev, kmp_int64,  64, <<, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed8,  shr_cpt_rev, kmp_int64,  64, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed8u, shr_cpt_rev, kmp_uint64, 64, >>, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( fixed8,  sub_cpt_rev, kmp_int64,  64, -,  KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( float4,  div_cpt_rev, kmp_real32, 32, /,  KMP_ARCH_X86 )  // __kmpc_atomic_float4_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( float4,  sub_cpt_rev, kmp_real32, 32, -,  KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( float8,  div_cpt_rev, kmp_real64, 64, /,  KMP_ARCH_X86 )  // __kmpc_atomic_float8_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV( float8,  sub_cpt_rev, kmp_real64, 64, -,  KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub_cpt_rev
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_CPT_REV(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG) \
+ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                               \
+    TYPE new_value;                                                     \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+    /*printf("__kmp_atomic_mode = %d\n", __kmp_atomic_mode);*/\
+    OP_GOMP_CRITICAL_CPT_REV(OP,GOMP_FLAG)                              \
+    OP_CRITICAL_CPT_REV(OP,LCK_ID)                                      \
+}
+
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_CPT_REV( float10, sub_cpt_rev, long double,     -, 10r,   1 )            // __kmpc_atomic_float10_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV( float10, div_cpt_rev, long double,     /, 10r,   1 )            // __kmpc_atomic_float10_div_cpt_rev
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_CPT_REV( float16, sub_cpt_rev, QUAD_LEGACY,     -, 16r,   1 )            // __kmpc_atomic_float16_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV( float16, div_cpt_rev, QUAD_LEGACY,     /, 16r,   1 )            // __kmpc_atomic_float16_div_cpt_rev
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_CPT_REV( float16, sub_a16_cpt_rev, Quad_a16_t, -, 16r,  1 )          // __kmpc_atomic_float16_sub_a16_cpt_rev
+    ATOMIC_CRITICAL_CPT_REV( float16, div_a16_cpt_rev, Quad_a16_t, /, 16r,  1 )          // __kmpc_atomic_float16_div_a16_cpt_rev
+#endif
+#endif
+
+// routines for complex types
+
+// ------------------------------------------------------------------------
+
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+#define OP_CRITICAL_CPT_REV_WRK(OP,LCK_ID)                                \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+                                                                          \
+    if( flag ) {                                                          \
+        (*lhs) = (rhs) OP (*lhs);                                         \
+        (*out) = (*lhs);                                                  \
+    } else {                                                              \
+        (*out) = (*lhs);                                                  \
+        (*lhs) = (rhs) OP (*lhs);                                         \
+    }                                                                     \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+    return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_REV_WRK(OP,FLAG)                             \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        OP_CRITICAL_CPT_REV_WRK( OP, 0 );                                 \
+    }
+#else
+#define OP_GOMP_CRITICAL_CPT_REV_WRK(OP,FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_CPT_REV_WRK(TYPE_ID,OP_ID,TYPE,OP,LCK_ID,GOMP_FLAG)   \
+ATOMIC_BEGIN_WRK(TYPE_ID,OP_ID,TYPE)                                          \
+    OP_GOMP_CRITICAL_CPT_REV_WRK(OP,GOMP_FLAG)                                \
+    OP_CRITICAL_CPT_REV_WRK(OP,LCK_ID)                                        \
+}
+// The end of workaround for cmplx4
+
+
+// !!! TODO: check if we need to return void for cmplx4 routines
+// cmplx4 routines to return void
+ATOMIC_CRITICAL_CPT_REV_WRK( cmplx4,  sub_cpt_rev, kmp_cmplx32, -, 8c,    1 )            // __kmpc_atomic_cmplx4_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV_WRK( cmplx4,  div_cpt_rev, kmp_cmplx32, /, 8c,    1 )            // __kmpc_atomic_cmplx4_div_cpt_rev
+
+ATOMIC_CRITICAL_CPT_REV( cmplx8,  sub_cpt_rev, kmp_cmplx64, -, 16c,   1 )            // __kmpc_atomic_cmplx8_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV( cmplx8,  div_cpt_rev, kmp_cmplx64, /, 16c,   1 )            // __kmpc_atomic_cmplx8_div_cpt_rev
+ATOMIC_CRITICAL_CPT_REV( cmplx10, sub_cpt_rev, kmp_cmplx80, -, 20c,   1 )            // __kmpc_atomic_cmplx10_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV( cmplx10, div_cpt_rev, kmp_cmplx80, /, 20c,   1 )            // __kmpc_atomic_cmplx10_div_cpt_rev
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_CPT_REV( cmplx16, sub_cpt_rev, CPLX128_LEG, -, 32c,   1 )            // __kmpc_atomic_cmplx16_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV( cmplx16, div_cpt_rev, CPLX128_LEG, /, 32c,   1 )            // __kmpc_atomic_cmplx16_div_cpt_rev
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_CPT_REV( cmplx16, sub_a16_cpt_rev, kmp_cmplx128_a16_t, -, 32c,   1 )   // __kmpc_atomic_cmplx16_sub_a16_cpt_rev
+    ATOMIC_CRITICAL_CPT_REV( cmplx16, div_a16_cpt_rev, kmp_cmplx128_a16_t, /, 32c,   1 )   // __kmpc_atomic_cmplx16_div_a16_cpt_rev
+#endif
+#endif
+
+//   OpenMP 4.0 Capture-write (swap): {v = x; x = expr;}
+
+#define ATOMIC_BEGIN_SWP(TYPE_ID,TYPE)                                                    \
+TYPE __kmpc_atomic_##TYPE_ID##_swp( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs )     \
+{                                                                                         \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                                \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_swp: T#%d\n", gtid ));
+
+#define CRITICAL_SWP(LCK_ID)                                              \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+                                                                          \
+    old_value = (*lhs);                                                   \
+    (*lhs) = rhs;                                                         \
+                                                                          \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+    return old_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_CRITICAL_SWP(FLAG)                                           \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        CRITICAL_SWP( 0 );                                                \
+    }
+#else
+#define GOMP_CRITICAL_SWP(FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+
+#define ATOMIC_XCHG_SWP(TYPE_ID,TYPE,BITS,GOMP_FLAG)                      \
+ATOMIC_BEGIN_SWP(TYPE_ID,TYPE)                                            \
+    TYPE old_value;                                                       \
+    GOMP_CRITICAL_SWP(GOMP_FLAG)                                          \
+    old_value = KMP_XCHG_FIXED##BITS( lhs, rhs );                         \
+    return old_value;                                                     \
+}
+// ------------------------------------------------------------------------
+#define ATOMIC_XCHG_FLOAT_SWP(TYPE_ID,TYPE,BITS,GOMP_FLAG)                \
+ATOMIC_BEGIN_SWP(TYPE_ID,TYPE)                                            \
+    TYPE old_value;                                                       \
+    GOMP_CRITICAL_SWP(GOMP_FLAG)                                          \
+    old_value = KMP_XCHG_REAL##BITS( lhs, rhs );                          \
+    return old_value;                                                     \
+}
+
+// ------------------------------------------------------------------------
+#define CMPXCHG_SWP(TYPE,BITS)                                            \
+    {                                                                     \
+        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
+        TYPE old_value, new_value;                                        \
+        temp_val = *lhs;                                                  \
+        old_value = temp_val;                                             \
+        new_value = rhs;                                                  \
+        while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
+                      *VOLATILE_CAST(kmp_int##BITS *) &old_value,         \
+                      *VOLATILE_CAST(kmp_int##BITS *) &new_value ) )      \
+        {                                                                 \
+            KMP_CPU_PAUSE();                                              \
+                                                                          \
+            temp_val = *lhs;                                              \
+            old_value = temp_val;                                         \
+            new_value = rhs;                                              \
+        }                                                                 \
+        return old_value;                                                 \
+    }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_SWP(TYPE_ID,TYPE,BITS,GOMP_FLAG)                   \
+ATOMIC_BEGIN_SWP(TYPE_ID,TYPE)                                            \
+    TYPE old_value;                                                       \
+    GOMP_CRITICAL_SWP(GOMP_FLAG)                                          \
+    CMPXCHG_SWP(TYPE,BITS)                                                \
+}
+
+ATOMIC_XCHG_SWP( fixed1, kmp_int8,    8, KMP_ARCH_X86 )  // __kmpc_atomic_fixed1_swp
+ATOMIC_XCHG_SWP( fixed2, kmp_int16,  16, KMP_ARCH_X86 )  // __kmpc_atomic_fixed2_swp
+ATOMIC_XCHG_SWP( fixed4, kmp_int32,  32, KMP_ARCH_X86 )  // __kmpc_atomic_fixed4_swp
+
+ATOMIC_XCHG_FLOAT_SWP( float4, kmp_real32, 32, KMP_ARCH_X86 )      // __kmpc_atomic_float4_swp
+
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CMPXCHG_SWP( fixed8, kmp_int64, 64, KMP_ARCH_X86 )      // __kmpc_atomic_fixed8_swp
+    ATOMIC_CMPXCHG_SWP( float8, kmp_real64, 64, KMP_ARCH_X86 )     // __kmpc_atomic_float8_swp
+#else
+    ATOMIC_XCHG_SWP(       fixed8, kmp_int64, 64, KMP_ARCH_X86 )   // __kmpc_atomic_fixed8_swp
+    ATOMIC_XCHG_FLOAT_SWP( float8, kmp_real64, 64, KMP_ARCH_X86 )  // __kmpc_atomic_float8_swp
+#endif
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use critical section)
+#define ATOMIC_CRITICAL_SWP(TYPE_ID,TYPE,LCK_ID,GOMP_FLAG)              \
+ATOMIC_BEGIN_SWP(TYPE_ID,TYPE)                                          \
+    TYPE old_value;                                                     \
+    GOMP_CRITICAL_SWP(GOMP_FLAG)                                        \
+    CRITICAL_SWP(LCK_ID)                                                \
+}
+
+// ------------------------------------------------------------------------
+
+// !!! TODO: check if we need to return void for cmplx4 routines
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+
+#define ATOMIC_BEGIN_SWP_WRK(TYPE_ID,TYPE)                                                \
+void __kmpc_atomic_##TYPE_ID##_swp( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs, TYPE * out )     \
+{                                                                                         \
+    KMP_DEBUG_ASSERT( __kmp_init_serial );                                                \
+    KA_TRACE(100,("__kmpc_atomic_" #TYPE_ID "_swp: T#%d\n", gtid ));
+
+
+#define CRITICAL_SWP_WRK(LCK_ID)                                          \
+    __kmp_acquire_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+                                                                          \
+    tmp = (*lhs);                                                         \
+    (*lhs) = (rhs);                                                       \
+    (*out) = tmp;                                                         \
+    __kmp_release_atomic_lock( & ATOMIC_LOCK##LCK_ID, gtid );             \
+    return;
+
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_CRITICAL_SWP_WRK(FLAG)                                       \
+    if ( (FLAG) && (__kmp_atomic_mode == 2) ) {                           \
+        KMP_CHECK_GTID;                                                   \
+        CRITICAL_SWP_WRK( 0 );                                            \
+    }
+#else
+#define GOMP_CRITICAL_SWP_WRK(FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_SWP_WRK(TYPE_ID, TYPE,LCK_ID,GOMP_FLAG)           \
+ATOMIC_BEGIN_SWP_WRK(TYPE_ID,TYPE)                                        \
+    TYPE tmp;                                                             \
+    GOMP_CRITICAL_SWP_WRK(GOMP_FLAG)                                      \
+    CRITICAL_SWP_WRK(LCK_ID)                                              \
+}
+// The end of workaround for cmplx4
+
+
+ATOMIC_CRITICAL_SWP( float10, long double, 10r,   1 )              // __kmpc_atomic_float10_swp
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_SWP( float16, QUAD_LEGACY, 16r,   1 )              // __kmpc_atomic_float16_swp
+#endif
+// cmplx4 routine to return void
+ATOMIC_CRITICAL_SWP_WRK( cmplx4, kmp_cmplx32,  8c,   1 )           // __kmpc_atomic_cmplx4_swp
+
+//ATOMIC_CRITICAL_SWP( cmplx4, kmp_cmplx32,  8c,   1 )           // __kmpc_atomic_cmplx4_swp
+
+
+ATOMIC_CRITICAL_SWP( cmplx8,  kmp_cmplx64, 16c,   1 )              // __kmpc_atomic_cmplx8_swp
+ATOMIC_CRITICAL_SWP( cmplx10, kmp_cmplx80, 20c,   1 )              // __kmpc_atomic_cmplx10_swp
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_SWP( cmplx16, CPLX128_LEG, 32c,   1 )              // __kmpc_atomic_cmplx16_swp
+#if ( KMP_ARCH_X86 )
+    ATOMIC_CRITICAL_SWP( float16_a16, Quad_a16_t,         16r, 1 )  // __kmpc_atomic_float16_a16_swp
+    ATOMIC_CRITICAL_SWP( cmplx16_a16, kmp_cmplx128_a16_t, 32c, 1 )  // __kmpc_atomic_cmplx16_a16_swp
+#endif
+#endif
+
+
+// End of OpenMP 4.0 Capture
+
+#endif //OMP_40_ENABLED
+
+#endif //KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+
+#undef OP_CRITICAL
+
+/* ------------------------------------------------------------------------ */
+/* Generic atomic routines                                                  */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmpc_atomic_1( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    if (
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+        FALSE                                   /* must use lock */
+#else
+        TRUE
+#endif
+	)
+    {
+	kmp_int8 old_value, new_value;
+
+	old_value = *(kmp_int8 *) lhs;
+	(*f)( &new_value, &old_value, rhs );
+
+	/* TODO: Should this be acquire or release? */
+	while ( !  KMP_COMPARE_AND_STORE_ACQ8 ( (kmp_int8 *) lhs,
+		    		*(kmp_int8 *) &old_value, *(kmp_int8 *) &new_value ) )
+	{
+	    KMP_CPU_PAUSE();
+
+	    old_value = *(kmp_int8 *) lhs;
+	    (*f)( &new_value, &old_value, rhs );
+	}
+
+	return;
+    }
+    else {
+        //
+        // All 1-byte data is of integer data type.
+        //
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_acquire_atomic_lock( & __kmp_atomic_lock_1i, gtid );
+
+	(*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_release_atomic_lock( & __kmp_atomic_lock_1i, gtid );
+    }
+}
+
+void
+__kmpc_atomic_2( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    if (
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+        FALSE                                   /* must use lock */
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+	TRUE					/* no alignment problems */
+#else
+	! ( (kmp_uintptr_t) lhs & 0x1)		/* make sure address is 2-byte aligned */
+#endif
+	)
+    {
+	kmp_int16 old_value, new_value;
+
+	old_value = *(kmp_int16 *) lhs;
+	(*f)( &new_value, &old_value, rhs );
+
+	/* TODO: Should this be acquire or release? */
+	while ( !  KMP_COMPARE_AND_STORE_ACQ16 ( (kmp_int16 *) lhs,
+		    		*(kmp_int16 *) &old_value, *(kmp_int16 *) &new_value ) )
+	{
+	    KMP_CPU_PAUSE();
+
+	    old_value = *(kmp_int16 *) lhs;
+	    (*f)( &new_value, &old_value, rhs );
+	}
+
+	return;
+    }
+    else {
+        //
+        // All 2-byte data is of integer data type.
+        //
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_acquire_atomic_lock( & __kmp_atomic_lock_2i, gtid );
+
+	(*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_release_atomic_lock( & __kmp_atomic_lock_2i, gtid );
+    }
+}
+
+void
+__kmpc_atomic_4( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    if (
+        //
+        // FIXME: On IA-32 architecture, gcc uses cmpxchg only for 4-byte ints.
+        // Gomp compatibility is broken if this routine is called for floats.
+        //
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+	TRUE					/* no alignment problems */
+#else
+	! ( (kmp_uintptr_t) lhs & 0x3)		/* make sure address is 4-byte aligned */
+#endif
+	)
+    {
+	kmp_int32 old_value, new_value;
+
+	old_value = *(kmp_int32 *) lhs;
+	(*f)( &new_value, &old_value, rhs );
+
+	/* TODO: Should this be acquire or release? */
+	while ( !  KMP_COMPARE_AND_STORE_ACQ32 ( (kmp_int32 *) lhs,
+		    		*(kmp_int32 *) &old_value, *(kmp_int32 *) &new_value ) )
+	{
+	    KMP_CPU_PAUSE();
+
+	    old_value = *(kmp_int32 *) lhs;
+	    (*f)( &new_value, &old_value, rhs );
+	}
+
+	return;
+    }
+    else {
+        //
+        // Use __kmp_atomic_lock_4i for all 4-byte data,
+        // even if it isn't of integer data type.
+        //
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_acquire_atomic_lock( & __kmp_atomic_lock_4i, gtid );
+
+	(*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_release_atomic_lock( & __kmp_atomic_lock_4i, gtid );
+    }
+}
+
+void
+__kmpc_atomic_8( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    if (
+
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+        FALSE                                   /* must use lock */
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+	TRUE					/* no alignment problems */
+#else
+	! ( (kmp_uintptr_t) lhs & 0x7)		/* make sure address is 8-byte aligned */
+#endif
+	)
+    {
+	kmp_int64 old_value, new_value;
+
+	old_value = *(kmp_int64 *) lhs;
+	(*f)( &new_value, &old_value, rhs );
+	/* TODO: Should this be acquire or release? */
+	while ( !  KMP_COMPARE_AND_STORE_ACQ64 ( (kmp_int64 *) lhs,
+					       *(kmp_int64 *) &old_value,
+					       *(kmp_int64 *) &new_value ) )
+	{
+	    KMP_CPU_PAUSE();
+
+	    old_value = *(kmp_int64 *) lhs;
+	    (*f)( &new_value, &old_value, rhs );
+	}
+
+	return;
+    } else {
+        //
+        // Use __kmp_atomic_lock_8i for all 8-byte data,
+        // even if it isn't of integer data type.
+        //
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_acquire_atomic_lock( & __kmp_atomic_lock_8i, gtid );
+
+	(*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+        if ( __kmp_atomic_mode == 2 ) {
+            __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+	__kmp_release_atomic_lock( & __kmp_atomic_lock_8i, gtid );
+    }
+}
+
+void
+__kmpc_atomic_10( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock( & __kmp_atomic_lock_10r, gtid );
+
+    (*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock( & __kmp_atomic_lock_10r, gtid );
+}
+
+void
+__kmpc_atomic_16( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock( & __kmp_atomic_lock_16c, gtid );
+
+    (*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock( & __kmp_atomic_lock_16c, gtid );
+}
+
+void
+__kmpc_atomic_20( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock( & __kmp_atomic_lock_20c, gtid );
+
+    (*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock( & __kmp_atomic_lock_20c, gtid );
+}
+
+void
+__kmpc_atomic_32( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_acquire_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock( & __kmp_atomic_lock_32c, gtid );
+
+    (*f)( lhs, lhs, rhs );
+
+#ifdef KMP_GOMP_COMPAT
+    if ( __kmp_atomic_mode == 2 ) {
+        __kmp_release_atomic_lock( & __kmp_atomic_lock, gtid );
+    }
+    else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock( & __kmp_atomic_lock_32c, gtid );
+}
+
+// AC: same two routines as GOMP_atomic_start/end, but will be called by our compiler
+//     duplicated in order to not use 3-party names in pure Intel code
+// TODO: consider adding GTID parameter after consultation with Ernesto/Xinmin.
+void
+__kmpc_atomic_start(void)
+{
+    int gtid = __kmp_entry_gtid();
+    KA_TRACE(20, ("__kmpc_atomic_start: T#%d\n", gtid));
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+
+void
+__kmpc_atomic_end(void)
+{
+    int gtid = __kmp_get_gtid();
+    KA_TRACE(20, ("__kmpc_atomic_end: T#%d\n", gtid));
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+/*!
+@}
+*/
+
+// end of file

diff --git a/final/runtime/src/kmp_atomic.h b/final/runtime/src/kmp_atomic.h
new file mode 100644
index 0000000..419ad08
--- /dev/null
+++ b/final/runtime/src/kmp_atomic.h

@@ -0,0 +1,1038 @@
+/*
+ * kmp_atomic.h - ATOMIC header file
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_ATOMIC_H
+#define KMP_ATOMIC_H
+
+#include "kmp_os.h"
+#include "kmp_lock.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// C++ build port.
+// Intel compiler does not support _Complex datatype on win.
+// Intel compiler supports _Complex datatype on lin and mac.
+// On the other side, there is a problem of stack alignment on lin_32 and mac_32
+// if the rhs is cmplx80 or cmplx128 typedef'ed datatype.
+// The decision is: to use compiler supported _Complex type on lin and mac,
+//                  to use typedef'ed types on win.
+// Condition for WIN64 was modified in anticipation of 10.1 build compiler.
+
+#if defined( __cplusplus ) && ( KMP_OS_WINDOWS )
+    // create shortcuts for c99 complex types
+
+    #if (_MSC_VER < 1600) && defined(_DEBUG)
+        // Workaround for the problem of _DebugHeapTag unresolved external.
+        // This problem prevented to use our static debug library for C tests
+        // compiled with /MDd option (the library itself built with /MTd),
+        #undef _DEBUG
+        #define _DEBUG_TEMPORARILY_UNSET_
+    #endif
+
+    #include <complex>
+
+    template< typename type_lhs, typename type_rhs >
+    std::complex< type_lhs > __kmp_lhs_div_rhs(
+                const std::complex< type_lhs >& lhs,
+                const std::complex< type_rhs >& rhs ) {
+    type_lhs a = lhs.real();
+    type_lhs b = lhs.imag();
+    type_rhs c = rhs.real();
+    type_rhs d = rhs.imag();
+    type_rhs den = c*c + d*d;
+    type_rhs r = ( a*c + b*d );
+    type_rhs i = ( b*c - a*d );
+    std::complex< type_lhs > ret( r/den, i/den );
+    return ret;
+    }
+
+    // complex8
+    struct __kmp_cmplx64_t : std::complex< double > {
+
+    __kmp_cmplx64_t() : std::complex< double > () {}
+
+    __kmp_cmplx64_t( const std::complex< double >& cd )
+                : std::complex< double > ( cd ) {}
+
+    void operator /= ( const __kmp_cmplx64_t& rhs ) {
+        std::complex< double > lhs = *this;
+        *this = __kmp_lhs_div_rhs( lhs, rhs );
+    }
+
+    __kmp_cmplx64_t operator / ( const __kmp_cmplx64_t& rhs ) {
+        std::complex< double > lhs = *this;
+        return __kmp_lhs_div_rhs( lhs, rhs );
+    }
+
+    };
+    typedef struct __kmp_cmplx64_t kmp_cmplx64;
+
+    // complex4
+    struct __kmp_cmplx32_t : std::complex< float > {
+
+    __kmp_cmplx32_t() : std::complex< float > () {}
+
+    __kmp_cmplx32_t( const std::complex<float>& cf )
+                : std::complex< float > ( cf ) {}
+
+    __kmp_cmplx32_t operator + ( const __kmp_cmplx32_t& b ) {
+        std::complex< float > lhs = *this;
+        std::complex< float > rhs = b;
+        return ( lhs + rhs );
+    }
+    __kmp_cmplx32_t operator - ( const __kmp_cmplx32_t& b ) {
+        std::complex< float > lhs = *this;
+        std::complex< float > rhs = b;
+        return ( lhs - rhs );
+    }
+    __kmp_cmplx32_t operator * ( const __kmp_cmplx32_t& b ) {
+        std::complex< float > lhs = *this;
+        std::complex< float > rhs = b;
+        return ( lhs * rhs );
+    }
+
+    __kmp_cmplx32_t operator + ( const kmp_cmplx64& b ) {
+        kmp_cmplx64 t = kmp_cmplx64( *this ) + b;
+        std::complex< double > d( t );
+        std::complex< float > f( d );
+        __kmp_cmplx32_t r( f );
+        return r;
+    }
+    __kmp_cmplx32_t operator - ( const kmp_cmplx64& b ) {
+        kmp_cmplx64 t = kmp_cmplx64( *this ) - b;
+        std::complex< double > d( t );
+        std::complex< float > f( d );
+        __kmp_cmplx32_t r( f );
+        return r;
+    }
+    __kmp_cmplx32_t operator * ( const kmp_cmplx64& b ) {
+        kmp_cmplx64 t = kmp_cmplx64( *this ) * b;
+        std::complex< double > d( t );
+        std::complex< float > f( d );
+        __kmp_cmplx32_t r( f );
+        return r;
+    }
+
+    void operator /= ( const __kmp_cmplx32_t& rhs ) {
+        std::complex< float > lhs = *this;
+        *this = __kmp_lhs_div_rhs( lhs, rhs );
+    }
+
+    __kmp_cmplx32_t operator / ( const __kmp_cmplx32_t& rhs ) {
+        std::complex< float > lhs = *this;
+        return __kmp_lhs_div_rhs( lhs, rhs );
+    }
+
+    void operator /= ( const kmp_cmplx64& rhs ) {
+        std::complex< float > lhs = *this;
+        *this = __kmp_lhs_div_rhs( lhs, rhs );
+    }
+
+    __kmp_cmplx32_t operator / ( const kmp_cmplx64& rhs ) {
+        std::complex< float > lhs = *this;
+        return __kmp_lhs_div_rhs( lhs, rhs );
+    }
+    };
+    typedef struct __kmp_cmplx32_t kmp_cmplx32;
+
+    // complex10
+    struct KMP_DO_ALIGN( 16 )  __kmp_cmplx80_t : std::complex< long double > {
+
+            __kmp_cmplx80_t() : std::complex< long double > () {}
+
+            __kmp_cmplx80_t( const std::complex< long double >& cld )
+                : std::complex< long double > ( cld ) {}
+
+        void operator /= ( const __kmp_cmplx80_t& rhs ) {
+        std::complex< long double > lhs = *this;
+        *this = __kmp_lhs_div_rhs( lhs, rhs );
+        }
+
+        __kmp_cmplx80_t operator / ( const __kmp_cmplx80_t& rhs ) {
+        std::complex< long double > lhs = *this;
+        return __kmp_lhs_div_rhs( lhs, rhs );
+        }
+
+    };
+    typedef KMP_DO_ALIGN( 16 )  struct __kmp_cmplx80_t kmp_cmplx80;
+
+    // complex16
+    #if KMP_HAVE_QUAD
+    struct __kmp_cmplx128_t : std::complex< _Quad > {
+
+            __kmp_cmplx128_t() : std::complex< _Quad > () {}
+
+            __kmp_cmplx128_t( const std::complex< _Quad >& cq )
+                : std::complex< _Quad > ( cq ) {}
+
+        void operator /= ( const __kmp_cmplx128_t& rhs ) {
+        std::complex< _Quad > lhs = *this;
+        *this = __kmp_lhs_div_rhs( lhs, rhs );
+        }
+
+        __kmp_cmplx128_t operator / ( const __kmp_cmplx128_t& rhs ) {
+        std::complex< _Quad > lhs = *this;
+        return __kmp_lhs_div_rhs( lhs, rhs );
+        }
+
+    };
+    typedef struct __kmp_cmplx128_t kmp_cmplx128;
+    #endif /* KMP_HAVE_QUAD */
+
+    #ifdef _DEBUG_TEMPORARILY_UNSET_
+        #undef _DEBUG_TEMPORARILY_UNSET_
+        // Set it back now
+        #define _DEBUG 1
+    #endif
+
+#else
+    // create shortcuts for c99 complex types
+    typedef float _Complex       kmp_cmplx32;
+    typedef double _Complex      kmp_cmplx64;
+    typedef long double _Complex kmp_cmplx80;
+    #if KMP_HAVE_QUAD
+    typedef _Quad _Complex       kmp_cmplx128;
+    #endif
+#endif
+
+// Compiler 12.0 changed alignment of 16 and 32-byte arguments (like _Quad
+// and kmp_cmplx128) on IA-32 architecture. The following aligned structures
+// are implemented to support the old alignment in 10.1, 11.0, 11.1 and 
+// introduce the new alignment in 12.0. See CQ88405.
+#if KMP_ARCH_X86 && KMP_HAVE_QUAD
+
+    // 4-byte aligned structures for backward compatibility.
+
+    #pragma pack( push, 4 )
+
+    
+    struct KMP_DO_ALIGN( 4 ) Quad_a4_t {
+        _Quad q;
+
+        Quad_a4_t(  ) : q(  ) {}
+        Quad_a4_t( const _Quad & cq ) : q ( cq ) {}
+
+        Quad_a4_t operator + ( const Quad_a4_t& b ) {
+        _Quad lhs = (*this).q;
+        _Quad rhs = b.q;
+        return (Quad_a4_t)( lhs + rhs );
+    }
+
+    Quad_a4_t operator - ( const Quad_a4_t& b ) {
+        _Quad lhs = (*this).q;
+        _Quad rhs = b.q;
+        return (Quad_a4_t)( lhs - rhs );
+    }
+    Quad_a4_t operator * ( const Quad_a4_t& b ) {
+        _Quad lhs = (*this).q;
+        _Quad rhs = b.q;
+        return (Quad_a4_t)( lhs * rhs );
+    }
+
+    Quad_a4_t operator / ( const Quad_a4_t& b ) {
+        _Quad lhs = (*this).q;
+            _Quad rhs = b.q;
+        return (Quad_a4_t)( lhs / rhs );
+    }
+
+    };
+
+    struct KMP_DO_ALIGN( 4 ) kmp_cmplx128_a4_t {
+        kmp_cmplx128 q;
+
+    kmp_cmplx128_a4_t() : q () {}
+
+    kmp_cmplx128_a4_t( const kmp_cmplx128 & c128 ) : q ( c128 ) {}
+
+        kmp_cmplx128_a4_t operator + ( const kmp_cmplx128_a4_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a4_t)( lhs + rhs );
+    }
+        kmp_cmplx128_a4_t operator - ( const kmp_cmplx128_a4_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a4_t)( lhs - rhs );
+    }
+    kmp_cmplx128_a4_t operator * ( const kmp_cmplx128_a4_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a4_t)( lhs * rhs );
+    }
+
+    kmp_cmplx128_a4_t operator / ( const kmp_cmplx128_a4_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a4_t)( lhs / rhs );
+    }
+
+    };
+
+    #pragma pack( pop )
+
+    // New 16-byte aligned structures for 12.0 compiler.
+    struct KMP_DO_ALIGN( 16 ) Quad_a16_t {
+        _Quad q;
+
+        Quad_a16_t(  ) : q(  ) {}
+        Quad_a16_t( const _Quad & cq ) : q ( cq ) {}
+
+        Quad_a16_t operator + ( const Quad_a16_t& b ) {
+        _Quad lhs = (*this).q;
+        _Quad rhs = b.q;
+        return (Quad_a16_t)( lhs + rhs );
+    }
+
+    Quad_a16_t operator - ( const Quad_a16_t& b ) {
+        _Quad lhs = (*this).q;
+        _Quad rhs = b.q;
+        return (Quad_a16_t)( lhs - rhs );
+    }
+    Quad_a16_t operator * ( const Quad_a16_t& b ) {
+        _Quad lhs = (*this).q;
+        _Quad rhs = b.q;
+        return (Quad_a16_t)( lhs * rhs );
+    }
+
+    Quad_a16_t operator / ( const Quad_a16_t& b ) {
+        _Quad lhs = (*this).q;
+            _Quad rhs = b.q;
+        return (Quad_a16_t)( lhs / rhs );
+    }
+    };
+
+    struct KMP_DO_ALIGN( 16 ) kmp_cmplx128_a16_t {
+        kmp_cmplx128 q;
+
+    kmp_cmplx128_a16_t() : q () {}
+
+    kmp_cmplx128_a16_t( const kmp_cmplx128 & c128 ) : q ( c128 ) {}
+
+       kmp_cmplx128_a16_t operator + ( const kmp_cmplx128_a16_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a16_t)( lhs + rhs );
+    }
+       kmp_cmplx128_a16_t operator - ( const kmp_cmplx128_a16_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a16_t)( lhs - rhs );
+    }
+    kmp_cmplx128_a16_t operator * ( const kmp_cmplx128_a16_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a16_t)( lhs * rhs );
+    }
+
+    kmp_cmplx128_a16_t operator / ( const kmp_cmplx128_a16_t& b ) {
+        kmp_cmplx128 lhs = (*this).q;
+        kmp_cmplx128 rhs = b.q;
+        return (kmp_cmplx128_a16_t)( lhs / rhs );
+    }
+    };
+
+#endif
+
+#if ( KMP_ARCH_X86 )
+    #define QUAD_LEGACY Quad_a4_t
+    #define CPLX128_LEG kmp_cmplx128_a4_t
+#else
+    #define QUAD_LEGACY _Quad
+    #define CPLX128_LEG kmp_cmplx128
+#endif
+
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+extern int __kmp_atomic_mode;
+
+//
+// Atomic locks can easily become contended, so we use queuing locks for them.
+//
+
+typedef kmp_queuing_lock_t kmp_atomic_lock_t;
+
+static inline void
+__kmp_acquire_atomic_lock( kmp_atomic_lock_t *lck, kmp_int32 gtid )
+{
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_wait_atomic)) {
+        ompt_callbacks.ompt_callback(ompt_event_wait_atomic)(
+            (ompt_wait_id_t) lck);
+    }
+#endif
+
+    __kmp_acquire_queuing_lock( lck, gtid );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)) {
+        ompt_callbacks.ompt_callback(ompt_event_acquired_atomic)(
+            (ompt_wait_id_t) lck);
+    }
+#endif
+}
+
+static inline int
+__kmp_test_atomic_lock( kmp_atomic_lock_t *lck, kmp_int32 gtid )
+{
+    return __kmp_test_queuing_lock( lck, gtid );
+}
+
+static inline void
+__kmp_release_atomic_lock( kmp_atomic_lock_t *lck, kmp_int32 gtid )
+{
+    __kmp_release_queuing_lock( lck, gtid );
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_atomic)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_atomic)(
+            (ompt_wait_id_t) lck);
+  }
+#endif
+}
+
+static inline void
+__kmp_init_atomic_lock( kmp_atomic_lock_t *lck )
+{
+    __kmp_init_queuing_lock( lck );
+}
+
+static inline void
+__kmp_destroy_atomic_lock( kmp_atomic_lock_t *lck )
+{
+    __kmp_destroy_queuing_lock( lck );
+}
+
+// Global Locks
+
+extern kmp_atomic_lock_t __kmp_atomic_lock;    /* Control access to all user coded atomics in Gnu compat mode   */
+extern kmp_atomic_lock_t __kmp_atomic_lock_1i;  /* Control access to all user coded atomics for 1-byte fixed data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_2i;  /* Control access to all user coded atomics for 2-byte fixed data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_4i;  /* Control access to all user coded atomics for 4-byte fixed data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_4r;  /* Control access to all user coded atomics for kmp_real32 data type    */
+extern kmp_atomic_lock_t __kmp_atomic_lock_8i;  /* Control access to all user coded atomics for 8-byte fixed data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_8r;  /* Control access to all user coded atomics for kmp_real64 data type    */
+extern kmp_atomic_lock_t __kmp_atomic_lock_8c;  /* Control access to all user coded atomics for complex byte data type  */
+extern kmp_atomic_lock_t __kmp_atomic_lock_10r; /* Control access to all user coded atomics for long double data type   */
+extern kmp_atomic_lock_t __kmp_atomic_lock_16r; /* Control access to all user coded atomics for _Quad data type         */
+extern kmp_atomic_lock_t __kmp_atomic_lock_16c; /* Control access to all user coded atomics for double complex data type*/
+extern kmp_atomic_lock_t __kmp_atomic_lock_20c; /* Control access to all user coded atomics for long double complex type*/
+extern kmp_atomic_lock_t __kmp_atomic_lock_32c; /* Control access to all user coded atomics for _Quad complex data type */
+
+//
+//  Below routines for atomic UPDATE are listed
+//
+
+// 1-byte
+void __kmpc_atomic_fixed1_add(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_andb( ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_div(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1u_div( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs );
+void __kmpc_atomic_fixed1_mul(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_orb(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_shl(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_shr(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1u_shr( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs );
+void __kmpc_atomic_fixed1_sub(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_xor(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+// 2-byte
+void __kmpc_atomic_fixed2_add(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_andb( ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_div(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2u_div( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs );
+void __kmpc_atomic_fixed2_mul(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_orb(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_shl(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_shr(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2u_shr( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs );
+void __kmpc_atomic_fixed2_sub(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_xor(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+// 4-byte add / sub fixed
+void __kmpc_atomic_fixed4_add(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_sub(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+// 4-byte add / sub float
+void __kmpc_atomic_float4_add(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs );
+void __kmpc_atomic_float4_sub(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs );
+// 8-byte add / sub fixed
+void __kmpc_atomic_fixed8_add(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_sub(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+// 8-byte add / sub float
+void __kmpc_atomic_float8_add(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_float8_sub(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs );
+// 4-byte fixed
+void __kmpc_atomic_fixed4_andb( ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_div(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4u_div( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs );
+void __kmpc_atomic_fixed4_mul(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_orb(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_shl(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_shr(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4u_shr( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs );
+void __kmpc_atomic_fixed4_xor(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+// 8-byte fixed
+void __kmpc_atomic_fixed8_andb( ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_div(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8u_div( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs );
+void __kmpc_atomic_fixed8_mul(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_orb(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_shl(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_shr(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8u_shr( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs );
+void __kmpc_atomic_fixed8_xor(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+// 4-byte float
+void __kmpc_atomic_float4_div(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs );
+void __kmpc_atomic_float4_mul(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs );
+// 8-byte float
+void __kmpc_atomic_float8_div(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_float8_mul(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs );
+// 1-, 2-, 4-, 8-byte logical (&&, ||)
+void __kmpc_atomic_fixed1_andl( ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_orl(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed2_andl( ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_orl(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed4_andl( ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_orl(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed8_andl( ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_orl(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+// MIN / MAX
+void __kmpc_atomic_fixed1_max(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_min(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed2_max(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_min(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed4_max(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_min(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed8_max(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_min(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_float4_max(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs );
+void __kmpc_atomic_float4_min(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs );
+void __kmpc_atomic_float8_max(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_float8_min(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs );
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_max( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+void __kmpc_atomic_float16_min( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+#if ( KMP_ARCH_X86 )
+    // Routines with 16-byte arguments aligned to 16-byte boundary; IA-32 architecture only
+    void __kmpc_atomic_float16_max_a16( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+    void __kmpc_atomic_float16_min_a16( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+#endif
+#endif
+// .NEQV. (same as xor)
+void __kmpc_atomic_fixed1_neqv( ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed2_neqv( ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed4_neqv( ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed8_neqv( ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+// .EQV. (same as ~xor)
+void __kmpc_atomic_fixed1_eqv(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed2_eqv(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed4_eqv(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed8_eqv(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+// long double type
+void __kmpc_atomic_float10_add( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+void __kmpc_atomic_float10_sub( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+void __kmpc_atomic_float10_mul( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+void __kmpc_atomic_float10_div( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+// _Quad type
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_add( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+void __kmpc_atomic_float16_sub( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+void __kmpc_atomic_float16_mul( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+void __kmpc_atomic_float16_div( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+#if ( KMP_ARCH_X86 )
+    // Routines with 16-byte arguments aligned to 16-byte boundary
+    void __kmpc_atomic_float16_add_a16( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+    void __kmpc_atomic_float16_sub_a16( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+    void __kmpc_atomic_float16_mul_a16( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+    void __kmpc_atomic_float16_div_a16( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+#endif
+#endif
+// routines for complex types
+void __kmpc_atomic_cmplx4_add(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+void __kmpc_atomic_cmplx4_sub(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+void __kmpc_atomic_cmplx4_mul(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+void __kmpc_atomic_cmplx4_div(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+void __kmpc_atomic_cmplx8_add(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx8_sub(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx8_mul(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx8_div(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx10_add( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+void __kmpc_atomic_cmplx10_sub( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+void __kmpc_atomic_cmplx10_mul( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+void __kmpc_atomic_cmplx10_div( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_add( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+void __kmpc_atomic_cmplx16_sub( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+void __kmpc_atomic_cmplx16_mul( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+void __kmpc_atomic_cmplx16_div( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+#if ( KMP_ARCH_X86 )
+    // Routines with 16-byte arguments aligned to 16-byte boundary
+    void __kmpc_atomic_cmplx16_add_a16( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+    void __kmpc_atomic_cmplx16_sub_a16( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+    void __kmpc_atomic_cmplx16_mul_a16( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+    void __kmpc_atomic_cmplx16_div_a16( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+#endif
+#endif
+
+#if OMP_40_ENABLED
+
+// OpenMP 4.0: x = expr binop x for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+void __kmpc_atomic_fixed1_sub_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_div_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1u_div_rev( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs );
+void __kmpc_atomic_fixed1_shl_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1_shr_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs );
+void __kmpc_atomic_fixed1u_shr_rev( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs );
+void __kmpc_atomic_fixed2_sub_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_div_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2u_div_rev( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs );
+void __kmpc_atomic_fixed2_shl_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2_shr_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs );
+void __kmpc_atomic_fixed2u_shr_rev( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs );
+void __kmpc_atomic_fixed4_sub_rev(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_div_rev(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4u_div_rev( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs );
+void __kmpc_atomic_fixed4_shl_rev(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4_shr_rev(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs );
+void __kmpc_atomic_fixed4u_shr_rev( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs );
+void __kmpc_atomic_fixed8_sub_rev(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_div_rev(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8u_div_rev( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs );
+void __kmpc_atomic_fixed8_shl_rev(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8_shr_rev(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs );
+void __kmpc_atomic_fixed8u_shr_rev( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs );
+void __kmpc_atomic_float4_sub_rev(  ident_t *id_ref, int gtid, float * lhs, float rhs );
+void __kmpc_atomic_float4_div_rev(  ident_t *id_ref, int gtid, float * lhs, float rhs );
+void __kmpc_atomic_float8_sub_rev(  ident_t *id_ref, int gtid, double * lhs, double rhs );
+void __kmpc_atomic_float8_div_rev(  ident_t *id_ref, int gtid, double * lhs, double rhs );
+void __kmpc_atomic_float10_sub_rev( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+void __kmpc_atomic_float10_div_rev( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_sub_rev( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+void __kmpc_atomic_float16_div_rev( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+#endif
+void __kmpc_atomic_cmplx4_sub_rev(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+void __kmpc_atomic_cmplx4_div_rev(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+void __kmpc_atomic_cmplx8_sub_rev(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx8_div_rev(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx10_sub_rev( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+void __kmpc_atomic_cmplx10_div_rev( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_sub_rev( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+void __kmpc_atomic_cmplx16_div_rev( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+#if ( KMP_ARCH_X86 )
+    // Routines with 16-byte arguments aligned to 16-byte boundary
+    void __kmpc_atomic_float16_sub_a16_rev( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+    void __kmpc_atomic_float16_div_a16_rev( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+    void __kmpc_atomic_cmplx16_sub_a16_rev( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+    void __kmpc_atomic_cmplx16_div_a16_rev( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+#endif
+#endif // KMP_HAVE_QUAD
+
+#endif //KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+#endif //OMP_40_ENABLED
+
+// routines for mixed types
+
+// RHS=float8
+void __kmpc_atomic_fixed1_mul_float8( ident_t *id_ref, int gtid, char * lhs, kmp_real64 rhs );
+void __kmpc_atomic_fixed1_div_float8( ident_t *id_ref, int gtid, char * lhs, kmp_real64 rhs );
+void __kmpc_atomic_fixed2_mul_float8( ident_t *id_ref, int gtid, short * lhs, kmp_real64 rhs );
+void __kmpc_atomic_fixed2_div_float8( ident_t *id_ref, int gtid, short * lhs, kmp_real64 rhs );
+void __kmpc_atomic_fixed4_mul_float8( ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_fixed4_div_float8( ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_fixed8_mul_float8( ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_fixed8_div_float8( ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_float4_add_float8( ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_float4_sub_float8( ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_float4_mul_float8( ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real64 rhs );
+void __kmpc_atomic_float4_div_float8( ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real64 rhs );
+
+// RHS=float16 (deprecated, to be removed when we are sure the compiler does not use them)
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_fixed1_add_fp(  ident_t *id_ref, int gtid, char * lhs, _Quad rhs );
+void __kmpc_atomic_fixed1_sub_fp(  ident_t *id_ref, int gtid, char * lhs, _Quad rhs );
+void __kmpc_atomic_fixed1_mul_fp(  ident_t *id_ref, int gtid, char * lhs, _Quad rhs );
+void __kmpc_atomic_fixed1_div_fp(  ident_t *id_ref, int gtid, char * lhs, _Quad rhs );
+void __kmpc_atomic_fixed1u_div_fp( ident_t *id_ref, int gtid, unsigned char * lhs, _Quad rhs );
+
+void __kmpc_atomic_fixed2_add_fp(  ident_t *id_ref, int gtid, short * lhs, _Quad rhs );
+void __kmpc_atomic_fixed2_sub_fp(  ident_t *id_ref, int gtid, short * lhs, _Quad rhs );
+void __kmpc_atomic_fixed2_mul_fp(  ident_t *id_ref, int gtid, short * lhs, _Quad rhs );
+void __kmpc_atomic_fixed2_div_fp(  ident_t *id_ref, int gtid, short * lhs, _Quad rhs );
+void __kmpc_atomic_fixed2u_div_fp( ident_t *id_ref, int gtid, unsigned short * lhs, _Quad rhs );
+
+void __kmpc_atomic_fixed4_add_fp(  ident_t *id_ref, int gtid, kmp_int32 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed4_sub_fp(  ident_t *id_ref, int gtid, kmp_int32 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed4_mul_fp(  ident_t *id_ref, int gtid, kmp_int32 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed4_div_fp(  ident_t *id_ref, int gtid, kmp_int32 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed4u_div_fp( ident_t *id_ref, int gtid, kmp_uint32 * lhs, _Quad rhs );
+
+void __kmpc_atomic_fixed8_add_fp(  ident_t *id_ref, int gtid, kmp_int64 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed8_sub_fp(  ident_t *id_ref, int gtid, kmp_int64 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed8_mul_fp(  ident_t *id_ref, int gtid, kmp_int64 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed8_div_fp(  ident_t *id_ref, int gtid, kmp_int64 * lhs, _Quad rhs );
+void __kmpc_atomic_fixed8u_div_fp( ident_t *id_ref, int gtid, kmp_uint64 * lhs, _Quad rhs );
+
+void __kmpc_atomic_float4_add_fp(  ident_t *id_ref, int gtid, kmp_real32 * lhs, _Quad rhs );
+void __kmpc_atomic_float4_sub_fp(  ident_t *id_ref, int gtid, kmp_real32 * lhs, _Quad rhs );
+void __kmpc_atomic_float4_mul_fp(  ident_t *id_ref, int gtid, kmp_real32 * lhs, _Quad rhs );
+void __kmpc_atomic_float4_div_fp(  ident_t *id_ref, int gtid, kmp_real32 * lhs, _Quad rhs );
+
+void __kmpc_atomic_float8_add_fp(  ident_t *id_ref, int gtid, kmp_real64 * lhs, _Quad rhs );
+void __kmpc_atomic_float8_sub_fp(  ident_t *id_ref, int gtid, kmp_real64 * lhs, _Quad rhs );
+void __kmpc_atomic_float8_mul_fp(  ident_t *id_ref, int gtid, kmp_real64 * lhs, _Quad rhs );
+void __kmpc_atomic_float8_div_fp(  ident_t *id_ref, int gtid, kmp_real64 * lhs, _Quad rhs );
+
+void __kmpc_atomic_float10_add_fp( ident_t *id_ref, int gtid, long double * lhs, _Quad rhs );
+void __kmpc_atomic_float10_sub_fp( ident_t *id_ref, int gtid, long double * lhs, _Quad rhs );
+void __kmpc_atomic_float10_mul_fp( ident_t *id_ref, int gtid, long double * lhs, _Quad rhs );
+void __kmpc_atomic_float10_div_fp( ident_t *id_ref, int gtid, long double * lhs, _Quad rhs );
+#endif // KMP_HAVE_QUAD
+
+// RHS=cmplx8
+void __kmpc_atomic_cmplx4_add_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx4_sub_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx4_mul_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx4_div_cmplx8( ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx64 rhs );
+
+// generic atomic routines
+void __kmpc_atomic_1(  ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+void __kmpc_atomic_2(  ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+void __kmpc_atomic_4(  ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+void __kmpc_atomic_8(  ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+void __kmpc_atomic_10( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+void __kmpc_atomic_16( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+void __kmpc_atomic_20( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+void __kmpc_atomic_32( ident_t *id_ref, int gtid, void* lhs, void* rhs, void (*f)( void *, void *, void * ) );
+
+// READ, WRITE, CAPTURE are supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+//
+//  Below routines for atomic READ are listed
+//
+
+char         __kmpc_atomic_fixed1_rd(  ident_t *id_ref, int gtid, char        * loc );
+short        __kmpc_atomic_fixed2_rd(  ident_t *id_ref, int gtid, short       * loc );
+kmp_int32    __kmpc_atomic_fixed4_rd(  ident_t *id_ref, int gtid, kmp_int32   * loc );
+kmp_int64    __kmpc_atomic_fixed8_rd(  ident_t *id_ref, int gtid, kmp_int64   * loc );
+kmp_real32   __kmpc_atomic_float4_rd(  ident_t *id_ref, int gtid, kmp_real32  * loc );
+kmp_real64   __kmpc_atomic_float8_rd(  ident_t *id_ref, int gtid, kmp_real64  * loc );
+long double  __kmpc_atomic_float10_rd( ident_t *id_ref, int gtid, long double * loc );
+#if KMP_HAVE_QUAD
+QUAD_LEGACY  __kmpc_atomic_float16_rd( ident_t *id_ref, int gtid, QUAD_LEGACY * loc );
+#endif
+// Fix for CQ220361: cmplx4 READ will return void on Windows* OS; read value will be
+// returned through an additional parameter
+#if ( KMP_OS_WINDOWS )
+    void  __kmpc_atomic_cmplx4_rd(  kmp_cmplx32 * out, ident_t *id_ref, int gtid, kmp_cmplx32 * loc );
+#else
+    kmp_cmplx32  __kmpc_atomic_cmplx4_rd(  ident_t *id_ref, int gtid, kmp_cmplx32 * loc );
+#endif
+kmp_cmplx64  __kmpc_atomic_cmplx8_rd(  ident_t *id_ref, int gtid, kmp_cmplx64 * loc );
+kmp_cmplx80  __kmpc_atomic_cmplx10_rd( ident_t *id_ref, int gtid, kmp_cmplx80 * loc );
+#if KMP_HAVE_QUAD
+CPLX128_LEG  __kmpc_atomic_cmplx16_rd( ident_t *id_ref, int gtid, CPLX128_LEG * loc );
+#if ( KMP_ARCH_X86 )
+    // Routines with 16-byte arguments aligned to 16-byte boundary
+    Quad_a16_t         __kmpc_atomic_float16_a16_rd( ident_t * id_ref, int gtid, Quad_a16_t         * loc );
+    kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_a16_rd( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * loc );
+#endif
+#endif
+
+
+//
+//  Below routines for atomic WRITE are listed
+//
+
+void __kmpc_atomic_fixed1_wr(  ident_t *id_ref, int gtid, char        * lhs, char        rhs );
+void __kmpc_atomic_fixed2_wr(  ident_t *id_ref, int gtid, short       * lhs, short       rhs );
+void __kmpc_atomic_fixed4_wr(  ident_t *id_ref, int gtid, kmp_int32   * lhs, kmp_int32   rhs );
+void __kmpc_atomic_fixed8_wr(  ident_t *id_ref, int gtid, kmp_int64   * lhs, kmp_int64   rhs );
+void __kmpc_atomic_float4_wr(  ident_t *id_ref, int gtid, kmp_real32  * lhs, kmp_real32  rhs );
+void __kmpc_atomic_float8_wr(  ident_t *id_ref, int gtid, kmp_real64  * lhs, kmp_real64  rhs );
+void __kmpc_atomic_float10_wr( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_wr( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+#endif
+void __kmpc_atomic_cmplx4_wr(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+void __kmpc_atomic_cmplx8_wr(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+void __kmpc_atomic_cmplx10_wr( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_wr( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+#if ( KMP_ARCH_X86 )
+    // Routines with 16-byte arguments aligned to 16-byte boundary
+    void __kmpc_atomic_float16_a16_wr( ident_t * id_ref, int gtid, Quad_a16_t         * lhs, Quad_a16_t         rhs );
+    void __kmpc_atomic_cmplx16_a16_wr( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+#endif
+#endif
+
+//
+//  Below routines for atomic CAPTURE are listed
+//
+
+// 1-byte
+char __kmpc_atomic_fixed1_add_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+char __kmpc_atomic_fixed1_andb_cpt( ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_mul_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+char __kmpc_atomic_fixed1_orb_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+char __kmpc_atomic_fixed1_shl_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+char __kmpc_atomic_fixed1_shr_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_shr_cpt( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_sub_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+char __kmpc_atomic_fixed1_xor_cpt(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag);
+// 2-byte
+short __kmpc_atomic_fixed2_add_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+short __kmpc_atomic_fixed2_andb_cpt( ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_mul_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+short __kmpc_atomic_fixed2_orb_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+short __kmpc_atomic_fixed2_shl_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+short __kmpc_atomic_fixed2_shr_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_shr_cpt( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+short __kmpc_atomic_fixed2_xor_cpt(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag);
+// 4-byte add / sub fixed
+kmp_int32  __kmpc_atomic_fixed4_add_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32 rhs, int flag);
+kmp_int32  __kmpc_atomic_fixed4_sub_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32 rhs, int flag);
+// 4-byte add / sub float
+kmp_real32 __kmpc_atomic_float4_add_cpt(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs, int flag);
+kmp_real32 __kmpc_atomic_float4_sub_cpt(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs, int flag);
+// 8-byte add / sub fixed
+kmp_int64  __kmpc_atomic_fixed8_add_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64 rhs, int flag);
+kmp_int64  __kmpc_atomic_fixed8_sub_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64 rhs, int flag);
+// 8-byte add / sub float
+kmp_real64 __kmpc_atomic_float8_add_cpt(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs, int flag);
+kmp_real64 __kmpc_atomic_float8_sub_cpt(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs, int flag);
+// 4-byte fixed
+kmp_int32  __kmpc_atomic_fixed4_andb_cpt( ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag);
+kmp_int32  __kmpc_atomic_fixed4_div_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs, int flag);
+kmp_int32  __kmpc_atomic_fixed4_mul_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag);
+kmp_int32  __kmpc_atomic_fixed4_orb_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag);
+kmp_int32  __kmpc_atomic_fixed4_shl_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag);
+kmp_int32  __kmpc_atomic_fixed4_shr_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_shr_cpt( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs, int flag);
+kmp_int32  __kmpc_atomic_fixed4_xor_cpt(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag);
+// 8-byte fixed
+kmp_int64  __kmpc_atomic_fixed8_andb_cpt( ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag);
+kmp_int64  __kmpc_atomic_fixed8_div_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs, int flag);
+kmp_int64  __kmpc_atomic_fixed8_mul_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag);
+kmp_int64  __kmpc_atomic_fixed8_orb_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag);
+kmp_int64  __kmpc_atomic_fixed8_shl_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag);
+kmp_int64  __kmpc_atomic_fixed8_shr_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_shr_cpt( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs, int flag);
+kmp_int64  __kmpc_atomic_fixed8_xor_cpt(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag);
+// 4-byte float
+kmp_real32 __kmpc_atomic_float4_div_cpt(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs, int flag);
+kmp_real32 __kmpc_atomic_float4_mul_cpt(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs, int flag);
+// 8-byte float
+kmp_real64 __kmpc_atomic_float8_div_cpt(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs, int flag);
+kmp_real64 __kmpc_atomic_float8_mul_cpt(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs, int flag);
+// 1-, 2-, 4-, 8-byte logical (&&, ||)
+char      __kmpc_atomic_fixed1_andl_cpt( ident_t *id_ref, int gtid, char      * lhs, char      rhs, int flag);
+char      __kmpc_atomic_fixed1_orl_cpt(  ident_t *id_ref, int gtid, char      * lhs, char      rhs, int flag);
+short     __kmpc_atomic_fixed2_andl_cpt( ident_t *id_ref, int gtid, short     * lhs, short     rhs, int flag);
+short     __kmpc_atomic_fixed2_orl_cpt(  ident_t *id_ref, int gtid, short     * lhs, short     rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_andl_cpt( ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_orl_cpt(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_andl_cpt( ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_orl_cpt(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs, int flag);
+// MIN / MAX
+char        __kmpc_atomic_fixed1_max_cpt(  ident_t *id_ref, int gtid, char      * lhs, char      rhs, int flag);
+char        __kmpc_atomic_fixed1_min_cpt(  ident_t *id_ref, int gtid, char      * lhs, char      rhs, int flag);
+short       __kmpc_atomic_fixed2_max_cpt(  ident_t *id_ref, int gtid, short     * lhs, short     rhs, int flag);
+short       __kmpc_atomic_fixed2_min_cpt(  ident_t *id_ref, int gtid, short     * lhs, short     rhs, int flag);
+kmp_int32   __kmpc_atomic_fixed4_max_cpt(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs, int flag);
+kmp_int32   __kmpc_atomic_fixed4_min_cpt(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs, int flag);
+kmp_int64   __kmpc_atomic_fixed8_max_cpt(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs, int flag);
+kmp_int64   __kmpc_atomic_fixed8_min_cpt(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs, int flag);
+kmp_real32  __kmpc_atomic_float4_max_cpt(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs, int flag);
+kmp_real32  __kmpc_atomic_float4_min_cpt(  ident_t *id_ref, int gtid, kmp_real32 * lhs, kmp_real32 rhs, int flag);
+kmp_real64  __kmpc_atomic_float8_max_cpt(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs, int flag);
+kmp_real64  __kmpc_atomic_float8_min_cpt(  ident_t *id_ref, int gtid, kmp_real64 * lhs, kmp_real64 rhs, int flag);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_max_cpt( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag);
+QUAD_LEGACY __kmpc_atomic_float16_min_cpt( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag);
+#endif
+// .NEQV. (same as xor)
+char      __kmpc_atomic_fixed1_neqv_cpt( ident_t *id_ref, int gtid, char      * lhs, char      rhs, int flag);
+short     __kmpc_atomic_fixed2_neqv_cpt( ident_t *id_ref, int gtid, short     * lhs, short     rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_neqv_cpt( ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_neqv_cpt( ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs, int flag);
+// .EQV. (same as ~xor)
+char      __kmpc_atomic_fixed1_eqv_cpt(  ident_t *id_ref, int gtid, char      * lhs, char      rhs, int flag);
+short     __kmpc_atomic_fixed2_eqv_cpt(  ident_t *id_ref, int gtid, short     * lhs, short     rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_eqv_cpt(  ident_t *id_ref, int gtid, kmp_int32 * lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_eqv_cpt(  ident_t *id_ref, int gtid, kmp_int64 * lhs, kmp_int64 rhs, int flag);
+// long double type
+long double __kmpc_atomic_float10_add_cpt( ident_t *id_ref, int gtid, long double * lhs, long double rhs, int flag);
+long double __kmpc_atomic_float10_sub_cpt( ident_t *id_ref, int gtid, long double * lhs, long double rhs, int flag);
+long double __kmpc_atomic_float10_mul_cpt( ident_t *id_ref, int gtid, long double * lhs, long double rhs, int flag);
+long double __kmpc_atomic_float10_div_cpt( ident_t *id_ref, int gtid, long double * lhs, long double rhs, int flag);
+#if KMP_HAVE_QUAD
+// _Quad type
+QUAD_LEGACY __kmpc_atomic_float16_add_cpt( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag);
+QUAD_LEGACY __kmpc_atomic_float16_sub_cpt( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag);
+QUAD_LEGACY __kmpc_atomic_float16_mul_cpt( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag);
+QUAD_LEGACY __kmpc_atomic_float16_div_cpt( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag);
+#endif
+// routines for complex types
+// Workaround for cmplx4 routines - return void; captured value is returned via the argument
+void __kmpc_atomic_cmplx4_add_cpt(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag);
+void __kmpc_atomic_cmplx4_sub_cpt(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag);
+void __kmpc_atomic_cmplx4_mul_cpt(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag);
+void __kmpc_atomic_cmplx4_div_cpt(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag);
+
+kmp_cmplx64 __kmpc_atomic_cmplx8_add_cpt(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs, int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_sub_cpt(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs, int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_mul_cpt(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs, int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_div_cpt(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs, int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_add_cpt( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs, int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_sub_cpt( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs, int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_mul_cpt( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs, int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_div_cpt( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs, int flag);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_add_cpt( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs, int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_sub_cpt( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs, int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_mul_cpt( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs, int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_div_cpt( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs, int flag);
+#if ( KMP_ARCH_X86 )
+    // Routines with 16-byte arguments aligned to 16-byte boundary
+    Quad_a16_t __kmpc_atomic_float16_add_a16_cpt( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag);
+    Quad_a16_t __kmpc_atomic_float16_sub_a16_cpt( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag);
+    Quad_a16_t __kmpc_atomic_float16_mul_a16_cpt( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag);
+    Quad_a16_t __kmpc_atomic_float16_div_a16_cpt( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag);
+    Quad_a16_t __kmpc_atomic_float16_max_a16_cpt( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag);
+    Quad_a16_t __kmpc_atomic_float16_min_a16_cpt( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag);
+    kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_add_a16_cpt( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs, int flag);
+    kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_sub_a16_cpt( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs, int flag);
+    kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_mul_a16_cpt( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs, int flag);
+    kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_div_a16_cpt( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs, int flag);
+#endif
+#endif
+
+void __kmpc_atomic_start(void);
+void __kmpc_atomic_end(void);
+
+#if OMP_40_ENABLED
+
+// OpenMP 4.0: v = x = expr binop x; { v = x; x = expr binop x; } { x = expr binop x; v = x; }  for non-commutative operations.
+
+char	       	__kmpc_atomic_fixed1_sub_cpt_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag );
+char		__kmpc_atomic_fixed1_div_cpt_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag );
+unsigned char 	__kmpc_atomic_fixed1u_div_cpt_rev( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs, int flag );
+char 		__kmpc_atomic_fixed1_shl_cpt_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs , int flag);
+char		__kmpc_atomic_fixed1_shr_cpt_rev(  ident_t *id_ref, int gtid, char * lhs, char rhs, int flag );
+unsigned char 	__kmpc_atomic_fixed1u_shr_cpt_rev( ident_t *id_ref, int gtid, unsigned char * lhs, unsigned char rhs, int flag );
+short 		__kmpc_atomic_fixed2_sub_cpt_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag );
+short 		__kmpc_atomic_fixed2_div_cpt_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag );
+unsigned short 	__kmpc_atomic_fixed2u_div_cpt_rev( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs, int flag );
+short 		__kmpc_atomic_fixed2_shl_cpt_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag );
+short 		__kmpc_atomic_fixed2_shr_cpt_rev(  ident_t *id_ref, int gtid, short * lhs, short rhs, int flag );
+unsigned short 	__kmpc_atomic_fixed2u_shr_cpt_rev( ident_t *id_ref, int gtid, unsigned short * lhs, unsigned short rhs, int flag );
+kmp_int32 	__kmpc_atomic_fixed4_sub_cpt_rev(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag );
+kmp_int32 	__kmpc_atomic_fixed4_div_cpt_rev(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag );
+kmp_uint32 	__kmpc_atomic_fixed4u_div_cpt_rev( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs, int flag );
+kmp_int32 	__kmpc_atomic_fixed4_shl_cpt_rev(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag );
+kmp_int32 	__kmpc_atomic_fixed4_shr_cpt_rev(  ident_t *id_ref, int gtid, kmp_int32  * lhs, kmp_int32  rhs, int flag );
+kmp_uint32 	__kmpc_atomic_fixed4u_shr_cpt_rev( ident_t *id_ref, int gtid, kmp_uint32 * lhs, kmp_uint32 rhs, int flag );
+kmp_int64 	__kmpc_atomic_fixed8_sub_cpt_rev(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag );
+kmp_int64 	__kmpc_atomic_fixed8_div_cpt_rev(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag );
+kmp_uint64      __kmpc_atomic_fixed8u_div_cpt_rev( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs, int flag );
+kmp_int64 	__kmpc_atomic_fixed8_shl_cpt_rev(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag );
+kmp_int64 	__kmpc_atomic_fixed8_shr_cpt_rev(  ident_t *id_ref, int gtid, kmp_int64  * lhs, kmp_int64  rhs, int flag );
+kmp_uint64      __kmpc_atomic_fixed8u_shr_cpt_rev( ident_t *id_ref, int gtid, kmp_uint64 * lhs, kmp_uint64 rhs, int flag );
+float 		__kmpc_atomic_float4_sub_cpt_rev(  ident_t *id_ref, int gtid, float * lhs, float rhs, int flag );
+float 		__kmpc_atomic_float4_div_cpt_rev(  ident_t *id_ref, int gtid, float * lhs, float rhs, int flag );
+double 		__kmpc_atomic_float8_sub_cpt_rev(  ident_t *id_ref, int gtid, double * lhs, double rhs, int flag );
+double 		__kmpc_atomic_float8_div_cpt_rev(  ident_t *id_ref, int gtid, double * lhs, double rhs, int flag );
+long double 	__kmpc_atomic_float10_sub_cpt_rev( ident_t *id_ref, int gtid, long double * lhs, long double rhs, int flag );
+long double 	__kmpc_atomic_float10_div_cpt_rev( ident_t *id_ref, int gtid, long double * lhs, long double rhs, int flag );
+#if KMP_HAVE_QUAD
+QUAD_LEGACY	__kmpc_atomic_float16_sub_cpt_rev( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag );
+QUAD_LEGACY	__kmpc_atomic_float16_div_cpt_rev( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs, int flag );
+#endif
+// Workaround for cmplx4 routines - return void; captured value is returned via the argument
+void     	__kmpc_atomic_cmplx4_sub_cpt_rev(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag );
+void 	        __kmpc_atomic_cmplx4_div_cpt_rev(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag );
+kmp_cmplx64 	__kmpc_atomic_cmplx8_sub_cpt_rev(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs, int flag );
+kmp_cmplx64 	__kmpc_atomic_cmplx8_div_cpt_rev(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs, int flag );
+kmp_cmplx80 	__kmpc_atomic_cmplx10_sub_cpt_rev( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs, int flag );
+kmp_cmplx80 	__kmpc_atomic_cmplx10_div_cpt_rev( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs, int flag );
+#if KMP_HAVE_QUAD
+CPLX128_LEG  	__kmpc_atomic_cmplx16_sub_cpt_rev( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs, int flag );
+CPLX128_LEG  	__kmpc_atomic_cmplx16_div_cpt_rev( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs, int flag );
+#if ( KMP_ARCH_X86 )
+    Quad_a16_t 		__kmpc_atomic_float16_sub_a16_cpt_rev( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag );
+    Quad_a16_t		__kmpc_atomic_float16_div_a16_cpt_rev( ident_t * id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs, int flag );
+    kmp_cmplx128_a16_t 	__kmpc_atomic_cmplx16_sub_a16_cpt_rev( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs, int flag );
+    kmp_cmplx128_a16_t 	__kmpc_atomic_cmplx16_div_a16_cpt_rev( ident_t * id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs, int flag );
+#endif
+#endif
+
+//   OpenMP 4.0 Capture-write (swap): {v = x; x = expr;}
+char 		__kmpc_atomic_fixed1_swp(  ident_t *id_ref, int gtid, char        * lhs, char        rhs );
+short           __kmpc_atomic_fixed2_swp(  ident_t *id_ref, int gtid, short       * lhs, short       rhs );
+kmp_int32       __kmpc_atomic_fixed4_swp(  ident_t *id_ref, int gtid, kmp_int32   * lhs, kmp_int32   rhs );
+kmp_int64 	__kmpc_atomic_fixed8_swp(  ident_t *id_ref, int gtid, kmp_int64   * lhs, kmp_int64   rhs );
+float 		__kmpc_atomic_float4_swp(  ident_t *id_ref, int gtid, float       * lhs, float  rhs );
+double		__kmpc_atomic_float8_swp(  ident_t *id_ref, int gtid, double      * lhs, double  rhs );
+long double	__kmpc_atomic_float10_swp( ident_t *id_ref, int gtid, long double * lhs, long double rhs );
+#if KMP_HAVE_QUAD
+QUAD_LEGACY    	__kmpc_atomic_float16_swp( ident_t *id_ref, int gtid, QUAD_LEGACY * lhs, QUAD_LEGACY rhs );
+#endif
+// !!! TODO: check if we need a workaround here
+void        	__kmpc_atomic_cmplx4_swp(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out );
+//kmp_cmplx32   	__kmpc_atomic_cmplx4_swp(  ident_t *id_ref, int gtid, kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+
+kmp_cmplx64 	__kmpc_atomic_cmplx8_swp(  ident_t *id_ref, int gtid, kmp_cmplx64 * lhs, kmp_cmplx64 rhs );
+kmp_cmplx80	__kmpc_atomic_cmplx10_swp( ident_t *id_ref, int gtid, kmp_cmplx80 * lhs, kmp_cmplx80 rhs );
+#if KMP_HAVE_QUAD
+CPLX128_LEG 	__kmpc_atomic_cmplx16_swp( ident_t *id_ref, int gtid, CPLX128_LEG * lhs, CPLX128_LEG rhs );
+#if ( KMP_ARCH_X86 )
+    Quad_a16_t		__kmpc_atomic_float16_a16_swp( ident_t *id_ref, int gtid, Quad_a16_t * lhs, Quad_a16_t rhs );
+    kmp_cmplx128_a16_t 	__kmpc_atomic_cmplx16_a16_swp( ident_t *id_ref, int gtid, kmp_cmplx128_a16_t * lhs, kmp_cmplx128_a16_t rhs );
+#endif
+#endif
+
+// End of OpenMP 4.0 capture
+
+#endif //OMP_40_ENABLED
+
+#endif //KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#ifdef __cplusplus
+    } // extern "C"
+#endif
+
+#endif /* KMP_ATOMIC_H */
+
+// end of file

diff --git a/final/runtime/src/kmp_barrier.cpp b/final/runtime/src/kmp_barrier.cpp
new file mode 100644
index 0000000..e6c4e8a
--- /dev/null
+++ b/final/runtime/src/kmp_barrier.cpp

@@ -0,0 +1,1739 @@
+/*
+ * kmp_barrier.cpp
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_wait_release.h"
+#include "kmp_stats.h"
+#include "kmp_itt.h"
+
+#if KMP_MIC
+#include <immintrin.h>
+#define USE_NGO_STORES 1
+#endif // KMP_MIC
+
+#if KMP_MIC && USE_NGO_STORES
+// ICV copying
+#define ngo_load(src)            __m512d Vt = _mm512_load_pd((void *)(src))
+#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_store_go(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_sync()               __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
+#else
+#define ngo_load(src)            ((void)0)
+#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
+#define ngo_store_go(dst, src)   KMP_MEMCPY((dst), (src), CACHE_LINE)
+#define ngo_sync()               ((void)0)
+#endif /* KMP_MIC && USE_NGO_STORES */
+
+void __kmp_print_structure(void); // Forward declaration
+
+// ---------------------------- Barrier Algorithms ----------------------------
+
+// Linear Barrier
+static void
+__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                            void (*reduce)(void *, void *)
+                            USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_linear_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
+    register kmp_info_t **other_threads = team->t.t_threads;
+
+    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
+    }
+#endif
+    // We now perform a linear reduction to signal that all of the threads have arrived.
+    if (!KMP_MASTER_TID(tid)) {
+        KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
+                      "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                      __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
+                      thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+        // Mark arrival to master thread
+        /* After performing this write, a worker thread may not assume that the team is valid
+           any more - it could be deallocated by the master thread at any time. */
+        kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
+        flag.release();
+    } else {
+        register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
+        register int nproc = this_thr->th.th_team_nproc;
+        register int i;
+        // Don't have to worry about sleep bit here or atomic since team setting
+        register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
+
+        // Collect all the worker team member threads.
+        for (i=1; i<nproc; ++i) {
+#if KMP_CACHE_MANAGE
+            // Prefetch next thread's arrived count
+            if (i+1 < nproc)
+                KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+            KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                          "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
+                            __kmp_gtid_from_tid(i, team), team->t.t_id, i,
+                            &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
+
+            // Wait for worker thread to arrive
+            kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
+            flag.wait(this_thr, FALSE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier imbalance - write min of the thread time and the other thread time to the thread.
+            if (__kmp_forkjoin_frames_mode == 2) {
+                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                                          other_threads[i]->th.th_bar_min_time);
+            }
+#endif
+            if (reduce) {
+                KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
+                               team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
+                (*reduce)(this_thr->th.th_local.reduce_data,
+                          other_threads[i]->th.th_local.reduce_data);
+            }
+        }
+        // Don't have to worry about sleep bit here or atomic since team setting
+        team_bar->b_arrived = new_state;
+        KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
+    }
+    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+static void
+__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                             int propagate_icvs
+                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_linear_release);
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_team_t *team;
+
+    if (KMP_MASTER_TID(tid)) {
+        register unsigned int i;
+        register kmp_uint32 nproc = this_thr->th.th_team_nproc;
+        register kmp_info_t **other_threads;
+
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        other_threads = team->t.t_threads;
+
+        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+
+        if (nproc > 1) {
+#if KMP_BARRIER_ICV_PUSH
+            KMP_START_EXPLICIT_TIMER(USER_icv_copy);
+            if (propagate_icvs) {
+                ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
+                for (i=1; i<nproc; ++i) {
+                    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
+                    ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
+                                   &team->t.t_implicit_task_taskdata[0].td_icvs);
+                }
+                ngo_sync();
+            }
+            KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
+#endif // KMP_BARRIER_ICV_PUSH
+
+            // Now, release all of the worker threads
+            for (i=1; i<nproc; ++i) {
+#if KMP_CACHE_MANAGE
+                // Prefetch next thread's go flag
+                if (i+1 < nproc)
+                    KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+                KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                              "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                              other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
+                              &other_threads[i]->th.th_bar[bt].bb.b_go,
+                              other_threads[i]->th.th_bar[bt].bb.b_go,
+                              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
+                kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
+                flag.release();
+            }
+        }
+    } else { // Wait for the MASTER thread to release us
+        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
+                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+            // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+            // Cancel wait on previous parallel region...
+            __kmp_itt_task_starting(itt_sync_obj);
+
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+                return;
+
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            if (itt_sync_obj != NULL)
+                // Call prepare as early as possible for "new" barrier
+                __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
+            return;
+        // The worker thread may now assume that the team is valid.
+#ifdef KMP_DEBUG
+        tid = __kmp_tid_from_gtid(gtid);
+        team = __kmp_threads[gtid]->th.th_team;
+#endif
+        KMP_DEBUG_ASSERT(team != NULL);
+        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    }
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// Tree barrier
+static void
+__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                          void (*reduce)(void *, void *)
+                          USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_tree_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_info_t **other_threads = team->t.t_threads;
+    register kmp_uint32 nproc = this_thr->th.th_team_nproc;
+    register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+    register kmp_uint32 branch_factor = 1 << branch_bits;
+    register kmp_uint32 child;
+    register kmp_uint32 child_tid;
+    register kmp_uint new_state;
+
+    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
+    }
+#endif
+    // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
+    child_tid = (tid << branch_bits) + 1;
+    if (child_tid < nproc) {
+        // Parent threads wait for all their children to arrive
+        new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+        child = 1;
+        do {
+            register kmp_info_t *child_thr = other_threads[child_tid];
+            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+            // Prefetch next thread's arrived count
+            if (child+1 <= branch_factor && child_tid+1 < nproc)
+                KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+            KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                          "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
+                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
+                            &child_bar->b_arrived, new_state));
+            // Wait for child to arrive
+            kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+            flag.wait(this_thr, FALSE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier imbalance - write min of the thread time and a child time to the thread.
+            if (__kmp_forkjoin_frames_mode == 2) {
+                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                                          child_thr->th.th_bar_min_time);
+            }
+#endif
+            if (reduce) {
+                KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                               team->t.t_id, child_tid));
+                (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+            }
+            child++;
+            child_tid++;
+        }
+        while (child <= branch_factor && child_tid < nproc);
+    }
+
+    if (!KMP_MASTER_TID(tid)) { // Worker threads
+        register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
+
+        KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                      "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                      __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
+                      &thr_bar->b_arrived, thr_bar->b_arrived,
+                      thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+
+        // Mark arrival to parent thread
+        /* After performing this write, a worker thread may not assume that the team is valid
+           any more - it could be deallocated by the master thread at any time.  */
+        kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
+        flag.release();
+    } else {
+        // Need to update the team arrived pointer if we are the master thread
+        if (nproc > 1) // New value was already computed above
+            team->t.t_bar[bt].b_arrived = new_state;
+        else
+            team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+        KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id,
+                      &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+    }
+    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+static void
+__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                           int propagate_icvs
+                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_tree_release);
+    register kmp_team_t *team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_uint32 nproc;
+    register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
+    register kmp_uint32 branch_factor = 1 << branch_bits;
+    register kmp_uint32 child;
+    register kmp_uint32 child_tid;
+
+    // Perform a tree release for all of the threads that have been gathered
+    if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
+        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
+                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+        // Wait for parent thread to release us
+        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+            // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+            // Cancel wait on previous parallel region...
+            __kmp_itt_task_starting(itt_sync_obj);
+
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+                return;
+
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            if (itt_sync_obj != NULL)
+                // Call prepare as early as possible for "new" barrier
+                __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+
+        // The worker thread may now assume that the team is valid.
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        tid = __kmp_tid_from_gtid(gtid);
+
+        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    } else {
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+    }
+    nproc = this_thr->th.th_team_nproc;
+    child_tid = (tid << branch_bits) + 1;
+
+    if (child_tid < nproc) {
+        register kmp_info_t **other_threads = team->t.t_threads;
+        child = 1;
+        // Parent threads release all their children
+        do {
+            register kmp_info_t *child_thr = other_threads[child_tid];
+            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+            // Prefetch next thread's go count
+            if (child+1 <= branch_factor && child_tid+1 < nproc)
+                KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+            KMP_START_EXPLICIT_TIMER(USER_icv_copy);
+            if (propagate_icvs) {
+                __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
+                                         team, child_tid, FALSE);
+                copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
+                          &team->t.t_implicit_task_taskdata[0].td_icvs);
+            }
+            KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
+#endif // KMP_BARRIER_ICV_PUSH
+            KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+                          "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                          __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                          child_tid, &child_bar->b_go, child_bar->b_go,
+                          child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+            // Release child from barrier
+            kmp_flag_64 flag(&child_bar->b_go, child_thr);
+            flag.release();
+            child++;
+            child_tid++;
+        }
+        while (child <= branch_factor && child_tid < nproc);
+    }
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+
+// Hyper Barrier
+static void
+__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                           void (*reduce)(void *, void *)
+                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hyper_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_info_t **other_threads = team->t.t_threads;
+    register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
+    register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
+    register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+    register kmp_uint32 branch_factor = 1 << branch_bits;
+    register kmp_uint32 offset;
+    register kmp_uint32 level;
+
+    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
+    }
+#endif
+    /* Perform a hypercube-embedded tree gather to wait until all of the threads have
+       arrived, and reduce any required data as we go.  */
+    kmp_flag_64 p_flag(&thr_bar->b_arrived);
+    for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
+    {
+        register kmp_uint32 child;
+        register kmp_uint32 child_tid;
+
+        if (((tid >> level) & (branch_factor - 1)) != 0) {
+            register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
+
+            KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                          "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                          __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
+                          &thr_bar->b_arrived, thr_bar->b_arrived,
+                          thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+            // Mark arrival to parent thread
+            /* After performing this write (in the last iteration of the enclosing for loop),
+               a worker thread may not assume that the team is valid any more - it could be
+               deallocated by the master thread at any time.  */
+            p_flag.set_waiter(other_threads[parent_tid]);
+	    p_flag.release();
+            break;
+        }
+
+        // Parent threads wait for children to arrive
+        if (new_state == KMP_BARRIER_UNUSED_STATE)
+            new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+        for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
+             child++, child_tid+=(1 << level))
+        {
+            register kmp_info_t *child_thr = other_threads[child_tid];
+            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+            register kmp_uint32 next_child_tid = child_tid + (1 << level);
+            // Prefetch next thread's arrived count
+            if (child+1 < branch_factor && next_child_tid < num_threads)
+                KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+            KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                          "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
+                          __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
+                          &child_bar->b_arrived, new_state));
+            // Wait for child to arrive
+            kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
+            c_flag.wait(this_thr, FALSE
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier imbalance - write min of the thread time and a child time to the thread.
+            if (__kmp_forkjoin_frames_mode == 2) {
+                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                                          child_thr->th.th_bar_min_time);
+            }
+#endif
+            if (reduce) {
+                KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                               team->t.t_id, child_tid));
+                (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+            }
+        }
+    }
+
+    if (KMP_MASTER_TID(tid)) {
+        // Need to update the team arrived pointer if we are the master thread
+        if (new_state == KMP_BARRIER_UNUSED_STATE)
+            team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+        else
+            team->t.t_bar[bt].b_arrived = new_state;
+        KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id,
+                      &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+    }
+    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// The reverse versions seem to beat the forward versions overall
+#define KMP_REVERSE_HYPER_BAR
+static void
+__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                            int propagate_icvs
+                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hyper_release);
+    register kmp_team_t    *team;
+    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
+    register kmp_info_t   **other_threads;
+    register kmp_uint32     num_threads;
+    register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
+    register kmp_uint32     branch_factor = 1 << branch_bits;
+    register kmp_uint32     child;
+    register kmp_uint32     child_tid;
+    register kmp_uint32     offset;
+    register kmp_uint32     level;
+
+    /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
+       If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
+       order of the corresponding gather, otherwise threads are released in the same order. */
+    if (KMP_MASTER_TID(tid)) { // master
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+#if KMP_BARRIER_ICV_PUSH
+        if (propagate_icvs) { // master already has ICVs in final destination; copy
+            copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
+        }
+#endif
+    }
+    else  { // Handle fork barrier workers who aren't part of a team yet
+        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
+                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+        // Wait for parent thread to release us
+        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+            // In fork barrier where we could not get the object reliably
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+            // Cancel wait on previous parallel region...
+            __kmp_itt_task_starting(itt_sync_obj);
+
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+                return;
+
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            if (itt_sync_obj != NULL)
+                // Call prepare as early as possible for "new" barrier
+                __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+
+        // The worker thread may now assume that the team is valid.
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        tid = __kmp_tid_from_gtid(gtid);
+
+        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    }
+    num_threads = this_thr->th.th_team_nproc;
+    other_threads = team->t.t_threads;
+
+#ifdef KMP_REVERSE_HYPER_BAR
+    // Count up to correct level for parent
+    for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
+         level+=branch_bits, offset<<=branch_bits);
+
+    // Now go down from there
+    for (level-=branch_bits, offset>>=branch_bits; offset != 0;
+         level-=branch_bits, offset>>=branch_bits)
+#else
+    // Go down the tree, level by level
+    for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
+#endif // KMP_REVERSE_HYPER_BAR
+    {
+#ifdef KMP_REVERSE_HYPER_BAR
+        /* Now go in reverse order through the children, highest to lowest.
+           Initial setting of child is conservative here. */
+        child = num_threads >> ((level==0)?level:level-1);
+        for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
+             child>=1; child--, child_tid-=(1<<level))
+#else
+        if (((tid >> level) & (branch_factor - 1)) != 0)
+            // No need to go lower than this, since this is the level parent would be notified
+            break;
+        // Iterate through children on this level of the tree
+        for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
+             child++, child_tid+=(1<<level))
+#endif // KMP_REVERSE_HYPER_BAR
+        {
+            if (child_tid >= num_threads) continue;  // Child doesn't exist so keep going
+            else {
+                register kmp_info_t *child_thr = other_threads[child_tid];
+                register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+                register kmp_uint32 next_child_tid = child_tid - (1 << level);
+                // Prefetch next thread's go count
+# ifdef KMP_REVERSE_HYPER_BAR
+                if (child-1 >= 1 && next_child_tid < num_threads)
+# else
+                if (child+1 < branch_factor && next_child_tid < num_threads)
+# endif // KMP_REVERSE_HYPER_BAR
+                    KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+                if (propagate_icvs) // push my fixed ICVs to my child
+                    copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+
+                KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+                              "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                              __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                              child_tid, &child_bar->b_go, child_bar->b_go,
+                              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                // Release child from barrier
+                kmp_flag_64 flag(&child_bar->b_go, child_thr);
+                flag.release();
+            }
+        }
+    }
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
+        __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
+    }
+#endif
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// Hierarchical Barrier
+
+// Initialize thread barrier data
+/* Initializes/re-initializes the hierarchical barrier data stored on a thread.  Performs the
+   minimum amount of initialization required based on how the team has changed.  Returns true if
+   leaf children will require both on-core and traditional wake-up mechanisms.  For example, if the
+   team size increases, threads already in the team will respond to on-core wakeup on their parent
+   thread, but threads newly added to the team will only be listening on the their local b_go. */
+static bool
+__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
+                                       int gtid, int tid, kmp_team_t *team)
+{
+    // Checks to determine if (re-)initialization is needed
+    bool uninitialized = thr_bar->team == NULL;
+    bool team_changed = team != thr_bar->team;
+    bool team_sz_changed = nproc != thr_bar->nproc;
+    bool tid_changed = tid != thr_bar->old_tid;
+    bool retval = false;
+
+    if (uninitialized || team_sz_changed) {
+        __kmp_get_hierarchy(nproc, thr_bar);
+    }
+
+    if (uninitialized || team_sz_changed || tid_changed) {
+        thr_bar->my_level = thr_bar->depth-1; // default for master
+        thr_bar->parent_tid = -1; // default for master
+        if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
+            kmp_uint32 d=0;
+            while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
+                kmp_uint32 rem;
+                if (d == thr_bar->depth-2) { // reached level right below the master
+                    thr_bar->parent_tid = 0;
+                    thr_bar->my_level = d;
+                    break;
+                }
+                else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
+                    // thread is not a subtree root at next level, so this is max
+                    thr_bar->parent_tid = tid - rem;
+                    thr_bar->my_level = d;
+                    break;
+                }
+                ++d;
+            }
+        }
+        thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
+        thr_bar->old_tid = tid;
+        thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+    }
+    if (uninitialized || team_changed || tid_changed) {
+        thr_bar->team = team;
+        thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
+        retval = true;
+    }
+    if (uninitialized || team_sz_changed || tid_changed) {
+        thr_bar->nproc = nproc;
+        thr_bar->leaf_kids = thr_bar->base_leaf_kids;
+        if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
+        if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
+            thr_bar->leaf_kids = nproc - tid - 1;
+        thr_bar->leaf_state = 0;
+        for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
+    }
+    return retval;
+}
+
+static void
+__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
+                                  int gtid, int tid, void (*reduce) (void *, void *)
+                                  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hier_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
+    register kmp_uint32 nproc = this_thr->th.th_team_nproc;
+    register kmp_info_t **other_threads = team->t.t_threads;
+    register kmp_uint64 new_state;
+
+    int level = team->t.t_level;
+    if (other_threads[0]->th.th_teams_microtask)    // are we inside the teams construct?
+        if (this_thr->th.th_teams_size.nteams > 1)
+            ++level; // level was not increased in teams construct for team_of_masters
+    if (level == 1) thr_bar->use_oncore_barrier = 1;
+    else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
+
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
+    }
+#endif
+
+    (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
+
+    if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
+        register kmp_int32 child_tid;
+        new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
+            if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
+                kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
+                kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
+                flag.wait(this_thr, FALSE
+                          USE_ITT_BUILD_ARG(itt_sync_obj) );
+                if (reduce) {
+                    for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
+                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                       team->t.t_id, child_tid));
+                        (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
+                    }
+                }
+                (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
+            }
+            // Next, wait for higher level children on each child's b_arrived flag
+            for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
+                kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
+                if (last > nproc) last = nproc;
+                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
+                    register kmp_info_t *child_thr = other_threads[child_tid];
+                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                                  "arrived(%p) == %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+                    kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+                    flag.wait(this_thr, FALSE
+                              USE_ITT_BUILD_ARG(itt_sync_obj) );
+                    if (reduce) {
+                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                       team->t.t_id, child_tid));
+                        (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+                    }
+                }
+            }
+        }
+        else { // Blocktime is not infinite
+            for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
+                kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
+                if (last > nproc) last = nproc;
+                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
+                    register kmp_info_t *child_thr = other_threads[child_tid];
+                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                                  "arrived(%p) == %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+                    kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+                    flag.wait(this_thr, FALSE
+                              USE_ITT_BUILD_ARG(itt_sync_obj) );
+                    if (reduce) {
+                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                       team->t.t_id, child_tid));
+                        (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+                    }
+                }
+            }
+        }
+    }
+    // All subordinates are gathered; now release parent if not master thread
+
+    if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                      "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                      __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
+                      &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
+        /* Mark arrival to parent: After performing this write, a worker thread may not assume that
+           the team is valid any more - it could be deallocated by the master thread at any time. */
+        if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
+            || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
+            kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
+            flag.release();
+        }
+        else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
+            thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+            kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
+            flag.set_waiter(other_threads[thr_bar->parent_tid]);
+            flag.release();
+        }
+    } else { // Master thread needs to update the team's b_arrived value
+        team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+    }
+    // Is the team access below unsafe or just technically invalid?
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+static void
+__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                                   int propagate_icvs
+                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hier_release);
+    register kmp_team_t *team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_uint32 nproc;
+    bool team_change = false; // indicates on-core barrier shouldn't be used
+
+    if (KMP_MASTER_TID(tid)) {
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+    }
+    else { // Worker threads
+        // Wait for parent thread to release me
+        if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
+            || thr_bar->my_level != 0 || thr_bar->team == NULL) {
+            // Use traditional method of waiting on my own b_go flag
+            thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
+            kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+            flag.wait(this_thr, TRUE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+            TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+        }
+        else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
+            // Wait on my "offset" bits on parent's b_go flag
+            thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
+            kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
+                                 bt, this_thr
+                                 USE_ITT_BUILD_ARG(itt_sync_obj) );
+            flag.wait(this_thr, TRUE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+            if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
+                TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+            }
+            else { // Reset my bits on parent's b_go flag
+                ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
+            }
+        }
+        thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+        // The worker thread may now assume that the team is valid.
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        tid = __kmp_tid_from_gtid(gtid);
+
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    }
+
+    int level = team->t.t_level;
+    if (team->t.t_threads[0]->th.th_teams_microtask ) {    // are we inside the teams construct?
+        if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
+            ++level; // level was not increased in teams construct for team_of_workers
+        if( this_thr->th.th_teams_size.nteams > 1 )
+            ++level; // level was not increased in teams construct for team_of_masters
+    }
+    if (level == 1) thr_bar->use_oncore_barrier = 1;
+    else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
+    nproc = this_thr->th.th_team_nproc;
+
+    // If the team size has increased, we still communicate with old leaves via oncore barrier.
+    unsigned short int old_leaf_kids = thr_bar->leaf_kids;
+    kmp_uint64 old_leaf_state = thr_bar->leaf_state;
+    team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
+    // But if the entire team changes, we won't use oncore barrier at all
+    if (team_change) old_leaf_kids = 0;
+
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) {
+        if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
+            copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
+        }
+        else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
+            if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
+                // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
+                copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                          &thr_bar->parent_bar->th_fixed_icvs);
+            // non-leaves will get ICVs piggybacked with b_go via NGO store
+        }
+        else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
+            if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
+                copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
+            else // leaves copy parent's fixed ICVs directly to local ICV store
+                copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                          &thr_bar->parent_bar->th_fixed_icvs);
+        }
+    }
+#endif // KMP_BARRIER_ICV_PUSH
+
+    // Now, release my children
+    if (thr_bar->my_level) { // not a leaf
+        register kmp_int32 child_tid;
+        kmp_uint32 last;
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
+            if (KMP_MASTER_TID(tid)) { // do a flat release
+                // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
+                thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
+                // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
+                ngo_load(&thr_bar->th_fixed_icvs);
+                // This loops over all the threads skipping only the leaf nodes in the hierarchy
+                for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
+                    register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
+                                  " go(%p): %u => %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                    // Use ngo store (if available) to both store ICVs and release child via child's b_go
+                    ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+                }
+                ngo_sync();
+            }
+            TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+            // Now, release leaf children
+            if (thr_bar->leaf_kids) { // if there are any
+                // We test team_change on the off-chance that the level 1 team changed.
+                if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
+                    if (old_leaf_kids) { // release old leaf kids
+                        thr_bar->b_go |= old_leaf_state;
+                    }
+                    // Release new leaf kids
+                    last = tid+thr_bar->skip_per_level[1];
+                    if (last > nproc) last = nproc;
+                    for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
+                        register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
+                        register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
+                                      " T#%d(%d:%d) go(%p): %u => %u\n",
+                                      gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                      team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                                      child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                        // Release child using child's b_go flag
+                        kmp_flag_64 flag(&child_bar->b_go, child_thr);
+                        flag.release();
+                    }
+                }
+                else { // Release all children at once with leaf_state bits on my own b_go flag
+                    thr_bar->b_go |= thr_bar->leaf_state;
+                }
+            }
+        }
+        else { // Blocktime is not infinite; do a simple hierarchical release
+            for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
+                last = tid+thr_bar->skip_per_level[d+1];
+                kmp_uint32 skip = thr_bar->skip_per_level[d];
+                if (last > nproc) last = nproc;
+                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
+                    register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
+                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
+                                  " go(%p): %u => %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                    // Release child using child's b_go flag
+                    kmp_flag_64 flag(&child_bar->b_go, child_thr);
+                    flag.release();
+                }
+            }
+        }
+#if KMP_BARRIER_ICV_PUSH
+        if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
+            copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+    }
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// ---------------------------- End of Barrier Algorithms ----------------------------
+
+// Internal function to do a barrier.
+/* If is_split is true, do a split barrier, otherwise, do a plain barrier
+   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
+   Returns 0 if master thread, 1 if worker thread.  */
+int
+__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
+              void *reduce_data, void (*reduce)(void *, void *))
+{
+    KMP_TIME_BLOCK(KMP_barrier);
+    register int tid = __kmp_tid_from_gtid(gtid);
+    register kmp_info_t *this_thr = __kmp_threads[gtid];
+    register kmp_team_t *team = this_thr->th.th_team;
+    register int status = 0;
+    ident_t *loc = __kmp_threads[gtid]->th.th_ident;
+#if OMPT_SUPPORT
+    ompt_task_id_t my_task_id;
+    ompt_parallel_id_t my_parallel_id;
+#endif
+
+    KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
+                  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+#if OMPT_BLAME
+        if (ompt_status == ompt_status_track_callback) {
+            my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
+            my_parallel_id = team->t.ompt_team_info.parallel_id;
+
+#if OMPT_TRACE
+            if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
+                if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
+                    ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
+                        my_parallel_id, my_task_id);
+                }
+            }
+#endif
+            if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
+                    my_parallel_id, my_task_id);
+            }
+        } 
+#endif
+        // It is OK to report the barrier state after the barrier begin callback.
+        // According to the OMPT specification, a compliant implementation may
+        // even delay reporting this state until the barrier begins to wait.
+        this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+    }
+#endif
+
+    if (! team->t.t_serialized) {
+#if USE_ITT_BUILD
+        // This value will be used in itt notify events below.
+        void *itt_sync_obj = NULL;
+# if USE_ITT_NOTIFY
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+# endif
+#endif /* USE_ITT_BUILD */
+        if (__kmp_tasking_mode == tskm_extra_barrier) {
+            __kmp_tasking_barrier(team, this_thr, gtid);
+            KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
+                          gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+        }
+
+        /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
+           the team struct is not guaranteed to exist. */
+        // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+            this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+            this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+        }
+
+#if USE_ITT_BUILD
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+#if USE_DEBUGGER
+        // Let the debugger know: the thread arrived to the barrier and waiting.
+        if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
+            team->t.t_bar[bt].b_master_arrived += 1;
+        } else {
+            this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
+        } // if
+#endif /* USE_DEBUGGER */
+        if (reduce != NULL) {
+            //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
+            this_thr->th.th_local.reduce_data = reduce_data;
+        }
+        switch (__kmp_barrier_gather_pattern[bt]) {
+        case bp_hyper_bar: {
+            KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
+            __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                       USE_ITT_BUILD_ARG(itt_sync_obj) );
+            break;
+        }
+        case bp_hierarchical_bar: {
+            __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                              USE_ITT_BUILD_ARG(itt_sync_obj));
+            break;
+        }
+        case bp_tree_bar: {
+            KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
+            __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+            break;
+        }
+        default: {
+            __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+        }
+        }
+
+        KMP_MB();
+
+        if (KMP_MASTER_TID(tid)) {
+            status = 0;
+            if (__kmp_tasking_mode != tskm_immediate_exec) {
+                __kmp_task_team_wait(this_thr, team
+                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
+                __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
+            }
+#if USE_DEBUGGER
+            // Let the debugger know: All threads are arrived and starting leaving the barrier.
+            team->t.t_bar[bt].b_team_arrived += 1;
+#endif
+
+#if USE_ITT_BUILD
+            /* TODO: In case of split reduction barrier, master thread may send acquired event early,
+               before the final summation into the shared variable is done (final summation can be a
+               long operation for array reductions).  */
+            if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+                __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier - report frame end (only if active_level == 1)
+            if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
+#if OMP_40_ENABLED
+                this_thr->th.th_teams_microtask == NULL &&
+#endif
+                team->t.t_active_level == 1)
+            {
+                kmp_uint64 cur_time = __itt_get_timestamp();
+                kmp_info_t **other_threads = team->t.t_threads;
+                int nproc = this_thr->th.th_team_nproc;
+                int i;
+                switch(__kmp_forkjoin_frames_mode) {
+                case 1:
+                    __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                    this_thr->th.th_frame_time = cur_time;
+                    break;
+                case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
+                    __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
+                    break;
+                case 3:
+                    if( __itt_metadata_add_ptr ) {
+                        // Initialize with master's wait time
+                        kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+                        for (i=1; i<nproc; ++i) {
+                            delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
+                        }
+                        __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
+                    }
+                    __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                    this_thr->th.th_frame_time = cur_time;
+                    break;
+                }
+            }
+#endif /* USE_ITT_BUILD */
+        } else {
+            status = 1;
+#if USE_ITT_BUILD
+            if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+                __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+        }
+        if (status == 1 || ! is_split) {
+            switch (__kmp_barrier_release_pattern[bt]) {
+            case bp_hyper_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
+                break;
+            }
+            case bp_hierarchical_bar: {
+                __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
+                break;
+            }
+            case bp_tree_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
+                break;
+            }
+            default: {
+                __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                             USE_ITT_BUILD_ARG(itt_sync_obj) );
+            }
+            }
+            if (__kmp_tasking_mode != tskm_immediate_exec) {
+                __kmp_task_team_sync(this_thr, team);
+            }
+        }
+
+#if USE_ITT_BUILD
+        /* GEH: TODO: Move this under if-condition above and also include in
+           __kmp_end_split_barrier(). This will more accurately represent the actual release time
+           of the threads for split barriers.  */
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+    } else { // Team is serialized.
+        status = 0;
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+#if OMP_41_ENABLED
+            if ( this_thr->th.th_task_team != NULL ) {
+                void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+                if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+                    itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+                    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+                }
+#endif
+
+                kmp_task_team_t * task_team;
+                task_team = this_thr->th.th_task_team;
+                KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
+                __kmp_task_team_wait(this_thr, team
+                                               USE_ITT_BUILD_ARG(itt_sync_obj));
+                __kmp_task_team_setup(this_thr, team, 0, 0);
+
+#if USE_ITT_BUILD
+                if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+                    __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+            }
+#else
+            // The task team should be NULL for serialized code (tasks will be executed immediately)
+            KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
+            KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
+#endif
+        }
+    }
+    KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
+                  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+#if OMPT_BLAME
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
+                my_parallel_id, my_task_id);
+        }
+#endif
+        this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    }
+#endif
+
+    return status;
+}
+
+
+void
+__kmp_end_split_barrier(enum barrier_type bt, int gtid)
+{
+    KMP_TIME_BLOCK(KMP_end_split_barrier);
+    int tid = __kmp_tid_from_gtid(gtid);
+    kmp_info_t *this_thr = __kmp_threads[gtid];
+    kmp_team_t *team = this_thr->th.th_team;
+
+    if (!team->t.t_serialized) {
+        if (KMP_MASTER_GTID(gtid)) {
+            switch (__kmp_barrier_release_pattern[bt]) {
+            case bp_hyper_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                            USE_ITT_BUILD_ARG(NULL) );
+                break;
+            }
+            case bp_hierarchical_bar: {
+                __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                                   USE_ITT_BUILD_ARG(NULL));
+                break;
+            }
+            case bp_tree_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                           USE_ITT_BUILD_ARG(NULL) );
+                break;
+            }
+            default: {
+                __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                             USE_ITT_BUILD_ARG(NULL) );
+            }
+            }
+            if (__kmp_tasking_mode != tskm_immediate_exec) {
+                __kmp_task_team_sync(this_thr, team);
+            } // if
+        }
+    }
+}
+
+
+void
+__kmp_join_barrier(int gtid)
+{
+    KMP_TIME_BLOCK(KMP_join_barrier);
+    register kmp_info_t *this_thr = __kmp_threads[gtid];
+    register kmp_team_t *team;
+    register kmp_uint nproc;
+    kmp_info_t *master_thread;
+    int tid;
+#ifdef KMP_DEBUG
+    int team_id;
+#endif /* KMP_DEBUG */
+#if USE_ITT_BUILD
+    void *itt_sync_obj = NULL;
+# if USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
+        // Get object created at fork_barrier
+        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+# endif
+#endif /* USE_ITT_BUILD */
+    KMP_MB();
+
+    // Get current info
+    team = this_thr->th.th_team;
+    nproc = this_thr->th.th_team_nproc;
+    KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
+    tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+    team_id = team->t.t_id;
+#endif /* KMP_DEBUG */
+    master_thread = this_thr->th.th_team_master;
+#ifdef KMP_DEBUG
+    if (master_thread != team->t.t_threads[0]) {
+        __kmp_print_structure();
+    }
+#endif /* KMP_DEBUG */
+    KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
+    KMP_MB();
+
+    // Verify state
+    KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+    KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
+    KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
+    KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
+    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
+
+#if OMPT_SUPPORT 
+#if OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
+        ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+#endif
+    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+#endif
+
+    if (__kmp_tasking_mode == tskm_extra_barrier) {
+        __kmp_tasking_barrier(team, this_thr, gtid);
+        KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
+    }
+# ifdef KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+        KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
+                       __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
+                       this_thr->th.th_task_team));
+        KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
+    }
+# endif /* KMP_DEBUG */
+
+    /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
+       team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
+       down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
+       since the values are not used by __kmp_wait_template() in that case. */
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+        this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+    }
+
+#if USE_ITT_BUILD
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+        __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+    switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
+    case bp_hyper_bar: {
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+        __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                          USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_tree_bar: {
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+        __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    default: {
+        __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
+    }
+    }
+
+    /* From this point on, the team data structure may be deallocated at any time by the
+       master thread - it is unsafe to reference it in any of the worker threads. Any per-team
+       data items that need to be referenced before the end of the barrier should be moved to
+       the kmp_task_team_t structs.  */
+    if (KMP_MASTER_TID(tid)) {
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+            // Master shouldn't call decrease_load().         // TODO: enable master threads.
+            // Master should have th_may_decrease_load == 0.  // TODO: enable master threads.
+            __kmp_task_team_wait(this_thr, team
+                                 USE_ITT_BUILD_ARG(itt_sync_obj) );
+        }
+#if USE_ITT_BUILD
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+# if USE_ITT_BUILD && USE_ITT_NOTIFY
+        // Join barrier - report frame end
+        if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
+#if OMP_40_ENABLED
+            this_thr->th.th_teams_microtask == NULL &&
+#endif
+            team->t.t_active_level == 1)
+        {
+            kmp_uint64 cur_time = __itt_get_timestamp();
+            ident_t * loc = team->t.t_ident;
+            kmp_info_t **other_threads = team->t.t_threads;
+            int nproc = this_thr->th.th_team_nproc;
+            int i;
+            switch(__kmp_forkjoin_frames_mode) {
+            case 1:
+                __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                break;
+            case 2:
+                __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
+                break;
+            case 3:
+                if( __itt_metadata_add_ptr ) {
+                    // Initialize with master's wait time
+                    kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+                    for (i=1; i<nproc; ++i) {
+                        delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
+                    }
+                    __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
+                }
+                __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                this_thr->th.th_frame_time = cur_time;
+                break;
+            }
+        }
+# endif /* USE_ITT_BUILD */
+    }
+#if USE_ITT_BUILD
+    else {
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+    }
+#endif /* USE_ITT_BUILD */
+
+#if KMP_DEBUG
+    if (KMP_MASTER_TID(tid)) {
+        KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
+                      gtid, team_id, tid, nproc));
+    }
+#endif /* KMP_DEBUG */
+
+    // TODO now, mark worker threads as done so they may be disbanded
+    KMP_MB(); // Flush all pending memory write invalidates.
+    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+#if OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
+            ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
+                team->t.ompt_team_info.parallel_id,
+                team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+       }
+#endif
+
+        // return to default state
+        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+}
+
+
+// TODO release worker threads' fork barriers as we are ready instead of all at once
+void
+__kmp_fork_barrier(int gtid, int tid)
+{
+    KMP_TIME_BLOCK(KMP_fork_barrier);
+    kmp_info_t *this_thr = __kmp_threads[gtid];
+    kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
+#if USE_ITT_BUILD
+    void * itt_sync_obj = NULL;
+#endif /* USE_ITT_BUILD */
+
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
+                  gtid, (team != NULL) ? team->t.t_id : -1, tid));
+
+    // th_team pointer only valid for master thread here
+    if (KMP_MASTER_TID(tid)) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+            // Create itt barrier object
+            itt_sync_obj  = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
+            __kmp_itt_barrier_middle(gtid, itt_sync_obj);  // Call acquired/releasing
+        }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+
+#ifdef KMP_DEBUG
+        register kmp_info_t **other_threads = team->t.t_threads;
+        register int i;
+
+        // Verify state
+        KMP_MB();
+
+        for(i=1; i<team->t.t_nproc; ++i) {
+            KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
+                           gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
+                           team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
+                           other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
+            KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
+                              & ~(KMP_BARRIER_SLEEP_STATE))
+                             == KMP_INIT_BARRIER_STATE);
+            KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
+        }
+#endif
+
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+            __kmp_task_team_setup(this_thr, team, 1, 0);  // 1,0 indicates setup both task teams if nthreads > 1
+        }
+
+        /* The master thread may have changed its blocktime between the join barrier and the
+           fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
+           access it when the team struct is not guaranteed to exist. */
+        // See note about the corresponding code in __kmp_join_barrier() being performance-critical
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+            this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+            this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+        }
+    } // master
+
+    switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
+    case bp_hyper_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+        __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_tree_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+        __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    default: {
+        __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
+    }
+    }
+
+    // Early exit for reaping threads releasing forkjoin barrier
+    if (TCR_4(__kmp_global.g.g_done)) {
+        if (this_thr->th.th_task_team != NULL) {
+            if (KMP_MASTER_TID(tid)) {
+                TCW_PTR(this_thr->th.th_task_team, NULL);
+            }
+            else {
+                __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
+            }
+        }
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+            if (!KMP_MASTER_TID(tid)) {
+                itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+                if (itt_sync_obj)
+                    __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+            }
+        }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
+        return;
+    }
+
+    /* We can now assume that a valid team structure has been allocated by the master and
+       propagated to all worker threads. The current thread, however, may not be part of the
+       team, so we can't blindly assume that the team pointer is non-null.  */
+    team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+
+#if KMP_BARRIER_ICV_PULL
+    /* Master thread's copy of the ICVs was set up on the implicit taskdata in
+       __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
+       this data before this function is called. We cannot modify __kmp_fork_call() to look at
+       the fixed ICVs in the master's thread struct, because it is not always the case that the
+       threads arrays have been allocated when __kmp_fork_call() is executed. */
+    KMP_START_EXPLICIT_TIMER(USER_icv_copy);
+    if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
+        // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
+        KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
+        __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
+    }
+    KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
+#endif // KMP_BARRIER_ICV_PULL
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+        __kmp_task_team_sync(this_thr, team);
+    }
+
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
+    if (proc_bind == proc_bind_intel) {
+#endif
+#if KMP_AFFINITY_SUPPORTED
+        // Call dynamic affinity settings
+        if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
+            __kmp_balanced_affinity(tid, team->t.t_nproc);
+        }
+#endif // KMP_AFFINITY_SUPPORTED
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    }
+    else if (proc_bind != proc_bind_false) {
+        if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
+            KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
+                           __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
+        }
+        else {
+            __kmp_affinity_set_place(gtid);
+        }
+    }
+#endif
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+        if (!KMP_MASTER_TID(tid)) {
+            // Get correct barrier object
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            __kmp_itt_barrier_finished(gtid, itt_sync_obj);  // Workers call acquired
+        } // (prepare called inside barrier_release)
+    }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
+}
+
+
+void
+__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
+{
+    KMP_TIME_BLOCK(KMP_setup_icv_copy);
+
+    KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
+    KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
+
+    /* Master thread's copy of the ICVs was set up on the implicit taskdata in
+       __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
+       this data before this function is called. */
+#if KMP_BARRIER_ICV_PULL
+    /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
+       all of the worker threads can access them and make their own copies after the barrier. */
+    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
+    copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
+    KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
+                  0, team->t.t_threads[0], team));
+#elif KMP_BARRIER_ICV_PUSH
+    // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
+    KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
+                  0, team->t.t_threads[0], team));
+#else
+    // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
+    ngo_load(new_icvs);
+    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
+    for (int f=1; f<new_nproc; ++f) { // Skip the master thread
+        // TODO: GEH - pass in better source location info since usually NULL here
+        KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                      f, team->t.t_threads[f], team));
+        __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
+        ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
+        KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                      f, team->t.t_threads[f], team));
+    }
+    ngo_sync();
+#endif // KMP_BARRIER_ICV_PULL
+}

diff --git a/final/runtime/src/kmp_cancel.cpp b/final/runtime/src/kmp_cancel.cpp
new file mode 100644
index 0000000..e5a76d2
--- /dev/null
+++ b/final/runtime/src/kmp_cancel.cpp

@@ -0,0 +1,282 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_str.h"
+
+#if OMP_40_ENABLED
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+@param cncl_kind Cancellation kind (parallel, for, sections, taskgroup)
+
+@return returns true if the cancellation request has been activated and the execution thread
+needs to proceed to the end of the canceled region.
+
+Request cancellation of the binding OpenMP region.
+*/
+kmp_int32 __kmpc_cancel(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
+    kmp_info_t *this_thr = __kmp_threads [ gtid ];
+    
+    KC_TRACE( 10, ("__kmpc_cancel: T#%d request %d OMP_CANCELLATION=%d\n", gtid, cncl_kind, __kmp_omp_cancellation) );
+
+    KMP_DEBUG_ASSERT(cncl_kind != cancel_noreq);
+    KMP_DEBUG_ASSERT(cncl_kind == cancel_parallel || cncl_kind == cancel_loop || 
+                     cncl_kind == cancel_sections || cncl_kind == cancel_taskgroup); 
+    KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+    if (__kmp_omp_cancellation) {
+        switch (cncl_kind) {
+        case cancel_parallel:
+        case cancel_loop:
+        case cancel_sections:
+            // cancellation requests for parallel and worksharing constructs
+            // are handled through the team structure
+            {
+                kmp_team_t *this_team = this_thr->th.th_team;
+                KMP_DEBUG_ASSERT(this_team);
+                kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(&(this_team->t.t_cancel_request), cancel_noreq, cncl_kind);
+                if (old == cancel_noreq || old == cncl_kind) {
+                    //printf("__kmpc_cancel: this_team->t.t_cancel_request=%d @ %p\n", 
+                    //       this_team->t.t_cancel_request, &(this_team->t.t_cancel_request));
+                    // we do not have a cancellation request in this team or we do have one
+                    // that matches the current request -> cancel
+                    return 1 /* true */;
+                }
+                break;
+            }
+        case cancel_taskgroup:
+            // cancellation requests for parallel and worksharing constructs
+            // are handled through the taskgroup structure
+            {
+                kmp_taskdata_t*  task; 
+                kmp_taskgroup_t* taskgroup;
+                
+                task = this_thr->th.th_current_task;
+                KMP_DEBUG_ASSERT( task );
+                
+                taskgroup = task->td_taskgroup;
+                if (taskgroup) {
+                    kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(&(taskgroup->cancel_request), cancel_noreq, cncl_kind);
+                    if (old == cancel_noreq || old == cncl_kind) {
+                        // we do not have a cancellation request in this taskgroup or we do have one
+                        // that matches the current request -> cancel
+                        return 1 /* true */;
+                    }
+                }
+                else {
+                    // TODO: what needs to happen here?
+                    // the specification disallows cancellation w/o taskgroups
+                    // so we might do anything here, let's abort for now
+                    KMP_ASSERT( 0 /* false */);
+                }
+            }
+            break;
+        default:
+            KMP_ASSERT (0 /* false */);
+        }
+    }
+
+    // ICV OMP_CANCELLATION=false, so we ignored this cancel request
+    KMP_DEBUG_ASSERT(!__kmp_omp_cancellation);
+    return 0 /* false */;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+@param cncl_kind Cancellation kind (parallel, for, sections, taskgroup)
+
+@return returns true if a matching cancellation request has been flagged in the RTL and the 
+encountering thread has to cancel..
+
+Cancellation point for the encountering thread.
+*/
+kmp_int32 __kmpc_cancellationpoint(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
+    kmp_info_t *this_thr = __kmp_threads [ gtid ];
+
+    KC_TRACE( 10, ("__kmpc_cancellationpoint: T#%d request %d OMP_CANCELLATION=%d\n", gtid, cncl_kind, __kmp_omp_cancellation) );
+
+    KMP_DEBUG_ASSERT(cncl_kind != cancel_noreq);
+    KMP_DEBUG_ASSERT(cncl_kind == cancel_parallel || cncl_kind == cancel_loop || 
+                     cncl_kind == cancel_sections || cncl_kind == cancel_taskgroup); 
+    KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+    if (__kmp_omp_cancellation) {
+        switch (cncl_kind) {
+        case cancel_parallel:
+        case cancel_loop:
+        case cancel_sections:
+            // cancellation requests for parallel and worksharing constructs
+            // are handled through the team structure
+            {
+                kmp_team_t *this_team = this_thr->th.th_team;
+                KMP_DEBUG_ASSERT(this_team);
+                if (this_team->t.t_cancel_request) {
+                    if (cncl_kind == this_team->t.t_cancel_request) {
+                        // the request in the team structure matches the type of
+                        // cancellation point so we can cancel
+                        return 1 /* true */;
+                    }
+                    KMP_ASSERT( 0 /* false */);
+                }
+                else {
+                    // we do not have a cancellation request pending, so we just
+                    // ignore this cancellation point
+                    return 0;
+                }
+                break;
+            }
+        case cancel_taskgroup:
+            // cancellation requests for parallel and worksharing constructs
+            // are handled through the taskgroup structure
+            {
+                kmp_taskdata_t*  task; 
+                kmp_taskgroup_t* taskgroup;
+                
+                task = this_thr->th.th_current_task;
+                KMP_DEBUG_ASSERT( task );
+                
+                taskgroup = task->td_taskgroup;
+                if (taskgroup) {
+                    // return the current status of cancellation for the 
+                    // taskgroup
+                    return !!taskgroup->cancel_request;
+                }
+                else {
+                    // if a cancellation point is encountered by a task
+                    // that does not belong to a taskgroup, it is OK
+                    // to ignore it
+                    return 0 /* false */;
+                }
+            }
+        default:
+            KMP_ASSERT (0 /* false */);
+        }
+    }
+
+    // ICV OMP_CANCELLATION=false, so we ignore the cancellation point
+    KMP_DEBUG_ASSERT(!__kmp_omp_cancellation);
+    return 0 /* false */;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+
+@return returns true if a matching cancellation request has been flagged in the RTL and the 
+encountering thread has to cancel..
+
+Barrier with cancellation point to send threads from the barrier to the
+end of the parallel region.  Needs a special code pattern as documented 
+in the design document for the cancellation feature.
+*/
+kmp_int32
+__kmpc_cancel_barrier(ident_t *loc, kmp_int32 gtid) {
+    int ret = 0 /* false */;
+    kmp_info_t *this_thr = __kmp_threads [ gtid ];
+    kmp_team_t *this_team = this_thr->th.th_team;
+
+    KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+    // call into the standard barrier
+    __kmpc_barrier(loc, gtid);
+
+    // if cancellation is active, check cancellation flag
+    if (__kmp_omp_cancellation) {
+        // depending on which construct to cancel, check the flag and
+        // reset the flag
+        switch (this_team->t.t_cancel_request) {
+        case cancel_parallel:
+            ret = 1;
+            // ensure that threads have checked the flag, when
+            // leaving the above barrier
+            __kmpc_barrier(loc, gtid);
+            this_team->t.t_cancel_request = cancel_noreq;
+            // the next barrier is the fork/join barrier, which
+            // synchronizes the threads leaving here        
+            break;
+        case cancel_loop:
+        case cancel_sections:
+            ret = 1;
+            // ensure that threads have checked the flag, when
+            // leaving the above barrier
+            __kmpc_barrier(loc, gtid);
+            this_team->t.t_cancel_request = cancel_noreq;
+            // synchronize the threads again to make sure we
+            // do not have any run-away threads that cause a race
+            // on the cancellation flag
+            __kmpc_barrier(loc, gtid);
+            break;
+        case cancel_taskgroup:
+            // this case should not occur
+            KMP_ASSERT (0 /* false */ );
+            break;
+        case cancel_noreq:
+            // do nothing
+            break;
+        default:
+            KMP_ASSERT ( 0 /* false */);
+        }
+    }
+    
+    return ret;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+
+@return returns true if a matching cancellation request has been flagged in the RTL and the 
+encountering thread has to cancel..
+
+Query function to query the current status of cancellation requests.
+Can be used to implement the following pattern:
+ 
+if (kmp_get_cancellation_status(kmp_cancel_parallel)) {
+    perform_cleanup();
+    #pragma omp cancellation point parallel      
+}
+*/
+int __kmp_get_cancellation_status(int cancel_kind) {
+    if (__kmp_omp_cancellation) {
+        kmp_info_t *this_thr = __kmp_entry_thread();
+        
+        switch (cancel_kind) {
+        case cancel_parallel:
+        case cancel_loop:
+        case cancel_sections:
+            {
+                kmp_team_t *this_team = this_thr->th.th_team;
+                return this_team->t.t_cancel_request == cancel_kind;
+            }
+        case cancel_taskgroup:
+            {
+                kmp_taskdata_t*  task; 
+                kmp_taskgroup_t* taskgroup;
+                task = this_thr->th.th_current_task;
+                taskgroup = task->td_taskgroup;
+                return taskgroup && taskgroup->cancel_request;
+            }
+        }
+    }
+
+    return 0 /* false */;
+}
+
+#endif

diff --git a/final/runtime/src/kmp_csupport.c b/final/runtime/src/kmp_csupport.c
new file mode 100644
index 0000000..70fe3dd
--- /dev/null
+++ b/final/runtime/src/kmp_csupport.c

@@ -0,0 +1,2821 @@
+/*
+ * kmp_csupport.c -- kfront linkage support for OpenMP.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "omp.h"        /* extern "C" declarations of user-visible routines */
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_error.h"
+#include "kmp_stats.h"
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+#define MAX_MESSAGE 512
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/*  flags will be used in future, e.g., to implement */
+/*  openmp_strict library restrictions               */
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc   in   source location information
+ * @param flags in   for future use (currently ignored)
+ *
+ * Initialize the runtime library. This call is optional; if it is not made then
+ * it will be implicitly called by attempts to use other library functions.
+ *
+ */
+void
+__kmpc_begin(ident_t *loc, kmp_int32 flags)
+{
+    // By default __kmp_ignore_mppbeg() returns TRUE.
+    if (__kmp_ignore_mppbeg() == FALSE) {
+        __kmp_internal_begin();
+
+        KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
+    }
+}
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc source location information
+ *
+ * Shutdown the runtime library. This is also optional, and even if called will not
+ * do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to zero.
+  */
+void
+__kmpc_end(ident_t *loc)
+{
+    // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
+    // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
+    // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
+    // will unregister this root (it can cause library shut down).
+    if (__kmp_ignore_mppend() == FALSE) {
+        KC_TRACE( 10, ("__kmpc_end: called\n" ) );
+        KA_TRACE( 30, ("__kmpc_end\n" ));
+
+        __kmp_internal_end_thread( -1 );
+    }
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The global thread index of the active thread.
+
+This function can be called in any context.
+
+If the runtime has ony been entered at the outermost level from a
+single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is that
+which would be returned by omp_get_thread_num() in the outermost
+active parallel construct. (Or zero if there is no active parallel
+construct, since the master thread is necessarily thread zero).
+
+If multiple non-OpenMP threads all enter an OpenMP construct then this
+will be a unique thread identifier among all the threads created by
+the OpenMP runtime (but the value cannote be defined in terms of
+OpenMP thread ids returned by omp_get_thread_num()).
+
+*/
+kmp_int32
+__kmpc_global_thread_num(ident_t *loc)
+{
+    kmp_int32 gtid = __kmp_entry_gtid();
+
+    KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
+
+    return gtid;
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads under control of the OpenMP<sup>*</sup> runtime
+
+This function can be called in any context.
+It returns the total number of threads under the control of the OpenMP runtime. That is
+not a number that can be determined by any OpenMP standard calls, since the library may be
+called from more than one non-OpenMP thread, and this reflects the total over all such calls.
+Similarly the runtime maintains underlying threads even when they are not active (since the cost
+of creating and destroying OS threads is high), this call counts all such threads even if they are not
+waiting for work.
+*/
+kmp_int32
+__kmpc_global_num_threads(ident_t *loc)
+{
+    KC_TRACE( 10, ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_nth ) );
+
+    return TCR_4(__kmp_nth);
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The thread number of the calling thread in the innermost active parallel construct.
+
+*/
+kmp_int32
+__kmpc_bound_thread_num(ident_t *loc)
+{
+    KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
+    return __kmp_tid_from_gtid( __kmp_entry_gtid() );
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads in the innermost active parallel construct.
+*/
+kmp_int32
+__kmpc_bound_num_threads(ident_t *loc)
+{
+    KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
+
+    return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
+}
+
+/*!
+ * @ingroup DEPRECATED
+ * @param loc location description
+ *
+ * This function need not be called. It always returns TRUE.
+ */
+kmp_int32
+__kmpc_ok_to_fork(ident_t *loc)
+{
+#ifndef KMP_DEBUG
+
+    return TRUE;
+
+#else
+
+    const char *semi2;
+    const char *semi3;
+    int line_no;
+
+    if (__kmp_par_range == 0) {
+        return TRUE;
+    }
+    semi2 = loc->psource;
+    if (semi2 == NULL) {
+        return TRUE;
+    }
+    semi2 = strchr(semi2, ';');
+    if (semi2 == NULL) {
+        return TRUE;
+    }
+    semi2 = strchr(semi2 + 1, ';');
+    if (semi2 == NULL) {
+        return TRUE;
+    }
+    if (__kmp_par_range_filename[0]) {
+        const char *name = semi2 - 1;
+        while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
+            name--;
+        }
+        if ((*name == '/') || (*name == ';')) {
+            name++;
+        }
+        if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
+            return __kmp_par_range < 0;
+        }
+    }
+    semi3 = strchr(semi2 + 1, ';');
+    if (__kmp_par_range_routine[0]) {
+        if ((semi3 != NULL) && (semi3 > semi2)
+          && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
+            return __kmp_par_range < 0;
+        }
+    }
+    if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
+        if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
+            return __kmp_par_range > 0;
+        }
+        return __kmp_par_range < 0;
+    }
+    return TRUE;
+
+#endif /* KMP_DEBUG */
+
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return 1 if this thread is executing inside an active parallel region, zero if not.
+*/
+kmp_int32
+__kmpc_in_parallel( ident_t *loc )
+{
+    return __kmp_entry_thread() -> th.th_root -> r.r_active;
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_threads number of threads requested for this parallel construct
+
+Set the number of threads to be used by the next fork spawned by this thread.
+This call is only required if the parallel construct has a `num_threads` clause.
+*/
+void
+__kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
+{
+    KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
+      global_tid, num_threads ) );
+
+    __kmp_push_num_threads( loc, global_tid, num_threads );
+}
+
+void
+__kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
+{
+    KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
+
+    /* the num_threads are automatically popped */
+}
+
+
+#if OMP_40_ENABLED
+
+void
+__kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
+{
+    KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
+      global_tid, proc_bind ) );
+
+    __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
+}
+
+#endif /* OMP_40_ENABLED */
+
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param argc  total number of arguments in the ellipsis
+@param microtask  pointer to callback routine consisting of outlined parallel construct
+@param ...  pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void
+__kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
+{
+  KMP_STOP_EXPLICIT_TIMER(OMP_serial);
+  KMP_COUNT_BLOCK(OMP_PARALLEL);
+  int         gtid = __kmp_entry_gtid();
+  // maybe to save thr_state is enough here
+  {
+    va_list     ap;
+    va_start(   ap, microtask );
+
+#if OMPT_SUPPORT
+    kmp_info_t *master_th = __kmp_threads[ gtid ];
+    kmp_team_t *parent_team = master_th->th.th_team;
+    int tid = __kmp_tid_from_gtid( gtid );
+    parent_team->t.t_implicit_task_taskdata[tid].
+        ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(0);
+#endif
+
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_FORKING();
+#endif
+    __kmp_fork_call( loc, gtid, fork_context_intel,
+            argc,
+#if OMPT_SUPPORT
+            VOLATILE_CAST(void *) microtask,      // "unwrapped" task
+#endif
+            VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
+            VOLATILE_CAST(launch_t)    __kmp_invoke_task_func,
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+            &ap
+#else
+            ap
+#endif
+            );
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_JOINING();
+#endif
+    __kmp_join_call( loc, gtid );
+
+    va_end( ap );
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        parent_team->t.t_implicit_task_taskdata[tid].
+            ompt_task_info.frame.reenter_runtime_frame = 0;
+    }
+#endif
+  }
+  KMP_START_EXPLICIT_TIMER(OMP_serial);
+}
+
+#if OMP_40_ENABLED
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_teams number of teams requested for the teams construct
+@param num_threads number of threads per team requested for the teams construct
+
+Set the number of teams to be used by the teams construct.
+This call is only required if the teams construct has a `num_teams` clause
+or a `thread_limit` clause (or both).
+*/
+void
+__kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
+{
+    KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
+      global_tid, num_teams, num_threads ) );
+
+    __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param argc  total number of arguments in the ellipsis
+@param microtask  pointer to callback routine consisting of outlined teams construct
+@param ...  pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void
+__kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
+{
+    int         gtid = __kmp_entry_gtid();
+    kmp_info_t *this_thr = __kmp_threads[ gtid ];
+    va_list     ap;
+    va_start(   ap, microtask );
+
+    // remember teams entry point and nesting level
+    this_thr->th.th_teams_microtask = microtask;
+    this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
+
+    // check if __kmpc_push_num_teams called, set default number of teams otherwise
+    if ( this_thr->th.th_teams_size.nteams == 0 ) {
+        __kmp_push_num_teams( loc, gtid, 0, 0 );
+    }
+    KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
+    KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
+    KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
+
+    __kmp_fork_call( loc, gtid, fork_context_intel,
+            argc,
+#if OMPT_SUPPORT
+            VOLATILE_CAST(void *) microtask,               // "unwrapped" task
+#endif
+            VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
+            VOLATILE_CAST(launch_t)    __kmp_invoke_teams_master,
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+            &ap
+#else
+            ap
+#endif
+            );
+    __kmp_join_call( loc, gtid );
+    this_thr->th.th_teams_microtask = NULL;
+    this_thr->th.th_teams_level = 0;
+    *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
+    va_end( ap );
+}
+#endif /* OMP_40_ENABLED */
+
+
+//
+// I don't think this function should ever have been exported.
+// The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
+// openmp code ever called it, but it's been exported from the RTL for so
+// long that I'm afraid to remove the definition.
+//
+int
+__kmpc_invoke_task_func( int gtid )
+{
+    return __kmp_invoke_task_func( gtid );
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param global_tid  global thread number
+
+Enter a serialized parallel construct. This interface is used to handle a
+conditional parallel region, like this,
+@code
+#pragma omp parallel if (condition)
+@endcode
+when the condition is false.
+*/
+void
+__kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
+{
+    __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
+                                                 * kmp_fork_call since the tasks to be done are similar in each case.
+                                                 */
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param global_tid  global thread number
+
+Leave a serialized parallel construct.
+*/
+void
+__kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
+{
+    kmp_internal_control_t *top;
+    kmp_info_t *this_thr;
+    kmp_team_t *serial_team;
+
+    KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
+
+    /* skip all this code for autopar serialized loops since it results in
+       unacceptable overhead */
+    if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
+        return;
+
+    // Not autopar code
+    if( ! TCR_4( __kmp_init_parallel ) )
+        __kmp_parallel_initialize();
+
+    this_thr    = __kmp_threads[ global_tid ];
+    serial_team = this_thr->th.th_serial_team;
+
+   #if OMP_41_ENABLED
+   kmp_task_team_t *   task_team = this_thr->th.th_task_team;
+
+   // we need to wait for the proxy tasks before finishing the thread
+   if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
+        __kmp_task_team_wait(this_thr, serial_team, NULL ); // is an ITT object needed here?
+   #endif
+
+    KMP_MB();
+    KMP_DEBUG_ASSERT( serial_team );
+    KMP_ASSERT(       serial_team -> t.t_serialized );
+    KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
+    KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
+    KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
+    KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
+
+    /* If necessary, pop the internal control stack values and replace the team values */
+    top = serial_team -> t.t_control_stack_top;
+    if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
+        copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
+        serial_team -> t.t_control_stack_top = top -> next;
+        __kmp_free(top);
+    }
+
+    //if( serial_team -> t.t_serialized > 1 )
+    serial_team -> t.t_level--;
+
+    /* pop dispatch buffers stack */
+    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
+    {
+        dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
+        serial_team->t.t_dispatch->th_disp_buffer =
+            serial_team->t.t_dispatch->th_disp_buffer->next;
+        __kmp_free( disp_buffer );
+    }
+
+    -- serial_team -> t.t_serialized;
+    if ( serial_team -> t.t_serialized == 0 ) {
+
+        /* return to the parallel section */
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+        if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
+            __kmp_clear_x87_fpu_status_word();
+            __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
+            __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
+        }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+        this_thr -> th.th_team           = serial_team -> t.t_parent;
+        this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
+
+        /* restore values cached in the thread */
+        this_thr -> th.th_team_nproc     = serial_team -> t.t_parent -> t.t_nproc;          /*  JPH */
+        this_thr -> th.th_team_master    = serial_team -> t.t_parent -> t.t_threads[0];     /* JPH */
+        this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
+
+        /* TODO the below shouldn't need to be adjusted for serialized teams */
+        this_thr -> th.th_dispatch       = & this_thr -> th.th_team ->
+            t.t_dispatch[ serial_team -> t.t_master_tid ];
+
+        __kmp_pop_current_task_from_thread( this_thr );
+
+        KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
+        this_thr -> th.th_current_task -> td_flags.executing = 1;
+
+        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+            // Copy the task team from the new child / old parent team to the thread.
+            this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
+            KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
+                            global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
+        }
+    } else {
+        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+            KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
+                            global_tid, serial_team, serial_team -> t.t_serialized ) );
+        }
+    }
+
+#if USE_ITT_BUILD
+    kmp_uint64 cur_time = 0;
+#if  USE_ITT_NOTIFY
+    if ( __itt_get_timestamp_ptr ) {
+        cur_time = __itt_get_timestamp();
+    }
+#endif /* USE_ITT_NOTIFY */
+    if ( this_thr->th.th_team->t.t_level == 0
+#if OMP_40_ENABLED
+        && this_thr->th.th_teams_microtask == NULL
+#endif
+    ) {
+        // Report the barrier
+        this_thr->th.th_ident = loc;
+        if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
+            ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
+        {
+            __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized,
+                                    cur_time, 0, loc, this_thr->th.th_team_nproc, 0 );
+            if ( __kmp_forkjoin_frames_mode == 3 )
+                // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier.
+                __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time,
+                                        cur_time, 0, loc, this_thr->th.th_team_nproc, 2 );
+        } else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
+            ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
+            // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
+            __kmp_itt_region_joined( global_tid, 1 );
+    }
+#endif /* USE_ITT_BUILD */
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_parallel( global_tid, NULL );
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc  source location information.
+
+Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
+depending on the memory ordering convention obeyed by the compiler
+even that may not be necessary).
+*/
+void
+__kmpc_flush(ident_t *loc)
+{
+    KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
+
+    /* need explicit __mf() here since use volatile instead in library */
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
+        #if KMP_MIC
+            // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
+            // We shouldn't need it, though, since the ABI rules require that
+            // * If the compiler generates NGO stores it also generates the fence
+            // * If users hand-code NGO stores they should insert the fence
+            // therefore no incomplete unordered stores should be visible.
+        #else
+            // C74404
+            // This is to address non-temporal store instructions (sfence needed).
+            // The clflush instruction is addressed either (mfence needed).
+            // Probably the non-temporal load monvtdqa instruction should also be addressed.
+            // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
+            if ( ! __kmp_cpuinfo.initialized ) {
+                __kmp_query_cpuid( & __kmp_cpuinfo );
+            }; // if
+            if ( ! __kmp_cpuinfo.sse2 ) {
+                // CPU cannot execute SSE2 instructions.
+            } else {
+                #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
+                _mm_mfence();
+                #else
+                __sync_synchronize();
+                #endif // KMP_COMPILER_ICC
+            }; // if
+        #endif // KMP_MIC
+    #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+        // Nothing to see here move along
+    #elif KMP_ARCH_PPC64
+        // Nothing needed here (we have a real MB above).
+        #if KMP_OS_CNK
+        // The flushing thread needs to yield here; this prevents a
+       // busy-waiting thread from saturating the pipeline. flush is
+          // often used in loops like this:
+           // while (!flag) {
+           //   #pragma omp flush(flag)
+           // }
+       // and adding the yield here is good for at least a 10x speedup
+          // when running >2 threads per core (on the NAS LU benchmark).
+            __kmp_yield(TRUE);
+        #endif
+    #else
+        #error Unknown or unsupported architecture
+    #endif
+
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Execute a barrier.
+*/
+void
+__kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
+{
+    KMP_COUNT_BLOCK(OMP_BARRIER);
+    KMP_TIME_BLOCK(OMP_barrier);
+    KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
+
+    if (! TCR_4(__kmp_init_parallel))
+        __kmp_parallel_initialize();
+
+    if ( __kmp_env_consistency_check ) {
+        if ( loc == 0 ) {
+            KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
+        }; // if
+
+        __kmp_check_barrier( global_tid, ct_barrier, loc );
+    }
+
+    __kmp_threads[ global_tid ]->th.th_ident = loc;
+    // TODO: explicit barrier_wait_id:
+    //   this function is called when 'barrier' directive is present or
+    //   implicit barrier at the end of a worksharing construct.
+    // 1) better to add a per-thread barrier counter to a thread data structure
+    // 2) set to 0 when a new team is created
+    // 4) no sync is required
+
+    __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+}
+
+/* The BARRIER for a MASTER section is always explicit   */
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
+*/
+kmp_int32
+__kmpc_master(ident_t *loc, kmp_int32 global_tid)
+{
+    KMP_COUNT_BLOCK(OMP_MASTER);
+    int status = 0;
+
+    KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
+
+    if( ! TCR_4( __kmp_init_parallel ) )
+        __kmp_parallel_initialize();
+
+    if( KMP_MASTER_GTID( global_tid ))
+        status = 1;
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (status) {
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
+            kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
+            kmp_team_t  *team            = this_thr -> th.th_team;
+
+            int  tid = __kmp_tid_from_gtid( global_tid );
+            ompt_callbacks.ompt_callback(ompt_event_master_begin)(
+                team->t.ompt_team_info.parallel_id,
+                team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+        }
+    }
+#endif
+
+    if ( __kmp_env_consistency_check ) {
+#if KMP_USE_DYNAMIC_LOCK
+        if (status)
+            __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
+        else
+            __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
+#else
+        if (status)
+            __kmp_push_sync( global_tid, ct_master, loc, NULL );
+        else
+            __kmp_check_sync( global_tid, ct_master, loc, NULL );
+#endif
+    }
+
+    return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+
+Mark the end of a <tt>master</tt> region. This should only be called by the thread
+that executes the <tt>master</tt> region.
+*/
+void
+__kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
+{
+    KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
+
+    KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t  *team            = this_thr -> th.th_team;
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_master_end)) {
+        int  tid = __kmp_tid_from_gtid( global_tid );
+        ompt_callbacks.ompt_callback(ompt_event_master_end)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+#endif
+
+    if ( __kmp_env_consistency_check ) {
+        if( global_tid < 0 )
+            KMP_WARNING( ThreadIdentInvalid );
+
+        if( KMP_MASTER_GTID( global_tid ))
+            __kmp_pop_sync( global_tid, ct_master, loc );
+    }
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+
+Start execution of an <tt>ordered</tt> construct.
+*/
+void
+__kmpc_ordered( ident_t * loc, kmp_int32 gtid )
+{
+    int cid = 0;
+    kmp_info_t *th;
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
+
+    if (! TCR_4(__kmp_init_parallel))
+        __kmp_parallel_initialize();
+
+#if USE_ITT_BUILD
+    __kmp_itt_ordered_prep( gtid );
+    // TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+    th = __kmp_threads[ gtid ];
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        /* OMPT state update */
+        th->th.ompt_thread_info.wait_id = (uint64_t) loc;
+        th->th.ompt_thread_info.state = ompt_state_wait_ordered;
+
+        /* OMPT event callback */
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
+            ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
+                th->th.ompt_thread_info.wait_id);
+        }
+    }
+#endif
+
+    if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
+        (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
+    else
+        __kmp_parallel_deo( & gtid, & cid, loc );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        /* OMPT state update */
+        th->th.ompt_thread_info.state = ompt_state_work_parallel;
+        th->th.ompt_thread_info.wait_id = 0;
+
+        /* OMPT event callback */
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
+            ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
+                th->th.ompt_thread_info.wait_id);
+        }
+    }
+#endif
+
+#if USE_ITT_BUILD
+    __kmp_itt_ordered_start( gtid );
+#endif /* USE_ITT_BUILD */
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+
+End execution of an <tt>ordered</tt> construct.
+*/
+void
+__kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
+{
+    int cid = 0;
+    kmp_info_t *th;
+
+    KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
+
+#if USE_ITT_BUILD
+    __kmp_itt_ordered_end( gtid );
+    // TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+    th = __kmp_threads[ gtid ];
+
+    if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
+        (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
+    else
+        __kmp_parallel_dxo( & gtid, & cid, loc );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
+            th->th.ompt_thread_info.wait_id);
+    }
+#endif
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+static __forceinline kmp_indirect_lock_t *
+__kmp_get_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_dyna_lockseq_t seq)
+{
+    // Code from __kmp_get_critical_section_ptr
+    // This function returns an indirect lock object instead of a user lock.
+    kmp_indirect_lock_t **lck, *ret;
+    lck = (kmp_indirect_lock_t **)crit;
+    ret = (kmp_indirect_lock_t *)TCR_PTR(*lck);
+    if (ret == NULL) {
+        void *idx;
+        kmp_indirect_locktag_t tag = DYNA_GET_I_TAG(seq);
+        kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
+        ret = ilk;
+        DYNA_I_LOCK_FUNC(ilk, init)(ilk->lock);
+        DYNA_SET_I_LOCK_LOCATION(ilk, loc);
+        DYNA_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
+        KA_TRACE(20, ("__kmp_get_indirect_csptr: initialized indirect lock #%d\n", tag));
+#if USE_ITT_BUILD
+        __kmp_itt_critical_creating(ilk->lock, loc);
+#endif
+        int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
+        if (status == 0) {
+#if USE_ITT_BUILD
+            __kmp_itt_critical_destroyed(ilk->lock);
+#endif
+            // Postponing destroy, to avoid costly dispatch here.
+            //DYNA_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
+            ret = (kmp_indirect_lock_t *)TCR_PTR(*lck);
+            KMP_DEBUG_ASSERT(ret != NULL);
+        }
+    }
+    return ret;
+}
+
+// Fast-path acquire tas lock
+#define DYNA_ACQUIRE_TAS_LOCK(lock, gtid) {                                                                      \
+    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                                                  \
+    if (l->lk.poll != DYNA_LOCK_FREE(tas) ||                                                                     \
+            ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas))) {    \
+        kmp_uint32 spins;                                                                                        \
+        KMP_FSYNC_PREPARE(l);                                                                                    \
+        KMP_INIT_YIELD(spins);                                                                                   \
+        if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {                            \
+            KMP_YIELD(TRUE);                                                                                     \
+        } else {                                                                                                 \
+            KMP_YIELD_SPIN(spins);                                                                               \
+        }                                                                                                        \
+        while (l->lk.poll != DYNA_LOCK_FREE(tas) ||                                                              \
+               ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas))) { \
+            if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {                        \
+                KMP_YIELD(TRUE);                                                                                 \
+            } else {                                                                                             \
+                KMP_YIELD_SPIN(spins);                                                                           \
+            }                                                                                                    \
+        }                                                                                                        \
+    }                                                                                                            \
+    KMP_FSYNC_ACQUIRED(l);                                                                                       \
+}
+
+// Fast-path test tas lock
+#define DYNA_TEST_TAS_LOCK(lock, gtid, rc) {                                                           \
+    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                                        \
+    rc = l->lk.poll == DYNA_LOCK_FREE(tas) &&                                                          \
+         KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas)); \
+}
+
+// Fast-path release tas lock
+#define DYNA_RELEASE_TAS_LOCK(lock, gtid) {                         \
+    TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, DYNA_LOCK_FREE(tas));  \
+    KMP_MB();                                                       \
+}
+
+#if DYNA_HAS_FUTEX
+
+# include <unistd.h>
+# include <sys/syscall.h>
+# ifndef FUTEX_WAIT
+#  define FUTEX_WAIT 0
+# endif
+# ifndef FUTEX_WAKE
+#  define FUTEX_WAKE 1
+# endif
+
+// Fast-path acquire futex lock
+#define DYNA_ACQUIRE_FUTEX_LOCK(lock, gtid) {                                                                       \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                                               \
+    kmp_int32 gtid_code = (gtid+1) << 1;                                                                            \
+    KMP_MB();                                                                                                       \
+    KMP_FSYNC_PREPARE(ftx);                                                                                         \
+    kmp_int32 poll_val;                                                                                             \
+    while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex),                          \
+                                                   DYNA_LOCK_BUSY(gtid_code, futex))) != DYNA_LOCK_FREE(futex)) {   \
+        kmp_int32 cond = DYNA_LOCK_STRIP(poll_val) & 1;                                                             \
+        if (!cond) {                                                                                                \
+            if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | DYNA_LOCK_BUSY(1, futex))) {     \
+                continue;                                                                                           \
+            }                                                                                                       \
+            poll_val |= DYNA_LOCK_BUSY(1, futex);                                                                   \
+        }                                                                                                           \
+        kmp_int32 rc;                                                                                               \
+        if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) {                \
+            continue;                                                                                               \
+        }                                                                                                           \
+        gtid_code |= 1;                                                                                             \
+    }                                                                                                               \
+    KMP_FSYNC_ACQUIRED(ftx);                                                                                        \
+}
+
+// Fast-path test futex lock
+#define DYNA_TEST_FUTEX_LOCK(lock, gtid, rc) {                                                                      \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                                               \
+    if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex), DYNA_LOCK_BUSY(gtid+1, futex) << 1)) {  \
+        KMP_FSYNC_ACQUIRED(ftx);                                                                                    \
+        rc = TRUE;                                                                                                  \
+    } else {                                                                                                        \
+        rc = FALSE;                                                                                                 \
+    }                                                                                                               \
+}
+
+// Fast-path release futex lock
+#define DYNA_RELEASE_FUTEX_LOCK(lock, gtid) {                                                       \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                               \
+    KMP_MB();                                                                                       \
+    KMP_FSYNC_RELEASING(ftx);                                                                       \
+    kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex));                  \
+    if (DYNA_LOCK_STRIP(poll_val) & 1) {                                                            \
+        syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, DYNA_LOCK_BUSY(1, futex), NULL, NULL, 0);  \
+    }                                                                                               \
+    KMP_MB();                                                                                       \
+    KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));              \
+}
+
+#endif // DYNA_HAS_FUTEX
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+static kmp_user_lock_p
+__kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
+{
+    kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
+
+    //
+    // Because of the double-check, the following load
+    // doesn't need to be volatile.
+    //
+    kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
+
+    if ( lck == NULL ) {
+        void * idx;
+
+        // Allocate & initialize the lock.
+        // Remember allocated locks in table in order to free them in __kmp_cleanup()
+        lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
+        __kmp_init_user_lock_with_checks( lck );
+        __kmp_set_user_lock_location( lck, loc );
+#if USE_ITT_BUILD
+        __kmp_itt_critical_creating( lck );
+            // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
+            // lock. It is the only place where we can guarantee it. There are chances the lock will
+            // destroyed with no usage, but it is not a problem, because this is not real event seen
+            // by user but rather setting name for object (lock). See more details in kmp_itt.h.
+#endif /* USE_ITT_BUILD */
+
+        //
+        // Use a cmpxchg instruction to slam the start of the critical
+        // section with the lock pointer.  If another thread beat us
+        // to it, deallocate the lock, and use the lock that the other
+        // thread allocated.
+        //
+        int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
+
+        if ( status == 0 ) {
+            // Deallocate the lock and reload the value.
+#if USE_ITT_BUILD
+            __kmp_itt_critical_destroyed( lck );
+                // Let ITT know the lock is destroyed and the same memory location may be reused for
+                // another purpose.
+#endif /* USE_ITT_BUILD */
+            __kmp_destroy_user_lock_with_checks( lck );
+            __kmp_user_lock_free( &idx, gtid, lck );
+            lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
+            KMP_DEBUG_ASSERT( lck != NULL );
+        }
+    }
+    return lck;
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
+some other suitably unique value.
+
+Enter code protected by a `critical` construct.
+This function blocks until the executing thread can enter the critical section.
+*/
+void
+__kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
+    KMP_COUNT_BLOCK(OMP_CRITICAL);
+
+    kmp_user_lock_p lck;
+
+    KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
+
+#if KMP_USE_DYNAMIC_LOCK
+    // Assumption: all direct locks fit in OMP_CRITICAL_SIZE.
+    // The global sequence __kmp_user_lock_seq is used unless compiler pushes a value.
+    if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
+        lck = (kmp_user_lock_p)crit;
+        // The thread that reaches here first needs to tag the lock word.
+        if (*((kmp_dyna_lock_t *)lck) == 0) {
+            KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)lck, 0, DYNA_GET_D_TAG(__kmp_user_lock_seq));
+        }
+        if (__kmp_env_consistency_check) {
+            __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+        }
+# if USE_ITT_BUILD
+        __kmp_itt_critical_acquiring(lck);
+# endif
+# if DYNA_USE_FAST_TAS
+        if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+            DYNA_ACQUIRE_TAS_LOCK(lck, global_tid);
+        } else
+# elif DYNA_USE_FAST_FUTEX
+        if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+            DYNA_ACQUIRE_FUTEX_LOCK(lck, global_tid);
+        } else
+# endif
+        {
+            DYNA_D_LOCK_FUNC(lck, set)((kmp_dyna_lock_t *)lck, global_tid);
+        }
+    } else {
+        kmp_indirect_lock_t *ilk = __kmp_get_indirect_csptr(crit, loc, global_tid, __kmp_user_lock_seq);
+        lck = ilk->lock;
+        if (__kmp_env_consistency_check) {
+            __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+        }
+# if USE_ITT_BUILD
+        __kmp_itt_critical_acquiring(lck);
+# endif
+        DYNA_I_LOCK_FUNC(ilk, set)(lck, global_tid);
+    }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    //TODO: add THR_OVHD_STATE
+
+    KMP_CHECK_USER_LOCK_INIT();
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+        lck = (kmp_user_lock_p)crit;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+        lck = (kmp_user_lock_p)crit;
+    }
+#endif
+    else { // ticket, queuing or drdpa
+        lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
+    }
+
+    if ( __kmp_env_consistency_check )
+        __kmp_push_sync( global_tid, ct_critical, loc, lck );
+
+    /* since the critical directive binds to all threads, not just
+     * the current team we have to check this even if we are in a
+     * serialized team */
+    /* also, even if we are the uber thread, we still have to conduct the lock,
+     * as we have to contend with sibling threads */
+
+#if USE_ITT_BUILD
+    __kmp_itt_critical_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+    // Value of 'crit' should be good for using as a critical_id of the critical section directive.
+    __kmp_acquire_user_lock_with_checks( lck, global_tid );
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+    __kmp_itt_critical_acquired( lck );
+#endif /* USE_ITT_BUILD */
+
+    KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
+} // __kmpc_critical
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
+some other suitably unique value.
+
+Leave a critical section, releasing any lock that was held during its execution.
+*/
+void
+__kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
+{
+    kmp_user_lock_p lck;
+
+    KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
+
+#if KMP_USE_DYNAMIC_LOCK
+    if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
+        lck = (kmp_user_lock_p)crit;
+        KMP_ASSERT(lck != NULL);
+        if (__kmp_env_consistency_check) {
+            __kmp_pop_sync(global_tid, ct_critical, loc);
+        }
+# if USE_ITT_BUILD
+        __kmp_itt_critical_releasing( lck );
+# endif
+# if DYNA_USE_FAST_TAS
+        if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+            DYNA_RELEASE_TAS_LOCK(lck, global_tid);
+        } else
+# elif DYNA_USE_FAST_FUTEX
+        if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+            DYNA_RELEASE_FUTEX_LOCK(lck, global_tid);
+        } else
+# endif
+        {
+            DYNA_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+        }
+    } else {
+        kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+        KMP_ASSERT(ilk != NULL);
+        lck = ilk->lock;
+        if (__kmp_env_consistency_check) {
+            __kmp_pop_sync(global_tid, ct_critical, loc);
+        }
+# if USE_ITT_BUILD
+        __kmp_itt_critical_releasing( lck );
+# endif
+        DYNA_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
+    }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+        lck = (kmp_user_lock_p)crit;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+        lck = (kmp_user_lock_p)crit;
+    }
+#endif
+    else { // ticket, queuing or drdpa
+        lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
+    }
+
+    KMP_ASSERT(lck != NULL);
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_sync( global_tid, ct_critical, loc );
+
+#if USE_ITT_BUILD
+    __kmp_itt_critical_releasing( lck );
+#endif /* USE_ITT_BUILD */
+    // Value of 'crit' should be good for using as a critical_id of the critical section directive.
+    __kmp_release_user_lock_with_checks( lck, global_tid );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_critical)(
+            (uint64_t) lck);
+    }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+    KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master. The barrier is executed inside this function.
+*/
+kmp_int32
+__kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
+{
+    int status;
+
+    KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
+
+    if (! TCR_4(__kmp_init_parallel))
+        __kmp_parallel_initialize();
+
+    if ( __kmp_env_consistency_check )
+        __kmp_check_barrier( global_tid, ct_barrier, loc );
+
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
+
+    return (status != 0) ? 0 : 1;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Complete the execution of a combined barrier and master. This function should
+only be called at the completion of the <tt>master</tt> code. Other threads will
+still be waiting at the barrier and this call releases them.
+*/
+void
+__kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
+{
+    KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
+
+    __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master(nowait) construct.
+The barrier is executed inside this function.
+There is no equivalent "end" function, since the
+*/
+kmp_int32
+__kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
+{
+    kmp_int32 ret;
+
+    KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
+
+    if (! TCR_4(__kmp_init_parallel))
+        __kmp_parallel_initialize();
+
+    if ( __kmp_env_consistency_check ) {
+        if ( loc == 0 ) {
+            KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
+        }
+        __kmp_check_barrier( global_tid, ct_barrier, loc );
+    }
+
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+    ret = __kmpc_master (loc, global_tid);
+
+    if ( __kmp_env_consistency_check ) {
+        /*  there's no __kmpc_end_master called; so the (stats) */
+        /*  actions of __kmpc_end_master are done here          */
+
+        if ( global_tid < 0 ) {
+            KMP_WARNING( ThreadIdentInvalid );
+        }
+        if (ret) {
+            /* only one thread should do the pop since only */
+            /* one did the push (see __kmpc_master())       */
+
+            __kmp_pop_sync( global_tid, ct_master, loc );
+        }
+    }
+
+    return (ret);
+}
+
+/* The BARRIER for a SINGLE process section is always explicit   */
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+@return One if this thread should execute the single construct, zero otherwise.
+
+Test whether to execute a <tt>single</tt> construct.
+There are no implicit barriers in the two "single" calls, rather the compiler should
+introduce an explicit barrier if it is required.
+*/
+
+kmp_int32
+__kmpc_single(ident_t *loc, kmp_int32 global_tid)
+{
+    KMP_COUNT_BLOCK(OMP_SINGLE);
+    kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t *team            = this_thr -> th.th_team;
+    int tid = __kmp_tid_from_gtid( global_tid );
+
+    if ((ompt_status == ompt_status_track_callback)) {
+        if (rc) {
+            if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
+                    team->t.ompt_team_info.parallel_id,
+                    team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
+                    team->t.ompt_team_info.microtask);
+            }
+        } else {
+            if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
+                    team->t.ompt_team_info.parallel_id,
+                    team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+            }
+            this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
+        }
+    }
+#endif
+
+    return rc;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+
+Mark the end of a <tt>single</tt> construct.  This function should
+only be called by the thread that executed the block of code protected
+by the `single` construct.
+*/
+void
+__kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
+{
+    __kmp_exit_single( global_tid );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t *team            = this_thr -> th.th_team;
+    int tid = __kmp_tid_from_gtid( global_tid );
+
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
+        ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+#endif
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc Source location
+@param global_tid Global thread id
+
+Mark the end of a statically scheduled loop.
+*/
+void
+__kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
+{
+    KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
+    kmp_team_t *team            = this_thr -> th.th_team;
+    int tid = __kmp_tid_from_gtid( global_tid );
+
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
+        ompt_callbacks.ompt_callback(ompt_event_loop_end)(
+            team->t.ompt_team_info.parallel_id,
+            team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+    }
+#endif
+
+    if ( __kmp_env_consistency_check )
+     __kmp_pop_workshare( global_tid, ct_pdo, loc );
+}
+
+/*
+ * User routines which take C-style arguments (call by value)
+ * different from the Fortran equivalent routines
+ */
+
+void
+ompc_set_num_threads( int arg )
+{
+// !!!!! TODO: check the per-task binding
+    __kmp_set_num_threads( arg, __kmp_entry_gtid() );
+}
+
+void
+ompc_set_dynamic( int flag )
+{
+    kmp_info_t *thread;
+
+    /* For the thread-private implementation of the internal controls */
+    thread = __kmp_entry_thread();
+
+    __kmp_save_internal_controls( thread );
+
+    set__dynamic( thread, flag ? TRUE : FALSE );
+}
+
+void
+ompc_set_nested( int flag )
+{
+    kmp_info_t *thread;
+
+    /* For the thread-private internal controls implementation */
+    thread = __kmp_entry_thread();
+
+    __kmp_save_internal_controls( thread );
+
+    set__nested( thread, flag ? TRUE : FALSE );
+}
+
+void
+ompc_set_max_active_levels( int max_active_levels )
+{
+    /* TO DO */
+    /* we want per-task implementation of this internal control */
+
+    /* For the per-thread internal controls implementation */
+    __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
+}
+
+void
+ompc_set_schedule( omp_sched_t kind, int modifier )
+{
+// !!!!! TODO: check the per-task binding
+    __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
+}
+
+int
+ompc_get_ancestor_thread_num( int level )
+{
+    return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
+}
+
+int
+ompc_get_team_size( int level )
+{
+    return __kmp_get_team_size( __kmp_entry_gtid(), level );
+}
+
+void
+kmpc_set_stacksize( int arg )
+{
+    // __kmp_aux_set_stacksize initializes the library if needed
+    __kmp_aux_set_stacksize( arg );
+}
+
+void
+kmpc_set_stacksize_s( size_t arg )
+{
+    // __kmp_aux_set_stacksize initializes the library if needed
+    __kmp_aux_set_stacksize( arg );
+}
+
+void
+kmpc_set_blocktime( int arg )
+{
+    int gtid, tid;
+    kmp_info_t *thread;
+
+    gtid = __kmp_entry_gtid();
+    tid = __kmp_tid_from_gtid(gtid);
+    thread = __kmp_thread_from_gtid(gtid);
+
+    __kmp_aux_set_blocktime( arg, thread, tid );
+}
+
+void
+kmpc_set_library( int arg )
+{
+    // __kmp_user_set_library initializes the library if needed
+    __kmp_user_set_library( (enum library_type)arg );
+}
+
+void
+kmpc_set_defaults( char const * str )
+{
+    // __kmp_aux_set_defaults initializes the library if needed
+    __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
+}
+
+int
+kmpc_set_affinity_mask_proc( int proc, void **mask )
+{
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+    return -1;
+#else
+    if ( ! TCR_4(__kmp_init_middle) ) {
+        __kmp_middle_initialize();
+    }
+    return __kmp_aux_set_affinity_mask_proc( proc, mask );
+#endif
+}
+
+int
+kmpc_unset_affinity_mask_proc( int proc, void **mask )
+{
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+    return -1;
+#else
+    if ( ! TCR_4(__kmp_init_middle) ) {
+        __kmp_middle_initialize();
+    }
+    return __kmp_aux_unset_affinity_mask_proc( proc, mask );
+#endif
+}
+
+int
+kmpc_get_affinity_mask_proc( int proc, void **mask )
+{
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+    return -1;
+#else
+    if ( ! TCR_4(__kmp_init_middle) ) {
+        __kmp_middle_initialize();
+    }
+    return __kmp_aux_get_affinity_mask_proc( proc, mask );
+#endif
+}
+
+
+/* -------------------------------------------------------------------------- */
+/*!
+@ingroup THREADPRIVATE
+@param loc       source location information
+@param gtid      global thread number
+@param cpy_size  size of the cpy_data buffer
+@param cpy_data  pointer to data to be copied
+@param cpy_func  helper function to call for copying data
+@param didit     flag variable: 1=single thread; 0=not single thread
+
+__kmpc_copyprivate implements the interface for the private data broadcast needed for
+the copyprivate clause associated with a single region in an OpenMP<sup>*</sup> program (both C and Fortran).
+All threads participating in the parallel region call this routine.
+One of the threads (called the single thread) should have the <tt>didit</tt> variable set to 1
+and all other threads should have that variable set to 0.
+All threads pass a pointer to a data buffer (cpy_data) that they have built.
+
+The OpenMP specification forbids the use of nowait on the single region when a copyprivate
+clause is present. However, @ref __kmpc_copyprivate implements a barrier internally to avoid
+race conditions, so the code generation for the single region should avoid generating a barrier
+after the call to @ref __kmpc_copyprivate.
+
+The <tt>gtid</tt> parameter is the global thread id for the current thread.
+The <tt>loc</tt> parameter is a pointer to source location information.
+
+Internal implementation: The single thread will first copy its descriptor address (cpy_data)
+to a team-private location, then the other threads will each call the function pointed to by
+the parameter cpy_func, which carries out the copy by copying the data using the cpy_data buffer.
+
+The cpy_func routine used for the copy and the contents of the data area defined by cpy_data
+and cpy_size may be built in any fashion that will allow the copy to be done. For instance,
+the cpy_data buffer can hold the actual data to be copied or it may hold a list of pointers
+to the data. The cpy_func routine must interpret the cpy_data buffer appropriately.
+
+The interface to cpy_func is as follows:
+@code
+void cpy_func( void *destination, void *source )
+@endcode
+where void *destination is the cpy_data pointer for the thread being copied to
+and void *source is the cpy_data pointer for the thread being copied from.
+*/
+void
+__kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
+{
+    void **data_ptr;
+
+    KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
+
+    KMP_MB();
+
+    data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
+
+    if ( __kmp_env_consistency_check ) {
+        if ( loc == 0 ) {
+            KMP_WARNING( ConstructIdentInvalid );
+        }
+    }
+
+    /* ToDo: Optimize the following two barriers into some kind of split barrier */
+
+    if (didit) *data_ptr = cpy_data;
+
+    /* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+    __kmp_threads[gtid]->th.th_ident = loc;
+#endif
+    __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
+
+    if (! didit) (*cpy_func)( cpy_data, *data_ptr );
+
+    /* Consider next barrier the user-visible barrier for barrier region boundaries */
+    /* Nesting checks are already handled by the single construct checks */
+
+#if USE_ITT_NOTIFY
+    __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
+#endif
+    __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define INIT_LOCK                 __kmp_init_user_lock_with_checks
+#define INIT_NESTED_LOCK          __kmp_init_nested_user_lock_with_checks
+#define ACQUIRE_LOCK              __kmp_acquire_user_lock_with_checks
+#define ACQUIRE_LOCK_TIMED        __kmp_acquire_user_lock_with_checks_timed
+#define ACQUIRE_NESTED_LOCK       __kmp_acquire_nested_user_lock_with_checks
+#define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
+#define RELEASE_LOCK              __kmp_release_user_lock_with_checks
+#define RELEASE_NESTED_LOCK       __kmp_release_nested_user_lock_with_checks
+#define TEST_LOCK                 __kmp_test_user_lock_with_checks
+#define TEST_NESTED_LOCK          __kmp_test_nested_user_lock_with_checks
+#define DESTROY_LOCK              __kmp_destroy_user_lock_with_checks
+#define DESTROY_NESTED_LOCK       __kmp_destroy_nested_user_lock_with_checks
+
+
+/*
+ * TODO: Make check abort messages use location info & pass it
+ * into with_checks routines
+ */
+
+/* initialize the lock */
+void
+__kmpc_init_lock( ident_t * loc, kmp_int32 gtid,  void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+    KMP_DEBUG_ASSERT(__kmp_init_serial);
+    if (__kmp_env_consistency_check && user_lock == NULL) {
+        KMP_FATAL(LockIsUninitialized, "omp_init_lock");
+    }
+    if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
+        DYNA_INIT_D_LOCK(user_lock, __kmp_user_lock_seq);
+# if USE_ITT_BUILD
+        __kmp_itt_lock_creating((kmp_user_lock_p)user_lock, NULL);
+# endif
+    } else {
+        DYNA_INIT_I_LOCK(user_lock, __kmp_user_lock_seq);
+        kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
+        DYNA_SET_I_LOCK_LOCATION(ilk, loc);
+# if USE_ITT_BUILD
+        __kmp_itt_lock_creating(ilk->lock, loc);
+# endif
+    }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    static char const * const func = "omp_init_lock";
+    kmp_user_lock_p lck;
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    if ( __kmp_env_consistency_check ) {
+        if ( user_lock == NULL ) {
+            KMP_FATAL( LockIsUninitialized, func );
+        }
+    }
+
+    KMP_CHECK_USER_LOCK_INIT();
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
+    }
+    INIT_LOCK( lck );
+    __kmp_set_user_lock_location( lck, loc );
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_creating( lck );
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_lock
+
+/* initialize the lock */
+void
+__kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+    KMP_DEBUG_ASSERT(__kmp_init_serial);
+    if (__kmp_env_consistency_check && user_lock == NULL) {
+        KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
+    }
+    // Invoke init function after converting to nested version.
+    kmp_dyna_lockseq_t nested_seq;
+    switch (__kmp_user_lock_seq) {
+        case lockseq_tas:       nested_seq = lockseq_nested_tas;        break;
+#if DYNA_HAS_FUTEX
+        case lockseq_futex:     nested_seq = lockseq_nested_futex;      break;
+#endif
+        case lockseq_ticket:    nested_seq = lockseq_nested_ticket;     break;
+        case lockseq_queuing:   nested_seq = lockseq_nested_queuing;    break;
+        case lockseq_drdpa:     nested_seq = lockseq_nested_drdpa;      break;
+        default:                nested_seq = lockseq_nested_queuing;    break;
+                                // Use nested queuing lock for lock kinds without "nested" implementation.
+    }
+    DYNA_INIT_I_LOCK(user_lock, nested_seq);
+    // All nested locks are indirect locks.
+    kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
+    DYNA_SET_I_LOCK_LOCATION(ilk, loc);
+# if USE_ITT_BUILD
+    __kmp_itt_lock_creating(ilk->lock, loc);
+# endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    static char const * const func = "omp_init_nest_lock";
+    kmp_user_lock_p lck;
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    if ( __kmp_env_consistency_check ) {
+        if ( user_lock == NULL ) {
+            KMP_FATAL( LockIsUninitialized, func );
+        }
+    }
+
+    KMP_CHECK_USER_LOCK_INIT();
+
+    if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+      + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+     && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+     <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
+    }
+
+    INIT_NESTED_LOCK( lck );
+    __kmp_set_user_lock_location( lck, loc );
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_creating( lck );
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_nest_lock
+
+void
+__kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+    kmp_user_lock_p lck;
+    if (DYNA_EXTRACT_D_TAG(user_lock) == 0) {
+        lck = ((kmp_indirect_lock_t *)DYNA_LOOKUP_I_LOCK(user_lock))->lock;
+    } else {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+    __kmp_itt_lock_destroyed(lck);
+# endif
+    DYNA_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+#else
+    kmp_user_lock_p lck;
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_destroyed( lck );
+#endif /* USE_ITT_BUILD */
+    DESTROY_LOCK( lck );
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        ;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        ;
+    }
+#endif
+    else {
+        __kmp_user_lock_free( user_lock, gtid, lck );
+    }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_lock
+
+/* destroy the lock */
+void
+__kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+    kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
+    __kmp_itt_lock_destroyed(ilk->lock);
+# endif
+    DYNA_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    kmp_user_lock_p lck;
+
+    if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+      + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+     && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+     <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_destroyed( lck );
+#endif /* USE_ITT_BUILD */
+
+    DESTROY_NESTED_LOCK( lck );
+
+    if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+     + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+        ;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+     && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+     <= OMP_NEST_LOCK_T_SIZE ) ) {
+        ;
+    }
+#endif
+    else {
+        __kmp_user_lock_free( user_lock, gtid, lck );
+    }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_nest_lock
+
+void
+__kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+    KMP_COUNT_BLOCK(OMP_set_lock);
+#if KMP_USE_DYNAMIC_LOCK
+    int tag = DYNA_EXTRACT_D_TAG(user_lock);
+# if USE_ITT_BUILD
+   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
+# endif
+# if DYNA_USE_FAST_TAS
+    if (tag == locktag_tas && !__kmp_env_consistency_check) {
+        DYNA_ACQUIRE_TAS_LOCK(user_lock, gtid);
+    } else
+# elif DYNA_USE_FAST_FUTEX
+    if (tag == locktag_futex && !__kmp_env_consistency_check) {
+        DYNA_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
+    } else
+# endif
+    {
+        __kmp_direct_set_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+    }
+# if USE_ITT_BUILD
+    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+# endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    kmp_user_lock_p lck;
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+    ACQUIRE_LOCK( lck, gtid );
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquired( lck );
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void
+__kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+    __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+# endif
+    DYNA_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
+# if USE_ITT_BUILD
+    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+    kmp_user_lock_p lck;
+
+    if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+      + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+     && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+     <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+    ACQUIRE_NESTED_LOCK( lck, gtid );
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquired( lck );
+#endif /* USE_ITT_BUILD */
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void
+__kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+#if KMP_USE_DYNAMIC_LOCK
+
+    int tag = DYNA_EXTRACT_D_TAG(user_lock);
+# if USE_ITT_BUILD
+    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+# endif
+# if DYNA_USE_FAST_TAS
+    if (tag == locktag_tas && !__kmp_env_consistency_check) {
+        DYNA_RELEASE_TAS_LOCK(user_lock, gtid);
+    } else
+# elif DYNA_USE_FAST_FUTEX
+    if (tag == locktag_futex && !__kmp_env_consistency_check) {
+        DYNA_RELEASE_FUTEX_LOCK(user_lock, gtid);
+    } else
+# endif
+    {
+        __kmp_direct_unset_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+    }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    kmp_user_lock_p lck;
+
+    /* Can't use serial interval since not block structured */
+    /* release the lock */
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+        // "fast" path implemented to fix customer performance issue
+#if USE_ITT_BUILD
+        __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
+#endif /* USE_ITT_BUILD */
+        TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
+        KMP_MB();
+        return;
+#else
+        lck = (kmp_user_lock_p)user_lock;
+#endif
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_releasing( lck );
+#endif /* USE_ITT_BUILD */
+
+    RELEASE_LOCK( lck, gtid );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
+        ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
+    }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* release the lock */
+void
+__kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+# endif
+    DYNA_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    kmp_user_lock_p lck;
+
+    /* Can't use serial interval since not block structured */
+
+    if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+      + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+        // "fast" path implemented to fix customer performance issue
+        kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
+#if USE_ITT_BUILD
+        __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
+#endif /* USE_ITT_BUILD */
+        if ( --(tl->lk.depth_locked) == 0 ) {
+            TCW_4(tl->lk.poll, 0);
+        }
+        KMP_MB();
+        return;
+#else
+        lck = (kmp_user_lock_p)user_lock;
+#endif
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+     && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+     <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_releasing( lck );
+#endif /* USE_ITT_BUILD */
+
+    int release_status;
+    release_status = RELEASE_NESTED_LOCK( lck, gtid );
+#if OMPT_SUPPORT && OMPT_BLAME
+    if (ompt_status == ompt_status_track_callback) {
+        if (release_status == KMP_LOCK_RELEASED) {
+            if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
+                ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
+                    (uint64_t) lck);
+            }
+        } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
+            ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
+                (uint64_t) lck);
+        }
+    }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int
+__kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+    KMP_COUNT_BLOCK(OMP_test_lock);
+    KMP_TIME_BLOCK(OMP_test_lock);
+
+#if KMP_USE_DYNAMIC_LOCK
+    int rc;
+    int tag = DYNA_EXTRACT_D_TAG(user_lock);
+# if USE_ITT_BUILD
+    __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+# endif
+# if DYNA_USE_FAST_TAS
+    if (tag == locktag_tas && !__kmp_env_consistency_check) {
+        DYNA_TEST_TAS_LOCK(user_lock, gtid, rc);
+    } else
+# elif DYNA_USE_FAST_FUTEX
+    if (tag == locktag_futex && !__kmp_env_consistency_check) {
+        DYNA_TEST_FUTEX_LOCK(user_lock, gtid, rc);
+    } else
+# endif
+    {
+        rc = __kmp_direct_test_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+    }
+    if (rc) {
+# if USE_ITT_BUILD
+        __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+# endif
+        return FTN_TRUE;
+    } else {
+# if USE_ITT_BUILD
+        __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+# endif
+        return FTN_FALSE;
+    }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    kmp_user_lock_p lck;
+    int          rc;
+
+    if ( ( __kmp_user_lock_kind == lk_tas )
+      && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+      && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+    rc = TEST_LOCK( lck, gtid );
+#if USE_ITT_BUILD
+    if ( rc ) {
+        __kmp_itt_lock_acquired( lck );
+    } else {
+        __kmp_itt_lock_cancelled( lck );
+    }
+#endif /* USE_ITT_BUILD */
+    return ( rc ? FTN_TRUE : FTN_FALSE );
+
+    /* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int
+__kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+#if KMP_USE_DYNAMIC_LOCK
+    int rc;
+# if USE_ITT_BUILD
+    __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+# endif
+    rc = DYNA_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
+# if USE_ITT_BUILD
+    if (rc) {
+        __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+    } else {
+        __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+    }
+# endif
+    return rc;
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    kmp_user_lock_p lck;
+    int          rc;
+
+    if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+      + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    else if ( ( __kmp_user_lock_kind == lk_futex )
+     && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+     <= OMP_NEST_LOCK_T_SIZE ) ) {
+        lck = (kmp_user_lock_p)user_lock;
+    }
+#endif
+    else {
+        lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+    rc = TEST_NESTED_LOCK( lck, gtid );
+#if USE_ITT_BUILD
+    if ( rc ) {
+        __kmp_itt_lock_acquired( lck );
+    } else {
+        __kmp_itt_lock_cancelled( lck );
+    }
+#endif /* USE_ITT_BUILD */
+    return rc;
+
+    /* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+
+/*--------------------------------------------------------------------------------------------------------------------*/
+
+/*
+ * Interface to fast scalable reduce methods routines
+ */
+
+// keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
+// another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
+// AT: which solution is better?
+#define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
+                   ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
+
+#define __KMP_GET_REDUCTION_METHOD(gtid) \
+                   ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
+
+// description of the packed_reduction_method variable: look at the macros in kmp.h
+
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
+
+    // this lock was visible to a customer and to the thread profiler as a serial overhead span
+    //            (although it's used for an internal purpose only)
+    //            why was it visible in previous implementation?
+    //            should we keep it visible in new reduce block?
+    kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+    if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
+        lck = (kmp_user_lock_p)crit;
+        if (*((kmp_dyna_lock_t *)lck) == 0) {
+            KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)lck, 0, DYNA_GET_D_TAG(__kmp_user_lock_seq));
+        }
+        KMP_DEBUG_ASSERT(lck != NULL);
+        if (__kmp_env_consistency_check) {
+            __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+        }
+        DYNA_D_LOCK_FUNC(lck, set)((kmp_dyna_lock_t *)lck, global_tid);
+    } else {
+        kmp_indirect_lock_t *ilk = __kmp_get_indirect_csptr(crit, loc, global_tid, __kmp_user_lock_seq);
+        KMP_DEBUG_ASSERT(ilk != NULL);
+        if (__kmp_env_consistency_check) {
+            __kmp_push_sync(global_tid, ct_critical, loc, ilk->lock, __kmp_user_lock_seq);
+        }
+        DYNA_I_LOCK_FUNC(ilk, set)(ilk->lock, global_tid);
+    }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    // We know that the fast reduction code is only emitted by Intel compilers
+    // with 32 byte critical sections. If there isn't enough space, then we
+    // have to use a pointer.
+    if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
+        lck = (kmp_user_lock_p)crit;
+    }
+    else {
+        lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
+    }
+    KMP_DEBUG_ASSERT( lck != NULL );
+
+    if ( __kmp_env_consistency_check )
+        __kmp_push_sync( global_tid, ct_critical, loc, lck );
+
+    __kmp_acquire_user_lock_with_checks( lck, global_tid );
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
+
+    kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+    if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
+        lck = (kmp_user_lock_p)crit;
+        if (__kmp_env_consistency_check)
+            __kmp_pop_sync(global_tid, ct_critical, loc);
+        DYNA_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+    } else {
+        kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+        if (__kmp_env_consistency_check)
+            __kmp_pop_sync(global_tid, ct_critical, loc);
+        DYNA_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
+    }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+    // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
+    // sections. If there isn't enough space, then we have to use a pointer.
+    if ( __kmp_base_user_lock_size > 32 ) {
+        lck = *( (kmp_user_lock_p *) crit );
+        KMP_ASSERT( lck != NULL );
+    } else {
+        lck = (kmp_user_lock_p) crit;
+    }
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_sync( global_tid, ct_critical, loc );
+
+    __kmp_release_user_lock_with_checks( lck, global_tid );
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmp_end_critical_section_reduce_block
+
+
+/* 2.a.i. Reduce Block without a terminating barrier */
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
+
+The nowait version is used for a reduce clause with the nowait argument.
+*/
+kmp_int32
+__kmpc_reduce_nowait(
+    ident_t *loc, kmp_int32 global_tid,
+    kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck ) {
+
+    KMP_COUNT_BLOCK(REDUCE_nowait);
+    int retval = 0;
+    PACKED_REDUCTION_METHOD_T packed_reduction_method;
+#if OMP_40_ENABLED
+    kmp_team_t *team;
+    kmp_info_t *th;
+    int teams_swapped = 0, task_state;
+#endif
+    KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
+
+    // why do we need this initialization here at all?
+    // Reduction clause can not be used as a stand-alone directive.
+
+    // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
+    // possible detection of false-positive race by the threadchecker ???
+    if( ! TCR_4( __kmp_init_parallel ) )
+        __kmp_parallel_initialize();
+
+    // check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+    if ( __kmp_env_consistency_check )
+        __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
+#else
+    if ( __kmp_env_consistency_check )
+        __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
+#endif
+
+#if OMP_40_ENABLED
+    th = __kmp_thread_from_gtid(global_tid);
+    if( th->th.th_teams_microtask ) {   // AC: check if we are inside the teams construct?
+        team = th->th.th_team;
+        if( team->t.t_level == th->th.th_teams_level ) {
+            // this is reduction at teams construct
+            KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid);  // AC: check that tid == 0
+            // Let's swap teams temporarily for the reduction barrier
+            teams_swapped = 1;
+            th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+            th->th.th_team = team->t.t_parent;
+            th->th.th_team_nproc = th->th.th_team->t.t_nproc;
+            th->th.th_task_team = th->th.th_team->t.t_task_team[0];
+            task_state = th->th.th_task_state;
+            th->th.th_task_state = 0;
+        }
+    }
+#endif // OMP_40_ENABLED
+
+    // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
+    // the variable should be either a construct-specific or thread-specific property, not a team specific property
+    //     (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
+    // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
+    //     (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
+    // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
+    // a thread-specific "th_local.reduction_method" variable is used currently
+    // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
+
+    packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
+    __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
+
+    if( packed_reduction_method == critical_reduce_block ) {
+
+        __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
+        retval = 1;
+
+    } else if( packed_reduction_method == empty_reduce_block ) {
+
+        // usage: if team size == 1, no synchronization is required ( Intel platforms only )
+        retval = 1;
+
+    } else if( packed_reduction_method == atomic_reduce_block ) {
+
+        retval = 2;
+
+        // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
+        //     (it's not quite good, because the checking block has been closed by this 'pop',
+        //      but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
+        if ( __kmp_env_consistency_check )
+            __kmp_pop_sync( global_tid, ct_reduce, loc );
+
+    } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+        //AT: performance issue: a real barrier here
+        //AT:     (if master goes slow, other threads are blocked here waiting for the master to come and release them)
+        //AT:     (it's not what a customer might expect specifying NOWAIT clause)
+        //AT:     (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
+        //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
+        //        and be more in line with sense of NOWAIT
+        //AT: TO DO: do epcc test and compare times
+
+        // this barrier should be invisible to a customer and to the thread profiler
+        //              (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+        retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
+        retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
+
+        // all other workers except master should do this pop here
+        //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
+        if ( __kmp_env_consistency_check ) {
+            if( retval == 0 ) {
+                __kmp_pop_sync( global_tid, ct_reduce, loc );
+            }
+        }
+
+    } else {
+
+        // should never reach this block
+        KMP_ASSERT( 0 ); // "unexpected method"
+
+    }
+#if OMP_40_ENABLED
+    if( teams_swapped ) {
+        // Restore thread structure
+        th->th.th_info.ds.ds_tid = 0;
+        th->th.th_team = team;
+        th->th.th_team_nproc = team->t.t_nproc;
+        th->th.th_task_team = team->t.t_task_team[task_state];
+        th->th.th_task_state = task_state;
+    }
+#endif
+    KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
+
+    return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a reduce nowait.
+*/
+void
+__kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
+
+    PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+    KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
+
+    packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
+
+    if( packed_reduction_method == critical_reduce_block ) {
+
+        __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
+
+    } else if( packed_reduction_method == empty_reduce_block ) {
+
+        // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
+
+    } else if( packed_reduction_method == atomic_reduce_block ) {
+
+        // neither master nor other workers should get here
+        //     (code gen does not generate this call in case 2: atomic reduce block)
+        // actually it's better to remove this elseif at all;
+        // after removal this value will checked by the 'else' and will assert
+
+    } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+        // only master gets here
+
+    } else {
+
+        // should never reach this block
+        KMP_ASSERT( 0 ); // "unexpected method"
+
+    }
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_sync( global_tid, ct_reduce, loc );
+
+    KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
+
+    return;
+}
+
+/* 2.a.ii. Reduce Block with a terminating barrier */
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
+
+A blocking reduce that includes an implicit barrier.
+*/
+kmp_int32
+__kmpc_reduce(
+    ident_t *loc, kmp_int32 global_tid,
+    kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
+    void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck )
+{
+    KMP_COUNT_BLOCK(REDUCE_wait);
+    int retval = 0;
+    PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+    KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
+
+    // why do we need this initialization here at all?
+    // Reduction clause can not be a stand-alone directive.
+
+    // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
+    // possible detection of false-positive race by the threadchecker ???
+    if( ! TCR_4( __kmp_init_parallel ) )
+        __kmp_parallel_initialize();
+
+    // check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+    if ( __kmp_env_consistency_check )
+        __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
+#else
+    if ( __kmp_env_consistency_check )
+        __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
+#endif
+
+    packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
+    __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
+
+    if( packed_reduction_method == critical_reduce_block ) {
+
+        __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
+        retval = 1;
+
+    } else if( packed_reduction_method == empty_reduce_block ) {
+
+        // usage: if team size == 1, no synchronization is required ( Intel platforms only )
+        retval = 1;
+
+    } else if( packed_reduction_method == atomic_reduce_block ) {
+
+        retval = 2;
+
+    } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+        //case tree_reduce_block:
+        // this barrier should be visible to a customer and to the thread profiler
+        //              (it's a terminating barrier on constructs if NOWAIT not specified)
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
+#endif
+        retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
+        retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
+
+        // all other workers except master should do this pop here
+        //     ( none of other workers except master will enter __kmpc_end_reduce() )
+        if ( __kmp_env_consistency_check ) {
+            if( retval == 0 ) { // 0: all other workers; 1: master
+                __kmp_pop_sync( global_tid, ct_reduce, loc );
+            }
+        }
+
+    } else {
+
+        // should never reach this block
+        KMP_ASSERT( 0 ); // "unexpected method"
+
+    }
+
+    KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
+
+    return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a blocking reduce.
+The <tt>lck</tt> pointer must be the same as that used in the corresponding start function.
+*/
+void
+__kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
+
+    PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+    KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
+
+    packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
+
+    // this barrier should be visible to a customer and to the thread profiler
+    //              (it's a terminating barrier on constructs if NOWAIT not specified)
+
+    if( packed_reduction_method == critical_reduce_block ) {
+
+        __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
+
+        // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+        __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+    } else if( packed_reduction_method == empty_reduce_block ) {
+
+        // usage: if team size == 1, no synchronization is required ( Intel platforms only )
+
+        // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+        __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+    } else if( packed_reduction_method == atomic_reduce_block ) {
+
+        // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+        __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+    } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+        // only master executes here (master releases all other workers)
+        __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
+
+    } else {
+
+        // should never reach this block
+        KMP_ASSERT( 0 ); // "unexpected method"
+
+    }
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_sync( global_tid, ct_reduce, loc );
+
+    KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
+
+    return;
+}
+
+#undef __KMP_GET_REDUCTION_METHOD
+#undef __KMP_SET_REDUCTION_METHOD
+
+/*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
+
+kmp_uint64
+__kmpc_get_taskid() {
+
+    kmp_int32    gtid;
+    kmp_info_t * thread;
+
+    gtid = __kmp_get_gtid();
+    if ( gtid < 0 ) {
+        return 0;
+    }; // if
+    thread = __kmp_thread_from_gtid( gtid );
+    return thread->th.th_current_task->td_task_id;
+
+} // __kmpc_get_taskid
+
+
+kmp_uint64
+__kmpc_get_parent_taskid() {
+
+    kmp_int32        gtid;
+    kmp_info_t *     thread;
+    kmp_taskdata_t * parent_task;
+
+    gtid = __kmp_get_gtid();
+    if ( gtid < 0 ) {
+        return 0;
+    }; // if
+    thread      = __kmp_thread_from_gtid( gtid );
+    parent_task = thread->th.th_current_task->td_parent;
+    return ( parent_task == NULL ? 0 : parent_task->td_task_id );
+
+} // __kmpc_get_parent_taskid
+
+void __kmpc_place_threads(int nC, int nT, int nO)
+{
+    if ( ! __kmp_init_serial ) {
+        __kmp_serial_initialize();
+    }
+    __kmp_place_num_cores = nC;
+    __kmp_place_num_threads_per_core = nT;
+    __kmp_place_core_offset = nO;
+}
+
+// end of file //
+

diff --git a/final/runtime/src/kmp_debug.c b/final/runtime/src/kmp_debug.c
new file mode 100644
index 0000000..3bbffa5
--- /dev/null
+++ b/final/runtime/src/kmp_debug.c

@@ -0,0 +1,142 @@
+/*
+ * kmp_debug.c -- debug utilities for the Guide library
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_debug.h" /* really necessary? */
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+
+#ifdef KMP_DEBUG
+void
+__kmp_debug_printf_stdout( char const * format, ... )
+{
+    va_list ap;
+    va_start( ap, format );
+
+    __kmp_vprintf( kmp_out, format, ap );
+
+    va_end(ap);
+}
+#endif
+
+void
+__kmp_debug_printf( char const * format, ... )
+{
+    va_list ap;
+    va_start( ap, format );
+
+    __kmp_vprintf( kmp_err, format, ap );
+
+    va_end( ap );
+}
+
+#ifdef KMP_USE_ASSERT
+    int
+    __kmp_debug_assert(
+        char const *  msg,
+        char const *  file,
+        int           line
+    ) {
+
+        if ( file == NULL ) {
+            file = KMP_I18N_STR( UnknownFile );
+        } else {
+            // Remove directories from path, leave only file name. File name is enough, there is no need
+            // in bothering developers and customers with full paths.
+            char const * slash = strrchr( file, '/' );
+            if ( slash != NULL ) {
+                file = slash + 1;
+            }; // if
+        }; // if
+
+        #ifdef KMP_DEBUG
+            __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
+            __kmp_debug_printf( "Assertion failure at %s(%d): %s.\n", file, line, msg );
+            __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
+            #ifdef USE_ASSERT_BREAK
+                #if KMP_OS_WINDOWS
+                    DebugBreak();
+                #endif
+            #endif // USE_ASSERT_BREAK
+            #ifdef USE_ASSERT_STALL
+                /*    __kmp_infinite_loop(); */
+                for(;;);
+            #endif // USE_ASSERT_STALL
+            #ifdef USE_ASSERT_SEG
+                {
+                    int volatile * ZERO = (int*) 0;
+                    ++ (*ZERO);
+                }
+            #endif // USE_ASSERT_SEG
+        #endif
+
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( AssertionFailure, file, line ),
+            KMP_HNT( SubmitBugReport ),
+            __kmp_msg_null
+        );
+
+        return 0;
+
+    } // __kmp_debug_assert
+
+#endif // KMP_USE_ASSERT
+
+/* Dump debugging buffer to stderr */
+void
+__kmp_dump_debug_buffer( void )
+{
+    if ( __kmp_debug_buffer != NULL ) {
+        int i;
+        int dc = __kmp_debug_count;
+        char *db = & __kmp_debug_buffer[ (dc % __kmp_debug_buf_lines) * __kmp_debug_buf_chars ];
+        char *db_end = & __kmp_debug_buffer[ __kmp_debug_buf_lines * __kmp_debug_buf_chars ];
+        char *db2;
+
+        __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
+        __kmp_printf_no_lock( "\nStart dump of debugging buffer (entry=%d):\n",
+                 dc % __kmp_debug_buf_lines );
+
+        for ( i = 0; i < __kmp_debug_buf_lines; i++ ) {
+
+            if ( *db != '\0' ) {
+                /* Fix up where no carriage return before string termination char */
+                for ( db2 = db + 1; db2 < db + __kmp_debug_buf_chars - 1; db2 ++) {
+                    if ( *db2 == '\0' ) {
+                        if ( *(db2-1) != '\n' ) { *db2 = '\n'; *(db2+1) = '\0'; }
+                        break;
+                    }
+                }
+                /* Handle case at end by shortening the printed message by one char if necessary */
+                if ( db2 == db + __kmp_debug_buf_chars - 1 &&
+                     *db2 == '\0' && *(db2-1) != '\n' ) {
+                    *(db2-1) = '\n';
+                }
+
+                __kmp_printf_no_lock( "%4d: %.*s", i, __kmp_debug_buf_chars, db );
+                *db = '\0'; /* only let it print once! */
+            }
+
+            db += __kmp_debug_buf_chars;
+            if ( db >= db_end )
+                db = __kmp_debug_buffer;
+        }
+
+        __kmp_printf_no_lock( "End dump of debugging buffer (entry=%d).\n\n",
+                 ( dc+i-1 ) % __kmp_debug_buf_lines );
+        __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
+    }
+}

diff --git a/final/runtime/src/kmp_debug.h b/final/runtime/src/kmp_debug.h
new file mode 100644
index 0000000..abc923e
--- /dev/null
+++ b/final/runtime/src/kmp_debug.h

@@ -0,0 +1,131 @@
+/*
+ * kmp_debug.h -- debug / assertion code for Assure library
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_DEBUG_H
+#define KMP_DEBUG_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+// -------------------------------------------------------------------------------------------------
+// Build-time assertion.
+// -------------------------------------------------------------------------------------------------
+
+/*
+    Build-time assertion can do compile-time checking of data structure sizes, etc. This works by
+    declaring a negative-length array if the conditional expression evaluates to false.  In that
+    case, the compiler issues a syntax error and stops the compilation. If the expression is
+    true, we get an extraneous static single character array in the scope of the macro.
+
+    Usage:
+
+        KMP_BUILD_ASSERT( sizeof( some_t ) <= 32 );
+        KMP_BUILD_ASSERT( offsetof( some_t, field ) % 8 == 0 );
+
+    Do not use _KMP_BUILD_ASSERT and __KMP_BUILD_ASSERT directly, it is working guts.
+*/
+
+#define __KMP_BUILD_ASSERT( expr, suffix )  typedef char __kmp_build_check_##suffix[ (expr) ? 1 : -1 ]
+#define _KMP_BUILD_ASSERT( expr, suffix )   __KMP_BUILD_ASSERT( (expr), suffix )
+#ifdef KMP_USE_ASSERT 
+    #define KMP_BUILD_ASSERT( expr )            _KMP_BUILD_ASSERT( (expr), __LINE__ )
+#else
+    #define KMP_BUILD_ASSERT( expr )            /* nothing to do */
+#endif
+
+// -------------------------------------------------------------------------------------------------
+// Run-time assertions.
+// -------------------------------------------------------------------------------------------------
+
+extern void __kmp_dump_debug_buffer( void );
+
+#ifdef KMP_USE_ASSERT
+    extern int __kmp_debug_assert( char const * expr, char const * file, int line );
+    #ifdef KMP_DEBUG
+        #define KMP_ASSERT( cond )             ( (cond) ? 0 : __kmp_debug_assert( #cond, __FILE__, __LINE__ ) )
+        #define KMP_ASSERT2( cond, msg )       ( (cond) ? 0 : __kmp_debug_assert( (msg), __FILE__, __LINE__ ) )
+        #define KMP_DEBUG_ASSERT( cond )       KMP_ASSERT( cond )
+        #define KMP_DEBUG_ASSERT2( cond, msg ) KMP_ASSERT2( cond, msg )
+    #else
+        // Do not expose condition in release build. Use "assertion failure".
+        #define KMP_ASSERT( cond )             ( (cond) ? 0 : __kmp_debug_assert( "assertion failure", __FILE__, __LINE__ ) )
+        #define KMP_ASSERT2( cond, msg )       KMP_ASSERT( cond )
+        #define KMP_DEBUG_ASSERT( cond )       0
+        #define KMP_DEBUG_ASSERT2( cond, msg ) 0
+    #endif // KMP_DEBUG
+#else
+    #define KMP_ASSERT( cond )             0
+    #define KMP_ASSERT2( cond, msg )       0
+    #define KMP_DEBUG_ASSERT( cond )       0
+    #define KMP_DEBUG_ASSERT2( cond, msg ) 0
+#endif // KMP_USE_ASSERT
+
+#ifdef KMP_DEBUG
+    extern void __kmp_debug_printf_stdout( char const * format, ... );
+#endif
+extern void __kmp_debug_printf( char const * format, ... );
+
+#ifdef KMP_DEBUG
+
+    extern int kmp_a_debug;
+    extern int kmp_b_debug;
+    extern int kmp_c_debug;
+    extern int kmp_d_debug;
+    extern int kmp_e_debug;
+    extern int kmp_f_debug;
+    extern int kmp_diag;
+
+    #define KA_TRACE(d,x)     if (kmp_a_debug >= d) { __kmp_debug_printf x ; }
+    #define KB_TRACE(d,x)     if (kmp_b_debug >= d) { __kmp_debug_printf x ; }
+    #define KC_TRACE(d,x)     if (kmp_c_debug >= d) { __kmp_debug_printf x ; }
+    #define KD_TRACE(d,x)     if (kmp_d_debug >= d) { __kmp_debug_printf x ; }
+    #define KE_TRACE(d,x)     if (kmp_e_debug >= d) { __kmp_debug_printf x ; }
+    #define KF_TRACE(d,x)     if (kmp_f_debug >= d) { __kmp_debug_printf x ; }
+    #define K_DIAG(d,x)       {if (kmp_diag == d) { __kmp_debug_printf_stdout x ; } }
+
+    #define KA_DUMP(d,x)     if (kmp_a_debug >= d) { int ks; __kmp_disable(&ks); (x) ; __kmp_enable(ks); }
+    #define KB_DUMP(d,x)     if (kmp_b_debug >= d) { int ks; __kmp_disable(&ks); (x) ; __kmp_enable(ks); }
+    #define KC_DUMP(d,x)     if (kmp_c_debug >= d) { int ks; __kmp_disable(&ks); (x) ; __kmp_enable(ks); }
+    #define KD_DUMP(d,x)     if (kmp_d_debug >= d) { int ks; __kmp_disable(&ks); (x) ; __kmp_enable(ks); }
+    #define KE_DUMP(d,x)     if (kmp_e_debug >= d) { int ks; __kmp_disable(&ks); (x) ; __kmp_enable(ks); }
+    #define KF_DUMP(d,x)     if (kmp_f_debug >= d) { int ks; __kmp_disable(&ks); (x) ; __kmp_enable(ks); }
+
+#else
+
+    #define KA_TRACE(d,x)     /* nothing to do */
+    #define KB_TRACE(d,x)     /* nothing to do */
+    #define KC_TRACE(d,x)     /* nothing to do */
+    #define KD_TRACE(d,x)     /* nothing to do */
+    #define KE_TRACE(d,x)     /* nothing to do */
+    #define KF_TRACE(d,x)     /* nothing to do */
+    #define K_DIAG(d,x)       {}/* nothing to do */
+
+    #define KA_DUMP(d,x)     /* nothing to do */
+    #define KB_DUMP(d,x)     /* nothing to do */
+    #define KC_DUMP(d,x)     /* nothing to do */
+    #define KD_DUMP(d,x)     /* nothing to do */
+    #define KE_DUMP(d,x)     /* nothing to do */
+    #define KF_DUMP(d,x)     /* nothing to do */
+
+#endif // KMP_DEBUG
+
+#ifdef __cplusplus
+    } // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_DEBUG_H */

diff --git a/final/runtime/src/kmp_debugger.c b/final/runtime/src/kmp_debugger.c
new file mode 100644
index 0000000..d46c8a9
--- /dev/null
+++ b/final/runtime/src/kmp_debugger.c

@@ -0,0 +1,308 @@
+#if USE_DEBUGGER
+/*
+ * kmp_debugger.c -- debugger support.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_lock.h"
+#include "kmp_omp.h"
+#include "kmp_str.h"
+
+/*
+    NOTE: All variable names are known to the debugger, do not change!
+*/
+
+#ifdef __cplusplus
+    extern "C" {
+        extern kmp_omp_struct_info_t __kmp_omp_debug_struct_info;
+    } // extern "C"
+#endif // __cplusplus
+
+int __kmp_debugging          = FALSE;    // Boolean whether currently debugging OpenMP RTL.
+
+#define offset_and_size_of( structure, field )     \
+    {                                              \
+        offsetof( structure,           field ),    \
+        sizeof( ( (structure *) NULL)->field )     \
+    }
+
+#define offset_and_size_not_available \
+    { -1, -1 }
+
+#define addr_and_size_of( var )                    \
+    {                                              \
+        (kmp_uint64)( & var ),                     \
+        sizeof( var )                              \
+    }
+
+#define nthr_buffer_size 1024
+static kmp_int32
+kmp_omp_nthr_info_buffer[ nthr_buffer_size ] =
+    { nthr_buffer_size * sizeof( kmp_int32 ) };
+
+/* TODO: Check punctuation for various platforms here */
+static char func_microtask[]    = "__kmp_invoke_microtask";
+static char func_fork[]         = "__kmpc_fork_call";
+static char func_fork_teams[]   = "__kmpc_fork_teams";
+
+
+// Various info about runtime structures: addresses, field offsets, sizes, etc.
+kmp_omp_struct_info_t
+__kmp_omp_debug_struct_info = {
+
+    /* Change this only if you make a fundamental data structure change here */
+    KMP_OMP_VERSION,
+
+    /* sanity check.  Only should be checked if versions are identical
+     * This is also used for backward compatibility to get the runtime
+     * structure size if it the runtime is older than the interface */
+    sizeof( kmp_omp_struct_info_t ),
+
+    /* OpenMP RTL version info. */
+    addr_and_size_of( __kmp_version_major ),
+    addr_and_size_of( __kmp_version_minor ),
+    addr_and_size_of( __kmp_version_build ),
+    addr_and_size_of( __kmp_openmp_version ),
+    { (kmp_uint64)( __kmp_copyright ) + KMP_VERSION_MAGIC_LEN, 0 },        // Skip magic prefix.
+
+    /* Various globals. */
+    addr_and_size_of( __kmp_threads ),
+    addr_and_size_of( __kmp_root ),
+    addr_and_size_of( __kmp_threads_capacity ),
+    addr_and_size_of( __kmp_monitor ),
+#if ! KMP_USE_DYNAMIC_LOCK
+    addr_and_size_of( __kmp_user_lock_table ),
+#endif
+    addr_and_size_of( func_microtask ),
+    addr_and_size_of( func_fork ),
+    addr_and_size_of( func_fork_teams ),
+    addr_and_size_of( __kmp_team_counter ),
+    addr_and_size_of( __kmp_task_counter ),
+    addr_and_size_of( kmp_omp_nthr_info_buffer ),
+    sizeof( void * ),
+    OMP_LOCK_T_SIZE < sizeof(void *),
+    bs_last_barrier,
+    TASK_DEQUE_SIZE,
+
+    // thread structure information
+    sizeof( kmp_base_info_t ),
+    offset_and_size_of( kmp_base_info_t, th_info ),
+    offset_and_size_of( kmp_base_info_t, th_team ),
+    offset_and_size_of( kmp_base_info_t, th_root ),
+    offset_and_size_of( kmp_base_info_t, th_serial_team ),
+    offset_and_size_of( kmp_base_info_t, th_ident ),
+    offset_and_size_of( kmp_base_info_t, th_spin_here    ),
+    offset_and_size_of( kmp_base_info_t, th_next_waiting ),
+    offset_and_size_of( kmp_base_info_t, th_task_team    ),
+    offset_and_size_of( kmp_base_info_t, th_current_task ),
+    offset_and_size_of( kmp_base_info_t, th_task_state   ),
+    offset_and_size_of( kmp_base_info_t,   th_bar ),
+    offset_and_size_of( kmp_bstate_t,      b_worker_arrived ),
+
+    // teams information
+    offset_and_size_of( kmp_base_info_t, th_teams_microtask),
+    offset_and_size_of( kmp_base_info_t, th_teams_level),
+    offset_and_size_of( kmp_teams_size_t, nteams ),
+    offset_and_size_of( kmp_teams_size_t, nth ),
+
+    // kmp_desc structure (for info field above)
+    sizeof( kmp_desc_base_t ),
+    offset_and_size_of( kmp_desc_base_t, ds_tid    ),
+    offset_and_size_of( kmp_desc_base_t, ds_gtid   ),
+    // On Windows* OS, ds_thread contains a thread /handle/, which is not usable, while thread /id/
+    // is in ds_thread_id.
+    #if KMP_OS_WINDOWS
+    offset_and_size_of( kmp_desc_base_t, ds_thread_id),
+    #else
+    offset_and_size_of( kmp_desc_base_t, ds_thread),
+    #endif
+
+    // team structure information
+    sizeof( kmp_base_team_t ),
+    offset_and_size_of( kmp_base_team_t,   t_master_tid ),
+    offset_and_size_of( kmp_base_team_t,   t_ident      ),
+    offset_and_size_of( kmp_base_team_t,   t_parent     ),
+    offset_and_size_of( kmp_base_team_t,   t_nproc      ),
+    offset_and_size_of( kmp_base_team_t,   t_threads    ),
+    offset_and_size_of( kmp_base_team_t,   t_serialized ),
+    offset_and_size_of( kmp_base_team_t,   t_id         ),
+    offset_and_size_of( kmp_base_team_t,   t_pkfn       ),
+    offset_and_size_of( kmp_base_team_t,   t_task_team ),
+    offset_and_size_of( kmp_base_team_t,   t_implicit_task_taskdata ),
+    offset_and_size_of( kmp_base_team_t,   t_cancel_request ),
+    offset_and_size_of( kmp_base_team_t,   t_bar ),
+    offset_and_size_of( kmp_balign_team_t, b_master_arrived ),
+    offset_and_size_of( kmp_balign_team_t, b_team_arrived ),
+
+    // root structure information
+    sizeof( kmp_base_root_t ),
+    offset_and_size_of( kmp_base_root_t, r_root_team   ),
+    offset_and_size_of( kmp_base_root_t, r_hot_team    ),
+    offset_and_size_of( kmp_base_root_t, r_uber_thread ),
+    offset_and_size_not_available,
+
+    // ident structure information
+    sizeof( ident_t ),
+    offset_and_size_of( ident_t, psource ),
+    offset_and_size_of( ident_t, flags   ),
+
+    // lock structure information
+    sizeof( kmp_base_queuing_lock_t ),
+    offset_and_size_of( kmp_base_queuing_lock_t, initialized  ),
+    offset_and_size_of( kmp_base_queuing_lock_t, location ),
+    offset_and_size_of( kmp_base_queuing_lock_t, tail_id  ),
+    offset_and_size_of( kmp_base_queuing_lock_t, head_id  ),
+    offset_and_size_of( kmp_base_queuing_lock_t, next_ticket  ),
+    offset_and_size_of( kmp_base_queuing_lock_t, now_serving  ),
+    offset_and_size_of( kmp_base_queuing_lock_t, owner_id     ),
+    offset_and_size_of( kmp_base_queuing_lock_t, depth_locked ),
+    offset_and_size_of( kmp_base_queuing_lock_t, flags ),
+
+#if ! KMP_USE_DYNAMIC_LOCK
+    /* Lock table. */
+    sizeof( kmp_lock_table_t ),
+    offset_and_size_of( kmp_lock_table_t, used       ),
+    offset_and_size_of( kmp_lock_table_t, allocated  ),
+    offset_and_size_of( kmp_lock_table_t, table      ),
+#endif
+
+    // Task team structure information.
+    sizeof( kmp_base_task_team_t ),
+    offset_and_size_of( kmp_base_task_team_t, tt_threads_data       ),
+    offset_and_size_of( kmp_base_task_team_t, tt_found_tasks        ),
+    offset_and_size_of( kmp_base_task_team_t, tt_nproc              ),
+    offset_and_size_of( kmp_base_task_team_t, tt_unfinished_threads ),
+    offset_and_size_of( kmp_base_task_team_t, tt_active             ),
+
+    // task_data_t.
+    sizeof( kmp_taskdata_t ),
+    offset_and_size_of( kmp_taskdata_t, td_task_id                ),
+    offset_and_size_of( kmp_taskdata_t, td_flags                  ),
+    offset_and_size_of( kmp_taskdata_t, td_team                   ),
+    offset_and_size_of( kmp_taskdata_t, td_parent                 ),
+    offset_and_size_of( kmp_taskdata_t, td_level                  ),
+    offset_and_size_of( kmp_taskdata_t, td_ident                  ),
+    offset_and_size_of( kmp_taskdata_t, td_allocated_child_tasks  ),
+    offset_and_size_of( kmp_taskdata_t, td_incomplete_child_tasks ),
+
+    offset_and_size_of( kmp_taskdata_t, td_taskwait_ident   ),
+    offset_and_size_of( kmp_taskdata_t, td_taskwait_counter ),
+    offset_and_size_of( kmp_taskdata_t, td_taskwait_thread  ),
+
+    offset_and_size_of( kmp_taskdata_t, td_taskgroup        ),
+    offset_and_size_of( kmp_taskgroup_t, count              ),
+    offset_and_size_of( kmp_taskgroup_t, cancel_request     ),
+
+    offset_and_size_of( kmp_taskdata_t, td_depnode          ),
+    offset_and_size_of( kmp_depnode_list_t, node            ),
+    offset_and_size_of( kmp_depnode_list_t, next            ),
+    offset_and_size_of( kmp_base_depnode_t, successors      ),
+    offset_and_size_of( kmp_base_depnode_t, task            ),
+    offset_and_size_of( kmp_base_depnode_t, npredecessors   ),
+    offset_and_size_of( kmp_base_depnode_t, nrefs           ),
+    offset_and_size_of( kmp_task_t, routine                 ),
+
+    // thread_data_t.
+    sizeof( kmp_thread_data_t ),
+    offset_and_size_of( kmp_base_thread_data_t, td_deque             ),
+    offset_and_size_of( kmp_base_thread_data_t, td_deque_head        ),
+    offset_and_size_of( kmp_base_thread_data_t, td_deque_tail        ),
+    offset_and_size_of( kmp_base_thread_data_t, td_deque_ntasks      ),
+    offset_and_size_of( kmp_base_thread_data_t, td_deque_last_stolen ),
+
+    // The last field.
+    KMP_OMP_VERSION,
+
+}; // __kmp_omp_debug_struct_info
+
+#undef offset_and_size_of
+#undef addr_and_size_of
+
+/*
+  Intel compiler on IA-32 architecture issues a warning "conversion
+  from "unsigned long long" to "char *" may lose significant bits"
+  when 64-bit value is assigned to 32-bit pointer. Use this function
+  to suppress the warning.
+*/
+static inline
+void *
+__kmp_convert_to_ptr(
+    kmp_uint64    addr
+) {
+    #if KMP_COMPILER_ICC
+        #pragma warning( push )
+        #pragma warning( disable:  810 ) // conversion from "unsigned long long" to "char *" may lose significant bits
+        #pragma warning( disable: 1195 ) // conversion from integer to smaller pointer
+    #endif // KMP_COMPILER_ICC
+    return (void *) addr;
+    #if KMP_COMPILER_ICC
+        #pragma warning( pop )
+    #endif // KMP_COMPILER_ICC
+} // __kmp_convert_to_ptr
+
+
+static int
+kmp_location_match(
+    kmp_str_loc_t *        loc,
+    kmp_omp_nthr_item_t *  item
+) {
+
+    int file_match = 0;
+    int func_match = 0;
+    int line_match = 0;
+
+    char * file = (char *) __kmp_convert_to_ptr( item->file );
+    char * func = (char *) __kmp_convert_to_ptr( item->func );
+    file_match = __kmp_str_fname_match( & loc->fname, file );
+    func_match =
+        item->func == 0  // If item->func is NULL, it allows any func name.
+        ||
+        strcmp( func, "*" ) == 0
+        ||
+        ( loc->func != NULL && strcmp( loc->func, func ) == 0 );
+    line_match =
+        item->begin <= loc->line
+        &&
+        ( item->end <= 0 || loc->line <= item->end ); // if item->end <= 0, it means "end of file".
+
+    return ( file_match && func_match && line_match );
+
+} // kmp_location_match
+
+
+int
+__kmp_omp_num_threads(
+    ident_t const * ident
+) {
+
+    int num_threads = 0;
+
+    kmp_omp_nthr_info_t * info =
+        (kmp_omp_nthr_info_t *) __kmp_convert_to_ptr(  __kmp_omp_debug_struct_info.nthr_info.addr );
+    if ( info->num > 0 && info->array != 0 ) {
+        kmp_omp_nthr_item_t * items = (kmp_omp_nthr_item_t *) __kmp_convert_to_ptr( info->array );
+        kmp_str_loc_t         loc   = __kmp_str_loc_init( ident->psource, 1 );
+        int i;
+        for ( i = 0; i < info->num; ++ i ) {
+            if ( kmp_location_match( & loc, & items[ i ] ) ) {
+                num_threads = items[ i ].num_threads;
+            }; // if
+        }; // for
+        __kmp_str_loc_free( & loc );
+    }; // if
+
+    return num_threads;;
+
+} // __kmp_omp_num_threads
+#endif /* USE_DEBUGGER */

diff --git a/final/runtime/src/kmp_debugger.h b/final/runtime/src/kmp_debugger.h
new file mode 100644
index 0000000..29f4134
--- /dev/null
+++ b/final/runtime/src/kmp_debugger.h

@@ -0,0 +1,51 @@
+#if USE_DEBUGGER
+/*
+ * kmp_debugger.h -- debugger support.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_DEBUGGER_H
+#define KMP_DEBUGGER_H
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+/* * This external variable can be set by any debugger to flag to the runtime that we
+   are currently executing inside a debugger.  This will allow the debugger to override
+   the number of threads spawned in a parallel region by using __kmp_omp_num_threads() (below). 
+   * When __kmp_debugging is TRUE, each team and each task gets a unique integer identifier 
+   that can be used by debugger to conveniently identify teams and tasks.
+   * The debugger has access to __kmp_omp_debug_struct_info which contains information
+   about the OpenMP library's important internal structures.  This access will allow the debugger 
+   to read detailed information from the typical OpenMP constructs (teams, threads, tasking, etc. )
+   during a debugging session and offer detailed and useful information which the user can probe
+   about the OpenMP portion of their code.
+   */
+extern int __kmp_debugging;             /* Boolean whether currently debugging OpenMP RTL */
+// Return number of threads specified by the debugger for given parallel region.
+/* The ident field, which represents a source file location, is used to check if the 
+   debugger has changed the number of threads for the parallel region at source file 
+   location ident.  This way, specific parallel regions' number of threads can be changed
+   at the debugger's request.
+ */
+int __kmp_omp_num_threads( ident_t const * ident );
+
+#ifdef __cplusplus
+    } // extern "C"
+#endif // __cplusplus
+
+
+#endif // KMP_DEBUGGER_H
+
+#endif // USE_DEBUGGER

diff --git a/final/runtime/src/kmp_dispatch.cpp b/final/runtime/src/kmp_dispatch.cpp
new file mode 100644
index 0000000..65abcf7
--- /dev/null
+++ b/final/runtime/src/kmp_dispatch.cpp

@@ -0,0 +1,2658 @@
+/*
+ * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*
+ * Dynamic scheduling initialization and dispatch.
+ *
+ * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
+ *       it may change values between parallel regions.  __kmp_max_nth
+ *       is the largest value __kmp_nth may take, 1 is the smallest.
+ *
+ */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_str.h"
+#include "kmp_error.h"
+#include "kmp_stats.h"
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+    #include <float.h>
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+// template for type limits
+template< typename T >
+struct i_maxmin {
+    static const T mx;
+    static const T mn;
+};
+template<>
+struct i_maxmin< int > {
+    static const int mx = 0x7fffffff;
+    static const int mn = 0x80000000;
+};
+template<>
+struct i_maxmin< unsigned int > {
+    static const unsigned int mx = 0xffffffff;
+    static const unsigned int mn = 0x00000000;
+};
+template<>
+struct i_maxmin< long long > {
+    static const long long mx = 0x7fffffffffffffffLL;
+    static const long long mn = 0x8000000000000000LL;
+};
+template<>
+struct i_maxmin< unsigned long long > {
+    static const unsigned long long mx = 0xffffffffffffffffLL;
+    static const unsigned long long mn = 0x0000000000000000LL;
+};
+//-------------------------------------------------------------------------
+
+#ifdef KMP_STATIC_STEAL_ENABLED
+
+    // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
+    template< typename T >
+    struct dispatch_private_infoXX_template {
+        typedef typename traits_t< T >::unsigned_t  UT;
+        typedef typename traits_t< T >::signed_t    ST;
+        UT count;                // unsigned
+        T  ub;
+        /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+        T  lb;
+        ST st;                   // signed
+        UT tc;                   // unsigned
+        T  static_steal_counter; // for static_steal only; maybe better to put after ub
+
+        /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+        // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+        //    a) parm3 is properly aligned and
+        //    b) all parm1-4 are in the same cache line.
+        // Because of parm1-4 are used together, performance seems to be better
+        // if they are in the same line (not measured though).
+
+        struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
+            T  parm1;
+            T  parm2;
+            T  parm3;
+            T  parm4;
+        };
+
+        UT ordered_lower; // unsigned
+        UT ordered_upper; // unsigned
+        #if KMP_OS_WINDOWS
+        T  last_upper;
+        #endif /* KMP_OS_WINDOWS */
+    };
+
+#else /* KMP_STATIC_STEAL_ENABLED */
+
+    // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
+    template< typename T >
+    struct dispatch_private_infoXX_template {
+        typedef typename traits_t< T >::unsigned_t  UT;
+        typedef typename traits_t< T >::signed_t    ST;
+        T  lb;
+        T  ub;
+        ST st;            // signed
+        UT tc;            // unsigned
+
+        T  parm1;
+        T  parm2;
+        T  parm3;
+        T  parm4;
+
+        UT count;         // unsigned
+
+        UT ordered_lower; // unsigned
+        UT ordered_upper; // unsigned
+        #if KMP_OS_WINDOWS
+	T  last_upper;
+        #endif /* KMP_OS_WINDOWS */
+    };
+
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+// replaces dispatch_private_info structure and dispatch_private_info_t type
+template< typename T >
+struct KMP_ALIGN_CACHE dispatch_private_info_template {
+    // duplicate alignment here, otherwise size of structure is not correct in our compiler
+    union KMP_ALIGN_CACHE private_info_tmpl {
+        dispatch_private_infoXX_template< T > p;
+        dispatch_private_info64_t             p64;
+    } u;
+    enum sched_type schedule;  /* scheduling algorithm */
+    kmp_uint32      ordered;   /* ordered clause specified */
+    kmp_uint32      ordered_bumped;
+    kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
+    dispatch_private_info * next; /* stack of buffers for nest of serial regions */
+    kmp_uint32      nomerge;   /* don't merge iters if serialized */
+    kmp_uint32      type_size;
+    enum cons_type  pushed_ws;
+};
+
+
+// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
+template< typename UT >
+struct dispatch_shared_infoXX_template {
+    /* chunk index under dynamic, number of idle threads under static-steal;
+       iteration index otherwise */
+    volatile UT     iteration;
+    volatile UT     num_done;
+    volatile UT     ordered_iteration;
+    UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
+};
+
+// replaces dispatch_shared_info structure and dispatch_shared_info_t type
+template< typename UT >
+struct dispatch_shared_info_template {
+    // we need union here to keep the structure size
+    union shared_info_tmpl {
+        dispatch_shared_infoXX_template< UT >  s;
+        dispatch_shared_info64_t               s64;
+    } u;
+    volatile kmp_uint32     buffer_index;
+};
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#undef USE_TEST_LOCKS
+
+// test_then_add template (general template should NOT be used)
+template< typename T >
+static __forceinline T
+test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
+
+template<>
+__forceinline kmp_int32
+test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
+{
+    kmp_int32 r;
+    r = KMP_TEST_THEN_ADD32( p, d );
+    return r;
+}
+
+template<>
+__forceinline kmp_int64
+test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
+{
+    kmp_int64 r;
+    r = KMP_TEST_THEN_ADD64( p, d );
+    return r;
+}
+
+// test_then_inc_acq template (general template should NOT be used)
+template< typename T >
+static __forceinline T
+test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
+
+template<>
+__forceinline kmp_int32
+test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
+{
+    kmp_int32 r;
+    r = KMP_TEST_THEN_INC_ACQ32( p );
+    return r;
+}
+
+template<>
+__forceinline kmp_int64
+test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
+{
+    kmp_int64 r;
+    r = KMP_TEST_THEN_INC_ACQ64( p );
+    return r;
+}
+
+// test_then_inc template (general template should NOT be used)
+template< typename T >
+static __forceinline T
+test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
+
+template<>
+__forceinline kmp_int32
+test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
+{
+    kmp_int32 r;
+    r = KMP_TEST_THEN_INC32( p );
+    return r;
+}
+
+template<>
+__forceinline kmp_int64
+test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
+{
+    kmp_int64 r;
+    r = KMP_TEST_THEN_INC64( p );
+    return r;
+}
+
+// compare_and_swap template (general template should NOT be used)
+template< typename T >
+static __forceinline kmp_int32
+compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
+
+template<>
+__forceinline kmp_int32
+compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
+{
+    return KMP_COMPARE_AND_STORE_REL32( p, c, s );
+}
+
+template<>
+__forceinline kmp_int32
+compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
+{
+    return KMP_COMPARE_AND_STORE_REL64( p, c, s );
+}
+
+/*
+    Spin wait loop that first does pause, then yield.
+    Waits until function returns non-zero when called with *spinner and check.
+    Does NOT put threads to sleep.
+#if USE_ITT_BUILD
+    Arguments:
+        obj -- is higher-level synchronization object to report to ittnotify. It is used to report
+            locks consistently. For example, if lock is acquired immediately, its address is
+            reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
+            immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
+            address, not an address of low-level spinner.
+#endif // USE_ITT_BUILD
+*/
+template< typename UT >
+// ToDo: make inline function (move to header file for icl)
+static UT  // unsigned 4- or 8-byte type
+__kmp_wait_yield( volatile UT * spinner,
+                  UT            checker,
+                  kmp_uint32 (* pred)( UT, UT )
+                  USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
+                  )
+{
+    // note: we may not belong to a team at this point
+    register volatile UT         * spin          = spinner;
+    register          UT           check         = checker;
+    register          kmp_uint32   spins;
+    register          kmp_uint32 (*f) ( UT, UT ) = pred;
+    register          UT           r;
+
+    KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
+    KMP_INIT_YIELD( spins );
+    // main wait spin loop
+    while(!f(r = *spin, check))
+    {
+        KMP_FSYNC_SPIN_PREPARE( obj );
+        /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
+           It causes problems with infinite recursion because of exit lock */
+        /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+            __kmp_abort_thread(); */
+
+        // if we are oversubscribed,
+        // or have waited a bit (and KMP_LIBRARY=throughput, then yield
+        // pause is in the following code
+        KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
+        KMP_YIELD_SPIN( spins );
+    }
+    KMP_FSYNC_SPIN_ACQUIRED( obj );
+    return r;
+}
+
+template< typename UT >
+static kmp_uint32 __kmp_eq( UT value, UT checker) {
+    return value == checker;
+}
+
+template< typename UT >
+static kmp_uint32 __kmp_neq( UT value, UT checker) {
+    return value != checker;
+}
+
+template< typename UT >
+static kmp_uint32 __kmp_lt( UT value, UT checker) {
+    return value < checker;
+}
+
+template< typename UT >
+static kmp_uint32 __kmp_ge( UT value, UT checker) {
+    return value >= checker;
+}
+
+template< typename UT >
+static kmp_uint32 __kmp_le( UT value, UT checker) {
+    return value <= checker;
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+static void
+__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    kmp_info_t *th;
+
+    KMP_DEBUG_ASSERT( gtid_ref );
+
+    if ( __kmp_env_consistency_check ) {
+        th = __kmp_threads[*gtid_ref];
+        if ( th -> th.th_root -> r.r_active
+          && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
+#if KMP_USE_DYNAMIC_LOCK
+            __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
+#else
+            __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
+#endif
+        }
+    }
+}
+
+template< typename UT >
+static void
+__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    typedef typename traits_t< UT >::signed_t    ST;
+    dispatch_private_info_template< UT > * pr;
+
+    int gtid = *gtid_ref;
+//    int  cid = *cid_ref;
+    kmp_info_t *th = __kmp_threads[ gtid ];
+    KMP_DEBUG_ASSERT( th -> th.th_dispatch );
+
+    KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
+    if ( __kmp_env_consistency_check ) {
+        pr = reinterpret_cast< dispatch_private_info_template< UT >* >
+            ( th -> th.th_dispatch -> th_dispatch_pr_current );
+        if ( pr -> pushed_ws != ct_none ) {
+#if KMP_USE_DYNAMIC_LOCK
+            __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
+#else
+            __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
+#endif
+        }
+    }
+
+    if ( ! th -> th.th_team -> t.t_serialized ) {
+        dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
+            ( th -> th.th_dispatch -> th_dispatch_sh_current );
+        UT  lower;
+
+        if ( ! __kmp_env_consistency_check ) {
+                pr = reinterpret_cast< dispatch_private_info_template< UT >* >
+                    ( th -> th.th_dispatch -> th_dispatch_pr_current );
+        }
+        lower = pr->u.p.ordered_lower;
+
+        #if ! defined( KMP_GOMP_COMPAT )
+            if ( __kmp_env_consistency_check ) {
+                if ( pr->ordered_bumped ) {
+                    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+                    __kmp_error_construct2(
+                        kmp_i18n_msg_CnsMultipleNesting,
+                        ct_ordered_in_pdo, loc_ref,
+                        & p->stack_data[ p->w_top ]
+                    );
+                }
+            }
+        #endif /* !defined(KMP_GOMP_COMPAT) */
+
+        KMP_MB();
+        #ifdef KMP_DEBUG
+        {
+            const char * buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
+                traits_t< UT >::spec, traits_t< UT >::spec );
+            KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
+            __kmp_str_free( &buff );
+        }
+        #endif
+
+        __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
+                                USE_ITT_BUILD_ARG( NULL )
+                                );
+        KMP_MB();  /* is this necessary? */
+        #ifdef KMP_DEBUG
+        {
+            const char * buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
+                traits_t< UT >::spec, traits_t< UT >::spec );
+            KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
+            __kmp_str_free( &buff );
+        }
+        #endif
+    }
+    KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
+}
+
+static void
+__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    kmp_info_t *th;
+
+    if ( __kmp_env_consistency_check ) {
+        th = __kmp_threads[*gtid_ref];
+        if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
+            __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
+        }
+    }
+}
+
+template< typename UT >
+static void
+__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    typedef typename traits_t< UT >::signed_t    ST;
+    dispatch_private_info_template< UT > * pr;
+
+    int gtid = *gtid_ref;
+//    int  cid = *cid_ref;
+    kmp_info_t *th = __kmp_threads[ gtid ];
+    KMP_DEBUG_ASSERT( th -> th.th_dispatch );
+
+    KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
+    if ( __kmp_env_consistency_check ) {
+        pr = reinterpret_cast< dispatch_private_info_template< UT >* >
+            ( th -> th.th_dispatch -> th_dispatch_pr_current );
+        if ( pr -> pushed_ws != ct_none ) {
+            __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
+        }
+    }
+
+    if ( ! th -> th.th_team -> t.t_serialized ) {
+        dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
+            ( th -> th.th_dispatch -> th_dispatch_sh_current );
+
+        if ( ! __kmp_env_consistency_check ) {
+            pr = reinterpret_cast< dispatch_private_info_template< UT >* >
+                ( th -> th.th_dispatch -> th_dispatch_pr_current );
+        }
+
+        KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
+        #if ! defined( KMP_GOMP_COMPAT )
+            if ( __kmp_env_consistency_check ) {
+                if ( pr->ordered_bumped != 0 ) {
+                    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+                    /* How to test it? - OM */
+                    __kmp_error_construct2(
+                        kmp_i18n_msg_CnsMultipleNesting,
+                        ct_ordered_in_pdo, loc_ref,
+                        & p->stack_data[ p->w_top ]
+                    );
+                }
+            }
+        #endif /* !defined(KMP_GOMP_COMPAT) */
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        pr->ordered_bumped += 1;
+
+        KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
+                        gtid, pr->ordered_bumped ) );
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        /* TODO use general release procedure? */
+        test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+    }
+    KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
+}
+
+/* Computes and returns x to the power of y, where y must a non-negative integer */
+template< typename UT >
+static __forceinline long double
+__kmp_pow(long double x, UT y) {
+    long double s=1.0L;
+
+    KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
+    //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
+    while(y) {
+        if ( y & 1 )
+            s *= x;
+        x *= x;
+        y >>= 1;
+    }
+    return s;
+}
+
+/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
+   (the total number of unassigned iterations in chunks with index greater than or equal to idx).
+   __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
+   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
+*/
+template< typename T >
+static __inline typename traits_t< T >::unsigned_t
+__kmp_dispatch_guided_remaining(
+    T                                  tc,
+    typename traits_t< T >::floating_t base,
+    typename traits_t< T >::unsigned_t idx
+) {
+    /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
+       least for ICL 8.1, long double arithmetic may not really have
+       long double precision, even with /Qlong_double.  Currently, we
+       workaround that in the caller code, by manipulating the FPCW for
+       Windows* OS on IA-32 architecture.  The lack of precision is not
+       expected to be a correctness issue, though.
+    */
+    typedef typename traits_t< T >::unsigned_t  UT;
+
+    long double x = tc * __kmp_pow< UT >(base, idx);
+    UT r = (UT) x;
+    if ( x == r )
+        return r;
+    return r + 1;
+}
+
+// Parameters of the guided-iterative algorithm:
+//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
+//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
+// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
+// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
+static int guided_int_param = 2;
+static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
+
+// UT - unsigned flavor of T, ST - signed flavor of T,
+// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+template< typename T >
+static void
+__kmp_dispatch_init(
+    ident_t                        * loc,
+    int                              gtid,
+    enum sched_type                  schedule,
+    T                                lb,
+    T                                ub,
+    typename traits_t< T >::signed_t st,
+    typename traits_t< T >::signed_t chunk,
+    int                              push_ws
+) {
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    typedef typename traits_t< T >::floating_t  DBL;
+    static const int ___kmp_size_type = sizeof( UT );
+
+    int                                            active;
+    T                                              tc;
+    kmp_info_t *                                   th;
+    kmp_team_t *                                   team;
+    kmp_uint32                                     my_buffer_index;
+    dispatch_private_info_template< T >          * pr;
+    dispatch_shared_info_template< UT > volatile * sh;
+
+    KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
+    KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
+
+    if ( ! TCR_4( __kmp_init_parallel ) )
+        __kmp_parallel_initialize();
+
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_DISPATCH_INIT();
+#endif
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
+            traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
+        KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+    /* setup data */
+    th     = __kmp_threads[ gtid ];
+    team   = th -> th.th_team;
+    active = ! team -> t.t_serialized;
+    th->th.th_ident = loc;
+
+#if USE_ITT_BUILD
+    kmp_uint64 cur_chunk = chunk;
+    int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+        KMP_MASTER_GTID(gtid) &&
+#if OMP_40_ENABLED
+        th->th.th_teams_microtask == NULL &&
+#endif
+        team->t.t_active_level == 1;
+#endif
+    if ( ! active ) {
+        pr = reinterpret_cast< dispatch_private_info_template< T >* >
+            ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
+    } else {
+        KMP_DEBUG_ASSERT( th->th.th_dispatch ==
+                &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
+
+        my_buffer_index = th->th.th_dispatch->th_disp_index ++;
+
+        /* What happens when number of threads changes, need to resize buffer? */
+        pr = reinterpret_cast< dispatch_private_info_template< T >  * >
+            ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
+        sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
+            ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
+    }
+
+    /* Pick up the nomerge/ordered bits from the scheduling type */
+    if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
+        pr->nomerge = TRUE;
+        schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
+    } else {
+        pr->nomerge = FALSE;
+    }
+    pr->type_size = ___kmp_size_type; // remember the size of variables
+    if ( kmp_ord_lower & schedule ) {
+        pr->ordered = TRUE;
+        schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
+    } else {
+        pr->ordered = FALSE;
+    }
+    if ( schedule == kmp_sch_static ) {
+        schedule = __kmp_static;
+    } else {
+        if ( schedule == kmp_sch_runtime ) {
+            // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
+            schedule = team -> t.t_sched.r_sched_type;
+            // Detail the schedule if needed (global controls are differentiated appropriately)
+            if ( schedule == kmp_sch_guided_chunked ) {
+                schedule = __kmp_guided;
+            } else if ( schedule == kmp_sch_static ) {
+                schedule = __kmp_static;
+            }
+            // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
+            chunk = team -> t.t_sched.chunk;
+
+            #ifdef KMP_DEBUG
+            {
+                const char * buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
+                    traits_t< ST >::spec );
+                KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
+                __kmp_str_free( &buff );
+            }
+            #endif
+        } else {
+            if ( schedule == kmp_sch_guided_chunked ) {
+                schedule = __kmp_guided;
+            }
+            if ( chunk <= 0 ) {
+                chunk = KMP_DEFAULT_CHUNK;
+            }
+        }
+
+        if ( schedule == kmp_sch_auto ) {
+            // mapping and differentiation: in the __kmp_do_serial_initialize()
+            schedule = __kmp_auto;
+            #ifdef KMP_DEBUG
+            {
+                const char * buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
+                    traits_t< ST >::spec );
+                KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
+                __kmp_str_free( &buff );
+            }
+            #endif
+        }
+
+        /* guided analytical not safe for too many threads */
+        if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
+            schedule = kmp_sch_guided_iterative_chunked;
+            KMP_WARNING( DispatchManyThreads );
+        }
+        pr->u.p.parm1 = chunk;
+    }
+    KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
+                "unknown scheduling type" );
+
+    pr->u.p.count = 0;
+
+    if ( __kmp_env_consistency_check ) {
+        if ( st == 0 ) {
+            __kmp_error_construct(
+                kmp_i18n_msg_CnsLoopIncrZeroProhibited,
+                ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
+            );
+        }
+    }
+
+    tc = ( ub - lb + st );
+    if ( st != 1 ) {
+        if ( st < 0 ) {
+            if ( lb < ub ) {
+                tc = 0;            // zero-trip
+            } else {   // lb >= ub
+                tc = (ST)tc / st;  // convert to signed division
+            }
+        } else {       // st > 0
+            if ( ub < lb ) {
+                tc = 0;            // zero-trip
+            } else {   // lb >= ub
+                tc /= st;
+            }
+        }
+    } else if ( ub < lb ) {        // st == 1
+        tc = 0;                    // zero-trip
+    }
+
+    pr->u.p.lb = lb;
+    pr->u.p.ub = ub;
+    pr->u.p.st = st;
+    pr->u.p.tc = tc;
+
+    #if KMP_OS_WINDOWS
+    pr->u.p.last_upper = ub + st;
+    #endif /* KMP_OS_WINDOWS */
+
+    /* NOTE: only the active parallel region(s) has active ordered sections */
+
+    if ( active ) {
+        if ( pr->ordered == 0 ) {
+            th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
+            th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
+        } else {
+            pr->ordered_bumped = 0;
+
+            pr->u.p.ordered_lower = 1;
+            pr->u.p.ordered_upper = 0;
+
+            th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
+            th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
+        }
+    }
+
+    if ( __kmp_env_consistency_check ) {
+        enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
+        if ( push_ws ) {
+            __kmp_push_workshare( gtid, ws, loc );
+            pr->pushed_ws = ws;
+        } else {
+            __kmp_check_workshare( gtid, ws, loc );
+            pr->pushed_ws = ct_none;
+        }
+    }
+
+    switch ( schedule ) {
+    #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+    case kmp_sch_static_steal:
+        {
+            T nproc = team->t.t_nproc;
+            T ntc, init;
+
+            KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
+
+            ntc = (tc % chunk ? 1 : 0) + tc / chunk;
+            if ( nproc > 1 && ntc >= nproc ) {
+                T id = __kmp_tid_from_gtid(gtid);
+                T small_chunk, extras;
+
+                small_chunk = ntc / nproc;
+                extras = ntc % nproc;
+
+                init = id * small_chunk + ( id < extras ? id : extras );
+                pr->u.p.count = init;
+                pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
+
+                pr->u.p.parm2 = lb;
+                //pr->pfields.parm3 = 0; // it's not used in static_steal
+                pr->u.p.parm4 = id;
+                pr->u.p.st = st;
+                break;
+            } else {
+                KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
+                               gtid ) );
+                schedule = kmp_sch_static_balanced;
+                /* too few iterations: fall-through to kmp_sch_static_balanced */
+            } // if
+            /* FALL-THROUGH to static balanced */
+        } // case
+    #endif
+    case kmp_sch_static_balanced:
+        {
+            T nproc = team->t.t_nproc;
+            T init, limit;
+
+            KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
+                            gtid ) );
+
+            if ( nproc > 1 ) {
+                T id = __kmp_tid_from_gtid(gtid);
+
+                if ( tc < nproc ) {
+                    if ( id < tc ) {
+                        init = id;
+                        limit = id;
+                        pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
+                    } else {
+                        pr->u.p.count = 1;  /* means no more chunks to execute */
+                        pr->u.p.parm1 = FALSE;
+                        break;
+                    }
+                } else {
+                    T small_chunk = tc / nproc;
+                    T extras = tc % nproc;
+                    init = id * small_chunk + (id < extras ? id : extras);
+                    limit = init + small_chunk - (id < extras ? 0 : 1);
+                    pr->u.p.parm1 = (id == nproc - 1);
+                }
+            } else {
+                if ( tc > 0 ) {
+                    init = 0;
+                    limit = tc - 1;
+                    pr->u.p.parm1 = TRUE;
+                } else {
+                    // zero trip count
+                    pr->u.p.count = 1;  /* means no more chunks to execute */
+                    pr->u.p.parm1 = FALSE;
+                    break;
+                }
+            }
+#if USE_ITT_BUILD
+            // Calculate chunk for metadata report
+            if ( itt_need_metadata_reporting )
+                cur_chunk = limit - init + 1;
+#endif
+            if ( st == 1 ) {
+                pr->u.p.lb = lb + init;
+                pr->u.p.ub = lb + limit;
+            } else {
+                T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
+                pr->u.p.lb = lb + init * st;
+                // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
+                if ( st > 0 ) {
+                    pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
+                } else {
+                    pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
+                }
+            }
+            if ( pr->ordered ) {
+                pr->u.p.ordered_lower = init;
+                pr->u.p.ordered_upper = limit;
+            }
+            break;
+        } // case
+    case kmp_sch_guided_iterative_chunked :
+        {
+            T nproc = team->t.t_nproc;
+            KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
+
+            if ( nproc > 1 ) {
+                if ( (2L * chunk + 1 ) * nproc >= tc ) {
+                    /* chunk size too large, switch to dynamic */
+                    schedule = kmp_sch_dynamic_chunked;
+                } else {
+                    // when remaining iters become less than parm2 - switch to dynamic
+                    pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
+                    *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
+                }
+            } else {
+                KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
+                schedule = kmp_sch_static_greedy;
+                /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
+                KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
+                pr->u.p.parm1 = tc;
+            } // if
+        } // case
+        break;
+    case kmp_sch_guided_analytical_chunked:
+        {
+            T nproc = team->t.t_nproc;
+            KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
+
+            if ( nproc > 1 ) {
+                if ( (2L * chunk + 1 ) * nproc >= tc ) {
+                    /* chunk size too large, switch to dynamic */
+                    schedule = kmp_sch_dynamic_chunked;
+                } else {
+                    /* commonly used term: (2 nproc - 1)/(2 nproc) */
+                    DBL x;
+
+                    #if KMP_OS_WINDOWS && KMP_ARCH_X86
+                    /* Linux* OS already has 64-bit computation by default for
+		       long double, and on Windows* OS on Intel(R) 64,
+		       /Qlong_double doesn't work.  On Windows* OS
+		       on IA-32 architecture, we need to set precision to
+		       64-bit instead of the default 53-bit. Even though long
+		       double doesn't work on Windows* OS on Intel(R) 64, the
+		       resulting lack of precision is not expected to impact
+		       the correctness of the algorithm, but this has not been
+		       mathematically proven.
+                    */
+                    // save original FPCW and set precision to 64-bit, as
+                    // Windows* OS on IA-32 architecture defaults to 53-bit
+                    unsigned int oldFpcw = _control87(0,0);
+                    _control87(_PC_64,_MCW_PC); // 0,0x30000
+                    #endif
+                    /* value used for comparison in solver for cross-over point */
+                    long double target = ((long double)chunk * 2 + 1) * nproc / tc;
+
+                    /* crossover point--chunk indexes equal to or greater than
+		       this point switch to dynamic-style scheduling */
+                    UT   cross;
+
+                    /* commonly used term: (2 nproc - 1)/(2 nproc) */
+                    x = (long double)1.0 - (long double)0.5 / nproc;
+
+                    #ifdef KMP_DEBUG
+                    { // test natural alignment
+                        struct _test_a {
+                            char a;
+                            union {
+                                char b;
+                                DBL  d;
+                            };
+                        } t;
+                        ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
+                        //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
+                        KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
+                    }
+                    #endif // KMP_DEBUG
+
+                    /* save the term in thread private dispatch structure */
+                    *(DBL*)&pr->u.p.parm3 = x;
+
+                    /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
+                    {
+                        UT          left, right, mid;
+                        long double p;
+
+                        /* estimate initial upper and lower bound */
+
+                        /* doesn't matter what value right is as long as it is positive, but
+                           it affects performance of the solver
+                        */
+                        right = 229;
+                        p = __kmp_pow< UT >(x,right);
+                        if ( p > target ) {
+                            do{
+                                p *= p;
+                                right <<= 1;
+                            } while(p>target && right < (1<<27));
+                            left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
+                        } else {
+                            left = 0;
+                        }
+
+                        /* bisection root-finding method */
+                        while ( left + 1 < right ) {
+                            mid = (left + right) / 2;
+                            if ( __kmp_pow< UT >(x,mid) > target ) {
+                                left = mid;
+                            } else {
+                                right = mid;
+                            }
+                        } // while
+                        cross = right;
+                    }
+                    /* assert sanity of computed crossover point */
+                    KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
+
+                    /* save the crossover point in thread private dispatch structure */
+                    pr->u.p.parm2 = cross;
+
+                    // C75803
+                    #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
+                        #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
+                    #else
+                        #define GUIDED_ANALYTICAL_WORKAROUND (x)
+                    #endif
+                    /* dynamic-style scheduling offset */
+                    pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
+                    #if KMP_OS_WINDOWS && KMP_ARCH_X86
+                        // restore FPCW
+                        _control87(oldFpcw,_MCW_PC);
+                    #endif
+                } // if
+            } else {
+                KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
+                               gtid ) );
+                schedule = kmp_sch_static_greedy;
+                /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
+                pr->u.p.parm1 = tc;
+            } // if
+        } // case
+        break;
+    case kmp_sch_static_greedy:
+        KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
+            pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
+                ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
+                tc;
+        break;
+    case kmp_sch_static_chunked :
+    case kmp_sch_dynamic_chunked :
+        KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
+        break;
+    case kmp_sch_trapezoidal :
+        {
+            /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
+
+            T parm1, parm2, parm3, parm4;
+            KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
+
+            parm1 = chunk;
+
+            /* F : size of the first cycle */
+            parm2 = ( tc / (2 * team->t.t_nproc) );
+
+            if ( parm2 < 1 ) {
+                parm2 = 1;
+            }
+
+            /* L : size of the last cycle.  Make sure the last cycle
+             *     is not larger than the first cycle.
+             */
+            if ( parm1 < 1 ) {
+                parm1 = 1;
+            } else if ( parm1 > parm2 ) {
+                parm1 = parm2;
+            }
+
+            /* N : number of cycles */
+            parm3 = ( parm2 + parm1 );
+            parm3 = ( 2 * tc + parm3 - 1) / parm3;
+
+            if ( parm3 < 2 ) {
+                parm3 = 2;
+            }
+
+            /* sigma : decreasing incr of the trapezoid */
+            parm4 = ( parm3 - 1 );
+            parm4 = ( parm2 - parm1 ) / parm4;
+
+            // pointless check, because parm4 >= 0 always
+            //if ( parm4 < 0 ) {
+            //    parm4 = 0;
+            //}
+
+            pr->u.p.parm1 = parm1;
+            pr->u.p.parm2 = parm2;
+            pr->u.p.parm3 = parm3;
+            pr->u.p.parm4 = parm4;
+        } // case
+        break;
+
+    default:
+        {
+            __kmp_msg(
+                kmp_ms_fatal,                        // Severity
+                KMP_MSG( UnknownSchedTypeDetected ), // Primary message
+                KMP_HNT( GetNewerLibrary ),          // Hint
+                __kmp_msg_null                       // Variadic argument list terminator
+            );
+        }
+        break;
+    } // switch
+    pr->schedule = schedule;
+    if ( active ) {
+        /* The name of this buffer should be my_buffer_index when it's free to use it */
+
+        KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
+                        gtid, my_buffer_index, sh->buffer_index) );
+        __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
+                                        USE_ITT_BUILD_ARG( NULL )
+                                        );
+            // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
+            // *always* 32-bit integers.
+        KMP_MB();  /* is this necessary? */
+        KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
+                        gtid, my_buffer_index, sh->buffer_index) );
+
+        th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
+        th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
+#if USE_ITT_BUILD
+        if ( pr->ordered ) {
+            __kmp_itt_ordered_init( gtid );
+        }; // if
+        // Report loop metadata
+        if ( itt_need_metadata_reporting ) {
+            // Only report metadata by master of active team at level 1
+            kmp_uint64 schedtype = 0;
+            switch ( schedule ) {
+            case kmp_sch_static_chunked:
+            case kmp_sch_static_balanced:// Chunk is calculated in the switch above
+                break;
+            case kmp_sch_static_greedy:
+                cur_chunk = pr->u.p.parm1;
+                break;
+            case kmp_sch_dynamic_chunked:
+                schedtype = 1;
+                break;
+            case kmp_sch_guided_iterative_chunked:
+            case kmp_sch_guided_analytical_chunked:
+                schedtype = 2;
+                break;
+            default:
+//            Should we put this case under "static"?
+//            case kmp_sch_static_steal:
+                schedtype = 3;
+                break;
+            }
+            __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
+        }
+#endif /* USE_ITT_BUILD */
+    }; // if
+
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
+            " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
+            " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
+            traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
+            traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
+            traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
+        KD_TRACE(10, ( buff,
+            gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
+            pr->u.p.st, pr->u.p.tc, pr->u.p.count,
+            pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
+            pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+    #if ( KMP_STATIC_STEAL_ENABLED )
+    if ( ___kmp_size_type < 8 ) {
+      // It cannot be guaranteed that after execution of a loop with some other schedule kind
+      // all the parm3 variables will contain the same value.
+      // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
+      // rather than program life-time increment.
+      // So the dedicated variable is required. The 'static_steal_counter' is used.
+      if( schedule == kmp_sch_static_steal ) {
+        // Other threads will inspect this variable when searching for a victim.
+        // This is a flag showing that other threads may steal from this thread since then.
+        volatile T * p = &pr->u.p.static_steal_counter;
+        *p = *p + 1;
+      }
+    }
+    #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+        ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+        ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+            team_info->parallel_id, task_info->task_id, team_info->microtask);
+    }
+#endif
+}
+
+/*
+ * For ordered loops, either __kmp_dispatch_finish() should be called after
+ * every iteration, or __kmp_dispatch_finish_chunk() should be called after
+ * every chunk of iterations.  If the ordered section(s) were not executed
+ * for this iteration (or every iteration in this chunk), we need to set the
+ * ordered iteration counters so that the next thread can proceed.
+ */
+template< typename UT >
+static void
+__kmp_dispatch_finish( int gtid, ident_t *loc )
+{
+    typedef typename traits_t< UT >::signed_t ST;
+    kmp_info_t *th = __kmp_threads[ gtid ];
+
+    KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
+    if ( ! th -> th.th_team -> t.t_serialized ) {
+
+        dispatch_private_info_template< UT > * pr =
+            reinterpret_cast< dispatch_private_info_template< UT >* >
+            ( th->th.th_dispatch->th_dispatch_pr_current );
+        dispatch_shared_info_template< UT > volatile * sh =
+            reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
+            ( th->th.th_dispatch->th_dispatch_sh_current );
+        KMP_DEBUG_ASSERT( pr );
+        KMP_DEBUG_ASSERT( sh );
+        KMP_DEBUG_ASSERT( th->th.th_dispatch ==
+                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
+
+        if ( pr->ordered_bumped ) {
+            KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
+                            gtid ) );
+            pr->ordered_bumped = 0;
+        } else {
+            UT lower = pr->u.p.ordered_lower;
+
+            #ifdef KMP_DEBUG
+            {
+                const char * buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
+                    traits_t< UT >::spec, traits_t< UT >::spec );
+                KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
+                __kmp_str_free( &buff );
+            }
+            #endif
+
+            __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
+                                   USE_ITT_BUILD_ARG(NULL)
+                                   );
+            KMP_MB();  /* is this necessary? */
+            #ifdef KMP_DEBUG
+            {
+                const char * buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
+                    traits_t< UT >::spec, traits_t< UT >::spec );
+                KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
+                __kmp_str_free( &buff );
+            }
+            #endif
+
+            test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
+        } // if
+    } // if
+    KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
+}
+
+#ifdef KMP_GOMP_COMPAT
+
+template< typename UT >
+static void
+__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
+{
+    typedef typename traits_t< UT >::signed_t ST;
+    kmp_info_t *th = __kmp_threads[ gtid ];
+
+    KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
+    if ( ! th -> th.th_team -> t.t_serialized ) {
+//        int cid;
+        dispatch_private_info_template< UT > * pr =
+            reinterpret_cast< dispatch_private_info_template< UT >* >
+            ( th->th.th_dispatch->th_dispatch_pr_current );
+        dispatch_shared_info_template< UT > volatile * sh =
+            reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
+            ( th->th.th_dispatch->th_dispatch_sh_current );
+        KMP_DEBUG_ASSERT( pr );
+        KMP_DEBUG_ASSERT( sh );
+        KMP_DEBUG_ASSERT( th->th.th_dispatch ==
+                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
+
+//        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
+            UT lower = pr->u.p.ordered_lower;
+            UT upper = pr->u.p.ordered_upper;
+            UT inc = upper - lower + 1;
+
+            if ( pr->ordered_bumped == inc ) {
+                KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
+                  gtid ) );
+                pr->ordered_bumped = 0;
+            } else {
+                inc -= pr->ordered_bumped;
+
+                #ifdef KMP_DEBUG
+                {
+                    const char * buff;
+                    // create format specifiers before the debug output
+                    buff = __kmp_str_format(
+                        "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
+                        "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
+                        traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
+                    KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
+                    __kmp_str_free( &buff );
+                }
+                #endif
+
+                __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
+                                       USE_ITT_BUILD_ARG(NULL)
+                                       );
+
+                KMP_MB();  /* is this necessary? */
+                KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
+                  gtid ) );
+                pr->ordered_bumped = 0;
+//!!!!! TODO check if the inc should be unsigned, or signed???
+                #ifdef KMP_DEBUG
+                {
+                    const char * buff;
+                    // create format specifiers before the debug output
+                    buff = __kmp_str_format(
+                        "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
+                        "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
+                        traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
+                    KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
+                    __kmp_str_free( &buff );
+                }
+                #endif
+
+                test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
+            }
+//        }
+    }
+    KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
+}
+
+#endif /* KMP_GOMP_COMPAT */
+
+/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
+ * (no more work), then tell OMPT the loop is over. In some cases
+ * kmp_dispatch_fini() is not called. */
+#if OMPT_SUPPORT && OMPT_TRACE
+#define OMPT_LOOP_END                                                          \
+    if (status == 0) {                                                         \
+        if ((ompt_status == ompt_status_track_callback) &&                     \
+            ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
+            ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
+            ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
+            ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
+                team_info->parallel_id, task_info->task_id);                   \
+        }                                                                      \
+    }
+#else
+#define OMPT_LOOP_END // no-op
+#endif
+
+template< typename T >
+static int
+__kmp_dispatch_next(
+    ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
+) {
+
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    typedef typename traits_t< T >::floating_t  DBL;
+#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+    static const int ___kmp_size_type = sizeof( UT );
+#endif
+
+    int                                   status;
+    dispatch_private_info_template< T > * pr;
+    kmp_info_t                          * th   = __kmp_threads[ gtid ];
+    kmp_team_t                          * team = th -> th.th_team;
+
+    KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
+        KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    if ( team -> t.t_serialized ) {
+        /* NOTE: serialize this dispatch becase we are not at the active level */
+        pr = reinterpret_cast< dispatch_private_info_template< T >* >
+            ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
+        KMP_DEBUG_ASSERT( pr );
+
+        if ( (status = (pr->u.p.tc != 0)) == 0 ) {
+            *p_lb = 0;
+            *p_ub = 0;
+//            if ( p_last != NULL )
+//                *p_last = 0;
+            if ( p_st != NULL )
+                *p_st = 0;
+            if ( __kmp_env_consistency_check ) {
+                if ( pr->pushed_ws != ct_none ) {
+                    pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
+                }
+            }
+        } else if ( pr->nomerge ) {
+            kmp_int32 last;
+            T         start;
+            UT        limit, trip, init;
+            ST        incr;
+            T         chunk = pr->u.p.parm1;
+
+            KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
+
+            init = chunk * pr->u.p.count++;
+            trip = pr->u.p.tc - 1;
+
+            if ( (status = (init <= trip)) == 0 ) {
+                *p_lb = 0;
+                *p_ub = 0;
+//                if ( p_last != NULL )
+//                    *p_last = 0;
+                if ( p_st != NULL )
+                    *p_st = 0;
+                if ( __kmp_env_consistency_check ) {
+                    if ( pr->pushed_ws != ct_none ) {
+                        pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
+                    }
+                }
+            } else {
+                start = pr->u.p.lb;
+                limit = chunk + init - 1;
+                incr  = pr->u.p.st;
+
+                if ( (last = (limit >= trip)) != 0 ) {
+                    limit = trip;
+                    #if KMP_OS_WINDOWS
+                    pr->u.p.last_upper = pr->u.p.ub;
+                    #endif /* KMP_OS_WINDOWS */
+                }
+                if ( p_last != NULL )
+                    *p_last = last;
+                if ( p_st != NULL )
+                    *p_st = incr;
+                if ( incr == 1 ) {
+                    *p_lb = start + init;
+                    *p_ub = start + limit;
+                } else {
+                    *p_lb = start + init * incr;
+                    *p_ub = start + limit * incr;
+                }
+
+                if ( pr->ordered ) {
+                    pr->u.p.ordered_lower = init;
+                    pr->u.p.ordered_upper = limit;
+                    #ifdef KMP_DEBUG
+                    {
+                        const char * buff;
+                        // create format specifiers before the debug output
+                        buff = __kmp_str_format(
+                            "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                            traits_t< UT >::spec, traits_t< UT >::spec );
+                        KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                        __kmp_str_free( &buff );
+                    }
+                    #endif
+                } // if
+            } // if
+        } else {
+            pr->u.p.tc = 0;
+            *p_lb = pr->u.p.lb;
+            *p_ub = pr->u.p.ub;
+            #if KMP_OS_WINDOWS
+            pr->u.p.last_upper = *p_ub;
+            #endif /* KMP_OS_WINDOWS */
+            if ( p_last != NULL )
+                *p_last = TRUE;
+            if ( p_st != NULL )
+                *p_st = pr->u.p.st;
+        } // if
+        #ifdef KMP_DEBUG
+        {
+            const char * buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
+                "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
+                traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
+            KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
+            __kmp_str_free( &buff );
+        }
+        #endif
+#if INCLUDE_SSC_MARKS
+        SSC_MARK_DISPATCH_NEXT();
+#endif
+        OMPT_LOOP_END;
+        return status;
+    } else {
+        kmp_int32 last = 0;
+        dispatch_shared_info_template< UT > *sh;
+        T         start;
+        ST        incr;
+        UT        limit, trip, init;
+
+        KMP_DEBUG_ASSERT( th->th.th_dispatch ==
+                &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
+
+        pr = reinterpret_cast< dispatch_private_info_template< T >* >
+            ( th->th.th_dispatch->th_dispatch_pr_current );
+        KMP_DEBUG_ASSERT( pr );
+        sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
+            ( th->th.th_dispatch->th_dispatch_sh_current );
+        KMP_DEBUG_ASSERT( sh );
+
+        if ( pr->u.p.tc == 0 ) {
+            // zero trip count
+            status = 0;
+        } else {
+            switch (pr->schedule) {
+            #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+            case kmp_sch_static_steal:
+                {
+                    T chunk = pr->u.p.parm1;
+
+                    KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
+
+                    trip = pr->u.p.tc - 1;
+
+                    if ( ___kmp_size_type > 4 ) {
+                        // Other threads do not look into the data of this thread,
+                        //  so it's not necessary to make volatile casting.
+                        init   = ( pr->u.p.count )++;
+                        status = ( init < (UT)pr->u.p.ub );
+                    } else {
+                        typedef union {
+                            struct {
+                                UT count;
+                                T  ub;
+                            } p;
+                            kmp_int64 b;
+                        } union_i4;
+                        // All operations on 'count' or 'ub' must be combined atomically together.
+                        // stealing implemented only for 4-byte indexes
+                        {
+                            union_i4 vold, vnew;
+                            vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
+                            vnew = vold;
+                            vnew.p.count++;
+                            while( ! KMP_COMPARE_AND_STORE_ACQ64(
+                                        ( volatile kmp_int64* )&pr->u.p.count,
+                                        *VOLATILE_CAST(kmp_int64 *)&vold.b,
+                                        *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
+                                KMP_CPU_PAUSE();
+                                vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
+                                vnew = vold;
+                                vnew.p.count++;
+                            }
+                            vnew = vold;
+                            init   = vnew.p.count;
+                            status = ( init < (UT)vnew.p.ub ) ;
+                        }
+
+                        if( !status ) {
+                            kmp_info_t   **other_threads = team->t.t_threads;
+                            int          while_limit = 10;
+                            int          while_index = 0;
+
+                            // TODO: algorithm of searching for a victim
+                            // should be cleaned up and measured
+                            while ( ( !status ) && ( while_limit != ++while_index ) ) {
+                                union_i4  vold, vnew;
+                                kmp_int32 remaining; // kmp_int32 because KMP_I4 only
+                                T         victimIdx    = pr->u.p.parm4;
+                                T         oldVictimIdx = victimIdx;
+                                dispatch_private_info_template< T > * victim;
+
+                                do {
+                                    if( !victimIdx ) {
+                                        victimIdx = team->t.t_nproc - 1;
+                                    } else {
+                                        --victimIdx;
+                                    }
+                                    victim = reinterpret_cast< dispatch_private_info_template< T >* >
+                                        ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
+                                } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
+                                // TODO: think about a proper place of this test
+                                if ( ( !victim ) ||
+                                   ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
+                                     (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
+                                    // TODO: delay would be nice
+                                    continue;
+                                    // the victim is not ready yet to participate in stealing
+                                    // because the victim is still in kmp_init_dispatch
+                                }
+                                if ( oldVictimIdx == victimIdx ) {
+                                    break;
+                                }
+                                pr->u.p.parm4 = victimIdx;
+
+                                while( 1 ) {
+                                    vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
+                                    vnew = vold;
+
+                                    KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
+                                    if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
+                                        break;
+                                    }
+                                    vnew.p.ub -= (remaining >> 2);
+                                    KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+                                    #pragma warning( push )
+                                    // disable warning on pointless comparison of unsigned with 0
+                                    #pragma warning( disable: 186 )
+                                        KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
+                                    #pragma warning( pop )
+                                    // TODO: Should this be acquire or release?
+                                    if ( KMP_COMPARE_AND_STORE_ACQ64(
+                                            ( volatile kmp_int64 * )&victim->u.p.count,
+                                            *VOLATILE_CAST(kmp_int64 *)&vold.b,
+                                            *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
+                                        status = 1;
+                                        while_index = 0;
+                                        // now update own count and ub
+                                        #if KMP_ARCH_X86
+                                        // stealing executed on non-KMP_ARCH_X86 only
+                                            // Atomic 64-bit write on ia32 is
+                                            // unavailable, so we do this in steps.
+                                            //     This code is not tested.
+                                            init = vold.p.count;
+                                            pr->u.p.ub = 0;
+                                            pr->u.p.count = init + 1;
+                                            pr->u.p.ub = vnew.p.count;
+                                        #else
+                                            init = vnew.p.ub;
+                                            vold.p.count = init + 1;
+                                            // TODO: is it safe and enough?
+                                            *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
+                                        #endif // KMP_ARCH_X86
+                                        break;
+                                    } // if
+                                KMP_CPU_PAUSE();
+                                } // while (1)
+                            } // while
+                        } // if
+                    } // if
+                    if ( !status ) {
+                        *p_lb = 0;
+                        *p_ub = 0;
+                        if ( p_st != NULL ) *p_st = 0;
+                    } else {
+                        start = pr->u.p.parm2;
+                        init *= chunk;
+                        limit = chunk + init - 1;
+                        incr  = pr->u.p.st;
+
+                        KMP_DEBUG_ASSERT(init <= trip);
+                        if ( (last = (limit >= trip)) != 0 )
+                            limit = trip;
+                        if ( p_st != NULL ) *p_st = incr;
+
+                        if ( incr == 1 ) {
+                            *p_lb = start + init;
+                            *p_ub = start + limit;
+                        } else {
+                            *p_lb = start + init * incr;
+                            *p_ub = start + limit * incr;
+                        }
+
+                        if ( pr->ordered ) {
+                            pr->u.p.ordered_lower = init;
+                            pr->u.p.ordered_upper = limit;
+                            #ifdef KMP_DEBUG
+                            {
+                                const char * buff;
+                                // create format specifiers before the debug output
+                                buff = __kmp_str_format(
+                                    "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t< UT >::spec, traits_t< UT >::spec );
+                                KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                                __kmp_str_free( &buff );
+                            }
+                            #endif
+                        } // if
+                    } // if
+                    break;
+                } // case
+            #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+            case kmp_sch_static_balanced:
+                {
+                    KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
+                    if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
+                        pr->u.p.count = 1;
+                        *p_lb = pr->u.p.lb;
+                        *p_ub = pr->u.p.ub;
+                        last = pr->u.p.parm1;
+                        if ( p_st != NULL )
+                            *p_st = pr->u.p.st;
+                    } else {  /* no iterations to do */
+                        pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
+                    }
+                    if ( pr->ordered ) {
+                        #ifdef KMP_DEBUG
+                        {
+                            const char * buff;
+                            // create format specifiers before the debug output
+                            buff = __kmp_str_format(
+                                "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                                traits_t< UT >::spec, traits_t< UT >::spec );
+                            KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                            __kmp_str_free( &buff );
+                        }
+                        #endif
+                    } // if
+                } // case
+                break;
+            case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
+            case kmp_sch_static_chunked:
+                {
+                    T parm1;
+
+                    KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
+                                   gtid ) );
+                    parm1 = pr->u.p.parm1;
+
+                    trip  = pr->u.p.tc - 1;
+                    init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
+
+                    if ( (status = (init <= trip)) != 0 ) {
+                        start = pr->u.p.lb;
+                        incr  = pr->u.p.st;
+                        limit = parm1 + init - 1;
+
+                        if ( (last = (limit >= trip)) != 0 )
+                            limit = trip;
+
+                        if ( p_st != NULL ) *p_st = incr;
+
+                        pr->u.p.count += team->t.t_nproc;
+
+                        if ( incr == 1 ) {
+                            *p_lb = start + init;
+                            *p_ub = start + limit;
+                        }
+                        else {
+                            *p_lb = start + init * incr;
+                            *p_ub = start + limit * incr;
+                        }
+
+                        if ( pr->ordered ) {
+                            pr->u.p.ordered_lower = init;
+                            pr->u.p.ordered_upper = limit;
+                            #ifdef KMP_DEBUG
+                            {
+                                const char * buff;
+                                // create format specifiers before the debug output
+                                buff = __kmp_str_format(
+                                    "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t< UT >::spec, traits_t< UT >::spec );
+                                KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                                __kmp_str_free( &buff );
+                            }
+                            #endif
+                        } // if
+                    } // if
+                } // case
+                break;
+
+            case kmp_sch_dynamic_chunked:
+                {
+                    T chunk = pr->u.p.parm1;
+
+                    KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
+                                   gtid ) );
+
+                    init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
+                    trip = pr->u.p.tc - 1;
+
+                    if ( (status = (init <= trip)) == 0 ) {
+                        *p_lb = 0;
+                        *p_ub = 0;
+                        if ( p_st != NULL ) *p_st = 0;
+                    } else {
+                        start = pr->u.p.lb;
+                        limit = chunk + init - 1;
+                        incr  = pr->u.p.st;
+
+                        if ( (last = (limit >= trip)) != 0 )
+                            limit = trip;
+
+                        if ( p_st != NULL ) *p_st = incr;
+
+                        if ( incr == 1 ) {
+                            *p_lb = start + init;
+                            *p_ub = start + limit;
+                        } else {
+                            *p_lb = start + init * incr;
+                            *p_ub = start + limit * incr;
+                        }
+
+                        if ( pr->ordered ) {
+                            pr->u.p.ordered_lower = init;
+                            pr->u.p.ordered_upper = limit;
+                            #ifdef KMP_DEBUG
+                            {
+                                const char * buff;
+                                // create format specifiers before the debug output
+                                buff = __kmp_str_format(
+                                    "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t< UT >::spec, traits_t< UT >::spec );
+                                KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                                __kmp_str_free( &buff );
+                            }
+                            #endif
+                        } // if
+                    } // if
+                } // case
+                break;
+
+            case kmp_sch_guided_iterative_chunked:
+                {
+                    T  chunkspec = pr->u.p.parm1;
+                    KD_TRACE(100,
+                        ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
+                    trip  = pr->u.p.tc;
+                    // Start atomic part of calculations
+                    while(1) {
+                        ST  remaining;             // signed, because can be < 0
+                        init = sh->u.s.iteration;  // shared value
+                        remaining = trip - init;
+                        if ( remaining <= 0 ) {    // AC: need to compare with 0 first
+                            // nothing to do, don't try atomic op
+                            status = 0;
+                            break;
+                        }
+                        if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
+                            // use dynamic-style shcedule
+                            // atomically inrement iterations, get old value
+                            init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
+                            remaining = trip - init;
+                            if (remaining <= 0) {
+                                status = 0;    // all iterations got by other threads
+                            } else {
+                                // got some iterations to work on
+                                status = 1;
+                                if ( (T)remaining > chunkspec ) {
+                                    limit = init + chunkspec - 1;
+                                } else {
+                                    last = 1;   // the last chunk
+                                    limit = init + remaining - 1;
+                                } // if
+                            } // if
+                            break;
+                        } // if
+                        limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
+                        if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
+                            // CAS was successful, chunk obtained
+                            status = 1;
+                            --limit;
+                            break;
+                        } // if
+                    } // while
+                    if ( status != 0 ) {
+                        start = pr->u.p.lb;
+                        incr = pr->u.p.st;
+                        if ( p_st != NULL )
+                            *p_st = incr;
+                        *p_lb = start + init * incr;
+                        *p_ub = start + limit * incr;
+                        if ( pr->ordered ) {
+                            pr->u.p.ordered_lower = init;
+                            pr->u.p.ordered_upper = limit;
+                            #ifdef KMP_DEBUG
+                            {
+                                const char * buff;
+                                // create format specifiers before the debug output
+                                buff = __kmp_str_format(
+                                    "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t< UT >::spec, traits_t< UT >::spec );
+                                KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                                __kmp_str_free( &buff );
+                            }
+                            #endif
+                        } // if
+                    } else {
+                        *p_lb = 0;
+                        *p_ub = 0;
+                        if ( p_st != NULL )
+                            *p_st = 0;
+                    } // if
+                } // case
+                break;
+
+            case kmp_sch_guided_analytical_chunked:
+                {
+                    T   chunkspec = pr->u.p.parm1;
+                    UT chunkIdx;
+    #if KMP_OS_WINDOWS && KMP_ARCH_X86
+                    /* for storing original FPCW value for Windows* OS on
+		       IA-32 architecture 8-byte version */
+                    unsigned int oldFpcw;
+                    unsigned int fpcwSet = 0;
+    #endif
+                    KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
+                                   gtid ) );
+
+                    trip  = pr->u.p.tc;
+
+                    KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+                    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
+
+                    while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
+                        chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
+                        if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
+                            --trip;
+                            /* use dynamic-style scheduling */
+                            init = chunkIdx * chunkspec + pr->u.p.count;
+                            /* need to verify init > 0 in case of overflow in the above calculation */
+                            if ( (status = (init > 0 && init <= trip)) != 0 ) {
+                                limit = init + chunkspec -1;
+
+                                if ( (last = (limit >= trip)) != 0 )
+                                    limit = trip;
+                            }
+                            break;
+                        } else {
+                            /* use exponential-style scheduling */
+                            /* The following check is to workaround the lack of long double precision on Windows* OS.
+                               This check works around the possible effect that init != 0 for chunkIdx == 0.
+                             */
+    #if KMP_OS_WINDOWS && KMP_ARCH_X86
+                            /* If we haven't already done so, save original
+			       FPCW and set precision to 64-bit, as Windows* OS
+			       on IA-32 architecture defaults to 53-bit */
+                            if ( !fpcwSet ) {
+                                oldFpcw = _control87(0,0);
+                                _control87(_PC_64,_MCW_PC);
+                                fpcwSet = 0x30000;
+                            }
+    #endif
+                            if ( chunkIdx ) {
+                                init = __kmp_dispatch_guided_remaining< T >(
+                                           trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
+                                KMP_DEBUG_ASSERT(init);
+                                init = trip - init;
+                            } else
+                                init = 0;
+                            limit = trip - __kmp_dispatch_guided_remaining< T >(
+                                               trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
+                            KMP_ASSERT(init <= limit);
+                            if ( init < limit ) {
+                                KMP_DEBUG_ASSERT(limit <= trip);
+                                --limit;
+                                status = 1;
+                                break;
+                            } // if
+                        } // if
+                    } // while (1)
+    #if KMP_OS_WINDOWS && KMP_ARCH_X86
+                    /* restore FPCW if necessary
+                       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
+                    */
+                    if ( fpcwSet && ( oldFpcw & fpcwSet ) )
+                        _control87(oldFpcw,_MCW_PC);
+    #endif
+                    if ( status != 0 ) {
+                        start = pr->u.p.lb;
+                        incr = pr->u.p.st;
+                        if ( p_st != NULL )
+                            *p_st = incr;
+                        *p_lb = start + init * incr;
+                        *p_ub = start + limit * incr;
+                        if ( pr->ordered ) {
+                            pr->u.p.ordered_lower = init;
+                            pr->u.p.ordered_upper = limit;
+                            #ifdef KMP_DEBUG
+                            {
+                                const char * buff;
+                                // create format specifiers before the debug output
+                                buff = __kmp_str_format(
+                                    "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t< UT >::spec, traits_t< UT >::spec );
+                                KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                                __kmp_str_free( &buff );
+                            }
+                            #endif
+                        }
+                    } else {
+                        *p_lb = 0;
+                        *p_ub = 0;
+                        if ( p_st != NULL )
+                            *p_st = 0;
+                    }
+                } // case
+                break;
+
+            case kmp_sch_trapezoidal:
+                {
+                    UT   index;
+                    T    parm2 = pr->u.p.parm2;
+                    T    parm3 = pr->u.p.parm3;
+                    T    parm4 = pr->u.p.parm4;
+                    KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
+                                   gtid ) );
+
+                    index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
+
+                    init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
+                    trip = pr->u.p.tc - 1;
+
+                    if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
+                        *p_lb = 0;
+                        *p_ub = 0;
+                        if ( p_st != NULL ) *p_st = 0;
+                    } else {
+                        start = pr->u.p.lb;
+                        limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
+                        incr  = pr->u.p.st;
+
+                        if ( (last = (limit >= trip)) != 0 )
+                            limit = trip;
+
+                        if ( p_st != NULL ) *p_st = incr;
+
+                        if ( incr == 1 ) {
+                            *p_lb = start + init;
+                            *p_ub = start + limit;
+                        } else {
+                            *p_lb = start + init * incr;
+                            *p_ub = start + limit * incr;
+                        }
+
+                        if ( pr->ordered ) {
+                            pr->u.p.ordered_lower = init;
+                            pr->u.p.ordered_upper = limit;
+                            #ifdef KMP_DEBUG
+                            {
+                                const char * buff;
+                                // create format specifiers before the debug output
+                                buff = __kmp_str_format(
+                                    "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t< UT >::spec, traits_t< UT >::spec );
+                                KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
+                                __kmp_str_free( &buff );
+                            }
+                            #endif
+                        } // if
+                    } // if
+                } // case
+                break;
+            default:
+                {
+                    status = 0; // to avoid complaints on uninitialized variable use
+                    __kmp_msg(
+                        kmp_ms_fatal,                        // Severity
+                        KMP_MSG( UnknownSchedTypeDetected ), // Primary message
+                        KMP_HNT( GetNewerLibrary ),          // Hint
+                        __kmp_msg_null                       // Variadic argument list terminator
+                    );
+                }
+                break;
+            } // switch
+        } // if tc == 0;
+
+        if ( status == 0 ) {
+            UT   num_done;
+
+            num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
+            #ifdef KMP_DEBUG
+            {
+                const char * buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
+                    traits_t< UT >::spec );
+                KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
+                __kmp_str_free( &buff );
+            }
+            #endif
+
+            if ( (ST)num_done == team->t.t_nproc-1 ) {
+                /* NOTE: release this buffer to be reused */
+
+                KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+                sh->u.s.num_done = 0;
+                sh->u.s.iteration = 0;
+
+                /* TODO replace with general release procedure? */
+                if ( pr->ordered ) {
+                    sh->u.s.ordered_iteration = 0;
+                }
+
+                KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+                sh -> buffer_index += KMP_MAX_DISP_BUF;
+                KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
+                                gtid, sh->buffer_index) );
+
+                KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+            } // if
+            if ( __kmp_env_consistency_check ) {
+                if ( pr->pushed_ws != ct_none ) {
+                    pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
+                }
+            }
+
+            th -> th.th_dispatch -> th_deo_fcn = NULL;
+            th -> th.th_dispatch -> th_dxo_fcn = NULL;
+            th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
+            th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
+        } // if (status == 0)
+#if KMP_OS_WINDOWS
+        else if ( last ) {
+            pr->u.p.last_upper = pr->u.p.ub;
+        }
+#endif /* KMP_OS_WINDOWS */
+        if ( p_last != NULL && status != 0 )
+            *p_last = last;
+    } // if
+
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_next: T#%%d normal case: " \
+            "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
+        KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_DISPATCH_NEXT();
+#endif
+    OMPT_LOOP_END;
+    return status;
+}
+
+template< typename T >
+static void
+__kmp_dist_get_bounds(
+    ident_t                          *loc,
+    kmp_int32                         gtid,
+    kmp_int32                        *plastiter,
+    T                                *plower,
+    T                                *pupper,
+    typename traits_t< T >::signed_t  incr
+) {
+    KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    register kmp_uint32  team_id;
+    register kmp_uint32  nteams;
+    register UT          trip_count;
+    register kmp_team_t *team;
+    kmp_info_t * th;
+
+    KMP_DEBUG_ASSERT( plastiter && plower && pupper );
+    KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< T >::spec );
+        KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    if( __kmp_env_consistency_check ) {
+        if( incr == 0 ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
+        }
+        if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
+            // The loop is illegal.
+            // Some zero-trip loops maintained by compiler, e.g.:
+            //   for(i=10;i<0;++i) // lower >= upper - run-time check
+            //   for(i=0;i>10;--i) // lower <= upper - run-time check
+            //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+            //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+            // Compiler does not check the following illegal loops:
+            //   for(i=0;i<10;i+=incr) // where incr<0
+            //   for(i=10;i>0;i-=incr) // where incr<0
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
+        }
+    }
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
+    team = th->th.th_team;
+    #if OMP_40_ENABLED
+    nteams = th->th.th_teams_size.nteams;
+    #endif
+    team_id = team->t.t_master_tid;
+    KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+
+    // compute global trip count
+    if( incr == 1 ) {
+        trip_count = *pupper - *plower + 1;
+    } else if(incr == -1) {
+        trip_count = *plower - *pupper + 1;
+    } else {
+        trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
+    }
+    if( trip_count <= nteams ) {
+        KMP_DEBUG_ASSERT(
+            __kmp_static == kmp_sch_static_greedy || \
+            __kmp_static == kmp_sch_static_balanced
+        ); // Unknown static scheduling type.
+        // only some teams get single iteration, others get nothing
+        if( team_id < trip_count ) {
+            *pupper = *plower = *plower + team_id * incr;
+        } else {
+            *plower = *pupper + incr; // zero-trip loop
+        }
+        if( plastiter != NULL )
+            *plastiter = ( team_id == trip_count - 1 );
+    } else {
+        if( __kmp_static == kmp_sch_static_balanced ) {
+            register UT chunk = trip_count / nteams;
+            register UT extras = trip_count % nteams;
+            *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
+            *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
+            if( plastiter != NULL )
+                *plastiter = ( team_id == nteams - 1 );
+        } else {
+            register T chunk_inc_count =
+                ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
+            register T upper = *pupper;
+            KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
+                // Unknown static scheduling type.
+            *plower += team_id * chunk_inc_count;
+            *pupper = *plower + chunk_inc_count - incr;
+            // Check/correct bounds if needed
+            if( incr > 0 ) {
+                if( *pupper < *plower )
+                    *pupper = i_maxmin< T >::mx;
+                if( plastiter != NULL )
+                    *plastiter = *plower <= upper && *pupper > upper - incr;
+                if( *pupper > upper )
+                    *pupper = upper; // tracker C73258
+            } else {
+                if( *pupper > *plower )
+                    *pupper = i_maxmin< T >::mn;
+                if( plastiter != NULL )
+                    *plastiter = *plower >= upper && *pupper < upper - incr;
+                if( *pupper < upper )
+                    *pupper = upper; // tracker C73258
+            }
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------------------
+// Dispatch routines
+//    Transfer call to template< type T >
+//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
+//                         T lb, T ub, ST st, ST chunk )
+extern "C" {
+
+/*!
+@ingroup WORK_SHARING
+@{
+@param loc Source location
+@param gtid Global thread id
+@param schedule Schedule type
+@param lb  Lower bound
+@param ub  Upper bound
+@param st  Step (or increment if you prefer)
+@param chunk The chunk size to block with
+
+This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
+These functions are all identical apart from the types of the arguments.
+*/
+
+void
+__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                        kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void
+__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                        kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void
+__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                        kmp_int64 lb, kmp_int64 ub,
+                        kmp_int64 st, kmp_int64 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void
+__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                         kmp_uint64 lb, kmp_uint64 ub,
+                         kmp_int64 st, kmp_int64 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+
+Difference from __kmpc_dispatch_init set of functions is these functions
+are called for composite distribute parallel for construct. Thus before
+regular iterations dispatching we need to calc per-team iteration space.
+
+These functions are all identical apart from the types of the arguments.
+*/
+void
+__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+void
+__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+void
+__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+void
+__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
+@param p_lb   Pointer to the lower bound for the next chunk of work
+@param p_ub   Pointer to the upper bound for the next chunk of work
+@param p_st   Pointer to the stride for the next chunk of work
+@return one if there is work to be done, zero otherwise
+
+Get the next dynamically allocated chunk of work for this thread.
+If there is no more work, then the lb,ub and stride need not be modified.
+*/
+int
+__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                        kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
+{
+    return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int
+__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                        kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
+{
+    return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int
+__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                        kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
+{
+    return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int
+__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                        kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
+{
+    return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+
+Mark the end of a dynamic loop.
+*/
+void
+__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void
+__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void
+__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void
+__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
+}
+/*! @} */
+
+//-----------------------------------------------------------------------------------------
+//Non-template routines from kmp_dispatch.c used in other sources
+
+kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
+    return value == checker;
+}
+
+kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
+    return value != checker;
+}
+
+kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
+    return value < checker;
+}
+
+kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
+    return value >= checker;
+}
+
+kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
+    return value <= checker;
+}
+kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
+    return value == checker;
+}
+
+kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
+    return value != checker;
+}
+
+kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
+    return value < checker;
+}
+
+kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
+    return value >= checker;
+}
+
+kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
+    return value <= checker;
+}
+
+kmp_uint32
+__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
+                   kmp_uint32            checker,
+                   kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
+                   , void        * obj    // Higher-level synchronization object, or NULL.
+                   )
+{
+    // note: we may not belong to a team at this point
+    register volatile kmp_uint32         * spin          = spinner;
+    register          kmp_uint32           check         = checker;
+    register          kmp_uint32   spins;
+    register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
+    register          kmp_uint32           r;
+
+    KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
+    KMP_INIT_YIELD( spins );
+    // main wait spin loop
+    while(!f(r = TCR_4(*spin), check)) {
+        KMP_FSYNC_SPIN_PREPARE( obj );
+        /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
+           It causes problems with infinite recursion because of exit lock */
+        /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+            __kmp_abort_thread(); */
+
+        /* if we have waited a bit, or are oversubscribed, yield */
+        /* pause is in the following code */
+        KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
+        KMP_YIELD_SPIN( spins );
+    }
+    KMP_FSYNC_SPIN_ACQUIRED( obj );
+    return r;
+}
+
+kmp_uint64
+__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
+                    kmp_uint64            checker,
+                    kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
+                    , void        * obj    // Higher-level synchronization object, or NULL.
+                    )
+{
+    // note: we may not belong to a team at this point
+    register volatile kmp_uint64         * spin          = spinner;
+    register          kmp_uint64           check         = checker;
+    register          kmp_uint32   spins;
+    register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
+    register          kmp_uint64           r;
+
+    KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
+    KMP_INIT_YIELD( spins );
+    // main wait spin loop
+    while(!f(r = *spin, check))
+    {
+        KMP_FSYNC_SPIN_PREPARE( obj );
+        /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
+           It causes problems with infinite recursion because of exit lock */
+        /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+            __kmp_abort_thread(); */
+
+        // if we are oversubscribed,
+        // or have waited a bit (and KMP_LIBARRY=throughput, then yield
+        // pause is in the following code
+        KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
+        KMP_YIELD_SPIN( spins );
+    }
+    KMP_FSYNC_SPIN_ACQUIRED( obj );
+    return r;
+}
+
+} // extern "C"
+
+#ifdef KMP_GOMP_COMPAT
+
+void
+__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                           kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
+                           kmp_int32 chunk, int push_ws )
+{
+    __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
+                                      push_ws );
+}
+
+void
+__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                            kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
+                            kmp_int32 chunk, int push_ws )
+{
+    __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
+                                       push_ws );
+}
+
+void
+__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                           kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
+                           kmp_int64 chunk, int push_ws )
+{
+    __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
+                                      push_ws );
+}
+
+void
+__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+                            kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
+                            kmp_int64 chunk, int push_ws )
+{
+    __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
+                                       push_ws );
+}
+
+void
+__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
+}
+
+void
+__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
+}
+
+void
+__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
+}
+
+void
+__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
+{
+    __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
+}
+
+#endif /* KMP_GOMP_COMPAT */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+

diff --git a/final/runtime/src/kmp_environment.c b/final/runtime/src/kmp_environment.c
new file mode 100644
index 0000000..df10c69
--- /dev/null
+++ b/final/runtime/src/kmp_environment.c

@@ -0,0 +1,598 @@
+/*
+ * kmp_environment.c -- Handle environment variables OS-independently.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*
+    ------------------------------------------------------------------------------------------------
+    We use GetEnvironmentVariable for Windows* OS instead of getenv because the act of
+    loading a DLL on Windows* OS makes any user-set environment variables (i.e. with putenv())
+    unavailable.  getenv() apparently gets a clean copy of the env variables as they existed
+    at the start of the run.
+    JH 12/23/2002
+    ------------------------------------------------------------------------------------------------
+    On Windows* OS, there are two environments (at least, see below):
+
+        1. Environment maintained by Windows* OS on IA-32 architecture. 
+	   Accessible through GetEnvironmentVariable(),
+           SetEnvironmentVariable(), and GetEnvironmentStrings().
+
+        2. Environment maintained by C RTL. Accessible through getenv(), putenv().
+
+    putenv() function updates both C and Windows* OS on IA-32 architecture. getenv() function 
+    search for variables in C RTL environment only. Windows* OS on IA-32 architecture functions work *only* 
+    with Windows* OS on IA-32 architecture.
+
+    Windows* OS on IA-32 architecture maintained by OS, so there is always only one Windows* OS on 
+    IA-32 architecture per process. Changes in Windows* OS on IA-32 architecture are process-visible.
+
+    C environment maintained by C RTL. Multiple copies of C RTL may be present in the process, and
+    each C RTL maintains its own environment. :-(
+
+    Thus, proper way to work with environment on Windows* OS is:
+
+        1. Set variables with putenv() function -- both C and Windows* OS on
+	   IA-32 architecture are being updated. Windows* OS on 
+	   IA-32 architecture may be considered as primary target,
+	   while updating C RTL environment is a free bonus.
+
+        2. Get variables with GetEnvironmentVariable() -- getenv() does not 
+	   search Windows* OS on IA-32 architecture, and can not see variables
+	   set with SetEnvironmentVariable().
+
+    2007-04-05 -- lev
+    ------------------------------------------------------------------------------------------------
+*/
+
+#include "kmp_environment.h"
+
+#include "kmp_os.h"    // KMP_OS_*.
+#include "kmp.h"       //
+#include "kmp_str.h"   // __kmp_str_*().
+#include "kmp_i18n.h"
+
+#if KMP_OS_UNIX
+    #include <stdlib.h>    // getenv, setenv, unsetenv.
+    #include <string.h>    // strlen, strcpy.
+    #if KMP_OS_LINUX || KMP_OS_FREEBSD
+        extern char * * environ;
+    #elif KMP_OS_DARWIN
+        #include <crt_externs.h>
+        #define environ (*_NSGetEnviron())
+    #else
+        #error Unknown or unsupported OS.
+    #endif
+#elif KMP_OS_WINDOWS
+    #include <windows.h>   // GetEnvironmentVariable, SetEnvironmentVariable, GetLastError.
+#else
+    #error Unknown or unsupported OS.
+#endif
+
+
+// TODO: Eliminate direct memory allocations, use string operations instead.
+
+static inline
+void *
+allocate(
+    size_t size
+) {
+    void * ptr = KMP_INTERNAL_MALLOC( size );
+    if ( ptr == NULL ) {
+	KMP_FATAL( MemoryAllocFailed );
+    }; // if
+    return ptr;
+} // allocate
+
+
+char *
+__kmp_env_get( char const * name ) {
+
+    char * result = NULL;
+
+    #if KMP_OS_UNIX
+        char const * value = getenv( name );
+        if ( value != NULL ) {
+            size_t len = KMP_STRLEN( value ) + 1;
+            result = (char *) KMP_INTERNAL_MALLOC( len );
+            if ( result == NULL ) {
+		KMP_FATAL( MemoryAllocFailed );
+            }; // if
+            KMP_STRNCPY_S( result, len, value, len );
+        }; // if
+    #elif KMP_OS_WINDOWS
+        /*
+            We use GetEnvironmentVariable for Windows* OS instead of getenv because the act of
+            loading a DLL on Windows* OS makes any user-set environment variables (i.e. with putenv())
+            unavailable.  getenv() apparently gets a clean copy of the env variables as they existed
+            at the start of the run.
+            JH 12/23/2002
+        */
+        DWORD rc;
+        rc = GetEnvironmentVariable( name, NULL, 0 );
+        if ( ! rc ) {
+            DWORD error = GetLastError();
+            if ( error != ERROR_ENVVAR_NOT_FOUND ) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( CantGetEnvVar, name ),
+                    KMP_ERR( error ),
+                    __kmp_msg_null
+                );
+            }; // if
+            // Variable is not found, it's ok, just continue.
+        } else {
+            DWORD len = rc;
+            result = (char *) KMP_INTERNAL_MALLOC( len );
+            if ( result == NULL ) {
+		KMP_FATAL( MemoryAllocFailed );
+            }; // if
+            rc = GetEnvironmentVariable( name, result, len );
+            if ( ! rc ) {
+                // GetEnvironmentVariable() may return 0 if variable is empty.
+                // In such a case GetLastError() returns ERROR_SUCCESS.
+                DWORD error = GetLastError();
+                if ( error != ERROR_SUCCESS ) {
+                    // Unexpected error. The variable should be in the environment,
+                    // and buffer should be large enough.
+                    __kmp_msg(
+                        kmp_ms_fatal,
+                        KMP_MSG( CantGetEnvVar, name ),
+                        KMP_ERR( error ),
+                        __kmp_msg_null
+                    );
+                    KMP_INTERNAL_FREE( (void *) result );
+                    result = NULL;
+                }; // if
+            }; // if
+        }; // if
+    #else
+        #error Unknown or unsupported OS.
+    #endif
+
+    return result;
+
+} // func __kmp_env_get
+
+
+// TODO: Find and replace all regular free() with __kmp_env_free().
+
+void
+__kmp_env_free( char const * * value ) {
+
+    KMP_DEBUG_ASSERT( value != NULL );
+    KMP_INTERNAL_FREE( (void *) * value );
+    * value = NULL;
+
+} // func __kmp_env_free
+
+
+
+int
+__kmp_env_exists( char const * name ) {
+
+    #if KMP_OS_UNIX
+        char const * value = getenv( name );
+        return ( ( value == NULL ) ? ( 0 ) : ( 1 ) );
+    #elif KMP_OS_WINDOWS
+        DWORD rc;
+        rc = GetEnvironmentVariable( name, NULL, 0 );
+        if ( rc == 0 ) {
+            DWORD error = GetLastError();
+            if ( error != ERROR_ENVVAR_NOT_FOUND ) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( CantGetEnvVar, name ),
+                    KMP_ERR( error ),
+                    __kmp_msg_null
+                );
+            }; // if
+            return 0;
+        }; // if
+        return 1;
+    #else
+        #error Unknown or unsupported OS.
+    #endif
+
+} // func __kmp_env_exists
+
+
+
+void
+__kmp_env_set( char const * name, char const * value, int overwrite ) {
+
+    #if KMP_OS_UNIX
+        int rc = setenv( name, value, overwrite );
+        if ( rc != 0 ) {
+            // Dead code. I tried to put too many variables into Linux* OS
+            // environment on IA-32 architecture. When application consumes
+            // more than ~2.5 GB of memory, entire system feels bad. Sometimes
+            // application is killed (by OS?), sometimes system stops 
+            // responding... But this error message never appears. --ln
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantSetEnvVar, name ),
+                KMP_HNT( NotEnoughMemory ),
+                __kmp_msg_null
+            );
+        }; // if
+    #elif KMP_OS_WINDOWS
+        BOOL rc;
+        if ( ! overwrite ) {
+            rc = GetEnvironmentVariable( name, NULL, 0 );
+            if ( rc ) {
+                // Variable exists, do not overwrite.
+                return;
+            }; // if
+            DWORD error = GetLastError();
+            if ( error != ERROR_ENVVAR_NOT_FOUND ) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( CantGetEnvVar, name ),
+                    KMP_ERR( error ),
+                    __kmp_msg_null
+                );
+            }; // if
+        }; // if
+        rc = SetEnvironmentVariable( name, value );
+        if ( ! rc ) {
+            DWORD error = GetLastError();
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantSetEnvVar, name ),
+                KMP_ERR( error ),
+                __kmp_msg_null
+            );
+        }; // if
+    #else
+        #error Unknown or unsupported OS.
+    #endif
+
+} // func __kmp_env_set
+
+
+
+void
+__kmp_env_unset( char const * name ) {
+
+    #if KMP_OS_UNIX
+        unsetenv( name );
+    #elif KMP_OS_WINDOWS
+        BOOL rc = SetEnvironmentVariable( name, NULL );
+        if ( ! rc ) {
+            DWORD error = GetLastError();
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantSetEnvVar, name ),
+                KMP_ERR( error ),
+                __kmp_msg_null
+            );
+        }; // if
+    #else
+        #error Unknown or unsupported OS.
+    #endif
+
+} // func __kmp_env_unset
+
+// -------------------------------------------------------------------------------------------------
+
+/*
+    Intel OpenMP RTL string representation of environment: just a string of characters, variables
+    are separated with vertical bars, e. g.:
+
+        "KMP_WARNINGS=0|KMP_AFFINITY=compact|"
+
+    Empty variables are allowed and ignored:
+
+        "||KMP_WARNINGS=1||"
+
+*/
+
+static
+void
+___kmp_env_blk_parse_string(
+    kmp_env_blk_t * block,   // M: Env block to fill.
+    char const *    env      // I: String to parse.
+) {
+
+    char const chr_delimiter   = '|';
+    char const str_delimiter[] = { chr_delimiter, 0 };
+
+    char *          bulk       = NULL;
+    kmp_env_var_t * vars       = NULL;
+    int             count      = 0;  // Number of used elements in vars array.
+    int             delimiters = 0;  // Number of delimiters in input string.
+
+    // Copy original string, we will modify the copy.
+    bulk = __kmp_str_format( "%s", env );
+
+    // Loop thru all the vars in environment block. Count delimiters (maximum number of variables
+    // is number of delimiters plus one).
+    {
+        char const * ptr = bulk;
+        for ( ; ; ) {
+            ptr = strchr( ptr, chr_delimiter );
+            if ( ptr == NULL ) {
+                break;
+            }; // if
+            ++ delimiters;
+            ptr += 1;
+        }; // forever
+    }
+
+    // Allocate vars array.
+    vars = (kmp_env_var_t *) allocate( ( delimiters + 1 ) * sizeof( kmp_env_var_t ) );
+
+    // Loop thru all the variables.
+    {
+        char * var;     // Pointer to variable (both name and value).
+        char * name;    // Pointer to name of variable.
+        char * value;   // Pointer to value.
+        char * buf;     // Buffer for __kmp_str_token() function.
+        var = __kmp_str_token( bulk, str_delimiter, & buf );      // Get the first var.
+        while ( var != NULL ) {
+            // Save found variable in vars array.
+            __kmp_str_split( var, '=', & name, & value );
+            KMP_DEBUG_ASSERT( count < delimiters + 1 );
+            vars[ count ].name  = name;
+            vars[ count ].value = value;
+            ++ count;
+            // Get the next var.
+            var = __kmp_str_token( NULL, str_delimiter, & buf );
+        }; // while
+    }
+
+    // Fill out result.
+    block->bulk  = bulk;
+    block->vars  = vars;
+    block->count = count;
+
+}; // ___kmp_env_blk_parse_string
+
+
+
+/*
+    Windows* OS (actually, DOS) environment block is a piece of memory with environment variables. Each
+    variable is terminated with zero byte, entire block is terminated with one extra zero byte, so
+    we have two zero bytes at the end of environment block, e. g.:
+
+        "HOME=C:\\users\\lev\x00OS=Windows_NT\x00\x00"
+
+    It is not clear how empty environment is represented. "\x00\x00"?
+*/
+
+#if KMP_OS_WINDOWS
+static
+void
+___kmp_env_blk_parse_windows(
+    kmp_env_blk_t * block,   // M: Env block to fill.
+    char const *    env      // I: Pointer to Windows* OS (DOS) environment block.
+) {
+
+    char *          bulk  = NULL;
+    kmp_env_var_t * vars  = NULL;
+    int             count = 0;     // Number of used elements in vars array.
+    int             size  = 0;     // Size of bulk.
+
+    char * name;    // Pointer to name of variable.
+    char * value;   // Pointer to value.
+
+    if ( env != NULL ) {
+
+        // Loop thru all the vars in environment block. Count variables, find size of block.
+        {
+            char const * var;     // Pointer to beginning of var.
+            int          len;     // Length of variable.
+            count = 0;
+            var = env;            // The first variable starts and beginning of environment block.
+            len = KMP_STRLEN( var );
+            while ( len != 0 ) {
+                ++ count;
+                size = size + len + 1;
+                var = var + len + 1; // Move pointer to the beginning of the next variable.
+                len = KMP_STRLEN( var );
+            }; // while
+            size = size + 1;         // Total size of env block, including terminating zero byte.
+        }
+
+        // Copy original block to bulk, we will modify bulk, not original block.
+        bulk = (char *) allocate( size );
+        KMP_MEMCPY_S( bulk, size, env, size );
+        // Allocate vars array.
+        vars = (kmp_env_var_t *) allocate( count * sizeof( kmp_env_var_t ) );
+
+        // Loop thru all the vars, now in bulk.
+        {
+            char * var;     // Pointer to beginning of var.
+            int    len;     // Length of variable.
+            count = 0;
+            var = bulk;
+            len = KMP_STRLEN( var );
+            while ( len != 0 ) {
+                // Save variable in vars array.
+                __kmp_str_split( var, '=', & name, & value );
+                vars[ count ].name  = name;
+                vars[ count ].value = value;
+                ++ count;
+                // Get the next var.
+                var = var + len + 1;
+                len = KMP_STRLEN( var );
+            }; // while
+        }
+
+    }; // if
+
+    // Fill out result.
+    block->bulk  = bulk;
+    block->vars  = vars;
+    block->count = count;
+
+}; // ___kmp_env_blk_parse_windows
+#endif
+
+
+/*
+    Unix environment block is a array of pointers to variables, last pointer in array is NULL:
+
+        { "HOME=/home/lev", "TERM=xterm", NULL }
+*/
+
+static
+void
+___kmp_env_blk_parse_unix(
+    kmp_env_blk_t * block,   // M: Env block to fill.
+    char * *        env      // I: Unix environment to parse.
+) {
+
+    char *          bulk  = NULL;
+    kmp_env_var_t * vars  = NULL;
+    int             count = 0;
+    int             size  = 0;    // Size of bulk.
+
+    // Count number of variables and length of required bulk.
+    {
+        count = 0;
+        size  = 0;
+        while ( env[ count ] != NULL ) {
+            size += KMP_STRLEN( env[ count ] ) + 1;
+            ++ count;
+        }; // while
+    }
+
+    // Allocate memory.
+    bulk = (char *) allocate( size );
+    vars = (kmp_env_var_t *) allocate( count * sizeof( kmp_env_var_t ) );
+
+    // Loop thru all the vars.
+    {
+        char * var;     // Pointer to beginning of var.
+        char * name;    // Pointer to name of variable.
+        char * value;   // Pointer to value.
+        int    len;     // Length of variable.
+        int    i;
+        var = bulk;
+        for ( i = 0; i < count; ++ i ) {
+            // Copy variable to bulk.
+            len = KMP_STRLEN( env[ i ] );
+            KMP_MEMCPY_S( var, size, env[ i ], len + 1 );
+            // Save found variable in vars array.
+            __kmp_str_split( var, '=', & name, & value );
+            vars[ i ].name  = name;
+            vars[ i ].value = value;
+            // Move pointer.
+            var += len + 1;
+        }; // for
+    }
+
+    // Fill out result.
+    block->bulk  = bulk;
+    block->vars  = vars;
+    block->count = count;
+
+}; // ___kmp_env_blk_parse_unix
+
+
+
+void
+__kmp_env_blk_init(
+    kmp_env_blk_t * block,  // M: Block to initialize.
+    char const *    bulk    // I: Initialization string, or NULL.
+) {
+
+    if ( bulk != NULL ) {
+        ___kmp_env_blk_parse_string( block, bulk );
+    } else {
+        #if KMP_OS_UNIX
+            ___kmp_env_blk_parse_unix( block, environ );
+        #elif KMP_OS_WINDOWS
+            {
+                char * mem = GetEnvironmentStrings();
+                if ( mem == NULL ) {
+                    DWORD error = GetLastError();
+                    __kmp_msg(
+                        kmp_ms_fatal,
+                        KMP_MSG( CantGetEnvironment ),
+                        KMP_ERR( error ),
+                        __kmp_msg_null
+                    );
+                }; // if
+                ___kmp_env_blk_parse_windows( block, mem );
+                FreeEnvironmentStrings( mem );
+            }
+        #else
+            #error Unknown or unsupported OS.
+        #endif
+    }; // if
+
+} // __kmp_env_blk_init
+
+
+
+static
+int
+___kmp_env_var_cmp(                              // Comparison function for qsort().
+    kmp_env_var_t const * lhs,
+    kmp_env_var_t const * rhs
+) {
+    return strcmp( lhs->name, rhs->name );
+}
+
+void
+__kmp_env_blk_sort(
+    kmp_env_blk_t * block  // M: Block of environment variables to sort.
+) {
+
+    qsort(
+        (void *) block->vars,
+        block->count,
+        sizeof( kmp_env_var_t ),
+        ( int ( * )( void const *, void const * ) ) & ___kmp_env_var_cmp
+    );
+
+} // __kmp_env_block_sort
+
+
+
+void
+__kmp_env_blk_free(
+    kmp_env_blk_t * block  // M: Block of environment variables to free.
+) {
+
+    KMP_INTERNAL_FREE( (void *) block->vars );
+    KMP_INTERNAL_FREE( (void *) block->bulk );
+
+    block->count = 0;
+    block->vars  = NULL;
+    block->bulk  = NULL;
+
+} // __kmp_env_blk_free
+
+
+
+char const *               // R: Value of variable or NULL if variable does not exist.
+__kmp_env_blk_var(
+    kmp_env_blk_t * block, // I: Block of environment variables.
+    char const *    name   // I: Name of variable to find.
+) {
+
+    int i;
+    for ( i = 0; i < block->count; ++ i ) {
+        if ( strcmp( block->vars[ i ].name, name ) == 0 ) {
+            return block->vars[ i ].value;
+        }; // if
+    }; // for
+    return NULL;
+
+} // __kmp_env_block_var
+
+
+// end of file //

diff --git a/final/runtime/src/kmp_environment.h b/final/runtime/src/kmp_environment.h
new file mode 100644
index 0000000..243b547
--- /dev/null
+++ b/final/runtime/src/kmp_environment.h

@@ -0,0 +1,81 @@
+/*
+ * kmp_environment.h -- Handle environment varoiables OS-independently.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_ENVIRONMENT_H
+#define KMP_ENVIRONMENT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Return a copy of the value of environment variable or NULL if the variable does not exist.
+// *Note*: Returned pointed *must* be freed after use with __kmp_env_free().
+char * __kmp_env_get( char const * name );
+void   __kmp_env_free( char const * * value );
+
+// Return 1 if the environment variable exists or 0 if does not exist.
+int __kmp_env_exists( char const * name );
+
+// Set the environment variable.
+void __kmp_env_set( char const * name, char const * value, int overwrite );
+
+// Unset (remove) environment variable.
+void __kmp_env_unset( char const * name );
+
+
+// -------------------------------------------------------------------------------------------------
+//  Working with environment blocks.
+// -------------------------------------------------------------------------------------------------
+
+/*
+    kmp_env_blk_t is read-only collection of environment variables (or environment-like). Usage:
+
+        kmp_env_blk_t block;
+        __kmp_env_blk_init( & block, NULL ); // Initialize block from process environment.
+        // or
+        __kmp_env_blk_init( & block, "KMP_WARNING=1|KMP_AFFINITY=none" ); // from string.
+        __kmp_env_blk_sort( & block ); // Optionally, sort list.
+        for ( i = 0; i < block.count; ++ i ) {
+            // Process block.vars[ i ].name and block.vars[ i ].value...
+        }; // for i
+        __kmp_env_block_free( & block );
+*/
+
+struct __kmp_env_var {
+    char const * name;
+    char const * value;
+};
+typedef struct __kmp_env_var kmp_env_var_t;
+
+struct __kmp_env_blk {
+    char const *          bulk;
+    kmp_env_var_t const * vars;
+    int                   count;
+};
+typedef struct __kmp_env_blk kmp_env_blk_t;
+
+void         __kmp_env_blk_init( kmp_env_blk_t * block, char const * bulk );
+void         __kmp_env_blk_free( kmp_env_blk_t * block );
+void         __kmp_env_blk_sort( kmp_env_blk_t * block );
+char const * __kmp_env_blk_var(  kmp_env_blk_t * block, char const * name );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KMP_ENVIRONMENT_H
+
+// end of file //
+

diff --git a/final/runtime/src/kmp_error.c b/final/runtime/src/kmp_error.c
new file mode 100644
index 0000000..6866df5
--- /dev/null
+++ b/final/runtime/src/kmp_error.c

@@ -0,0 +1,523 @@
+/*
+ * kmp_error.c -- KPTS functions for error checking at runtime
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+#include "kmp_error.h"
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#define MIN_STACK       100
+
+
+static char const * cons_text_c[] = {
+    "(none)",
+    "\"parallel\"",
+    "work-sharing",             /* this is not called "for" because of lowering of "sections" pragmas */
+    "\"ordered\" work-sharing", /* this is not called "for ordered" because of lowering of "sections" pragmas */
+    "\"sections\"",
+    "work-sharing",             /* this is not called "single" because of lowering of "sections" pragmas */
+    "\"taskq\"",
+    "\"taskq\"",
+    "\"taskq ordered\"",
+    "\"critical\"",
+    "\"ordered\"",              /* in PARALLEL */
+    "\"ordered\"",              /* in PDO */
+    "\"ordered\"",              /* in TASKQ */
+    "\"master\"",
+    "\"reduce\"",
+    "\"barrier\""
+};
+
+#define get_src( ident )   ( (ident) == NULL ? NULL : (ident)->psource )
+
+#define PUSH_MSG( ct, ident ) \
+    "\tpushing on stack: %s (%s)\n", cons_text_c[ (ct) ], get_src( (ident) )
+#define POP_MSG( p )                                  \
+    "\tpopping off stack: %s (%s)\n",                 \
+    cons_text_c[ (p)->stack_data[ tos ].type ],       \
+    get_src( (p)->stack_data[ tos ].ident )
+
+static int const cons_text_c_num    = sizeof( cons_text_c    ) / sizeof( char const * );
+
+/* ------------------------------------------------------------------------ */
+/* --------------- START OF STATIC LOCAL ROUTINES ------------------------- */
+/* ------------------------------------------------------------------------ */
+
+static void
+__kmp_check_null_func( void )
+{
+    /* nothing to do */
+}
+
+static void
+__kmp_expand_cons_stack( int gtid, struct cons_header *p )
+{
+    int    i;
+    struct cons_data *d;
+
+    /* TODO for monitor perhaps? */
+    if (gtid < 0)
+        __kmp_check_null_func();
+
+    KE_TRACE( 10, ("expand cons_stack (%d %d)\n", gtid, __kmp_get_gtid() ) );
+
+    d = p->stack_data;
+
+    p->stack_size = (p->stack_size * 2) + 100;
+
+    /* TODO free the old data */
+    p->stack_data = (struct cons_data *) __kmp_allocate( sizeof( struct cons_data ) * (p->stack_size+1) );
+
+    for (i = p->stack_top; i >= 0; --i)
+        p->stack_data[i] = d[i];
+
+    /* NOTE: we do not free the old stack_data */
+}
+
+// NOTE: Function returns allocated memory, caller must free it!
+static char const *
+__kmp_pragma(
+    int              ct,
+    ident_t const *  ident
+) {
+    char const * cons = NULL;  // Construct name.
+    char * file = NULL;  // File name.
+    char * func = NULL;  // Function (routine) name.
+    char * line = NULL;  // Line number.
+    kmp_str_buf_t buffer;
+    kmp_msg_t     prgm;
+    __kmp_str_buf_init( & buffer );
+    if ( 0 < ct && ct < cons_text_c_num ) {
+        cons = cons_text_c[ ct ];
+    } else {
+        KMP_DEBUG_ASSERT( 0 );
+    };
+    if ( ident != NULL && ident->psource != NULL ) {
+        char * tail = NULL;
+        __kmp_str_buf_print( & buffer, "%s", ident->psource ); // Copy source to buffer.
+        // Split string in buffer to file, func, and line.
+        tail = buffer.str;
+        __kmp_str_split( tail, ';', NULL,   & tail );
+        __kmp_str_split( tail, ';', & file, & tail );
+        __kmp_str_split( tail, ';', & func, & tail );
+        __kmp_str_split( tail, ';', & line, & tail );
+    }; // if
+    prgm = __kmp_msg_format( kmp_i18n_fmt_Pragma, cons, file, func, line );
+    __kmp_str_buf_free( & buffer );
+    return prgm.str;
+} // __kmp_pragma
+
+/* ------------------------------------------------------------------------ */
+/* ----------------- END OF STATIC LOCAL ROUTINES ------------------------- */
+/* ------------------------------------------------------------------------ */
+
+
+void
+__kmp_error_construct(
+    kmp_i18n_id_t    id,     // Message identifier.
+    enum cons_type   ct,     // Construct type.
+    ident_t const *  ident   // Construct ident.
+) {
+    char const * construct = __kmp_pragma( ct, ident );
+    __kmp_msg( kmp_ms_fatal, __kmp_msg_format( id, construct ), __kmp_msg_null );
+    KMP_INTERNAL_FREE( (void *) construct );
+}
+
+void
+__kmp_error_construct2(
+    kmp_i18n_id_t            id,     // Message identifier.
+    enum cons_type           ct,     // First construct type.
+    ident_t const *          ident,  // First construct ident.
+    struct cons_data const * cons    // Second construct.
+) {
+    char const * construct1 = __kmp_pragma( ct, ident );
+    char const * construct2 = __kmp_pragma( cons->type, cons->ident );
+    __kmp_msg( kmp_ms_fatal, __kmp_msg_format( id, construct1, construct2 ), __kmp_msg_null );
+    KMP_INTERNAL_FREE( (void *) construct1 );
+    KMP_INTERNAL_FREE( (void *) construct2 );
+}
+
+
+struct cons_header *
+__kmp_allocate_cons_stack( int gtid )
+{
+    struct cons_header *p;
+
+    /* TODO for monitor perhaps? */
+    if ( gtid < 0 ) {
+        __kmp_check_null_func();
+    }; // if
+    KE_TRACE( 10, ("allocate cons_stack (%d)\n", gtid ) );
+    p = (struct cons_header *) __kmp_allocate( sizeof( struct cons_header ) );
+    p->p_top = p->w_top = p->s_top = 0;
+    p->stack_data = (struct cons_data *) __kmp_allocate( sizeof( struct cons_data ) * (MIN_STACK+1) );
+    p->stack_size = MIN_STACK;
+    p->stack_top  = 0;
+    p->stack_data[ 0 ].type = ct_none;
+    p->stack_data[ 0 ].prev = 0;
+    p->stack_data[ 0 ].ident = NULL;
+    return p;
+}
+
+void
+__kmp_free_cons_stack( void * ptr ) {
+    struct cons_header * p = (struct cons_header *) ptr;
+    if ( p != NULL ) {
+        if ( p->stack_data != NULL ) {
+            __kmp_free( p->stack_data );
+            p->stack_data = NULL;
+        }; // if
+        __kmp_free( p );
+    }; // if
+}
+
+
+#if KMP_DEBUG
+static void
+dump_cons_stack( int gtid, struct cons_header * p ) {
+    int i;
+    int tos = p->stack_top;
+    kmp_str_buf_t buffer;
+    __kmp_str_buf_init( & buffer );
+    __kmp_str_buf_print( & buffer, "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n" );
+    __kmp_str_buf_print( & buffer, "Begin construct stack with %d items for thread %d\n", tos, gtid );
+    __kmp_str_buf_print( & buffer, "     stack_top=%d { P=%d, W=%d, S=%d }\n", tos, p->p_top, p->w_top, p->s_top );
+    for ( i = tos; i > 0; i-- ) {
+        struct cons_data * c = & ( p->stack_data[ i ] );
+        __kmp_str_buf_print( & buffer, "        stack_data[%2d] = { %s (%s) %d %p }\n", i, cons_text_c[ c->type ], get_src( c->ident ), c->prev, c->name );
+    }; // for i
+    __kmp_str_buf_print( & buffer, "End construct stack for thread %d\n", gtid );
+    __kmp_str_buf_print( & buffer, "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n" );
+    __kmp_debug_printf( "%s", buffer.str );
+    __kmp_str_buf_free( & buffer );
+}
+#endif
+
+void
+__kmp_push_parallel( int gtid, ident_t const * ident )
+{
+    int tos;
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+
+    KMP_DEBUG_ASSERT( __kmp_threads[ gtid ]-> th.th_cons );
+    KE_TRACE( 10, ("__kmp_push_parallel (%d %d)\n", gtid, __kmp_get_gtid() ) );
+    KE_TRACE( 100, ( PUSH_MSG( ct_parallel, ident ) ) );
+    if ( p->stack_top >= p->stack_size ) {
+        __kmp_expand_cons_stack( gtid, p );
+    }; // if
+    tos = ++p->stack_top;
+    p->stack_data[ tos ].type = ct_parallel;
+    p->stack_data[ tos ].prev = p->p_top;
+    p->stack_data[ tos ].ident = ident;
+    p->stack_data[ tos ].name = NULL;
+    p->p_top = tos;
+    KE_DUMP( 1000, dump_cons_stack( gtid, p ) );
+}
+
+void
+__kmp_check_workshare( int gtid, enum cons_type ct, ident_t const * ident )
+{
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+
+    KMP_DEBUG_ASSERT( __kmp_threads[ gtid ]-> th.th_cons );
+    KE_TRACE( 10, ("__kmp_check_workshare (%d %d)\n", gtid, __kmp_get_gtid() ) );
+
+
+    if ( p->stack_top >= p->stack_size ) {
+        __kmp_expand_cons_stack( gtid, p );
+    }; // if
+    if ( p->w_top > p->p_top &&
+        !(IS_CONS_TYPE_TASKQ(p->stack_data[ p->w_top ].type) && IS_CONS_TYPE_TASKQ(ct))) {
+        // We are already in a WORKSHARE construct for this PARALLEL region.
+        __kmp_error_construct2( kmp_i18n_msg_CnsInvalidNesting, ct, ident, & p->stack_data[ p->w_top ] );
+    }; // if
+    if ( p->s_top > p->p_top ) {
+        // We are already in a SYNC construct for this PARALLEL region.
+        __kmp_error_construct2( kmp_i18n_msg_CnsInvalidNesting, ct, ident, & p->stack_data[ p->s_top ] );
+    }; // if
+}
+
+void
+__kmp_push_workshare( int gtid, enum cons_type ct, ident_t const * ident )
+{
+    int         tos;
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+    KE_TRACE( 10, ("__kmp_push_workshare (%d %d)\n", gtid, __kmp_get_gtid() ) );
+    __kmp_check_workshare( gtid, ct, ident );
+    KE_TRACE( 100, ( PUSH_MSG( ct, ident ) ) );
+    tos = ++p->stack_top;
+    p->stack_data[ tos ].type = ct;
+    p->stack_data[ tos ].prev = p->w_top;
+    p->stack_data[ tos ].ident = ident;
+    p->stack_data[ tos ].name = NULL;
+    p->w_top = tos;
+    KE_DUMP( 1000, dump_cons_stack( gtid, p ) );
+}
+
+void
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck, kmp_uint32 seq )
+#else
+__kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck )
+#endif
+{
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+
+    KE_TRACE( 10, ("__kmp_check_sync (gtid=%d)\n", __kmp_get_gtid() ) );
+
+    if (p->stack_top >= p->stack_size)
+       __kmp_expand_cons_stack( gtid, p );
+
+    if (ct == ct_ordered_in_parallel || ct == ct_ordered_in_pdo || ct == ct_ordered_in_taskq ) {
+        if (p->w_top <= p->p_top) {
+            /* we are not in a worksharing construct */
+            #ifdef BUILD_PARALLEL_ORDERED
+                /* do not report error messages for PARALLEL ORDERED */
+                KMP_ASSERT( ct == ct_ordered_in_parallel );
+            #else
+                __kmp_error_construct( kmp_i18n_msg_CnsBoundToWorksharing, ct, ident );
+            #endif /* BUILD_PARALLEL_ORDERED */
+        } else {
+            /* inside a WORKSHARING construct for this PARALLEL region */
+            if (!IS_CONS_TYPE_ORDERED(p->stack_data[ p->w_top ].type)) {
+                if (p->stack_data[ p->w_top ].type == ct_taskq) {
+                    __kmp_error_construct2(
+                        kmp_i18n_msg_CnsNotInTaskConstruct,
+                        ct, ident,
+                        & p->stack_data[ p->w_top ]
+                    );
+                } else {
+                    __kmp_error_construct2(
+                        kmp_i18n_msg_CnsNoOrderedClause,
+                        ct, ident,
+                        & p->stack_data[ p->w_top ]
+                    );
+               }
+            }
+        }
+        if (p->s_top > p->p_top && p->s_top > p->w_top) {
+            /* inside a sync construct which is inside a worksharing construct */
+            int index = p->s_top;
+            enum cons_type stack_type;
+
+            stack_type = p->stack_data[ index ].type;
+
+            if (stack_type == ct_critical ||
+                ( ( stack_type == ct_ordered_in_parallel ||
+                    stack_type == ct_ordered_in_pdo      ||
+                    stack_type == ct_ordered_in_taskq  ) &&     /* C doesn't allow named ordered; ordered in ordered gets error */
+                 p->stack_data[ index ].ident != NULL &&
+                 (p->stack_data[ index ].ident->flags & KMP_IDENT_KMPC ))) {
+                /* we are in ORDERED which is inside an ORDERED or CRITICAL construct */
+                __kmp_error_construct2(
+                    kmp_i18n_msg_CnsInvalidNesting,
+                    ct, ident,
+                    & p->stack_data[ index ]
+                );
+            }
+        }
+    } else if ( ct == ct_critical ) {
+#if KMP_USE_DYNAMIC_LOCK
+        if ( lck != NULL && __kmp_get_user_lock_owner( lck, seq ) == gtid ) {    /* this same thread already has lock for this critical section */
+#else
+        if ( lck != NULL && __kmp_get_user_lock_owner( lck ) == gtid ) {    /* this same thread already has lock for this critical section */
+#endif
+            int index = p->s_top;
+            struct cons_data cons = { NULL, ct_critical, 0, NULL };
+            /* walk up construct stack and try to find critical with matching name */
+            while ( index != 0 && p->stack_data[ index ].name != lck ) {
+                index = p->stack_data[ index ].prev;
+            }
+            if ( index != 0 ) {
+                /* found match on the stack (may not always because of interleaved critical for Fortran) */
+                cons = p->stack_data[ index ];
+            }
+            /* we are in CRITICAL which is inside a CRITICAL construct of the same name */
+            __kmp_error_construct2( kmp_i18n_msg_CnsNestingSameName, ct, ident, & cons );
+        }
+    } else if ( ct == ct_master || ct == ct_reduce ) {
+        if (p->w_top > p->p_top) {
+            /* inside a WORKSHARING construct for this PARALLEL region */
+           __kmp_error_construct2(
+               kmp_i18n_msg_CnsInvalidNesting,
+               ct, ident,
+               & p->stack_data[ p->w_top ]
+           );
+        }
+        if (ct == ct_reduce && p->s_top > p->p_top) {
+            /* inside a another SYNC construct for this PARALLEL region */
+            __kmp_error_construct2(
+                kmp_i18n_msg_CnsInvalidNesting,
+                ct, ident,
+                & p->stack_data[ p->s_top ]
+            );
+        }; // if
+    }; // if
+}
+
+void
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck, kmp_uint32 seq )
+#else
+__kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck )
+#endif
+{
+    int         tos;
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+
+    KMP_ASSERT( gtid == __kmp_get_gtid() );
+    KE_TRACE( 10, ("__kmp_push_sync (gtid=%d)\n", gtid ) );
+#if KMP_USE_DYNAMIC_LOCK
+    __kmp_check_sync( gtid, ct, ident, lck, seq );
+#else
+    __kmp_check_sync( gtid, ct, ident, lck );
+#endif
+    KE_TRACE( 100, ( PUSH_MSG( ct, ident ) ) );
+    tos = ++ p->stack_top;
+    p->stack_data[ tos ].type  = ct;
+    p->stack_data[ tos ].prev  = p->s_top;
+    p->stack_data[ tos ].ident = ident;
+    p->stack_data[ tos ].name  = lck;
+    p->s_top = tos;
+    KE_DUMP( 1000, dump_cons_stack( gtid, p ) );
+}
+
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_pop_parallel( int gtid, ident_t const * ident )
+{
+    int tos;
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+    tos = p->stack_top;
+    KE_TRACE( 10, ("__kmp_pop_parallel (%d %d)\n", gtid, __kmp_get_gtid() ) );
+    if ( tos == 0 || p->p_top == 0 ) {
+        __kmp_error_construct( kmp_i18n_msg_CnsDetectedEnd, ct_parallel, ident );
+    }
+    if ( tos != p->p_top || p->stack_data[ tos ].type != ct_parallel ) {
+        __kmp_error_construct2(
+            kmp_i18n_msg_CnsExpectedEnd,
+            ct_parallel, ident,
+            & p->stack_data[ tos ]
+        );
+    }
+    KE_TRACE( 100, ( POP_MSG( p ) ) );
+    p->p_top = p->stack_data[ tos ].prev;
+    p->stack_data[ tos ].type = ct_none;
+    p->stack_data[ tos ].ident = NULL;
+    p->stack_top = tos - 1;
+    KE_DUMP( 1000, dump_cons_stack( gtid, p ) );
+}
+
+enum cons_type
+__kmp_pop_workshare( int gtid, enum cons_type ct, ident_t const * ident )
+{
+    int tos;
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+
+    tos = p->stack_top;
+    KE_TRACE( 10, ("__kmp_pop_workshare (%d %d)\n", gtid, __kmp_get_gtid() ) );
+    if ( tos == 0 || p->w_top == 0 ) {
+        __kmp_error_construct( kmp_i18n_msg_CnsDetectedEnd, ct, ident );
+    }
+
+    if ( tos != p->w_top ||
+         ( p->stack_data[ tos ].type != ct &&
+          /* below are two exceptions to the rule that construct types must match */
+          ! ( p->stack_data[ tos ].type == ct_pdo_ordered && ct == ct_pdo ) &&
+          ! ( p->stack_data[ tos ].type == ct_task_ordered && ct == ct_task )
+         )
+       ) {
+        __kmp_check_null_func();
+        __kmp_error_construct2(
+            kmp_i18n_msg_CnsExpectedEnd,
+            ct, ident,
+            & p->stack_data[ tos ]
+        );
+    }
+    KE_TRACE( 100, ( POP_MSG( p ) ) );
+    p->w_top = p->stack_data[ tos ].prev;
+    p->stack_data[ tos ].type = ct_none;
+    p->stack_data[ tos ].ident = NULL;
+    p->stack_top = tos - 1;
+    KE_DUMP( 1000, dump_cons_stack( gtid, p ) );
+    return p->stack_data[ p->w_top ].type;
+}
+
+void
+__kmp_pop_sync( int gtid, enum cons_type ct, ident_t const * ident )
+{
+    int tos;
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+    tos = p->stack_top;
+    KE_TRACE( 10, ("__kmp_pop_sync (%d %d)\n", gtid, __kmp_get_gtid() ) );
+    if ( tos == 0 || p->s_top == 0 ) {
+        __kmp_error_construct( kmp_i18n_msg_CnsDetectedEnd, ct, ident );
+    };
+    if ( tos != p->s_top || p->stack_data[ tos ].type != ct ) {
+        __kmp_check_null_func();
+        __kmp_error_construct2(
+            kmp_i18n_msg_CnsExpectedEnd,
+            ct, ident,
+            & p->stack_data[ tos ]
+        );
+    };
+    if ( gtid < 0 ) {
+        __kmp_check_null_func();
+    };
+    KE_TRACE( 100, ( POP_MSG( p ) ) );
+    p->s_top = p->stack_data[ tos ].prev;
+    p->stack_data[ tos ].type = ct_none;
+    p->stack_data[ tos ].ident = NULL;
+    p->stack_top = tos - 1;
+    KE_DUMP( 1000, dump_cons_stack( gtid, p ) );
+}
+
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_check_barrier( int gtid, enum cons_type ct, ident_t const * ident )
+{
+    struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
+    KE_TRACE( 10, ("__kmp_check_barrier (loc: %p, gtid: %d %d)\n", ident, gtid, __kmp_get_gtid() ) );
+    if ( ident != 0 ) {
+        __kmp_check_null_func();
+    }
+    if ( p->w_top > p->p_top ) {
+        /* we are already in a WORKSHARING construct for this PARALLEL region */
+        __kmp_error_construct2(
+            kmp_i18n_msg_CnsInvalidNesting,
+            ct, ident,
+            & p->stack_data[ p->w_top ]
+        );
+    }
+    if (p->s_top > p->p_top) {
+        /* we are already in a SYNC construct for this PARALLEL region */
+        __kmp_error_construct2(
+            kmp_i18n_msg_CnsInvalidNesting,
+            ct, ident,
+            & p->stack_data[ p->s_top ]
+        );
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */

diff --git a/final/runtime/src/kmp_error.h b/final/runtime/src/kmp_error.h
new file mode 100644
index 0000000..9dfe111
--- /dev/null
+++ b/final/runtime/src/kmp_error.h

@@ -0,0 +1,57 @@
+/*
+ * kmp_error.h -- PTS functions for error checking at runtime.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_ERROR_H
+#define KMP_ERROR_H
+
+#include "kmp_i18n.h"
+
+/* ------------------------------------------------------------------------ */
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+void __kmp_error_construct(  kmp_i18n_id_t id, enum cons_type ct, ident_t const * ident );
+void __kmp_error_construct2( kmp_i18n_id_t id, enum cons_type ct, ident_t const * ident, struct cons_data const * cons );
+
+struct cons_header * __kmp_allocate_cons_stack( int gtid );
+void                 __kmp_free_cons_stack( void * ptr );
+
+void __kmp_push_parallel( int gtid, ident_t const * ident );
+void __kmp_push_workshare( int gtid, enum cons_type ct, ident_t const * ident );
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p name, kmp_uint32 );
+#else
+void __kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p name );
+#endif
+
+void __kmp_check_workshare( int gtid, enum cons_type ct, ident_t const * ident );
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p name, kmp_uint32 );
+#else
+void __kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p name );
+#endif
+
+void __kmp_pop_parallel( int gtid, ident_t const * ident );
+enum cons_type __kmp_pop_workshare( int gtid, enum cons_type ct, ident_t const * ident );
+void __kmp_pop_sync( int gtid, enum cons_type ct, ident_t const * ident );
+void __kmp_check_barrier( int gtid, enum cons_type ct, ident_t const * ident );
+
+#ifdef __cplusplus
+    } // extern "C"
+#endif
+
+#endif // KMP_ERROR_H
+

diff --git a/final/runtime/src/kmp_ftn_cdecl.c b/final/runtime/src/kmp_ftn_cdecl.c
new file mode 100644
index 0000000..51fa1bf
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_cdecl.c

@@ -0,0 +1,35 @@
+/*
+ * kmp_ftn_cdecl.c -- Fortran __cdecl linkage support for OpenMP.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+
+#if KMP_OS_WINDOWS
+#   if defined  KMP_WIN_CDECL ||  !defined KMP_DYNAMIC_LIB
+#       define KMP_FTN_ENTRIES      KMP_FTN_UPPER
+#   endif
+#elif KMP_OS_UNIX
+#   define KMP_FTN_ENTRIES  KMP_FTN_PLAIN
+#endif
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftncdecl[] = KMP_VERSION_PREFIX "Fortran __cdecl OMP support: "
+#ifdef KMP_FTN_ENTRIES
+    "yes";
+#   define FTN_STDCALL 	/* no stdcall */
+#   include "kmp_ftn_os.h"
+#   include "kmp_ftn_entry.h"
+#else
+    "no";
+#endif /* KMP_FTN_ENTRIES */

diff --git a/final/runtime/src/kmp_ftn_entry.h b/final/runtime/src/kmp_ftn_entry.h
new file mode 100644
index 0000000..a04b284
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_entry.h

@@ -0,0 +1,1245 @@
+/*
+ * kmp_ftn_entry.h -- Fortran entry linkage support for OpenMP.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef FTN_STDCALL
+# error The support file kmp_ftn_entry.h should not be compiled by itself.
+#endif
+
+#ifdef KMP_STUB
+    #include "kmp_stub.h"
+#endif
+
+#include "kmp_i18n.h"
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+/*
+ * For compatibility with the Gnu/MS Open MP codegen, omp_set_num_threads(),
+ * omp_set_nested(), and omp_set_dynamic() [in lowercase on MS, and w/o
+ * a trailing underscore on Linux* OS] take call by value integer arguments.
+ * + omp_set_max_active_levels()
+ * + omp_set_schedule()
+ *
+ * For backward compatibility with 9.1 and previous Intel compiler, these
+ * entry points take call by reference integer arguments.
+ */
+#ifdef KMP_GOMP_COMPAT
+# if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) || (KMP_FTN_ENTRIES == KMP_FTN_UPPER)
+#  define PASS_ARGS_BY_VALUE 1
+# endif
+#endif
+#if KMP_OS_WINDOWS
+# if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) || (KMP_FTN_ENTRIES == KMP_FTN_APPEND)
+#  define PASS_ARGS_BY_VALUE 1
+# endif
+#endif
+
+// This macro helps to reduce code duplication.
+#ifdef PASS_ARGS_BY_VALUE
+    #define KMP_DEREF
+#else
+    #define KMP_DEREF *
+#endif
+
+void  FTN_STDCALL
+FTN_SET_STACKSIZE( int KMP_DEREF arg )
+{
+    #ifdef KMP_STUB
+        __kmps_set_stacksize( KMP_DEREF arg );
+    #else
+        // __kmp_aux_set_stacksize initializes the library if needed
+        __kmp_aux_set_stacksize( (size_t) KMP_DEREF arg );
+    #endif
+}
+
+void  FTN_STDCALL
+FTN_SET_STACKSIZE_S( size_t KMP_DEREF arg )
+{
+    #ifdef KMP_STUB
+        __kmps_set_stacksize( KMP_DEREF arg );
+    #else
+        // __kmp_aux_set_stacksize initializes the library if needed
+        __kmp_aux_set_stacksize( KMP_DEREF arg );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_GET_STACKSIZE( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_stacksize();
+    #else
+        if ( ! __kmp_init_serial ) {
+            __kmp_serial_initialize();
+        };
+        return (int)__kmp_stksize;
+    #endif
+}
+
+size_t FTN_STDCALL
+FTN_GET_STACKSIZE_S( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_stacksize();
+    #else
+        if ( ! __kmp_init_serial ) {
+            __kmp_serial_initialize();
+        };
+        return __kmp_stksize;
+    #endif
+}
+
+void FTN_STDCALL
+FTN_SET_BLOCKTIME( int KMP_DEREF arg )
+{
+    #ifdef KMP_STUB
+        __kmps_set_blocktime( KMP_DEREF arg );
+    #else
+	int gtid, tid;
+	kmp_info_t *thread;
+
+	gtid = __kmp_entry_gtid();
+	tid = __kmp_tid_from_gtid(gtid);
+	thread = __kmp_thread_from_gtid(gtid);
+
+        __kmp_aux_set_blocktime( KMP_DEREF arg, thread, tid );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_GET_BLOCKTIME( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_blocktime();
+    #else
+	int gtid, tid;
+	kmp_info_t *thread;
+        kmp_team_p *team;
+
+	gtid = __kmp_entry_gtid();
+	tid = __kmp_tid_from_gtid(gtid);
+	thread = __kmp_thread_from_gtid(gtid);
+        team = __kmp_threads[ gtid ] -> th.th_team;
+
+        /* These must match the settings used in __kmp_wait_sleep() */
+        if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
+	    KF_TRACE(10, ( "kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n",
+			  gtid, team->t.t_id, tid, KMP_MAX_BLOCKTIME) );
+            return KMP_MAX_BLOCKTIME;
+        }
+#ifdef KMP_ADJUST_BLOCKTIME
+        else if ( __kmp_zero_bt && !get__bt_set( team, tid ) ) {
+	    KF_TRACE(10, ( "kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n",
+			  gtid, team->t.t_id, tid, 0) );
+            return 0;
+        }
+#endif /* KMP_ADJUST_BLOCKTIME */
+        else {
+	    KF_TRACE(10, ( "kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n",
+              gtid, team->t.t_id, tid, get__blocktime( team, tid ) ) );
+            return get__blocktime( team, tid );
+        };
+    #endif
+}
+
+void FTN_STDCALL
+FTN_SET_LIBRARY_SERIAL( void )
+{
+    #ifdef KMP_STUB
+        __kmps_set_library( library_serial );
+    #else
+        // __kmp_user_set_library initializes the library if needed
+        __kmp_user_set_library( library_serial );
+    #endif
+}
+
+void FTN_STDCALL
+FTN_SET_LIBRARY_TURNAROUND( void )
+{
+    #ifdef KMP_STUB
+        __kmps_set_library( library_turnaround );
+    #else
+        // __kmp_user_set_library initializes the library if needed
+        __kmp_user_set_library( library_turnaround );
+    #endif
+}
+
+void FTN_STDCALL
+FTN_SET_LIBRARY_THROUGHPUT( void )
+{
+    #ifdef KMP_STUB
+        __kmps_set_library( library_throughput );
+    #else
+        // __kmp_user_set_library initializes the library if needed
+        __kmp_user_set_library( library_throughput );
+    #endif
+}
+
+void FTN_STDCALL
+FTN_SET_LIBRARY( int KMP_DEREF arg )
+{
+    #ifdef KMP_STUB
+        __kmps_set_library( KMP_DEREF arg );
+    #else
+        enum library_type lib;
+        lib = (enum library_type) KMP_DEREF arg;
+        // __kmp_user_set_library initializes the library if needed
+        __kmp_user_set_library( lib );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_GET_LIBRARY (void)
+{
+    #ifdef KMP_STUB
+        return __kmps_get_library();
+    #else
+        if ( ! __kmp_init_serial ) {
+            __kmp_serial_initialize();
+        }
+        return ((int) __kmp_library);
+    #endif
+}
+
+int FTN_STDCALL
+FTN_SET_AFFINITY( void **mask )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        return -1;
+    #else
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        return __kmp_aux_set_affinity( mask );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_GET_AFFINITY( void **mask )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        return -1;
+    #else
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        return __kmp_aux_get_affinity( mask );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_GET_AFFINITY_MAX_PROC( void )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        return 0;
+    #else
+        //
+        // We really only NEED serial initialization here.
+        //
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        if ( ! ( KMP_AFFINITY_CAPABLE() ) ) {
+            return 0;
+        }
+
+    #if KMP_GROUP_AFFINITY
+        if ( __kmp_num_proc_groups > 1 ) {
+            return (int)KMP_CPU_SETSIZE;
+        }
+    #endif /* KMP_GROUP_AFFINITY */
+        return __kmp_xproc;
+    #endif
+}
+
+void FTN_STDCALL
+FTN_CREATE_AFFINITY_MASK( void **mask )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        *mask = NULL;
+    #else
+        //
+        // We really only NEED serial initialization here.
+        //
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        *mask = kmpc_malloc( __kmp_affin_mask_size );
+        KMP_CPU_ZERO( (kmp_affin_mask_t *)(*mask) );
+    #endif
+}
+
+void FTN_STDCALL
+FTN_DESTROY_AFFINITY_MASK( void **mask )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        // Nothing
+    #else
+        //
+        // We really only NEED serial initialization here.
+        //
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        if ( __kmp_env_consistency_check ) {
+            if ( *mask == NULL ) {
+	        KMP_FATAL( AffinityInvalidMask, "kmp_destroy_affinity_mask" );
+	    }
+        }
+        kmpc_free( *mask );
+        *mask = NULL;
+    #endif
+}
+
+int FTN_STDCALL
+FTN_SET_AFFINITY_MASK_PROC( int KMP_DEREF proc, void **mask )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        return -1;
+    #else
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        return __kmp_aux_set_affinity_mask_proc( KMP_DEREF proc, mask );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_UNSET_AFFINITY_MASK_PROC( int KMP_DEREF proc, void **mask )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        return -1;
+    #else
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        return __kmp_aux_unset_affinity_mask_proc( KMP_DEREF proc, mask );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_GET_AFFINITY_MASK_PROC( int KMP_DEREF proc, void **mask )
+{
+    #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+        return -1;
+    #else
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        return __kmp_aux_get_affinity_mask_proc( KMP_DEREF proc, mask );
+    #endif
+}
+
+
+/* ------------------------------------------------------------------------ */
+
+/* sets the requested number of threads for the next parallel region */
+
+void FTN_STDCALL
+xexpand(FTN_SET_NUM_THREADS)( int KMP_DEREF arg )
+{
+    #ifdef KMP_STUB
+        // Nothing.
+    #else
+        __kmp_set_num_threads( KMP_DEREF arg, __kmp_entry_gtid() );
+    #endif
+}
+
+
+/* returns the number of threads in current team */
+int FTN_STDCALL
+xexpand(FTN_GET_NUM_THREADS)( void )
+{
+    #ifdef KMP_STUB
+        return 1;
+    #else
+        // __kmpc_bound_num_threads initializes the library if needed
+        return __kmpc_bound_num_threads(NULL);
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_MAX_THREADS)( void )
+{
+    #ifdef KMP_STUB
+        return 1;
+    #else
+        int         gtid;
+        kmp_info_t *thread;
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        gtid   = __kmp_entry_gtid();
+        thread = __kmp_threads[ gtid ];
+        //return thread -> th.th_team -> t.t_current_task[ thread->th.th_info.ds.ds_tid ] -> icvs.nproc;
+	return thread -> th.th_current_task -> td_icvs.nproc;
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_THREAD_NUM)( void )
+{
+    #ifdef KMP_STUB
+        return 0;
+    #else
+        int gtid;
+
+        #if KMP_OS_DARWIN || KMP_OS_FREEBSD
+            gtid = __kmp_entry_gtid();
+        #elif KMP_OS_WINDOWS
+            if (!__kmp_init_parallel ||
+                (gtid = (int)((kmp_intptr_t)TlsGetValue( __kmp_gtid_threadprivate_key ))) == 0) {
+                // Either library isn't initialized or thread is not registered
+                // 0 is the correct TID in this case
+                return 0;
+            }
+            --gtid; // We keep (gtid+1) in TLS
+        #elif KMP_OS_LINUX
+            #ifdef KMP_TDATA_GTID
+            if ( __kmp_gtid_mode >= 3 ) {
+                if ((gtid = __kmp_gtid) == KMP_GTID_DNE) {
+                    return 0;
+                }
+            } else {
+            #endif
+                if (!__kmp_init_parallel ||
+                    (gtid = (kmp_intptr_t)(pthread_getspecific( __kmp_gtid_threadprivate_key ))) == 0) {
+                    return 0;
+                }
+                --gtid;
+            #ifdef KMP_TDATA_GTID
+            }
+            #endif
+        #else
+            #error Unknown or unsupported OS
+        #endif
+
+        return __kmp_tid_from_gtid( gtid );
+    #endif
+}
+
+int FTN_STDCALL
+FTN_GET_NUM_KNOWN_THREADS( void )
+{
+    #ifdef KMP_STUB
+        return 1;
+    #else
+        if ( ! __kmp_init_serial ) {
+            __kmp_serial_initialize();
+        }
+        /* NOTE: this is not syncronized, so it can change at any moment */
+        /* NOTE: this number also includes threads preallocated in hot-teams */
+        return TCR_4(__kmp_nth);
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_NUM_PROCS)( void )
+{
+    #ifdef KMP_STUB
+        return 1;
+    #else
+        if ( ! TCR_4(__kmp_init_middle) ) {
+            __kmp_middle_initialize();
+        }
+        return __kmp_avail_proc;
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_SET_NESTED)( int KMP_DEREF flag )
+{
+    #ifdef KMP_STUB
+        __kmps_set_nested( KMP_DEREF flag );
+    #else
+        kmp_info_t *thread;
+        /* For the thread-private internal controls implementation */
+        thread = __kmp_entry_thread();
+        __kmp_save_internal_controls( thread );
+        set__nested( thread, ( (KMP_DEREF flag) ? TRUE : FALSE ) );
+    #endif
+}
+
+
+int FTN_STDCALL
+xexpand(FTN_GET_NESTED)( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_nested();
+    #else
+        kmp_info_t *thread;
+        thread = __kmp_entry_thread();
+        return get__nested( thread );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_SET_DYNAMIC)( int KMP_DEREF flag )
+{
+    #ifdef KMP_STUB
+        __kmps_set_dynamic( KMP_DEREF flag ? TRUE : FALSE );
+    #else
+        kmp_info_t *thread;
+        /* For the thread-private implementation of the internal controls */
+        thread = __kmp_entry_thread();
+        // !!! What if foreign thread calls it?
+        __kmp_save_internal_controls( thread );
+        set__dynamic( thread, KMP_DEREF flag ? TRUE : FALSE );
+    #endif
+}
+
+
+int FTN_STDCALL
+xexpand(FTN_GET_DYNAMIC)( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_dynamic();
+    #else
+        kmp_info_t *thread;
+        thread = __kmp_entry_thread();
+        return get__dynamic( thread );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_IN_PARALLEL)( void )
+{
+    #ifdef KMP_STUB
+        return 0;
+    #else
+        kmp_info_t *th = __kmp_entry_thread();
+#if OMP_40_ENABLED
+        if ( th->th.th_teams_microtask ) {
+            // AC: r_in_parallel does not work inside teams construct
+            //     where real parallel is inactive, but all threads have same root,
+            //     so setting it in one team affects other teams.
+            //     The solution is to use per-team nesting level
+            return ( th->th.th_team->t.t_active_level ? 1 : 0 );
+        }
+        else
+#endif /* OMP_40_ENABLED */
+            return ( th->th.th_root->r.r_in_parallel ? FTN_TRUE : FTN_FALSE );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_SET_SCHEDULE)( kmp_sched_t KMP_DEREF kind, int KMP_DEREF modifier )
+{
+    #ifdef KMP_STUB
+        __kmps_set_schedule( KMP_DEREF kind, KMP_DEREF modifier );
+    #else
+	/*  TO DO  */
+        /* For the per-task implementation of the internal controls */
+        __kmp_set_schedule( __kmp_entry_gtid(), KMP_DEREF kind, KMP_DEREF modifier );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_GET_SCHEDULE)( kmp_sched_t * kind, int * modifier )
+{
+    #ifdef KMP_STUB
+        __kmps_get_schedule( kind, modifier );
+    #else
+	/*  TO DO  */
+	/* For the per-task implementation of the internal controls */
+        __kmp_get_schedule( __kmp_entry_gtid(), kind, modifier );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_SET_MAX_ACTIVE_LEVELS)( int KMP_DEREF arg )
+{
+    #ifdef KMP_STUB
+	// Nothing.
+    #else
+	/*  TO DO  */
+        /* We want per-task implementation of this internal control */
+        __kmp_set_max_active_levels( __kmp_entry_gtid(), KMP_DEREF arg );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_MAX_ACTIVE_LEVELS)( void )
+{
+    #ifdef KMP_STUB
+	return 0;
+    #else
+	/*  TO DO  */
+	/* We want per-task implementation of this internal control */
+	return __kmp_get_max_active_levels( __kmp_entry_gtid() );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_ACTIVE_LEVEL)( void )
+{
+    #ifdef KMP_STUB
+	return 0; // returns 0 if it is called from the sequential part of the program
+    #else
+	/*  TO DO  */
+	/* For the per-task implementation of the internal controls */
+        return __kmp_entry_thread() -> th.th_team -> t.t_active_level;
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_LEVEL)( void )
+{
+    #ifdef KMP_STUB
+	return 0; // returns 0 if it is called from the sequential part of the program
+    #else
+	/*  TO DO  */
+	/* For the per-task implementation of the internal controls */
+        return __kmp_entry_thread() -> th.th_team -> t.t_level;
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_ANCESTOR_THREAD_NUM)( int KMP_DEREF level )
+{
+    #ifdef KMP_STUB
+	return ( KMP_DEREF level ) ? ( -1 ) : ( 0 );
+    #else
+	return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), KMP_DEREF level );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_TEAM_SIZE)( int KMP_DEREF level )
+{
+    #ifdef KMP_STUB
+        return ( KMP_DEREF level ) ? ( -1 ) : ( 1 );
+    #else
+        return __kmp_get_team_size( __kmp_entry_gtid(), KMP_DEREF level );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_THREAD_LIMIT)( void )
+{
+    #ifdef KMP_STUB
+	return 1;   // TO DO: clarify whether it returns 1 or 0?
+    #else
+        if ( ! __kmp_init_serial ) {
+            __kmp_serial_initialize();
+        };
+        /* global ICV */
+	return __kmp_max_nth;
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_IN_FINAL)( void )
+{
+    #ifdef KMP_STUB
+	return 0;   // TO DO: clarify whether it returns 1 or 0?
+    #else
+        if ( ! TCR_4(__kmp_init_parallel) ) {
+            return 0;
+        }
+	return __kmp_entry_thread() -> th.th_current_task -> td_flags.final;
+    #endif
+}
+
+#if OMP_40_ENABLED
+
+
+kmp_proc_bind_t FTN_STDCALL
+xexpand(FTN_GET_PROC_BIND)( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_proc_bind();
+    #else
+        return get__proc_bind( __kmp_entry_thread() );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_NUM_TEAMS)( void )
+{
+    #ifdef KMP_STUB
+        return 1;
+    #else
+        kmp_info_t *thr = __kmp_entry_thread();
+        if ( thr->th.th_teams_microtask ) {
+            kmp_team_t *team = thr->th.th_team;
+            int tlevel = thr->th.th_teams_level;
+            int ii = team->t.t_level;            // the level of the teams construct
+            int dd = team -> t.t_serialized;
+            int level = tlevel + 1;
+            KMP_DEBUG_ASSERT( ii >= tlevel );
+            while( ii > level )
+            {
+                for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
+                {
+                }
+                if( team -> t.t_serialized && ( !dd ) ) {
+                    team = team->t.t_parent;
+                    continue;
+                }
+                if( ii > level ) {
+                    team = team->t.t_parent;
+                    ii--;
+                }
+            }
+            if ( dd > 1 ) {
+                return 1;  // teams region is serialized ( 1 team of 1 thread ).
+            } else {
+                return team->t.t_parent->t.t_nproc;
+            }
+        } else {
+            return 1;
+        }
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_GET_TEAM_NUM)( void )
+{
+    #ifdef KMP_STUB
+        return 0;
+    #else
+        kmp_info_t *thr = __kmp_entry_thread();
+        if ( thr->th.th_teams_microtask ) {
+            kmp_team_t *team = thr->th.th_team;
+            int tlevel = thr->th.th_teams_level; // the level of the teams construct
+            int ii = team->t.t_level;
+            int dd = team -> t.t_serialized;
+            int level = tlevel + 1;
+            KMP_DEBUG_ASSERT( ii >= tlevel );
+            while( ii > level )
+            {
+                for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
+                {
+                }
+                if( team -> t.t_serialized && ( !dd ) ) {
+                    team = team->t.t_parent;
+                    continue;
+                }
+                if( ii > level ) {
+                    team = team->t.t_parent;
+                    ii--;
+                }
+            }
+            if ( dd > 1 ) {
+                return 0;  // teams region is serialized ( 1 team of 1 thread ).
+            } else {
+                return team->t.t_master_tid;
+            }
+        } else {
+            return 0;
+        }
+    #endif
+}
+
+#if KMP_MIC || KMP_OS_DARWIN
+
+static int __kmp_default_device = 0;
+
+int FTN_STDCALL
+FTN_GET_DEFAULT_DEVICE( void )
+{
+    return __kmp_default_device;
+}
+
+void FTN_STDCALL
+FTN_SET_DEFAULT_DEVICE( int KMP_DEREF arg )
+{
+    __kmp_default_device = KMP_DEREF arg;
+}
+
+int FTN_STDCALL
+FTN_GET_NUM_DEVICES( void )
+{
+    return 0;
+}
+
+#endif // KMP_MIC || KMP_OS_DARWIN
+
+#if ! KMP_OS_LINUX
+
+int FTN_STDCALL
+xexpand(FTN_IS_INITIAL_DEVICE)( void )
+{
+    return 1;
+}
+
+#else
+
+// This internal function is used when the entry from the offload library
+// is not found.
+int _Offload_get_device_number( void )  __attribute__((weak));
+
+int FTN_STDCALL
+xexpand(FTN_IS_INITIAL_DEVICE)( void )
+{
+    if( _Offload_get_device_number ) {
+        return _Offload_get_device_number() == -1;
+    } else {
+        return 1;
+    }
+}
+
+#endif // ! KMP_OS_LINUX
+
+#endif // OMP_40_ENABLED
+
+#ifdef KMP_STUB
+typedef enum { UNINIT = -1, UNLOCKED, LOCKED } kmp_stub_lock_t;
+#endif /* KMP_STUB */
+
+#if KMP_USE_DYNAMIC_LOCK
+void FTN_STDCALL
+FTN_INIT_LOCK_HINTED( void **user_lock, int KMP_DEREF hint )
+{
+    #ifdef KMP_STUB
+        *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+    #else
+        __kmp_init_lock_hinted( user_lock, KMP_DEREF hint );
+    #endif
+}
+
+void FTN_STDCALL
+FTN_INIT_NEST_LOCK_HINTED( void **user_lock, int KMP_DEREF hint )
+{
+    #ifdef KMP_STUB
+        *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+    #else
+        __kmp_init_nest_lock_hinted( user_lock, KMP_DEREF hint );
+    #endif
+}
+#endif
+
+/* initialize the lock */
+void FTN_STDCALL
+xexpand(FTN_INIT_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+    #else
+        __kmpc_init_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+/* initialize the lock */
+void FTN_STDCALL
+xexpand(FTN_INIT_NEST_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+    #else
+        __kmpc_init_nest_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_DESTROY_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        *((kmp_stub_lock_t *)user_lock) = UNINIT;
+    #else
+        __kmpc_destroy_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_DESTROY_NEST_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        *((kmp_stub_lock_t *)user_lock) = UNINIT;
+    #else
+        __kmpc_destroy_nest_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_SET_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        if ( *((kmp_stub_lock_t *)user_lock) == UNINIT ) {
+            // TODO: Issue an error.
+        }; // if
+        if ( *((kmp_stub_lock_t *)user_lock) != UNLOCKED ) {
+            // TODO: Issue an error.
+        }; // if
+        *((kmp_stub_lock_t *)user_lock) = LOCKED;
+    #else
+        __kmpc_set_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_SET_NEST_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        if ( *((kmp_stub_lock_t *)user_lock) == UNINIT ) {
+            // TODO: Issue an error.
+        }; // if
+        (*((int *)user_lock))++;
+    #else
+        __kmpc_set_nest_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_UNSET_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        if ( *((kmp_stub_lock_t *)user_lock) == UNINIT ) {
+            // TODO: Issue an error.
+        }; // if
+        if ( *((kmp_stub_lock_t *)user_lock) == UNLOCKED ) {
+            // TODO: Issue an error.
+        }; // if
+        *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+    #else
+        __kmpc_unset_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+void FTN_STDCALL
+xexpand(FTN_UNSET_NEST_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        if ( *((kmp_stub_lock_t *)user_lock) == UNINIT ) {
+            // TODO: Issue an error.
+        }; // if
+        if ( *((kmp_stub_lock_t *)user_lock) == UNLOCKED ) {
+            // TODO: Issue an error.
+        }; // if
+        (*((int *)user_lock))--;
+    #else
+        __kmpc_unset_nest_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_TEST_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        if ( *((kmp_stub_lock_t *)user_lock) == UNINIT ) {
+            // TODO: Issue an error.
+        }; // if
+        if ( *((kmp_stub_lock_t *)user_lock) == LOCKED ) {
+            return 0;
+        }; // if
+        *((kmp_stub_lock_t *)user_lock) = LOCKED;
+        return 1;
+    #else
+        return __kmpc_test_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+int FTN_STDCALL
+xexpand(FTN_TEST_NEST_LOCK)( void **user_lock )
+{
+    #ifdef KMP_STUB
+        if ( *((kmp_stub_lock_t *)user_lock) == UNINIT ) {
+            // TODO: Issue an error.
+        }; // if
+        return ++(*((int *)user_lock));
+    #else
+        return __kmpc_test_nest_lock( NULL, __kmp_entry_gtid(), user_lock );
+    #endif
+}
+
+double FTN_STDCALL
+xexpand(FTN_GET_WTIME)( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_wtime();
+    #else
+        double data;
+        #if ! KMP_OS_LINUX
+        // We don't need library initialization to get the time on Linux* OS.
+        // The routine can be used to measure library initialization time on Linux* OS now.
+        if ( ! __kmp_init_serial ) {
+            __kmp_serial_initialize();
+        };
+        #endif
+        __kmp_elapsed( & data );
+        return data;
+    #endif
+}
+
+double FTN_STDCALL
+xexpand(FTN_GET_WTICK)( void )
+{
+    #ifdef KMP_STUB
+        return __kmps_get_wtick();
+    #else
+        double data;
+        if ( ! __kmp_init_serial ) {
+            __kmp_serial_initialize();
+        };
+        __kmp_elapsed_tick( & data );
+        return data;
+    #endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+void * FTN_STDCALL
+FTN_MALLOC( size_t KMP_DEREF size )
+{
+    // kmpc_malloc initializes the library if needed
+    return kmpc_malloc( KMP_DEREF size );
+}
+
+void * FTN_STDCALL
+FTN_CALLOC( size_t KMP_DEREF nelem, size_t KMP_DEREF elsize )
+{
+    // kmpc_calloc initializes the library if needed
+    return kmpc_calloc( KMP_DEREF nelem, KMP_DEREF elsize );
+}
+
+void * FTN_STDCALL
+FTN_REALLOC( void * KMP_DEREF ptr, size_t KMP_DEREF size )
+{
+    // kmpc_realloc initializes the library if needed
+    return kmpc_realloc( KMP_DEREF ptr, KMP_DEREF size );
+}
+
+void FTN_STDCALL
+FTN_FREE( void * KMP_DEREF ptr )
+{
+    // does nothing if the library is not initialized
+    kmpc_free( KMP_DEREF ptr );
+}
+
+void FTN_STDCALL
+FTN_SET_WARNINGS_ON( void )
+{
+    #ifndef KMP_STUB
+        __kmp_generate_warnings = kmp_warnings_explicit;
+    #endif
+}
+
+void FTN_STDCALL
+FTN_SET_WARNINGS_OFF( void )
+{
+    #ifndef KMP_STUB
+        __kmp_generate_warnings = FALSE;
+    #endif
+}
+
+void FTN_STDCALL
+FTN_SET_DEFAULTS( char const * str
+    #ifndef PASS_ARGS_BY_VALUE
+        , int len
+    #endif
+)
+{
+    #ifndef KMP_STUB
+        #ifdef PASS_ARGS_BY_VALUE
+            int len = (int)KMP_STRLEN( str );
+        #endif
+        __kmp_aux_set_defaults( str, len );
+    #endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+
+#if OMP_40_ENABLED
+/* returns the status of cancellation */
+int FTN_STDCALL
+xexpand(FTN_GET_CANCELLATION)(void) {
+#ifdef KMP_STUB
+    return 0 /* false */;
+#else
+    // initialize the library if needed
+    if ( ! __kmp_init_serial ) {
+        __kmp_serial_initialize();
+    }
+    return __kmp_omp_cancellation;
+#endif
+}
+
+int FTN_STDCALL
+FTN_GET_CANCELLATION_STATUS(int cancel_kind) {
+#ifdef KMP_STUB
+    return 0 /* false */;
+#else
+    return __kmp_get_cancellation_status(cancel_kind);
+#endif
+}
+
+#endif // OMP_40_ENABLED
+
+// GCC compatibility (versioned symbols)
+#ifdef KMP_USE_VERSION_SYMBOLS
+
+/*
+    These following sections create function aliases (dummy symbols) for the omp_* routines.
+    These aliases will then be versioned according to how libgomp ``versions'' its
+    symbols (OMP_1.0, OMP_2.0, OMP_3.0, ...) while also retaining the
+    default version which libomp uses: VERSION (defined in exports_so.txt)
+    If you want to see the versioned symbols for libgomp.so.1 then just type:
+
+    objdump -T /path/to/libgomp.so.1 | grep omp_
+
+    Example:
+    Step 1)  Create __kmp_api_omp_set_num_threads_10_alias
+             which is alias of __kmp_api_omp_set_num_threads
+    Step 2)  Set __kmp_api_omp_set_num_threads_10_alias to version: omp_set_num_threads@OMP_1.0
+    Step 2B) Set __kmp_api_omp_set_num_threads to default version : omp_set_num_threads@@VERSION
+*/
+
+// OMP_1.0 aliases
+xaliasify(FTN_SET_NUM_THREADS,   10);
+xaliasify(FTN_GET_NUM_THREADS,   10);
+xaliasify(FTN_GET_MAX_THREADS,   10);
+xaliasify(FTN_GET_THREAD_NUM,    10);
+xaliasify(FTN_GET_NUM_PROCS,     10);
+xaliasify(FTN_IN_PARALLEL,       10);
+xaliasify(FTN_SET_DYNAMIC,       10);
+xaliasify(FTN_GET_DYNAMIC,       10);
+xaliasify(FTN_SET_NESTED,        10);
+xaliasify(FTN_GET_NESTED,        10);
+xaliasify(FTN_INIT_LOCK,         10);
+xaliasify(FTN_INIT_NEST_LOCK,    10);
+xaliasify(FTN_DESTROY_LOCK,      10);
+xaliasify(FTN_DESTROY_NEST_LOCK, 10);
+xaliasify(FTN_SET_LOCK,          10);
+xaliasify(FTN_SET_NEST_LOCK,     10);
+xaliasify(FTN_UNSET_LOCK,        10);
+xaliasify(FTN_UNSET_NEST_LOCK,   10);
+xaliasify(FTN_TEST_LOCK,         10);
+xaliasify(FTN_TEST_NEST_LOCK,    10);
+
+// OMP_2.0 aliases
+xaliasify(FTN_GET_WTICK, 20);
+xaliasify(FTN_GET_WTIME, 20);
+
+// OMP_3.0 aliases
+xaliasify(FTN_SET_SCHEDULE,            30);
+xaliasify(FTN_GET_SCHEDULE,            30);
+xaliasify(FTN_GET_THREAD_LIMIT,        30);
+xaliasify(FTN_SET_MAX_ACTIVE_LEVELS,   30);
+xaliasify(FTN_GET_MAX_ACTIVE_LEVELS,   30);
+xaliasify(FTN_GET_LEVEL,               30);
+xaliasify(FTN_GET_ANCESTOR_THREAD_NUM, 30);
+xaliasify(FTN_GET_TEAM_SIZE,           30);
+xaliasify(FTN_GET_ACTIVE_LEVEL,        30);
+xaliasify(FTN_INIT_LOCK,               30);
+xaliasify(FTN_INIT_NEST_LOCK,          30);
+xaliasify(FTN_DESTROY_LOCK,            30);
+xaliasify(FTN_DESTROY_NEST_LOCK,       30);
+xaliasify(FTN_SET_LOCK,                30);
+xaliasify(FTN_SET_NEST_LOCK,           30);
+xaliasify(FTN_UNSET_LOCK,              30);
+xaliasify(FTN_UNSET_NEST_LOCK,         30);
+xaliasify(FTN_TEST_LOCK,               30);
+xaliasify(FTN_TEST_NEST_LOCK,          30);
+
+// OMP_3.1 aliases
+xaliasify(FTN_IN_FINAL, 31);
+
+#if OMP_40_ENABLED
+// OMP_4.0 aliases
+xaliasify(FTN_GET_PROC_BIND, 40);
+xaliasify(FTN_GET_NUM_TEAMS, 40);
+xaliasify(FTN_GET_TEAM_NUM, 40);
+xaliasify(FTN_GET_CANCELLATION, 40);
+xaliasify(FTN_IS_INITIAL_DEVICE, 40);
+#endif /* OMP_40_ENABLED */
+
+#if OMP_41_ENABLED
+// OMP_4.1 aliases
+#endif
+
+#if OMP_50_ENABLED
+// OMP_5.0 aliases
+#endif
+
+// OMP_1.0 versioned symbols
+xversionify(FTN_SET_NUM_THREADS,   10, "OMP_1.0");
+xversionify(FTN_GET_NUM_THREADS,   10, "OMP_1.0");
+xversionify(FTN_GET_MAX_THREADS,   10, "OMP_1.0");
+xversionify(FTN_GET_THREAD_NUM,    10, "OMP_1.0");
+xversionify(FTN_GET_NUM_PROCS,     10, "OMP_1.0");
+xversionify(FTN_IN_PARALLEL,       10, "OMP_1.0");
+xversionify(FTN_SET_DYNAMIC,       10, "OMP_1.0");
+xversionify(FTN_GET_DYNAMIC,       10, "OMP_1.0");
+xversionify(FTN_SET_NESTED,        10, "OMP_1.0");
+xversionify(FTN_GET_NESTED,        10, "OMP_1.0");
+xversionify(FTN_INIT_LOCK,         10, "OMP_1.0");
+xversionify(FTN_INIT_NEST_LOCK,    10, "OMP_1.0");
+xversionify(FTN_DESTROY_LOCK,      10, "OMP_1.0");
+xversionify(FTN_DESTROY_NEST_LOCK, 10, "OMP_1.0");
+xversionify(FTN_SET_LOCK,          10, "OMP_1.0");
+xversionify(FTN_SET_NEST_LOCK,     10, "OMP_1.0");
+xversionify(FTN_UNSET_LOCK,        10, "OMP_1.0");
+xversionify(FTN_UNSET_NEST_LOCK,   10, "OMP_1.0");
+xversionify(FTN_TEST_LOCK,         10, "OMP_1.0");
+xversionify(FTN_TEST_NEST_LOCK,    10, "OMP_1.0");
+
+// OMP_2.0 versioned symbols
+xversionify(FTN_GET_WTICK,         20, "OMP_2.0");
+xversionify(FTN_GET_WTIME,         20, "OMP_2.0");
+
+// OMP_3.0 versioned symbols
+xversionify(FTN_SET_SCHEDULE,      30, "OMP_3.0");
+xversionify(FTN_GET_SCHEDULE,      30, "OMP_3.0");
+xversionify(FTN_GET_THREAD_LIMIT,        30, "OMP_3.0");
+xversionify(FTN_SET_MAX_ACTIVE_LEVELS,   30, "OMP_3.0");
+xversionify(FTN_GET_MAX_ACTIVE_LEVELS,   30, "OMP_3.0");
+xversionify(FTN_GET_ANCESTOR_THREAD_NUM, 30, "OMP_3.0");
+xversionify(FTN_GET_LEVEL,               30, "OMP_3.0");
+xversionify(FTN_GET_TEAM_SIZE,     30, "OMP_3.0");
+xversionify(FTN_GET_ACTIVE_LEVEL,  30, "OMP_3.0");
+
+// the lock routines have a 1.0 and 3.0 version
+xversionify(FTN_INIT_LOCK,         30, "OMP_3.0");
+xversionify(FTN_INIT_NEST_LOCK,    30, "OMP_3.0");
+xversionify(FTN_DESTROY_LOCK,      30, "OMP_3.0");
+xversionify(FTN_DESTROY_NEST_LOCK, 30, "OMP_3.0");
+xversionify(FTN_SET_LOCK,          30, "OMP_3.0");
+xversionify(FTN_SET_NEST_LOCK,     30, "OMP_3.0");
+xversionify(FTN_UNSET_LOCK,        30, "OMP_3.0");
+xversionify(FTN_UNSET_NEST_LOCK,   30, "OMP_3.0");
+xversionify(FTN_TEST_LOCK,         30, "OMP_3.0");
+xversionify(FTN_TEST_NEST_LOCK,    30, "OMP_3.0");
+
+// OMP_3.1 versioned symbol
+xversionify(FTN_IN_FINAL,          31, "OMP_3.1");
+
+#if OMP_40_ENABLED
+// OMP_4.0 versioned symbols
+xversionify(FTN_GET_PROC_BIND,     40, "OMP_4.0");
+xversionify(FTN_GET_NUM_TEAMS,     40, "OMP_4.0");
+xversionify(FTN_GET_TEAM_NUM,      40, "OMP_4.0");
+xversionify(FTN_GET_CANCELLATION,  40, "OMP_4.0");
+xversionify(FTN_IS_INITIAL_DEVICE, 40, "OMP_4.0");
+#endif /* OMP_40_ENABLED */
+
+#if OMP_41_ENABLED
+// OMP_4.1 versioned symbols
+#endif
+
+#if OMP_50_ENABLED
+// OMP_5.0 versioned symbols
+#endif
+
+#endif // KMP_USE_VERSION_SYMBOLS
+
+#ifdef __cplusplus
+    } //extern "C"
+#endif // __cplusplus
+
+// end of file //

diff --git a/final/runtime/src/kmp_ftn_extra.c b/final/runtime/src/kmp_ftn_extra.c
new file mode 100644
index 0000000..1d0fb4c
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_extra.c

@@ -0,0 +1,33 @@
+/*
+ * kmp_ftn_extra.c -- Fortran 'extra' linkage support for OpenMP.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+
+#if KMP_OS_WINDOWS
+#   define KMP_FTN_ENTRIES KMP_FTN_PLAIN
+#elif KMP_OS_UNIX
+#   define KMP_FTN_ENTRIES KMP_FTN_APPEND
+#endif
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftnextra[] = KMP_VERSION_PREFIX "Fortran \"extra\" OMP support: "
+#ifdef KMP_FTN_ENTRIES
+    "yes";
+#   define FTN_STDCALL /* nothing to do */
+#   include "kmp_ftn_os.h"
+#   include "kmp_ftn_entry.h"
+#else
+    "no";
+#endif /* KMP_FTN_ENTRIES */

diff --git a/final/runtime/src/kmp_ftn_os.h b/final/runtime/src/kmp_ftn_os.h
new file mode 100644
index 0000000..197779f
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_os.h

@@ -0,0 +1,532 @@
+/*
+ * kmp_ftn_os.h -- KPTS Fortran defines header file.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_FTN_OS_H
+#define KMP_FTN_OS_H
+
+// KMP_FNT_ENTRIES may be one of: KMP_FTN_PLAIN, KMP_FTN_UPPER, KMP_FTN_APPEND, KMP_FTN_UAPPEND.
+
+
+/* -------------------------- External definitions ------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_PLAIN
+
+    #define FTN_SET_STACKSIZE                    kmp_set_stacksize
+    #define FTN_SET_STACKSIZE_S                  kmp_set_stacksize_s
+    #define FTN_GET_STACKSIZE                    kmp_get_stacksize
+    #define FTN_GET_STACKSIZE_S                  kmp_get_stacksize_s
+    #define FTN_SET_BLOCKTIME                    kmp_set_blocktime
+    #define FTN_GET_BLOCKTIME                    kmp_get_blocktime
+    #define FTN_SET_LIBRARY_SERIAL               kmp_set_library_serial
+    #define FTN_SET_LIBRARY_TURNAROUND           kmp_set_library_turnaround
+    #define FTN_SET_LIBRARY_THROUGHPUT           kmp_set_library_throughput
+    #define FTN_SET_LIBRARY                      kmp_set_library
+    #define FTN_GET_LIBRARY                      kmp_get_library
+    #define FTN_SET_DEFAULTS                     kmp_set_defaults
+    #define FTN_SET_AFFINITY                     kmp_set_affinity
+    #define FTN_GET_AFFINITY                     kmp_get_affinity
+    #define FTN_GET_AFFINITY_MAX_PROC            kmp_get_affinity_max_proc
+    #define FTN_CREATE_AFFINITY_MASK             kmp_create_affinity_mask
+    #define FTN_DESTROY_AFFINITY_MASK            kmp_destroy_affinity_mask
+    #define FTN_SET_AFFINITY_MASK_PROC           kmp_set_affinity_mask_proc
+    #define FTN_UNSET_AFFINITY_MASK_PROC         kmp_unset_affinity_mask_proc
+    #define FTN_GET_AFFINITY_MASK_PROC           kmp_get_affinity_mask_proc
+
+    #define FTN_MALLOC                           kmp_malloc
+    #define FTN_CALLOC                           kmp_calloc
+    #define FTN_REALLOC                          kmp_realloc
+    #define FTN_FREE                             kmp_free
+
+    #define FTN_GET_NUM_KNOWN_THREADS            kmp_get_num_known_threads
+
+    #define FTN_SET_NUM_THREADS                  omp_set_num_threads
+    #define FTN_GET_NUM_THREADS                  omp_get_num_threads
+    #define FTN_GET_MAX_THREADS                  omp_get_max_threads
+    #define FTN_GET_THREAD_NUM                   omp_get_thread_num
+    #define FTN_GET_NUM_PROCS                    omp_get_num_procs
+    #define FTN_SET_DYNAMIC                      omp_set_dynamic
+    #define FTN_GET_DYNAMIC                      omp_get_dynamic
+    #define FTN_SET_NESTED                       omp_set_nested
+    #define FTN_GET_NESTED                       omp_get_nested
+    #define FTN_IN_PARALLEL                      omp_in_parallel
+    #define FTN_GET_THREAD_LIMIT                 omp_get_thread_limit
+    #define FTN_SET_SCHEDULE                     omp_set_schedule
+    #define FTN_GET_SCHEDULE                     omp_get_schedule
+    #define FTN_SET_MAX_ACTIVE_LEVELS            omp_set_max_active_levels
+    #define FTN_GET_MAX_ACTIVE_LEVELS            omp_get_max_active_levels
+    #define FTN_GET_ACTIVE_LEVEL                 omp_get_active_level
+    #define FTN_GET_LEVEL                        omp_get_level
+    #define FTN_GET_ANCESTOR_THREAD_NUM          omp_get_ancestor_thread_num
+    #define FTN_GET_TEAM_SIZE                    omp_get_team_size
+    #define FTN_IN_FINAL                         omp_in_final
+//  #define FTN_SET_PROC_BIND                    omp_set_proc_bind
+    #define FTN_GET_PROC_BIND                    omp_get_proc_bind
+//  #define FTN_CURR_PROC_BIND                   omp_curr_proc_bind
+#if OMP_40_ENABLED
+    #define FTN_GET_NUM_TEAMS                    omp_get_num_teams
+    #define FTN_GET_TEAM_NUM                     omp_get_team_num
+#endif
+    #define FTN_INIT_LOCK                        omp_init_lock
+#if KMP_USE_DYNAMIC_LOCK
+    #define FTN_INIT_LOCK_HINTED                 kmp_init_lock_hinted
+    #define FTN_INIT_NEST_LOCK_HINTED            kmp_init_nest_lock_hinted
+#endif
+    #define FTN_DESTROY_LOCK                     omp_destroy_lock
+    #define FTN_SET_LOCK                         omp_set_lock
+    #define FTN_UNSET_LOCK                       omp_unset_lock
+    #define FTN_TEST_LOCK                        omp_test_lock
+    #define FTN_INIT_NEST_LOCK                   omp_init_nest_lock
+    #define FTN_DESTROY_NEST_LOCK                omp_destroy_nest_lock
+    #define FTN_SET_NEST_LOCK                    omp_set_nest_lock
+    #define FTN_UNSET_NEST_LOCK                  omp_unset_nest_lock
+    #define FTN_TEST_NEST_LOCK                   omp_test_nest_lock
+
+    #define FTN_SET_WARNINGS_ON                  kmp_set_warnings_on
+    #define FTN_SET_WARNINGS_OFF                 kmp_set_warnings_off
+
+    #define FTN_GET_WTIME                        omp_get_wtime
+    #define FTN_GET_WTICK                        omp_get_wtick
+
+#if OMP_40_ENABLED
+#if KMP_MIC || KMP_OS_DARWIN
+    #define FTN_GET_DEFAULT_DEVICE               omp_get_default_device
+    #define FTN_SET_DEFAULT_DEVICE               omp_set_default_device
+    #define FTN_GET_NUM_DEVICES                  omp_get_num_devices
+#endif
+    #define FTN_IS_INITIAL_DEVICE                omp_is_initial_device
+#endif
+
+#if OMP_40_ENABLED
+    #define FTN_GET_CANCELLATION                 omp_get_cancellation
+    #define FTN_GET_CANCELLATION_STATUS          kmp_get_cancellation_status
+#endif
+
+#endif /* KMP_FTN_PLAIN */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_APPEND
+
+    #define FTN_SET_STACKSIZE                    kmp_set_stacksize_
+    #define FTN_SET_STACKSIZE_S                  kmp_set_stacksize_s_
+    #define FTN_GET_STACKSIZE                    kmp_get_stacksize_
+    #define FTN_GET_STACKSIZE_S                  kmp_get_stacksize_s_
+    #define FTN_SET_BLOCKTIME                    kmp_set_blocktime_
+    #define FTN_GET_BLOCKTIME                    kmp_get_blocktime_
+    #define FTN_SET_LIBRARY_SERIAL               kmp_set_library_serial_
+    #define FTN_SET_LIBRARY_TURNAROUND           kmp_set_library_turnaround_
+    #define FTN_SET_LIBRARY_THROUGHPUT           kmp_set_library_throughput_
+    #define FTN_SET_LIBRARY                      kmp_set_library_
+    #define FTN_GET_LIBRARY                      kmp_get_library_
+    #define FTN_SET_DEFAULTS                     kmp_set_defaults_
+    #define FTN_SET_AFFINITY                     kmp_set_affinity_
+    #define FTN_GET_AFFINITY                     kmp_get_affinity_
+    #define FTN_GET_AFFINITY_MAX_PROC            kmp_get_affinity_max_proc_
+    #define FTN_CREATE_AFFINITY_MASK             kmp_create_affinity_mask_
+    #define FTN_DESTROY_AFFINITY_MASK            kmp_destroy_affinity_mask_
+    #define FTN_SET_AFFINITY_MASK_PROC           kmp_set_affinity_mask_proc_
+    #define FTN_UNSET_AFFINITY_MASK_PROC         kmp_unset_affinity_mask_proc_
+    #define FTN_GET_AFFINITY_MASK_PROC           kmp_get_affinity_mask_proc_
+
+    #define FTN_MALLOC                           kmp_malloc_
+    #define FTN_CALLOC                           kmp_calloc_
+    #define FTN_REALLOC                          kmp_realloc_
+    #define FTN_FREE                             kmp_free_
+
+    #define FTN_GET_NUM_KNOWN_THREADS            kmp_get_num_known_threads_
+
+    #define FTN_SET_NUM_THREADS                  omp_set_num_threads_
+    #define FTN_GET_NUM_THREADS                  omp_get_num_threads_
+    #define FTN_GET_MAX_THREADS                  omp_get_max_threads_
+    #define FTN_GET_THREAD_NUM                   omp_get_thread_num_
+    #define FTN_GET_NUM_PROCS                    omp_get_num_procs_
+    #define FTN_SET_DYNAMIC                      omp_set_dynamic_
+    #define FTN_GET_DYNAMIC                      omp_get_dynamic_
+    #define FTN_SET_NESTED                       omp_set_nested_
+    #define FTN_GET_NESTED                       omp_get_nested_
+    #define FTN_IN_PARALLEL                      omp_in_parallel_
+    #define FTN_GET_THREAD_LIMIT                 omp_get_thread_limit_
+    #define FTN_SET_SCHEDULE                     omp_set_schedule_
+    #define FTN_GET_SCHEDULE                     omp_get_schedule_
+    #define FTN_SET_MAX_ACTIVE_LEVELS            omp_set_max_active_levels_
+    #define FTN_GET_MAX_ACTIVE_LEVELS            omp_get_max_active_levels_
+    #define FTN_GET_ACTIVE_LEVEL                 omp_get_active_level_
+    #define FTN_GET_LEVEL                        omp_get_level_
+    #define FTN_GET_ANCESTOR_THREAD_NUM          omp_get_ancestor_thread_num_
+    #define FTN_GET_TEAM_SIZE                    omp_get_team_size_
+    #define FTN_IN_FINAL                         omp_in_final_
+//  #define FTN_SET_PROC_BIND                    omp_set_proc_bind_
+    #define FTN_GET_PROC_BIND                    omp_get_proc_bind_
+//  #define FTN_CURR_PROC_BIND                   omp_curr_proc_bind_
+#if OMP_40_ENABLED
+    #define FTN_GET_NUM_TEAMS                    omp_get_num_teams_
+    #define FTN_GET_TEAM_NUM                     omp_get_team_num_
+#endif
+    #define FTN_INIT_LOCK                        omp_init_lock_
+#if KMP_USE_DYNAMIC_LOCK
+    #define FTN_INIT_LOCK_HINTED                 kmp_init_lock_hinted_
+    #define FTN_INIT_NEST_LOCK_HINTED            kmp_init_nest_lock_hinted_
+#endif
+    #define FTN_DESTROY_LOCK                     omp_destroy_lock_
+    #define FTN_SET_LOCK                         omp_set_lock_
+    #define FTN_UNSET_LOCK                       omp_unset_lock_
+    #define FTN_TEST_LOCK                        omp_test_lock_
+    #define FTN_INIT_NEST_LOCK                   omp_init_nest_lock_
+    #define FTN_DESTROY_NEST_LOCK                omp_destroy_nest_lock_
+    #define FTN_SET_NEST_LOCK                    omp_set_nest_lock_
+    #define FTN_UNSET_NEST_LOCK                  omp_unset_nest_lock_
+    #define FTN_TEST_NEST_LOCK                   omp_test_nest_lock_
+
+    #define FTN_SET_WARNINGS_ON                  kmp_set_warnings_on_
+    #define FTN_SET_WARNINGS_OFF                 kmp_set_warnings_off_
+
+    #define FTN_GET_WTIME                        omp_get_wtime_
+    #define FTN_GET_WTICK                        omp_get_wtick_
+
+#if OMP_40_ENABLED
+#if KMP_MIC || KMP_OS_DARWIN
+    #define FTN_GET_DEFAULT_DEVICE               omp_get_default_device_
+    #define FTN_SET_DEFAULT_DEVICE               omp_set_default_device_
+    #define FTN_GET_NUM_DEVICES                  omp_get_num_devices_
+#endif
+    #define FTN_IS_INITIAL_DEVICE                omp_is_initial_device_
+#endif
+
+
+#if OMP_40_ENABLED
+    #define FTN_GET_CANCELLATION                 omp_get_cancellation_
+    #define FTN_GET_CANCELLATION_STATUS          kmp_get_cancellation_status_
+#endif
+
+#endif /* KMP_FTN_APPEND */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_UPPER
+
+    #define FTN_SET_STACKSIZE                    KMP_SET_STACKSIZE
+    #define FTN_SET_STACKSIZE_S                  KMP_SET_STACKSIZE_S
+    #define FTN_GET_STACKSIZE                    KMP_GET_STACKSIZE
+    #define FTN_GET_STACKSIZE_S                  KMP_GET_STACKSIZE_S
+    #define FTN_SET_BLOCKTIME                    KMP_SET_BLOCKTIME
+    #define FTN_GET_BLOCKTIME                    KMP_GET_BLOCKTIME
+    #define FTN_SET_LIBRARY_SERIAL               KMP_SET_LIBRARY_SERIAL
+    #define FTN_SET_LIBRARY_TURNAROUND           KMP_SET_LIBRARY_TURNAROUND
+    #define FTN_SET_LIBRARY_THROUGHPUT           KMP_SET_LIBRARY_THROUGHPUT
+    #define FTN_SET_LIBRARY                      KMP_SET_LIBRARY
+    #define FTN_GET_LIBRARY                      KMP_GET_LIBRARY
+    #define FTN_SET_DEFAULTS                     KMP_SET_DEFAULTS
+    #define FTN_SET_AFFINITY                     KMP_SET_AFFINITY
+    #define FTN_GET_AFFINITY                     KMP_GET_AFFINITY
+    #define FTN_GET_AFFINITY_MAX_PROC            KMP_GET_AFFINITY_MAX_PROC
+    #define FTN_CREATE_AFFINITY_MASK             KMP_CREATE_AFFINITY_MASK
+    #define FTN_DESTROY_AFFINITY_MASK            KMP_DESTROY_AFFINITY_MASK
+    #define FTN_SET_AFFINITY_MASK_PROC           KMP_SET_AFFINITY_MASK_PROC
+    #define FTN_UNSET_AFFINITY_MASK_PROC         KMP_UNSET_AFFINITY_MASK_PROC
+    #define FTN_GET_AFFINITY_MASK_PROC           KMP_GET_AFFINITY_MASK_PROC
+
+    #define FTN_MALLOC                           KMP_MALLOC
+    #define FTN_CALLOC                           KMP_CALLOC
+    #define FTN_REALLOC                          KMP_REALLOC
+    #define FTN_FREE                             KMP_FREE
+
+    #define FTN_GET_NUM_KNOWN_THREADS            KMP_GET_NUM_KNOWN_THREADS
+
+    #define FTN_SET_NUM_THREADS                  OMP_SET_NUM_THREADS
+    #define FTN_GET_NUM_THREADS                  OMP_GET_NUM_THREADS
+    #define FTN_GET_MAX_THREADS                  OMP_GET_MAX_THREADS
+    #define FTN_GET_THREAD_NUM                   OMP_GET_THREAD_NUM
+    #define FTN_GET_NUM_PROCS                    OMP_GET_NUM_PROCS
+    #define FTN_SET_DYNAMIC                      OMP_SET_DYNAMIC
+    #define FTN_GET_DYNAMIC                      OMP_GET_DYNAMIC
+    #define FTN_SET_NESTED                       OMP_SET_NESTED
+    #define FTN_GET_NESTED                       OMP_GET_NESTED
+    #define FTN_IN_PARALLEL                      OMP_IN_PARALLEL
+    #define FTN_GET_THREAD_LIMIT                 OMP_GET_THREAD_LIMIT
+    #define FTN_SET_SCHEDULE                     OMP_SET_SCHEDULE
+    #define FTN_GET_SCHEDULE                     OMP_GET_SCHEDULE
+    #define FTN_SET_MAX_ACTIVE_LEVELS            OMP_SET_MAX_ACTIVE_LEVELS
+    #define FTN_GET_MAX_ACTIVE_LEVELS            OMP_GET_MAX_ACTIVE_LEVELS
+    #define FTN_GET_ACTIVE_LEVEL                 OMP_GET_ACTIVE_LEVEL
+    #define FTN_GET_LEVEL                        OMP_GET_LEVEL
+    #define FTN_GET_ANCESTOR_THREAD_NUM          OMP_GET_ANCESTOR_THREAD_NUM
+    #define FTN_GET_TEAM_SIZE                    OMP_GET_TEAM_SIZE
+    #define FTN_IN_FINAL                         OMP_IN_FINAL
+//  #define FTN_SET_PROC_BIND                    OMP_SET_PROC_BIND
+    #define FTN_GET_PROC_BIND                    OMP_GET_PROC_BIND
+//  #define FTN_CURR_PROC_BIND                   OMP_CURR_PROC_BIND
+#if OMP_40_ENABLED
+    #define FTN_GET_NUM_TEAMS                    OMP_GET_NUM_TEAMS
+    #define FTN_GET_TEAM_NUM                     OMP_GET_TEAM_NUM
+#endif
+    #define FTN_INIT_LOCK                        OMP_INIT_LOCK
+#if KMP_USE_DYNAMIC_LOCK
+    #define FTN_INIT_LOCK_HINTED                 KMP_INIT_LOCK_HINTED
+    #define FTN_INIT_NEST_LOCK_HINTED            KMP_INIT_NEST_LOCK_HINTED
+#endif
+    #define FTN_DESTROY_LOCK                     OMP_DESTROY_LOCK
+    #define FTN_SET_LOCK                         OMP_SET_LOCK
+    #define FTN_UNSET_LOCK                       OMP_UNSET_LOCK
+    #define FTN_TEST_LOCK                        OMP_TEST_LOCK
+    #define FTN_INIT_NEST_LOCK                   OMP_INIT_NEST_LOCK
+    #define FTN_DESTROY_NEST_LOCK                OMP_DESTROY_NEST_LOCK
+    #define FTN_SET_NEST_LOCK                    OMP_SET_NEST_LOCK
+    #define FTN_UNSET_NEST_LOCK                  OMP_UNSET_NEST_LOCK
+    #define FTN_TEST_NEST_LOCK                   OMP_TEST_NEST_LOCK
+
+    #define FTN_SET_WARNINGS_ON                  KMP_SET_WARNINGS_ON
+    #define FTN_SET_WARNINGS_OFF                 KMP_SET_WARNINGS_OFF
+
+    #define FTN_GET_WTIME                        OMP_GET_WTIME
+    #define FTN_GET_WTICK                        OMP_GET_WTICK
+
+#if OMP_40_ENABLED
+#if KMP_MIC || KMP_OS_DARWIN
+    #define FTN_GET_DEFAULT_DEVICE               OMP_GET_DEFAULT_DEVICE
+    #define FTN_SET_DEFAULT_DEVICE               OMP_SET_DEFAULT_DEVICE
+    #define FTN_GET_NUM_DEVICES                  OMP_GET_NUM_DEVICES
+#endif
+    #define FTN_IS_INITIAL_DEVICE                OMP_IS_INITIAL_DEVICE
+#endif
+
+
+#if OMP_40_ENABLED
+    #define FTN_GET_CANCELLATION                 OMP_GET_CANCELLATION
+    #define FTN_GET_CANCELLATION_STATUS          KMP_GET_CANCELLATION_STATUS
+#endif
+
+#endif /* KMP_FTN_UPPER */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_UAPPEND
+
+    #define FTN_SET_STACKSIZE                    KMP_SET_STACKSIZE_
+    #define FTN_SET_STACKSIZE_S                  KMP_SET_STACKSIZE_S_
+    #define FTN_GET_STACKSIZE                    KMP_GET_STACKSIZE_
+    #define FTN_GET_STACKSIZE_S                  KMP_GET_STACKSIZE_S_
+    #define FTN_SET_BLOCKTIME                    KMP_SET_BLOCKTIME_
+    #define FTN_GET_BLOCKTIME                    KMP_GET_BLOCKTIME_
+    #define FTN_SET_LIBRARY_SERIAL               KMP_SET_LIBRARY_SERIAL_
+    #define FTN_SET_LIBRARY_TURNAROUND           KMP_SET_LIBRARY_TURNAROUND_
+    #define FTN_SET_LIBRARY_THROUGHPUT           KMP_SET_LIBRARY_THROUGHPUT_
+    #define FTN_SET_LIBRARY                      KMP_SET_LIBRARY_
+    #define FTN_GET_LIBRARY                      KMP_GET_LIBRARY_
+    #define FTN_SET_DEFAULTS                     KMP_SET_DEFAULTS_
+    #define FTN_SET_AFFINITY                     KMP_SET_AFFINITY_
+    #define FTN_GET_AFFINITY                     KMP_GET_AFFINITY_
+    #define FTN_GET_AFFINITY_MAX_PROC            KMP_GET_AFFINITY_MAX_PROC_
+    #define FTN_CREATE_AFFINITY_MASK             KMP_CREATE_AFFINITY_MASK_
+    #define FTN_DESTROY_AFFINITY_MASK            KMP_DESTROY_AFFINITY_MASK_
+    #define FTN_SET_AFFINITY_MASK_PROC           KMP_SET_AFFINITY_MASK_PROC_
+    #define FTN_UNSET_AFFINITY_MASK_PROC         KMP_UNSET_AFFINITY_MASK_PROC_
+    #define FTN_GET_AFFINITY_MASK_PROC           KMP_GET_AFFINITY_MASK_PROC_
+
+    #define FTN_MALLOC                           KMP_MALLOC_
+    #define FTN_CALLOC                           KMP_CALLOC_
+    #define FTN_REALLOC                          KMP_REALLOC_
+    #define FTN_FREE                             KMP_FREE_
+
+    #define FTN_GET_NUM_KNOWN_THREADS            KMP_GET_NUM_KNOWN_THREADS_
+
+    #define FTN_SET_NUM_THREADS                  OMP_SET_NUM_THREADS_
+    #define FTN_GET_NUM_THREADS                  OMP_GET_NUM_THREADS_
+    #define FTN_GET_MAX_THREADS                  OMP_GET_MAX_THREADS_
+    #define FTN_GET_THREAD_NUM                   OMP_GET_THREAD_NUM_
+    #define FTN_GET_NUM_PROCS                    OMP_GET_NUM_PROCS_
+    #define FTN_SET_DYNAMIC                      OMP_SET_DYNAMIC_
+    #define FTN_GET_DYNAMIC                      OMP_GET_DYNAMIC_
+    #define FTN_SET_NESTED                       OMP_SET_NESTED_
+    #define FTN_GET_NESTED                       OMP_GET_NESTED_
+    #define FTN_IN_PARALLEL                      OMP_IN_PARALLEL_
+    #define FTN_GET_THREAD_LIMIT                 OMP_GET_THREAD_LIMIT_
+    #define FTN_SET_SCHEDULE                     OMP_SET_SCHEDULE_
+    #define FTN_GET_SCHEDULE                     OMP_GET_SCHEDULE_
+    #define FTN_SET_MAX_ACTIVE_LEVELS            OMP_SET_MAX_ACTIVE_LEVELS_
+    #define FTN_GET_MAX_ACTIVE_LEVELS            OMP_GET_MAX_ACTIVE_LEVELS_
+    #define FTN_GET_ACTIVE_LEVEL                 OMP_GET_ACTIVE_LEVEL_
+    #define FTN_GET_LEVEL                        OMP_GET_LEVEL_
+    #define FTN_GET_ANCESTOR_THREAD_NUM          OMP_GET_ANCESTOR_THREAD_NUM_
+    #define FTN_GET_TEAM_SIZE                    OMP_GET_TEAM_SIZE_
+    #define FTN_IN_FINAL                         OMP_IN_FINAL_
+//  #define FTN_SET_PROC_BIND                    OMP_SET_PROC_BIND_
+    #define FTN_GET_PROC_BIND                    OMP_GET_PROC_BIND_
+//  #define FTN_CURR_PROC_BIND                   OMP_CURR_PROC_BIND_
+#if OMP_40_ENABLED
+    #define FTN_GET_NUM_TEAMS                    OMP_GET_NUM_TEAMS_
+    #define FTN_GET_TEAM_NUM                     OMP_GET_TEAM_NUM_
+#endif
+    #define FTN_INIT_LOCK                        OMP_INIT_LOCK_
+#if KMP_USE_DYNAMIC_LOCK
+    #define FTN_INIT_LOCK_HINTED                 KMP_INIT_LOCK_HINTED_
+    #define FTN_INIT_NEST_LOCK_HINTED            KMP_INIT_NEST_LOCK_HINTED_
+#endif
+    #define FTN_DESTROY_LOCK                     OMP_DESTROY_LOCK_
+    #define FTN_SET_LOCK                         OMP_SET_LOCK_
+    #define FTN_UNSET_LOCK                       OMP_UNSET_LOCK_
+    #define FTN_TEST_LOCK                        OMP_TEST_LOCK_
+    #define FTN_INIT_NEST_LOCK                   OMP_INIT_NEST_LOCK_
+    #define FTN_DESTROY_NEST_LOCK                OMP_DESTROY_NEST_LOCK_
+    #define FTN_SET_NEST_LOCK                    OMP_SET_NEST_LOCK_
+    #define FTN_UNSET_NEST_LOCK                  OMP_UNSET_NEST_LOCK_
+    #define FTN_TEST_NEST_LOCK                   OMP_TEST_NEST_LOCK_
+
+    #define FTN_SET_WARNINGS_ON                  KMP_SET_WARNINGS_ON_
+    #define FTN_SET_WARNINGS_OFF                 KMP_SET_WARNINGS_OFF_
+
+    #define FTN_GET_WTIME                        OMP_GET_WTIME_
+    #define FTN_GET_WTICK                        OMP_GET_WTICK_
+
+#if OMP_40_ENABLED
+#if KMP_MIC || KMP_OS_DARWIN
+    #define FTN_GET_DEFAULT_DEVICE               OMP_GET_DEFAULT_DEVICE_
+    #define FTN_SET_DEFAULT_DEVICE               OMP_SET_DEFAULT_DEVICE_
+    #define FTN_GET_NUM_DEVICES                  OMP_GET_NUM_DEVICES_
+#endif
+    #define FTN_IS_INITIAL_DEVICE                OMP_IS_INITIAL_DEVICE_
+#endif
+
+
+#if OMP_40_ENABLED
+    #define FTN_GET_CANCELLATION                 OMP_GET_CANCELLATION_
+    #define FTN_GET_CANCELLATION_STATUS          KMP_GET_CANCELLATION_STATUS_
+#endif
+
+#endif /* KMP_FTN_UAPPEND */
+
+/* ------------------------------------------------------------------ */
+/* -------------------------- GOMP API NAMES ------------------------ */
+// All GOMP_1.0 symbols
+#define KMP_API_NAME_GOMP_ATOMIC_END                   GOMP_atomic_end
+#define KMP_API_NAME_GOMP_ATOMIC_START                 GOMP_atomic_start
+#define KMP_API_NAME_GOMP_BARRIER                      GOMP_barrier
+#define KMP_API_NAME_GOMP_CRITICAL_END                 GOMP_critical_end
+#define KMP_API_NAME_GOMP_CRITICAL_NAME_END            GOMP_critical_name_end
+#define KMP_API_NAME_GOMP_CRITICAL_NAME_START          GOMP_critical_name_start
+#define KMP_API_NAME_GOMP_CRITICAL_START               GOMP_critical_start
+#define KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT            GOMP_loop_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_DYNAMIC_START           GOMP_loop_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_END                     GOMP_loop_end
+#define KMP_API_NAME_GOMP_LOOP_END_NOWAIT              GOMP_loop_end_nowait
+#define KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT             GOMP_loop_guided_next
+#define KMP_API_NAME_GOMP_LOOP_GUIDED_START            GOMP_loop_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT    GOMP_loop_ordered_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START   GOMP_loop_ordered_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT     GOMP_loop_ordered_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START    GOMP_loop_ordered_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT    GOMP_loop_ordered_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START   GOMP_loop_ordered_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT     GOMP_loop_ordered_static_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START    GOMP_loop_ordered_static_start
+#define KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT            GOMP_loop_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_RUNTIME_START           GOMP_loop_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_STATIC_NEXT             GOMP_loop_static_next
+#define KMP_API_NAME_GOMP_LOOP_STATIC_START            GOMP_loop_static_start
+#define KMP_API_NAME_GOMP_ORDERED_END                  GOMP_ordered_end
+#define KMP_API_NAME_GOMP_ORDERED_START                GOMP_ordered_start
+#define KMP_API_NAME_GOMP_PARALLEL_END                 GOMP_parallel_end
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START  GOMP_parallel_loop_dynamic_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START   GOMP_parallel_loop_guided_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START  GOMP_parallel_loop_runtime_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START   GOMP_parallel_loop_static_start
+#define KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START      GOMP_parallel_sections_start
+#define KMP_API_NAME_GOMP_PARALLEL_START               GOMP_parallel_start
+#define KMP_API_NAME_GOMP_SECTIONS_END                 GOMP_sections_end
+#define KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT          GOMP_sections_end_nowait
+#define KMP_API_NAME_GOMP_SECTIONS_NEXT                GOMP_sections_next
+#define KMP_API_NAME_GOMP_SECTIONS_START               GOMP_sections_start
+#define KMP_API_NAME_GOMP_SINGLE_COPY_END              GOMP_single_copy_end
+#define KMP_API_NAME_GOMP_SINGLE_COPY_START            GOMP_single_copy_start
+#define KMP_API_NAME_GOMP_SINGLE_START                 GOMP_single_start
+
+// All GOMP_2.0 symbols
+#define KMP_API_NAME_GOMP_TASK                           GOMP_task
+#define KMP_API_NAME_GOMP_TASKWAIT                       GOMP_taskwait
+#define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT          GOMP_loop_ull_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START         GOMP_loop_ull_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT           GOMP_loop_ull_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START          GOMP_loop_ull_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT  GOMP_loop_ull_ordered_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START GOMP_loop_ull_ordered_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT   GOMP_loop_ull_ordered_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START  GOMP_loop_ull_ordered_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT  GOMP_loop_ull_ordered_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START GOMP_loop_ull_ordered_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT   GOMP_loop_ull_ordered_static_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START  GOMP_loop_ull_ordered_static_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT          GOMP_loop_ull_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START         GOMP_loop_ull_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT           GOMP_loop_ull_static_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START          GOMP_loop_ull_static_start
+
+// All GOMP_3.0 symbols
+#define KMP_API_NAME_GOMP_TASKYIELD                      GOMP_taskyield
+
+// All GOMP_4.0 symbols 
+// TODO: As of 2013-10-14, none of the GOMP_4.0 functions are implemented in libomp
+#define KMP_API_NAME_GOMP_BARRIER_CANCEL                 GOMP_barrier_cancel
+#define KMP_API_NAME_GOMP_CANCEL                         GOMP_cancel
+#define KMP_API_NAME_GOMP_CANCELLATION_POINT             GOMP_cancellation_point
+#define KMP_API_NAME_GOMP_LOOP_END_CANCEL                GOMP_loop_end_cancel
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC          GOMP_parallel_loop_dynamic
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED           GOMP_parallel_loop_guided
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME          GOMP_parallel_loop_runtime
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC           GOMP_parallel_loop_static
+#define KMP_API_NAME_GOMP_PARALLEL_SECTIONS              GOMP_parallel_sections
+#define KMP_API_NAME_GOMP_PARALLEL                       GOMP_parallel
+#define KMP_API_NAME_GOMP_SECTIONS_END_CANCEL            GOMP_sections_end_cancel
+#define KMP_API_NAME_GOMP_TASKGROUP_START                GOMP_taskgroup_start
+#define KMP_API_NAME_GOMP_TASKGROUP_END                  GOMP_taskgroup_end
+/* Target functions should be taken care of by liboffload */
+#define KMP_API_NAME_GOMP_TARGET                         GOMP_target
+#define KMP_API_NAME_GOMP_TARGET_DATA                    GOMP_target_data
+#define KMP_API_NAME_GOMP_TARGET_END_DATA                GOMP_target_end_data
+#define KMP_API_NAME_GOMP_TARGET_UPDATE                  GOMP_target_update
+#define KMP_API_NAME_GOMP_TEAMS                          GOMP_teams
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+    #define xstr(x) str(x)
+    #define str(x) #x
+
+    // If Linux, xexpand prepends __kmp_api_ to the real API name
+    #define xexpand(api_name) expand(api_name)
+    #define expand(api_name) __kmp_api_##api_name
+
+    #define xaliasify(api_name,ver) aliasify(api_name,ver)
+    #define aliasify(api_name,ver) __typeof__(__kmp_api_##api_name) __kmp_api_##api_name##_##ver##_alias __attribute__((alias(xstr(__kmp_api_##api_name))))
+
+    #define xversionify(api_name, version_num, version_str) versionify(api_name, version_num, version_str, "VERSION")
+    #define versionify(api_name, version_num, version_str, default_ver) \
+    __asm__(".symver " xstr(__kmp_api_##api_name##_##version_num##_alias) "," xstr(api_name) "@" version_str "\n\t"); \
+    __asm__(".symver " xstr(__kmp_api_##api_name) "," xstr(api_name) "@@" default_ver "\n\t")
+
+#else // KMP_USE_VERSION_SYMBOLS
+    #define xstr(x) /* Nothing */
+    #define str(x)  /* Nothing */
+
+    // if Windows or Mac, xexpand does no name transformation
+    #define xexpand(api_name) expand(api_name)
+    #define expand(api_name) api_name
+
+    #define xaliasify(api_name,ver) /* Nothing */
+    #define aliasify(api_name,ver)  /* Nothing */
+
+    #define xversionify(api_name, version_num, version_str) /* Nothing */
+    #define versionify(api_name, version_num, version_str, default_ver) /* Nothing */
+
+#endif // KMP_USE_VERSION_SYMBOLS
+
+#endif /* KMP_FTN_OS_H */
+

diff --git a/final/runtime/src/kmp_ftn_stdcall.c b/final/runtime/src/kmp_ftn_stdcall.c
new file mode 100644
index 0000000..cf70d74
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_stdcall.c

@@ -0,0 +1,35 @@
+/*
+ * kmp_ftn_stdcall.c -- Fortran __stdcall linkage support for OpenMP.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftnstdcall[] = KMP_VERSION_PREFIX "Fortran __stdcall OMP support: "
+#ifdef USE_FTN_STDCALL
+    "yes";
+#else
+    "no";
+#endif
+
+#ifdef USE_FTN_STDCALL
+
+#define FTN_STDCALL 	KMP_STDCALL
+#define KMP_FTN_ENTRIES	USE_FTN_STDCALL
+
+#include "kmp_ftn_os.h"
+#include "kmp_ftn_entry.h"
+
+#endif /* USE_FTN_STDCALL */
+

diff --git a/final/runtime/src/kmp_global.c b/final/runtime/src/kmp_global.c
new file mode 100644
index 0000000..49ddbd4
--- /dev/null
+++ b/final/runtime/src/kmp_global.c

@@ -0,0 +1,471 @@
+/*
+ * kmp_global.c -- KPTS global variables for runtime support library
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+
+#ifdef KMP_SETVERSION
+char __kmp_setversion_string[] = VERSION_STRING;
+#endif
+
+kmp_key_t __kmp_gtid_threadprivate_key;
+
+kmp_cpuinfo_t   __kmp_cpuinfo = { 0 }; // Not initialized
+
+#if KMP_STATS_ENABLED
+#include "kmp_stats.h"
+// lock for modifying the global __kmp_stats_list
+kmp_tas_lock_t __kmp_stats_lock = KMP_TAS_LOCK_INITIALIZER(__kmp_stats_lock);
+
+// global list of per thread stats, the head is a sentinel node which accumulates all stats produced before __kmp_create_worker is called.
+kmp_stats_list __kmp_stats_list;
+
+// thread local pointer to stats node within list
+__thread kmp_stats_list* __kmp_stats_thread_ptr = &__kmp_stats_list;
+
+// gives reference tick for all events (considered the 0 tick)
+tsc_tick_count __kmp_stats_start_time;
+#endif
+
+/* ----------------------------------------------------- */
+/* INITIALIZATION VARIABLES */
+/* they are syncronized to write during init, but read anytime */
+volatile int        __kmp_init_serial     = FALSE;
+volatile int        __kmp_init_gtid       = FALSE;
+volatile int        __kmp_init_common     = FALSE;
+volatile int        __kmp_init_middle     = FALSE;
+volatile int        __kmp_init_parallel   = FALSE;
+volatile int        __kmp_init_monitor    = 0;  /* 1 - launched, 2 - actually started (Windows* OS only) */
+volatile int        __kmp_init_user_locks = FALSE;
+
+/* list of address of allocated caches for commons */
+kmp_cached_addr_t  *__kmp_threadpriv_cache_list = NULL;
+
+int                 __kmp_init_counter  = 0;
+int                 __kmp_root_counter  = 0;
+int                 __kmp_version       = 0;
+
+volatile kmp_uint32 __kmp_team_counter  = 0;
+volatile kmp_uint32 __kmp_task_counter  = 0;
+
+unsigned int __kmp_init_wait = KMP_DEFAULT_INIT_WAIT;   /* initial number of spin-tests   */
+unsigned int __kmp_next_wait = KMP_DEFAULT_NEXT_WAIT;   /* susequent number of spin-tests */
+
+size_t      __kmp_stksize         = KMP_DEFAULT_STKSIZE;
+size_t      __kmp_monitor_stksize = 0;  // auto adjust
+size_t      __kmp_stkoffset       = KMP_DEFAULT_STKOFFSET;
+int         __kmp_stkpadding      = KMP_MIN_STKPADDING;
+
+size_t    __kmp_malloc_pool_incr  = KMP_DEFAULT_MALLOC_POOL_INCR;
+
+/* Barrier method defaults, settings, and strings */
+/* branch factor = 2^branch_bits (only relevant for tree and hyper barrier types) */
+#if KMP_ARCH_X86_64
+kmp_uint32 __kmp_barrier_gather_bb_dflt      = 2;  /* branch_factor = 4 */ /* hyper2: C78980 */
+kmp_uint32 __kmp_barrier_release_bb_dflt     = 2;  /* branch_factor = 4 */ /* hyper2: C78980 */
+#else
+kmp_uint32 __kmp_barrier_gather_bb_dflt      = 2;  /* branch_factor = 4 */ /* communication in core for MIC */
+kmp_uint32 __kmp_barrier_release_bb_dflt     = 2;  /* branch_factor = 4 */ /* communication in core for MIC */
+#endif // KMP_ARCH_X86_64
+#if KMP_ARCH_X86_64
+kmp_bar_pat_e __kmp_barrier_gather_pat_dflt  = bp_hyper_bar;  /* hyper2: C78980 */
+kmp_bar_pat_e __kmp_barrier_release_pat_dflt = bp_hyper_bar;  /* hyper2: C78980 */
+#else
+kmp_bar_pat_e __kmp_barrier_gather_pat_dflt  = bp_linear_bar;
+kmp_bar_pat_e __kmp_barrier_release_pat_dflt = bp_linear_bar;
+#endif
+kmp_uint32 __kmp_barrier_gather_branch_bits  [ bs_last_barrier ] = { 0 };
+kmp_uint32 __kmp_barrier_release_branch_bits [ bs_last_barrier ] = { 0 };
+kmp_bar_pat_e __kmp_barrier_gather_pattern   [ bs_last_barrier ] = { bp_linear_bar };
+kmp_bar_pat_e __kmp_barrier_release_pattern  [ bs_last_barrier ] = { bp_linear_bar };
+char const *__kmp_barrier_branch_bit_env_name [ bs_last_barrier ] =
+                            { "KMP_PLAIN_BARRIER", "KMP_FORKJOIN_BARRIER"
+                                #if KMP_FAST_REDUCTION_BARRIER
+                                    , "KMP_REDUCTION_BARRIER"
+                                #endif // KMP_FAST_REDUCTION_BARRIER
+                            };
+char const *__kmp_barrier_pattern_env_name    [ bs_last_barrier ] =
+                            { "KMP_PLAIN_BARRIER_PATTERN", "KMP_FORKJOIN_BARRIER_PATTERN"
+                                #if KMP_FAST_REDUCTION_BARRIER
+                                    , "KMP_REDUCTION_BARRIER_PATTERN"
+                                #endif // KMP_FAST_REDUCTION_BARRIER
+                            };
+char const *__kmp_barrier_type_name           [ bs_last_barrier ] =
+                            { "plain", "forkjoin"
+                                #if KMP_FAST_REDUCTION_BARRIER
+                                    , "reduction"
+                                #endif // KMP_FAST_REDUCTION_BARRIER
+                            };
+char const *__kmp_barrier_pattern_name [ bp_last_bar ] = { "linear", "tree", "hyper", "hierarchical" };
+
+
+int       __kmp_allThreadsSpecified = 0;
+size_t    __kmp_align_alloc = CACHE_LINE;
+
+
+int     __kmp_generate_warnings = kmp_warnings_low;
+int          __kmp_reserve_warn = 0;
+int                 __kmp_xproc = 0;
+int            __kmp_avail_proc = 0;
+size_t    __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
+int           __kmp_sys_max_nth = KMP_MAX_NTH;
+int               __kmp_max_nth = 0;
+int      __kmp_threads_capacity = 0;
+int         __kmp_dflt_team_nth = 0;
+int      __kmp_dflt_team_nth_ub = 0;
+int           __kmp_tp_capacity = 0;
+int             __kmp_tp_cached = 0;
+int           __kmp_dflt_nested = FALSE;
+int __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; /* max_active_levels limit */
+#if KMP_NESTED_HOT_TEAMS
+int __kmp_hot_teams_mode         = 0; /* 0 - free extra threads when reduced */
+                                      /* 1 - keep extra threads when reduced */
+int __kmp_hot_teams_max_level    = 1; /* nesting level of hot teams */
+#endif
+enum library_type __kmp_library = library_none;
+enum sched_type     __kmp_sched = kmp_sch_default;  /* scheduling method for runtime scheduling */
+enum sched_type    __kmp_static = kmp_sch_static_greedy; /* default static scheduling method */
+enum sched_type    __kmp_guided = kmp_sch_guided_iterative_chunked; /* default guided scheduling method */
+enum sched_type      __kmp_auto = kmp_sch_guided_analytical_chunked; /* default auto scheduling method */
+int        __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+int       __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
+int          __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( KMP_DEFAULT_BLOCKTIME, KMP_MIN_MONITOR_WAKEUPS );
+#ifdef KMP_ADJUST_BLOCKTIME
+int               __kmp_zero_bt = FALSE;
+#endif /* KMP_ADJUST_BLOCKTIME */
+#ifdef KMP_DFLT_NTH_CORES
+int                __kmp_ncores = 0;
+#endif
+int                 __kmp_chunk = 0;
+int           __kmp_abort_delay = 0;
+#if KMP_OS_LINUX && defined(KMP_TDATA_GTID)
+int             __kmp_gtid_mode = 3; /* use __declspec(thread) TLS to store gtid */
+int      __kmp_adjust_gtid_mode = FALSE;
+#elif KMP_OS_WINDOWS
+int             __kmp_gtid_mode = 2; /* use TLS functions to store gtid */
+int      __kmp_adjust_gtid_mode = FALSE;
+#else
+int             __kmp_gtid_mode = 0; /* select method to get gtid based on #threads */
+int      __kmp_adjust_gtid_mode = TRUE;
+#endif /* KMP_OS_LINUX && defined(KMP_TDATA_GTID) */
+#ifdef KMP_TDATA_GTID
+#if KMP_OS_WINDOWS
+__declspec(thread) int __kmp_gtid = KMP_GTID_DNE;
+#else
+__thread int __kmp_gtid = KMP_GTID_DNE;
+#endif /* KMP_OS_WINDOWS - workaround because Intel(R) Many Integrated Core compiler 20110316 doesn't accept __declspec */
+#endif /* KMP_TDATA_GTID */
+int          __kmp_tls_gtid_min = INT_MAX;
+int            __kmp_foreign_tp = TRUE;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+int    __kmp_inherit_fp_control = TRUE;
+kmp_int16  __kmp_init_x87_fpu_control_word = 0;
+kmp_uint32     __kmp_init_mxcsr = 0;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef USE_LOAD_BALANCE
+double  __kmp_load_balance_interval   = 1.0;
+#endif /* USE_LOAD_BALANCE */
+
+kmp_nested_nthreads_t __kmp_nested_nth  = { NULL, 0, 0 };
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params = { 1, 1024 }; // TODO: tune it!
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+char * __kmp_speculative_statsfile = "-";
+#endif
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+#if OMP_40_ENABLED
+int __kmp_display_env           = FALSE;
+int __kmp_display_env_verbose   = FALSE;
+int __kmp_omp_cancellation      = FALSE;
+#endif
+
+/* map OMP 3.0 schedule types with our internal schedule types */
+enum sched_type __kmp_sch_map[ kmp_sched_upper - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ] = {
+    kmp_sch_static_chunked,     // ==> kmp_sched_static            = 1
+    kmp_sch_dynamic_chunked,    // ==> kmp_sched_dynamic           = 2
+    kmp_sch_guided_chunked,     // ==> kmp_sched_guided            = 3
+    kmp_sch_auto,               // ==> kmp_sched_auto              = 4
+    kmp_sch_trapezoidal         // ==> kmp_sched_trapezoidal       = 101
+                                // will likely not used, introduced here just to debug the code
+                                // of public intel extension schedules
+};
+
+#if KMP_OS_LINUX
+enum clock_function_type __kmp_clock_function;
+int __kmp_clock_function_param;
+#endif /* KMP_OS_LINUX */
+
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+enum mic_type __kmp_mic_type = non_mic;
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+
+# if KMP_GROUP_AFFINITY
+
+int __kmp_num_proc_groups = 1;
+
+kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount = NULL;
+kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount = NULL;
+kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity = NULL;
+kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity = NULL;
+
+# endif /* KMP_GROUP_AFFINITY */
+
+size_t   __kmp_affin_mask_size = 0;
+enum affinity_type __kmp_affinity_type = affinity_default;
+enum affinity_gran __kmp_affinity_gran = affinity_gran_default;
+int __kmp_affinity_gran_levels  = -1;
+int __kmp_affinity_dups = TRUE;
+enum affinity_top_method __kmp_affinity_top_method = affinity_top_method_default;
+int      __kmp_affinity_compact      = 0;
+int      __kmp_affinity_offset       = 0;
+int      __kmp_affinity_verbose      = FALSE;
+int      __kmp_affinity_warnings     = TRUE;
+int      __kmp_affinity_respect_mask = affinity_respect_mask_default;
+char *   __kmp_affinity_proclist     = NULL;
+kmp_affin_mask_t *__kmp_affinity_masks = NULL;
+unsigned __kmp_affinity_num_masks    = 0;
+
+char const *  __kmp_cpuinfo_file     = NULL;
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+#if OMP_40_ENABLED
+kmp_nested_proc_bind_t __kmp_nested_proc_bind = { NULL, 0, 0 };
+int __kmp_affinity_num_places = 0;
+#endif
+
+int __kmp_place_num_cores = 0;
+int __kmp_place_num_threads_per_core = 0;
+int __kmp_place_core_offset = 0;
+
+kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
+
+/* This check ensures that the compiler is passing the correct data type
+ * for the flags formal parameter of the function kmpc_omp_task_alloc().
+ * If the type is not a 4-byte type, then give an error message about
+ * a non-positive length array pointing here.  If that happens, the
+ * kmp_tasking_flags_t structure must be redefined to have exactly 32 bits.
+ */
+KMP_BUILD_ASSERT( sizeof(kmp_tasking_flags_t) == 4 );
+
+kmp_int32 __kmp_task_stealing_constraint = 1;   /* Constrain task stealing by default */
+
+#ifdef DEBUG_SUSPEND
+int         __kmp_suspend_count = 0;
+#endif
+
+int     __kmp_settings = FALSE;
+int     __kmp_duplicate_library_ok = 0;
+#if USE_ITT_BUILD
+int     __kmp_forkjoin_frames = 1;
+int     __kmp_forkjoin_frames_mode = 3;
+#endif
+PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method = reduction_method_not_defined;
+int     __kmp_determ_red = FALSE;
+
+#ifdef KMP_DEBUG
+int     kmp_a_debug = 0;
+int     kmp_b_debug = 0;
+int     kmp_c_debug = 0;
+int     kmp_d_debug = 0;
+int     kmp_e_debug = 0;
+int     kmp_f_debug = 0;
+int     kmp_diag    = 0;
+#endif
+
+/* For debug information logging using rotating buffer */
+int     __kmp_debug_buf = FALSE;        /* TRUE means use buffer, FALSE means print to stderr */
+int     __kmp_debug_buf_lines = KMP_DEBUG_BUF_LINES_INIT; /* Lines of debug stored in buffer */
+int     __kmp_debug_buf_chars = KMP_DEBUG_BUF_CHARS_INIT; /* Characters allowed per line in buffer */
+int     __kmp_debug_buf_atomic = FALSE; /* TRUE means use atomic update of buffer entry pointer */
+
+char   *__kmp_debug_buffer = NULL;      /* Debug buffer itself */
+int     __kmp_debug_count = 0;          /* Counter for number of lines printed in buffer so far */
+int     __kmp_debug_buf_warn_chars = 0; /* Keep track of char increase recommended in warnings */
+/* end rotating debug buffer */
+
+#ifdef KMP_DEBUG
+int     __kmp_par_range;           /* +1 => only go par for constructs in range */
+                                           /* -1 => only go par for constructs outside range */
+char    __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN] = { '\0' };
+char    __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN] = { '\0' };
+int     __kmp_par_range_lb = 0;
+int     __kmp_par_range_ub = INT_MAX;
+#endif /* KMP_DEBUG */
+
+/* For printing out dynamic storage map for threads and teams */
+int     __kmp_storage_map = FALSE;         /* True means print storage map for threads and teams */
+int     __kmp_storage_map_verbose = FALSE; /* True means storage map includes placement info */
+int     __kmp_storage_map_verbose_specified = FALSE;
+/* Initialize the library data structures when we fork a child process, defaults to TRUE */
+int     __kmp_need_register_atfork = TRUE; /* At initialization, call pthread_atfork to install fork handler */
+int     __kmp_need_register_atfork_specified = TRUE;
+
+
+int        __kmp_env_chunk       = FALSE;  /* KMP_CHUNK specified?     */
+int        __kmp_env_stksize     = FALSE;  /* KMP_STACKSIZE specified? */
+int        __kmp_env_omp_stksize = FALSE;  /* OMP_STACKSIZE specified? */
+int        __kmp_env_all_threads     = FALSE;/* KMP_ALL_THREADS or KMP_MAX_THREADS specified? */
+int        __kmp_env_omp_all_threads = FALSE;/* OMP_THREAD_LIMIT specified? */
+int        __kmp_env_blocktime   = FALSE;  /* KMP_BLOCKTIME specified? */
+int        __kmp_env_checks      = FALSE;  /* KMP_CHECKS specified?    */
+int        __kmp_env_consistency_check  = FALSE;  /* KMP_CONSISTENCY_CHECK specified? */
+
+kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
+kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
+kmp_uint32 __kmp_yielding_on = 1;
+#if KMP_OS_CNK
+kmp_uint32 __kmp_yield_cycle = 0;
+#else
+kmp_uint32 __kmp_yield_cycle = 1;     /* Yield-cycle is on by default */
+#endif
+kmp_int32  __kmp_yield_on_count = 10; /* By default, yielding is on for 10 monitor periods. */
+kmp_int32  __kmp_yield_off_count = 1; /* By default, yielding is off for 1 monitor periods. */
+/* ----------------------------------------------------- */
+
+
+/* ------------------------------------------------------ */
+/* STATE mostly syncronized with global lock */
+/* data written to rarely by masters, read often by workers */
+/*
+ * SHALL WE EDIT THE COMMENT BELOW IN SOME WAY?
+ * TODO:  None of this global padding stuff works consistently because
+ * the order of declaration is not necessarily correlated to storage order.
+ * To fix this, all the important globals must be put in a big structure
+ * instead.
+ */
+KMP_ALIGN_CACHE
+         kmp_info_t **__kmp_threads     = NULL;
+         kmp_root_t **__kmp_root        = NULL;
+
+/* data read/written to often by masters */
+KMP_ALIGN_CACHE
+volatile int          __kmp_nth                    = 0;
+volatile int          __kmp_all_nth                = 0;
+int                   __kmp_thread_pool_nth        = 0;
+volatile kmp_info_t  *__kmp_thread_pool            = NULL;
+volatile kmp_team_t  *__kmp_team_pool              = NULL;
+
+KMP_ALIGN_CACHE
+volatile int          __kmp_thread_pool_active_nth = 0;
+
+/* -------------------------------------------------
+ * GLOBAL/ROOT STATE */
+KMP_ALIGN_CACHE
+kmp_global_t __kmp_global = {{ 0 }};
+
+/* ----------------------------------------------- */
+/* GLOBAL SYNCHRONIZATION LOCKS */
+/* TODO verify the need for these locks and if they need to be global */
+
+#if KMP_USE_INTERNODE_ALIGNMENT
+/* Multinode systems have larger cache line granularity which can cause
+ * false sharing if the alignment is not large enough for these locks */
+KMP_ALIGN_CACHE_INTERNODE
+
+kmp_bootstrap_lock_t __kmp_initz_lock   = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_initz_lock   ); /* Control initializations */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_exit_lock;   /* exit() is not always thread-safe */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and __kmp_threads expansion to co-exist */
+
+KMP_ALIGN_CACHE_INTERNODE
+kmp_lock_t __kmp_global_lock;           /* Control OS/global access */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_queuing_lock_t __kmp_dispatch_lock;         /* Control dispatch access  */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_lock_t __kmp_debug_lock;            /* Control I/O access for KMP_DEBUG */
+#else
+KMP_ALIGN_CACHE
+
+kmp_bootstrap_lock_t __kmp_initz_lock   = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_initz_lock   ); /* Control initializations */
+kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+kmp_bootstrap_lock_t __kmp_exit_lock;   /* exit() is not always thread-safe */
+kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
+kmp_bootstrap_lock_t __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and __kmp_threads expansion to co-exist */
+
+KMP_ALIGN(128)
+kmp_lock_t __kmp_global_lock;           /* Control OS/global access */
+KMP_ALIGN(128)
+kmp_queuing_lock_t __kmp_dispatch_lock;         /* Control dispatch access  */
+KMP_ALIGN(128)
+kmp_lock_t __kmp_debug_lock;            /* Control I/O access for KMP_DEBUG */
+#endif
+
+/* ----------------------------------------------- */
+
+#if KMP_HANDLE_SIGNALS
+    /*
+        Signal handling is disabled by default, because it confuses users: In case of sigsegv
+        (or other trouble) in user code signal handler catches the signal, which then "appears" in
+        the monitor thread (when the monitor executes raise() function). Users see signal in the
+        monitor thread and blame OpenMP RTL.
+
+        Grant said signal handling required on some older OSes (Irix?) supported by KAI, because
+        bad applications hung but not aborted. Currently it is not a problem for Linux* OS, OS X* and
+        Windows* OS.
+
+        Grant: Found new hangs for EL4, EL5, and a Fedora Core machine.  So I'm putting
+        the default back for now to see if that fixes hangs on those machines.
+
+        2010-04013 Lev: It was a bug in Fortran RTL. Fortran RTL prints a kind of stack backtrace
+        when program is aborting, but the code is not signal-safe. When multiple signals raised at
+        the same time (which occurs in dynamic negative tests because all the worker threads detects
+        the same error), Fortran RTL may hang. The bug finally fixed in Fortran RTL library provided
+        by Steve R., and will be available soon.
+    */
+    int __kmp_handle_signals = FALSE;
+#endif
+
+/* ----------------------------------------------- */
+#ifdef BUILD_TV
+kmp_key_t __kmp_tv_key = 0;
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#ifdef DEBUG_SUSPEND
+int
+get_suspend_count_( void ) {
+    int count = __kmp_suspend_count;
+    __kmp_suspend_count = 0;
+    return count;
+}
+void
+set_suspend_count_( int * value ) {
+    __kmp_suspend_count = *value;
+}
+#endif
+
+// Symbols for MS mutual detection.
+int _You_must_link_with_exactly_one_OpenMP_library = 1;
+int _You_must_link_with_Intel_OpenMP_library       = 1;
+#if KMP_OS_WINDOWS && ( KMP_VERSION_MAJOR > 4 )
+    int _You_must_link_with_Microsoft_OpenMP_library = 1;
+#endif
+
+// end of file //

diff --git a/final/runtime/src/kmp_gsupport.c b/final/runtime/src/kmp_gsupport.c
new file mode 100644
index 0000000..9397e6f
--- /dev/null
+++ b/final/runtime/src/kmp_gsupport.c

@@ -0,0 +1,1533 @@
+/*
+ * kmp_gsupport.c
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if defined(__x86_64) || defined (__powerpc64__) || defined(__aarch64__)
+# define KMP_I8
+#endif
+#include "kmp.h"
+#include "kmp_atomic.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+#define MKLOC(loc,routine) \
+    static ident_t (loc) = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;" };
+
+#include "kmp_ftn_os.h"
+
+void
+xexpand(KMP_API_NAME_GOMP_BARRIER)(void)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_barrier");
+    KA_TRACE(20, ("GOMP_barrier: T#%d\n", gtid));
+    __kmpc_barrier(&loc, gtid);
+}
+
+
+//
+// Mutual exclusion
+//
+
+//
+// The symbol that icc/ifort generates for unnamed for unnamed critical
+// sections - .gomp_critical_user_ - is defined using .comm in any objects
+// reference it.  We can't reference it directly here in C code, as the
+// symbol contains a ".".
+//
+// The RTL contains an assembly language definition of .gomp_critical_user_
+// with another symbol __kmp_unnamed_critical_addr initialized with it's
+// address.
+//
+extern kmp_critical_name *__kmp_unnamed_critical_addr;
+
+
+void
+xexpand(KMP_API_NAME_GOMP_CRITICAL_START)(void)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_critical_start");
+    KA_TRACE(20, ("GOMP_critical_start: T#%d\n", gtid));
+    __kmpc_critical(&loc, gtid, __kmp_unnamed_critical_addr);
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_CRITICAL_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_critical_end");
+    KA_TRACE(20, ("GOMP_critical_end: T#%d\n", gtid));
+    __kmpc_end_critical(&loc, gtid, __kmp_unnamed_critical_addr);
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_CRITICAL_NAME_START)(void **pptr)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_critical_name_start");
+    KA_TRACE(20, ("GOMP_critical_name_start: T#%d\n", gtid));
+    __kmpc_critical(&loc, gtid, (kmp_critical_name *)pptr);
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_CRITICAL_NAME_END)(void **pptr)
+{
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_critical_name_end");
+    KA_TRACE(20, ("GOMP_critical_name_end: T#%d\n", gtid));
+    __kmpc_end_critical(&loc, gtid, (kmp_critical_name *)pptr);
+}
+
+
+//
+// The Gnu codegen tries to use locked operations to perform atomic updates
+// inline.  If it can't, then it calls GOMP_atomic_start() before performing
+// the update and GOMP_atomic_end() afterward, regardless of the data type.
+//
+
+void
+xexpand(KMP_API_NAME_GOMP_ATOMIC_START)(void)
+{
+    int gtid = __kmp_entry_gtid();
+    KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+    __ompt_thread_assign_wait_id(0);
+#endif
+
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_ATOMIC_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+
+int
+xexpand(KMP_API_NAME_GOMP_SINGLE_START)(void)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_single_start");
+    KA_TRACE(20, ("GOMP_single_start: T#%d\n", gtid));
+
+    if (! TCR_4(__kmp_init_parallel))
+        __kmp_parallel_initialize();
+
+    //
+    // 3rd parameter == FALSE prevents kmp_enter_single from pushing a
+    // workshare when USE_CHECKS is defined.  We need to avoid the push,
+    // as there is no corresponding GOMP_single_end() call.
+    //
+    return __kmp_enter_single(gtid, &loc, FALSE);
+}
+
+
+void *
+xexpand(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void)
+{
+    void *retval;
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_single_copy_start");
+    KA_TRACE(20, ("GOMP_single_copy_start: T#%d\n", gtid));
+
+    if (! TCR_4(__kmp_init_parallel))
+        __kmp_parallel_initialize();
+
+    //
+    // If this is the first thread to enter, return NULL.  The generated
+    // code will then call GOMP_single_copy_end() for this thread only,
+    // with the copyprivate data pointer as an argument.
+    //
+    if (__kmp_enter_single(gtid, &loc, FALSE))
+        return NULL;
+
+    //
+    // Wait for the first thread to set the copyprivate data pointer,
+    // and for all other threads to reach this point.
+    //
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+    //
+    // Retrieve the value of the copyprivate data point, and wait for all
+    // threads to do likewise, then return.
+    //
+    retval = __kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+    return retval;
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_SINGLE_COPY_END)(void *data)
+{
+    int gtid = __kmp_get_gtid();
+    KA_TRACE(20, ("GOMP_single_copy_end: T#%d\n", gtid));
+
+    //
+    // Set the copyprivate data pointer fo the team, then hit the barrier
+    // so that the other threads will continue on and read it.  Hit another
+    // barrier before continuing, so that the know that the copyprivate
+    // data pointer has been propagated to all threads before trying to
+    // reuse the t_copypriv_data field.
+    //
+    __kmp_team_from_gtid(gtid)->t.t_copypriv_data = data;
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_ORDERED_START)(void)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_ordered_start");
+    KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+    __kmpc_ordered(&loc, gtid);
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_ORDERED_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_ordered_end");
+    KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+    __kmpc_end_ordered(&loc, gtid);
+}
+
+
+//
+// Dispatch macro defs
+//
+// They come in two flavors: 64-bit unsigned, and either 32-bit signed
+// (IA-32 architecture) or 64-bit signed (Intel(R) 64).
+//
+
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+# define KMP_DISPATCH_INIT              __kmp_aux_dispatch_init_4
+# define KMP_DISPATCH_FINI_CHUNK        __kmp_aux_dispatch_fini_chunk_4
+# define KMP_DISPATCH_NEXT              __kmpc_dispatch_next_4
+#else
+# define KMP_DISPATCH_INIT              __kmp_aux_dispatch_init_8
+# define KMP_DISPATCH_FINI_CHUNK        __kmp_aux_dispatch_fini_chunk_8
+# define KMP_DISPATCH_NEXT              __kmpc_dispatch_next_8
+#endif /* KMP_ARCH_X86 */
+
+# define KMP_DISPATCH_INIT_ULL          __kmp_aux_dispatch_init_8u
+# define KMP_DISPATCH_FINI_CHUNK_ULL    __kmp_aux_dispatch_fini_chunk_8u
+# define KMP_DISPATCH_NEXT_ULL          __kmpc_dispatch_next_8u
+
+
+//
+// The parallel contruct
+//
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+void
+__kmp_GOMP_microtask_wrapper(int *gtid, int *npr, void (*task)(void *),
+  void *data)
+{
+#if OMPT_SUPPORT
+    kmp_info_t *thr;
+    ompt_frame_t *ompt_frame;
+    ompt_state_t enclosing_state;
+
+    if (ompt_status & ompt_status_track) {
+        // get pointer to thread data structure
+        thr = __kmp_threads[*gtid];
+
+        // save enclosing task state; set current state for task
+        enclosing_state = thr->th.ompt_thread_info.state;
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+        // set task frame
+        ompt_frame = __ompt_get_task_frame_internal(0);
+        ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
+    task(data);
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        // clear task frame
+        ompt_frame->exit_runtime_frame = NULL;
+
+        // restore enclosing state
+        thr->th.ompt_thread_info.state = enclosing_state;
+    }
+#endif
+}
+
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+void
+__kmp_GOMP_parallel_microtask_wrapper(int *gtid, int *npr,
+  void (*task)(void *), void *data, unsigned num_threads, ident_t *loc,
+  enum sched_type schedule, long start, long end, long incr, long chunk_size)
+{
+    //
+    // Intialize the loop worksharing construct.
+    //
+    KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
+      schedule != kmp_sch_static);
+
+#if OMPT_SUPPORT
+    kmp_info_t *thr;
+    ompt_frame_t *ompt_frame;
+    ompt_state_t enclosing_state;
+
+    if (ompt_status & ompt_status_track) {
+        thr = __kmp_threads[*gtid];
+        // save enclosing task state; set current state for task
+        enclosing_state = thr->th.ompt_thread_info.state;
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+        // set task frame
+        ompt_frame = __ompt_get_task_frame_internal(0);
+        ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
+    //
+    // Now invoke the microtask.
+    //
+    task(data);
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        // clear task frame
+        ompt_frame->exit_runtime_frame = NULL;
+
+        // reset enclosing state
+        thr->th.ompt_thread_info.state = enclosing_state;
+    }
+#endif
+}
+
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+void
+__kmp_GOMP_fork_call(ident_t *loc, int gtid, void (*unwrapped_task)(void *), microtask_t wrapper, int argc,...)
+{
+    int rc;
+    kmp_info_t *thr = __kmp_threads[gtid];
+    kmp_team_t *team = thr->th.th_team;
+    int tid = __kmp_tid_from_gtid(gtid);
+
+    va_list ap;
+    va_start(ap, argc);
+
+    rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc,
+#if OMPT_SUPPORT
+      VOLATILE_CAST(void *) unwrapped_task,
+#endif
+      wrapper, __kmp_invoke_task_func,
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+      &ap
+#else
+      ap
+#endif
+      );
+
+    va_end(ap);
+
+    if (rc) {
+        __kmp_run_before_invoked_task(gtid, tid, thr, team);
+    }
+
+#if OMPT_SUPPORT 
+    if (ompt_status & ompt_status_track) {
+#if OMPT_TRACE
+        ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+        ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+
+        // implicit task callback
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                team_info->parallel_id, task_info->task_id);
+        }
+#endif
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    }
+#endif
+}
+
+static void
+__kmp_GOMP_serialized_parallel(ident_t *loc, kmp_int32 gtid, void (*task)(void *))
+{
+    __kmp_serialized_parallel(loc, gtid);
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        ompt_task_id_t ompt_task_id = __ompt_get_task_id_internal(0);
+        ompt_frame_t  *ompt_frame = __ompt_get_task_frame_internal(0);
+        kmp_info_t *thr = __kmp_threads[gtid];
+
+        ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(gtid);
+        ompt_task_id_t my_ompt_task_id = __ompt_task_id_new(gtid);
+
+        ompt_frame->exit_runtime_frame = NULL;
+
+        // parallel region callback
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
+            int team_size = 1;
+            ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
+                ompt_task_id, ompt_frame, ompt_parallel_id,
+                team_size, (void *) task);
+        }
+
+        // set up lightweight task
+        ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
+            __kmp_allocate(sizeof(ompt_lw_taskteam_t));
+        __ompt_lw_taskteam_init(lwt, thr, gtid, (void *) task, ompt_parallel_id);
+        lwt->ompt_task_info.task_id = my_ompt_task_id;
+        lwt->ompt_task_info.frame.exit_runtime_frame = 0;
+        __ompt_lw_taskteam_link(lwt, thr);
+
+#if OMPT_TRACE
+        // implicit task callback
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                ompt_parallel_id, my_ompt_task_id);
+        }
+        thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+#endif
+    }
+#endif
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *), void *data, unsigned num_threads)
+{
+    int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+    ompt_frame_t *parent_frame;
+
+    if (ompt_status & ompt_status_track) {
+        parent_frame = __ompt_get_task_frame_internal(0);
+        parent_frame->reenter_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
+    MKLOC(loc, "GOMP_parallel_start");
+    KA_TRACE(20, ("GOMP_parallel_start: T#%d\n", gtid));
+
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+        if (num_threads != 0) {
+            __kmp_push_num_threads(&loc, gtid, num_threads);
+        }
+        __kmp_GOMP_fork_call(&loc, gtid, task,
+          (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task, data);
+    }
+    else {
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        parent_frame->reenter_runtime_frame = NULL;
+    }
+#endif
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_PARALLEL_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    kmp_info_t *thr;
+
+    thr = __kmp_threads[gtid];
+
+    MKLOC(loc, "GOMP_parallel_end");
+    KA_TRACE(20, ("GOMP_parallel_end: T#%d\n", gtid));
+
+
+#if OMPT_SUPPORT
+    ompt_parallel_id_t parallel_id;
+    ompt_frame_t *ompt_frame = NULL;
+
+    if (ompt_status & ompt_status_track) {
+        ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+        parallel_id = team_info->parallel_id;
+
+        ompt_frame = __ompt_get_task_frame_internal(0);
+        ompt_frame->exit_runtime_frame = __builtin_frame_address(0);
+
+#if OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+            ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                parallel_id, task_info->task_id);
+        }
+#endif
+
+        // unlink if necessary. no-op if there is not a lightweight task.
+        ompt_lw_taskteam_t *lwt = __ompt_lw_taskteam_unlink(thr);
+        // GOMP allocates/frees lwt since it can't be kept on the stack
+        if (lwt) __kmp_free(lwt);
+    }
+#endif
+
+    if (! __kmp_threads[gtid]->th.th_team->t.t_serialized) {
+        kmp_info_t *thr = __kmp_threads[gtid];
+        __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
+          thr->th.th_team);
+        __kmp_join_call(&loc, gtid);
+    }
+    else {
+        __kmpc_end_serialized_parallel(&loc, gtid);
+
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            if ((ompt_status == ompt_status_track_callback) &&
+                ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+                ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                    parallel_id, task_info->task_id);
+            }
+
+            thr->th.ompt_thread_info.state =
+                (((thr->th.th_team)->t.t_serialized) ?
+                ompt_state_work_serial : ompt_state_work_parallel);
+        }
+#endif
+
+    }
+}
+
+
+//
+// Loop worksharing constructs
+//
+
+//
+// The Gnu codegen passes in an exclusive upper bound for the overall range,
+// but the libguide dispatch code expects an inclusive upper bound, hence the
+// "end - incr" 5th argument to KMP_DISPATCH_INIT (and the " ub - str" 11th
+// argument to __kmp_GOMP_fork_call).
+//
+// Conversely, KMP_DISPATCH_NEXT returns and inclusive upper bound in *p_ub,
+// but the Gnu codegen expects an excluside upper bound, so the adjustment
+// "*p_ub += stride" compenstates for the discrepancy.
+//
+// Correction: the gnu codegen always adjusts the upper bound by +-1, not the
+// stride value.  We adjust the dispatch parameters accordingly (by +-1), but
+// we still adjust p_ub by the actual stride value.
+//
+// The "runtime" versions do not take a chunk_sz parameter.
+//
+// The profile lib cannot support construct checking of unordered loops that
+// are predetermined by the compiler to be statically scheduled, as the gcc
+// codegen will not always emit calls to GOMP_loop_static_next() to get the
+// next iteration.  Instead, it emits inline code to call omp_get_thread_num()
+// num and calculate the iteration space using the result.  It doesn't do this
+// with ordered static loop, so they can be checked.
+//
+
+#define LOOP_START(func,schedule) \
+    int func (long lb, long ub, long str, long chunk_sz, long *p_lb,         \
+      long *p_ub)                                                            \
+    {                                                                        \
+        int status;                                                          \
+        long stride;                                                         \
+        int gtid = __kmp_entry_gtid();                                       \
+        MKLOC(loc, #func);                                                   \
+        KA_TRACE(20, ( #func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+          gtid, lb, ub, str, chunk_sz ));                                    \
+                                                                             \
+        if ((str > 0) ? (lb < ub) : (lb > ub)) {                             \
+            KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                    \
+              (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,                \
+              (schedule) != kmp_sch_static);                                 \
+            status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,    \
+              (kmp_int *)p_ub, (kmp_int *)&stride);                          \
+            if (status) {                                                    \
+                KMP_DEBUG_ASSERT(stride == str);                             \
+                *p_ub += (str > 0) ? 1 : -1;                                 \
+            }                                                                \
+        }                                                                    \
+        else {                                                               \
+            status = 0;                                                      \
+        }                                                                    \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n", \
+          gtid, *p_lb, *p_ub, status));                                      \
+        return status;                                                       \
+    }
+
+
+#define LOOP_RUNTIME_START(func,schedule) \
+    int func (long lb, long ub, long str, long *p_lb, long *p_ub)            \
+    {                                                                        \
+        int status;                                                          \
+        long stride;                                                         \
+        long chunk_sz = 0;                                                   \
+        int gtid = __kmp_entry_gtid();                                       \
+        MKLOC(loc, #func);                                                   \
+        KA_TRACE(20, ( #func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n",  \
+          gtid, lb, ub, str, chunk_sz ));                                    \
+                                                                             \
+        if ((str > 0) ? (lb < ub) : (lb > ub)) {                             \
+            KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                    \
+              (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE);         \
+            status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,    \
+              (kmp_int *)p_ub, (kmp_int *)&stride);                          \
+            if (status) {                                                    \
+                KMP_DEBUG_ASSERT(stride == str);                             \
+                *p_ub += (str > 0) ? 1 : -1;                                 \
+            }                                                                \
+        }                                                                    \
+        else {                                                               \
+            status = 0;                                                      \
+        }                                                                    \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n", \
+          gtid, *p_lb, *p_ub, status));                                      \
+        return status;                                                       \
+    }
+
+
+#define LOOP_NEXT(func,fini_code) \
+    int func(long *p_lb, long *p_ub)                                         \
+    {                                                                        \
+        int status;                                                          \
+        long stride;                                                         \
+        int gtid = __kmp_get_gtid();                                         \
+        MKLOC(loc, #func);                                                   \
+        KA_TRACE(20, ( #func ": T#%d\n", gtid));                             \
+                                                                             \
+        fini_code                                                            \
+        status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,        \
+          (kmp_int *)p_ub, (kmp_int *)&stride);                              \
+        if (status) {                                                        \
+            *p_ub += (stride > 0) ? 1 : -1;                                  \
+        }                                                                    \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, stride 0x%lx, "  \
+          "returning %d\n", gtid, *p_lb, *p_ub, stride, status));            \
+        return status;                                                       \
+    }
+
+
+LOOP_START(xexpand(KMP_API_NAME_GOMP_LOOP_STATIC_START), kmp_sch_static)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT), {})
+LOOP_START(xexpand(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START), kmp_sch_dynamic_chunked)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT), {})
+LOOP_START(xexpand(KMP_API_NAME_GOMP_LOOP_GUIDED_START), kmp_sch_guided_chunked)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT), {})
+LOOP_RUNTIME_START(xexpand(KMP_API_NAME_GOMP_LOOP_RUNTIME_START), kmp_sch_runtime)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT), {})
+
+LOOP_START(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START), kmp_ord_static)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_START(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START), kmp_ord_dynamic_chunked)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_START(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START), kmp_ord_guided_chunked)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_RUNTIME_START(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START), kmp_ord_runtime)
+LOOP_NEXT(xexpand(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+
+
+void
+xexpand(KMP_API_NAME_GOMP_LOOP_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    KA_TRACE(20, ("GOMP_loop_end: T#%d\n", gtid))
+
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+    KA_TRACE(20, ("GOMP_loop_end exit: T#%d\n", gtid))
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_LOOP_END_NOWAIT)(void)
+{
+    KA_TRACE(20, ("GOMP_loop_end_nowait: T#%d\n", __kmp_get_gtid()))
+}
+
+
+//
+// Unsigned long long loop worksharing constructs
+//
+// These are new with gcc 4.4
+//
+
+#define LOOP_START_ULL(func,schedule) \
+    int func (int up, unsigned long long lb, unsigned long long ub,          \
+      unsigned long long str, unsigned long long chunk_sz,                   \
+      unsigned long long *p_lb, unsigned long long *p_ub)                    \
+    {                                                                        \
+        int status;                                                          \
+        long long str2 = up ? ((long long)str) : -((long long)str);          \
+        long long stride;                                                    \
+        int gtid = __kmp_entry_gtid();                                       \
+        MKLOC(loc, #func);                                                   \
+                                                                             \
+        KA_TRACE(20, ( #func ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str 0x%llx, chunk_sz 0x%llx\n", \
+          gtid, up, lb, ub, str, chunk_sz ));                                \
+                                                                             \
+        if ((str > 0) ? (lb < ub) : (lb > ub)) {                             \
+            KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                \
+              (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz,              \
+              (schedule) != kmp_sch_static);                                 \
+            status = KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL,                 \
+              (kmp_uint64 *)p_lb, (kmp_uint64 *)p_ub, (kmp_int64 *)&stride); \
+            if (status) {                                                    \
+                KMP_DEBUG_ASSERT(stride == str2);                            \
+                *p_ub += (str > 0) ? 1 : -1;                                 \
+            }                                                                \
+        }                                                                    \
+        else {                                                               \
+            status = 0;                                                      \
+        }                                                                    \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n", \
+          gtid, *p_lb, *p_ub, status));                                      \
+        return status;                                                       \
+    }
+
+
+#define LOOP_RUNTIME_START_ULL(func,schedule) \
+    int func (int up, unsigned long long lb, unsigned long long ub,          \
+      unsigned long long str, unsigned long long *p_lb,                      \
+      unsigned long long *p_ub)                                              \
+    {                                                                        \
+        int status;                                                          \
+        long long str2 = up ? ((long long)str) : -((long long)str);          \
+        unsigned long long stride;                                           \
+        unsigned long long chunk_sz = 0;                                     \
+        int gtid = __kmp_entry_gtid();                                       \
+        MKLOC(loc, #func);                                                   \
+                                                                             \
+        KA_TRACE(20, ( #func ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str 0x%llx, chunk_sz 0x%llx\n", \
+          gtid, up, lb, ub, str, chunk_sz ));                                \
+                                                                             \
+        if ((str > 0) ? (lb < ub) : (lb > ub)) {                             \
+            KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                \
+              (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz, TRUE);       \
+            status = KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL,                 \
+              (kmp_uint64 *)p_lb, (kmp_uint64 *)p_ub, (kmp_int64 *)&stride); \
+            if (status) {                                                    \
+                KMP_DEBUG_ASSERT((long long)stride == str2);                 \
+                *p_ub += (str > 0) ? 1 : -1;                                 \
+            }                                                                \
+        }                                                                    \
+        else {                                                               \
+            status = 0;                                                      \
+        }                                                                    \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n", \
+          gtid, *p_lb, *p_ub, status));                                      \
+        return status;                                                       \
+    }
+
+
+#define LOOP_NEXT_ULL(func,fini_code) \
+    int func(unsigned long long *p_lb, unsigned long long *p_ub)             \
+    {                                                                        \
+        int status;                                                          \
+        long long stride;                                                    \
+        int gtid = __kmp_get_gtid();                                         \
+        MKLOC(loc, #func);                                                   \
+        KA_TRACE(20, ( #func ": T#%d\n", gtid));                             \
+                                                                             \
+        fini_code                                                            \
+        status = KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb, \
+          (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);                         \
+        if (status) {                                                        \
+            *p_ub += (stride > 0) ? 1 : -1;                                  \
+        }                                                                    \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, stride 0x%llx, " \
+          "returning %d\n", gtid, *p_lb, *p_ub, stride, status));            \
+        return status;                                                       \
+    }
+
+
+LOOP_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START), kmp_sch_static)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT), {})
+LOOP_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START), kmp_sch_dynamic_chunked)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT), {})
+LOOP_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START), kmp_sch_guided_chunked)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT), {})
+LOOP_RUNTIME_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START), kmp_sch_runtime)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT), {})
+
+LOOP_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START), kmp_ord_static)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START), kmp_ord_dynamic_chunked)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START), kmp_ord_guided_chunked)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_RUNTIME_START_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START), kmp_ord_runtime)
+LOOP_NEXT_ULL(xexpand(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT), \
+    { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+
+
+//
+// Combined parallel / loop worksharing constructs
+//
+// There are no ull versions (yet).
+//
+
+#define PARALLEL_LOOP_START(func, schedule) \
+    void func (void (*task) (void *), void *data, unsigned num_threads,      \
+      long lb, long ub, long str, long chunk_sz)                             \
+    {                                                                        \
+        int gtid = __kmp_entry_gtid();                                       \
+        MKLOC(loc, #func);                                                   \
+        KA_TRACE(20, ( #func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",        \
+          gtid, lb, ub, str, chunk_sz ));                                    \
+                                                                             \
+        if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                 \
+            if (num_threads != 0) {                                          \
+                __kmp_push_num_threads(&loc, gtid, num_threads);             \
+            }                                                                \
+            __kmp_GOMP_fork_call(&loc, gtid, task,                           \
+              (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,         \
+              task, data, num_threads, &loc, (schedule), lb,                 \
+              (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);               \
+        }                                                                    \
+        else {                                                               \
+            __kmp_GOMP_serialized_parallel(&loc, gtid, task);                \
+        }                                                                    \
+                                                                             \
+        KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                        \
+          (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,                    \
+          (schedule) != kmp_sch_static);                                     \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d\n", gtid));                        \
+    }
+
+
+PARALLEL_LOOP_START(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START), kmp_sch_static)
+PARALLEL_LOOP_START(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START), kmp_sch_dynamic_chunked)
+PARALLEL_LOOP_START(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START), kmp_sch_guided_chunked)
+PARALLEL_LOOP_START(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START), kmp_sch_runtime)
+
+
+//
+// Tasking constructs
+//
+
+void
+xexpand(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data, void (*copy_func)(void *, void *),
+  long arg_size, long arg_align, int if_cond, unsigned gomp_flags)
+{
+    MKLOC(loc, "GOMP_task");
+    int gtid = __kmp_entry_gtid();
+    kmp_int32 flags = 0;
+    kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags;
+
+    KA_TRACE(20, ("GOMP_task: T#%d\n", gtid));
+
+    // The low-order bit is the "tied" flag
+    if (gomp_flags & 1) {
+        input_flags->tiedness = 1;
+    }
+    input_flags->native = 1;
+    // __kmp_task_alloc() sets up all other flags
+
+    if (! if_cond) {
+        arg_size = 0;
+    }
+
+    kmp_task_t *task = __kmp_task_alloc(&loc, gtid, input_flags,
+      sizeof(kmp_task_t), arg_size ? arg_size + arg_align - 1 : 0,
+      (kmp_routine_entry_t)func);
+
+    if (arg_size > 0) {
+        if (arg_align > 0) {
+            task->shareds = (void *)((((size_t)task->shareds)
+              + arg_align - 1) / arg_align * arg_align);
+        }
+        //else error??
+
+        if (copy_func) {
+            (*copy_func)(task->shareds, data);
+        }
+        else {
+            KMP_MEMCPY(task->shareds, data, arg_size);
+        }
+    }
+
+    if (if_cond) {
+        __kmpc_omp_task(&loc, gtid, task);
+    }
+    else {
+#if OMPT_SUPPORT
+        ompt_thread_info_t oldInfo;
+        kmp_info_t *thread;
+        kmp_taskdata_t *taskdata;
+        if (ompt_status & ompt_status_track) {
+            // Store the threads states and restore them after the task
+            thread = __kmp_threads[ gtid ];
+            taskdata = KMP_TASK_TO_TASKDATA(task);
+            oldInfo = thread->th.ompt_thread_info;
+            thread->th.ompt_thread_info.wait_id = 0;
+            thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+            taskdata->ompt_task_info.frame.exit_runtime_frame =
+                __builtin_frame_address(0);
+        }
+#endif
+
+        __kmpc_omp_task_begin_if0(&loc, gtid, task);
+        func(data);
+        __kmpc_omp_task_complete_if0(&loc, gtid, task);
+
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            thread->th.ompt_thread_info = oldInfo;
+            taskdata->ompt_task_info.frame.exit_runtime_frame = 0;
+        }
+#endif
+    }
+
+    KA_TRACE(20, ("GOMP_task exit: T#%d\n", gtid));
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_TASKWAIT)(void)
+{
+    MKLOC(loc, "GOMP_taskwait");
+    int gtid = __kmp_entry_gtid();
+
+    KA_TRACE(20, ("GOMP_taskwait: T#%d\n", gtid));
+
+    __kmpc_omp_taskwait(&loc, gtid);
+
+    KA_TRACE(20, ("GOMP_taskwait exit: T#%d\n", gtid));
+}
+
+
+//
+// Sections worksharing constructs
+//
+
+//
+// For the sections construct, we initialize a dynamically scheduled loop
+// worksharing construct with lb 1 and stride 1, and use the iteration #'s
+// that its returns as sections ids.
+//
+// There are no special entry points for ordered sections, so we always use
+// the dynamically scheduled workshare, even if the sections aren't ordered.
+//
+
+unsigned
+xexpand(KMP_API_NAME_GOMP_SECTIONS_START)(unsigned count)
+{
+    int status;
+    kmp_int lb, ub, stride;
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_sections_start");
+    KA_TRACE(20, ("GOMP_sections_start: T#%d\n", gtid));
+
+    KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+    status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
+    if (status) {
+        KMP_DEBUG_ASSERT(stride == 1);
+        KMP_DEBUG_ASSERT(lb > 0);
+        KMP_ASSERT(lb == ub);
+    }
+    else {
+        lb = 0;
+    }
+
+    KA_TRACE(20, ("GOMP_sections_start exit: T#%d returning %u\n", gtid,
+      (unsigned)lb));
+    return (unsigned)lb;
+}
+
+
+unsigned
+xexpand(KMP_API_NAME_GOMP_SECTIONS_NEXT)(void)
+{
+    int status;
+    kmp_int lb, ub, stride;
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_sections_next");
+    KA_TRACE(20, ("GOMP_sections_next: T#%d\n", gtid));
+
+    status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
+    if (status) {
+        KMP_DEBUG_ASSERT(stride == 1);
+        KMP_DEBUG_ASSERT(lb > 0);
+        KMP_ASSERT(lb == ub);
+    }
+    else {
+        lb = 0;
+    }
+
+    KA_TRACE(20, ("GOMP_sections_next exit: T#%d returning %u\n", gtid,
+      (unsigned)lb));
+    return (unsigned)lb;
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(void (*task) (void *), void *data,
+  unsigned num_threads, unsigned count)
+{
+    int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+    ompt_frame_t *parent_frame;
+
+    if (ompt_status & ompt_status_track) {
+        parent_frame = __ompt_get_task_frame_internal(0);
+        parent_frame->reenter_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
+    MKLOC(loc, "GOMP_parallel_sections_start");
+    KA_TRACE(20, ("GOMP_parallel_sections_start: T#%d\n", gtid));
+
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+        if (num_threads != 0) {
+            __kmp_push_num_threads(&loc, gtid, num_threads);
+        }
+        __kmp_GOMP_fork_call(&loc, gtid, task,
+          (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, task, data,
+          num_threads, &loc, kmp_nm_dynamic_chunked, (kmp_int)1,
+          (kmp_int)count, (kmp_int)1, (kmp_int)1);
+    }
+    else {
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        parent_frame->reenter_runtime_frame = NULL;
+    }
+#endif
+
+    KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+    KA_TRACE(20, ("GOMP_parallel_sections_start exit: T#%d\n", gtid));
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_SECTIONS_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    KA_TRACE(20, ("GOMP_sections_end: T#%d\n", gtid))
+
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+    KA_TRACE(20, ("GOMP_sections_end exit: T#%d\n", gtid))
+}
+
+
+void
+xexpand(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT)(void)
+{
+    KA_TRACE(20, ("GOMP_sections_end_nowait: T#%d\n", __kmp_get_gtid()))
+}
+
+// libgomp has an empty function for GOMP_taskyield as of 2013-10-10
+void
+xexpand(KMP_API_NAME_GOMP_TASKYIELD)(void)
+{
+    KA_TRACE(20, ("GOMP_taskyield: T#%d\n", __kmp_get_gtid()))
+    return;
+}
+
+#if OMP_40_ENABLED // these are new GOMP_4.0 entry points
+
+void
+xexpand(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *), void *data, unsigned num_threads, unsigned int flags)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_parallel");
+    KA_TRACE(20, ("GOMP_parallel: T#%d\n", gtid));
+
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+        if (num_threads != 0) {
+            __kmp_push_num_threads(&loc, gtid, num_threads);
+        }
+        if(flags != 0) {
+            __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
+        }
+        __kmp_GOMP_fork_call(&loc, gtid, task,
+          (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task, data);
+    }
+    else {
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+    }
+    task(data);
+    xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data,
+  unsigned num_threads, unsigned count, unsigned flags)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_parallel_sections");
+    KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
+
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+        if (num_threads != 0) {
+            __kmp_push_num_threads(&loc, gtid, num_threads);
+        }
+        if(flags != 0) {
+            __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
+        }
+        __kmp_GOMP_fork_call(&loc, gtid, task,
+          (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, task, data,
+          num_threads, &loc, kmp_nm_dynamic_chunked, (kmp_int)1,
+          (kmp_int)count, (kmp_int)1, (kmp_int)1);
+    }
+    else {
+        __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+    }
+
+    KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+    task(data);
+    xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();
+    KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
+}
+
+#define PARALLEL_LOOP(func, schedule) \
+    void func (void (*task) (void *), void *data, unsigned num_threads,      \
+      long lb, long ub, long str, long chunk_sz, unsigned flags)             \
+    {                                                                        \
+        int gtid = __kmp_entry_gtid();                                       \
+        MKLOC(loc, #func);                                                   \
+        KA_TRACE(20, ( #func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",        \
+          gtid, lb, ub, str, chunk_sz ));                                    \
+                                                                             \
+        if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                 \
+            if (num_threads != 0) {                                          \
+                __kmp_push_num_threads(&loc, gtid, num_threads);             \
+            }                                                                \
+            if (flags != 0) {                                                \
+                __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);    \
+            }                                                                \
+            __kmp_GOMP_fork_call(&loc, gtid, task,                           \
+              (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,         \
+              task, data, num_threads, &loc, (schedule), lb,                 \
+              (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);               \
+        }                                                                    \
+        else {                                                               \
+            __kmp_GOMP_serialized_parallel(&loc, gtid, task);                \
+        }                                                                    \
+                                                                             \
+        KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                        \
+          (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,                    \
+          (schedule) != kmp_sch_static);                                     \
+        task(data);                                                          \
+        xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();                           \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d\n", gtid));                        \
+    }
+
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC), kmp_sch_static)
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC), kmp_sch_dynamic_chunked)
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED), kmp_sch_guided_chunked)
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME), kmp_sch_runtime)
+
+
+void
+xexpand(KMP_API_NAME_GOMP_TASKGROUP_START)(void)
+{
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_taskgroup_start");
+    KA_TRACE(20, ("GOMP_taskgroup_start: T#%d\n", gtid));
+
+    __kmpc_taskgroup(&loc, gtid);
+
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TASKGROUP_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_taskgroup_end");
+    KA_TRACE(20, ("GOMP_taskgroup_end: T#%d\n", gtid));
+
+    __kmpc_end_taskgroup(&loc, gtid);
+
+    return;
+}
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+kmp_int32 __kmp_gomp_to_omp_cancellation_kind(int gomp_kind) {
+    kmp_int32 cncl_kind = 0;
+    switch(gomp_kind) {
+      case 1:
+        cncl_kind = cancel_parallel;
+        break;
+      case 2:
+        cncl_kind = cancel_loop;
+        break;
+      case 4:
+        cncl_kind = cancel_sections;
+        break;
+      case 8:
+        cncl_kind = cancel_taskgroup;
+        break;
+    }
+    return cncl_kind;
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_CANCELLATION_POINT)(int which)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_cancellation_point");
+    KA_TRACE(20, ("GOMP_cancellation_point: T#%d\n", gtid));
+
+    kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
+
+    return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_BARRIER_CANCEL)(void)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    KMP_FATAL(NoGompCancellation);
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_barrier_cancel");
+    KA_TRACE(20, ("GOMP_barrier_cancel: T#%d\n", gtid));
+
+    return __kmpc_cancel_barrier(&loc, gtid);
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_CANCEL)(int which, bool do_cancel)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    } else {
+        return FALSE;
+    }
+
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_cancel");
+    KA_TRACE(20, ("GOMP_cancel: T#%d\n", gtid));
+
+    kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
+
+    if(do_cancel == FALSE) {
+        return xexpand(KMP_API_NAME_GOMP_CANCELLATION_POINT)(which);
+    } else {
+        return __kmpc_cancel(&loc, gtid, cncl_kind);
+    }
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL)(void)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_sections_end_cancel");
+    KA_TRACE(20, ("GOMP_sections_end_cancel: T#%d\n", gtid));
+
+    return __kmpc_cancel_barrier(&loc, gtid);
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_LOOP_END_CANCEL)(void)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_loop_end_cancel");
+    KA_TRACE(20, ("GOMP_loop_end_cancel: T#%d\n", gtid));
+
+    return __kmpc_cancel_barrier(&loc, gtid);
+}
+
+// All target functions are empty as of 2014-05-29
+void
+xexpand(KMP_API_NAME_GOMP_TARGET)(int device, void (*fn) (void *), const void *openmp_target,
+             size_t mapnum, void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TARGET_DATA)(int device, const void *openmp_target, size_t mapnum,
+                  void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TARGET_END_DATA)(void)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TARGET_UPDATE)(int device, const void *openmp_target, size_t mapnum,
+                    void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TEAMS)(unsigned int num_teams, unsigned int thread_limit)
+{
+    return;
+}
+#endif // OMP_40_ENABLED
+
+
+/*
+    The following sections of code create aliases for the GOMP_* functions,
+    then create versioned symbols using the assembler directive .symver.
+    This is only pertinent for ELF .so library
+    xaliasify and xversionify are defined in kmp_ftn_os.h
+*/
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+
+// GOMP_1.0 aliases
+xaliasify(KMP_API_NAME_GOMP_ATOMIC_END, 10);
+xaliasify(KMP_API_NAME_GOMP_ATOMIC_START, 10);
+xaliasify(KMP_API_NAME_GOMP_BARRIER, 10);
+xaliasify(KMP_API_NAME_GOMP_CRITICAL_END, 10);
+xaliasify(KMP_API_NAME_GOMP_CRITICAL_NAME_END, 10);
+xaliasify(KMP_API_NAME_GOMP_CRITICAL_NAME_START, 10);
+xaliasify(KMP_API_NAME_GOMP_CRITICAL_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_END, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_END_NOWAIT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_GUIDED_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_RUNTIME_START, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_LOOP_STATIC_START, 10);
+xaliasify(KMP_API_NAME_GOMP_ORDERED_END, 10);
+xaliasify(KMP_API_NAME_GOMP_ORDERED_START, 10);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_END, 10);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START, 10);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START, 10);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START, 10);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START, 10);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START, 10);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_START, 10);
+xaliasify(KMP_API_NAME_GOMP_SECTIONS_END, 10);
+xaliasify(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT, 10);
+xaliasify(KMP_API_NAME_GOMP_SECTIONS_NEXT, 10);
+xaliasify(KMP_API_NAME_GOMP_SECTIONS_START, 10);
+xaliasify(KMP_API_NAME_GOMP_SINGLE_COPY_END, 10);
+xaliasify(KMP_API_NAME_GOMP_SINGLE_COPY_START, 10);
+xaliasify(KMP_API_NAME_GOMP_SINGLE_START, 10);
+
+// GOMP_2.0 aliases
+xaliasify(KMP_API_NAME_GOMP_TASK, 20);
+xaliasify(KMP_API_NAME_GOMP_TASKWAIT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT, 20);
+xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START, 20);
+
+// GOMP_3.0 aliases
+xaliasify(KMP_API_NAME_GOMP_TASKYIELD, 30);
+
+// GOMP_4.0 aliases
+// The GOMP_parallel* entry points below aren't OpenMP 4.0 related.
+#if OMP_40_ENABLED
+xaliasify(KMP_API_NAME_GOMP_PARALLEL, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_SECTIONS, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC, 40);
+xaliasify(KMP_API_NAME_GOMP_TASKGROUP_START, 40);
+xaliasify(KMP_API_NAME_GOMP_TASKGROUP_END, 40);
+xaliasify(KMP_API_NAME_GOMP_BARRIER_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_CANCELLATION_POINT, 40);
+xaliasify(KMP_API_NAME_GOMP_LOOP_END_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET_DATA, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET_END_DATA, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET_UPDATE, 40);
+xaliasify(KMP_API_NAME_GOMP_TEAMS, 40);
+#endif
+
+// GOMP_1.0 versioned symbols
+xversionify(KMP_API_NAME_GOMP_ATOMIC_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_ATOMIC_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_BARRIER, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_CRITICAL_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_CRITICAL_NAME_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_CRITICAL_NAME_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_CRITICAL_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_END_NOWAIT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_GUIDED_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_RUNTIME_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_STATIC_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_ORDERED_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_ORDERED_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_SECTIONS_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_SECTIONS_NEXT, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_SECTIONS_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_SINGLE_COPY_END, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_SINGLE_COPY_START, 10, "GOMP_1.0");
+xversionify(KMP_API_NAME_GOMP_SINGLE_START, 10, "GOMP_1.0");
+
+// GOMP_2.0 versioned symbols
+xversionify(KMP_API_NAME_GOMP_TASK, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_TASKWAIT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT, 20, "GOMP_2.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START, 20, "GOMP_2.0");
+
+// GOMP_3.0 versioned symbols
+xversionify(KMP_API_NAME_GOMP_TASKYIELD, 30, "GOMP_3.0");
+
+// GOMP_4.0 versioned symbols
+#if OMP_40_ENABLED
+xversionify(KMP_API_NAME_GOMP_PARALLEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_SECTIONS, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TASKGROUP_START, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TASKGROUP_END, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_BARRIER_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_CANCELLATION_POINT, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_END_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET_DATA, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET_END_DATA, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET_UPDATE, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TEAMS, 40, "GOMP_4.0");
+#endif
+
+#endif // KMP_USE_VERSION_SYMBOLS
+
+#ifdef __cplusplus
+    } //extern "C"
+#endif // __cplusplus
+
+

diff --git a/final/runtime/src/kmp_i18n.c b/final/runtime/src/kmp_i18n.c
new file mode 100644
index 0000000..8dad255
--- /dev/null
+++ b/final/runtime/src/kmp_i18n.c

@@ -0,0 +1,974 @@
+/*
+ * kmp_i18n.c
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include "kmp_i18n.h"
+
+#include "kmp_os.h"
+#include "kmp_debug.h"
+#include "kmp.h"
+#include "kmp_lock.h"
+#include "kmp_io.h"          // __kmp_printf.
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <locale.h>
+#include <stdarg.h>
+
+#include "kmp_i18n_default.inc"
+#include "kmp_str.h"
+#include "kmp_environment.h"
+
+#undef KMP_I18N_OK
+
+#define get_section( id )  ( (id) >> 16 )
+#define get_number( id )   ( (id) & 0xFFFF )
+
+kmp_msg_t           __kmp_msg_empty = { kmp_mt_dummy, 0, "", 0  };
+kmp_msg_t           __kmp_msg_null  = { kmp_mt_dummy, 0, NULL, 0 };
+static char const * no_message_available = "(No message available)";
+
+enum kmp_i18n_cat_status {
+    KMP_I18N_CLOSED,    // Not yet opened or closed.
+    KMP_I18N_OPENED,    // Opened successfully, ready to use.
+    KMP_I18N_ABSENT     // Opening failed, message catalog should not be used.
+}; // enum kmp_i18n_cat_status
+typedef enum kmp_i18n_cat_status  kmp_i18n_cat_status_t;
+static volatile kmp_i18n_cat_status_t  status = KMP_I18N_CLOSED;
+
+/*
+    Message catalog is opened at first usage, so we have to synchronize opening to avoid race and
+    multiple openings.
+
+    Closing does not require synchronization, because catalog is closed very late at library
+    shutting down, when no other threads are alive.
+*/
+
+static void __kmp_i18n_do_catopen();
+static kmp_bootstrap_lock_t  lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( lock );
+    // `lock' variable may be placed into __kmp_i18n_catopen function because it is used only by
+    // that function. But we afraid a (buggy) compiler may treat it wrongly. So we put it outside of
+    // function just in case.
+
+void
+__kmp_i18n_catopen(
+) {
+    if ( status == KMP_I18N_CLOSED ) {
+        __kmp_acquire_bootstrap_lock( & lock );
+        if ( status == KMP_I18N_CLOSED ) {
+            __kmp_i18n_do_catopen();
+        }; // if
+        __kmp_release_bootstrap_lock( & lock );
+    }; // if
+} // func __kmp_i18n_catopen
+
+
+/*
+    ================================================================================================
+    Linux* OS and OS X* part.
+    ================================================================================================
+*/
+
+#if KMP_OS_UNIX
+#define KMP_I18N_OK
+
+#include <nl_types.h>
+
+#define KMP_I18N_NULLCAT ((nl_catd)( -1 ))
+static nl_catd       cat  = KMP_I18N_NULLCAT;    // !!! Shall it be volatile?
+static char const *  name = ( KMP_VERSION_MAJOR == 4 ? "libguide.cat" : "libomp.cat" );
+
+/*
+    Useful links:
+        http://www.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html#tag_08_02
+        http://www.opengroup.org/onlinepubs/000095399/functions/catopen.html
+        http://www.opengroup.org/onlinepubs/000095399/functions/setlocale.html
+*/
+
+void
+__kmp_i18n_do_catopen(
+) {
+    int    english = 0;
+    char * lang    = __kmp_env_get( "LANG" );
+    // TODO: What about LC_ALL or LC_MESSAGES?
+
+    KMP_DEBUG_ASSERT( status == KMP_I18N_CLOSED );
+    KMP_DEBUG_ASSERT( cat    == KMP_I18N_NULLCAT );
+
+    english =
+	lang == NULL                       ||  // In all these cases English language is used.
+	strcmp( lang, "" )            == 0 ||
+        strcmp( lang, " " )           == 0 ||
+              // Workaround for Fortran RTL bug DPD200137873 "Fortran runtime resets LANG env var
+              // to space if it is not set".
+	strcmp( lang, "C" )           == 0 ||
+	strcmp( lang, "POSIX" )       == 0;
+
+    if ( ! english ) {  // English language is not yet detected, let us continue.
+        // Format of LANG is: [language[_territory][.codeset][@modifier]]
+        // Strip all parts except language.
+        char * tail = NULL;
+        __kmp_str_split( lang, '@', & lang, & tail );
+        __kmp_str_split( lang, '.', & lang, & tail );
+        __kmp_str_split( lang, '_', & lang, & tail );
+        english = ( strcmp( lang, "en" ) == 0 );
+    }; // if
+
+    KMP_INTERNAL_FREE( lang );
+
+    // Do not try to open English catalog because internal messages are
+    // exact copy of messages in English catalog.
+    if ( english ) {
+	status = KMP_I18N_ABSENT;  // mark catalog as absent so it will not be re-opened.
+	return;
+    }
+
+    cat = catopen( name, 0 );
+    // TODO: Why do we pass 0 in flags?
+    status = ( cat == KMP_I18N_NULLCAT ? KMP_I18N_ABSENT : KMP_I18N_OPENED );
+
+    if ( status == KMP_I18N_ABSENT ) {
+      if (__kmp_generate_warnings > kmp_warnings_low) { // AC: only issue warning in case explicitly asked to
+        int    error   = errno; // Save errno immediately.
+	char * nlspath = __kmp_env_get( "NLSPATH" );
+        char * lang    = __kmp_env_get( "LANG" );
+
+	// Infinite recursion will not occur -- status is KMP_I18N_ABSENT now, so
+	// __kmp_i18n_catgets() will not try to open catalog, but will return default message.
+	__kmp_msg(
+	    kmp_ms_warning,
+	    KMP_MSG( CantOpenMessageCatalog, name ),
+	    KMP_ERR( error ),
+	    KMP_HNT( CheckEnvVar, "NLSPATH", nlspath ),
+            KMP_HNT( CheckEnvVar, "LANG", lang ),
+	    __kmp_msg_null
+	);
+	KMP_INFORM( WillUseDefaultMessages );
+        KMP_INTERNAL_FREE( nlspath );
+        KMP_INTERNAL_FREE( lang );
+      }
+    } else { // status == KMP_I18N_OPENED
+
+        int section = get_section( kmp_i18n_prp_Version );
+        int number  = get_number( kmp_i18n_prp_Version );
+        char const * expected = __kmp_i18n_default_table.sect[ section ].str[ number ];
+            // Expected version of the catalog.
+        kmp_str_buf_t version;   // Actual version of the catalog.
+        __kmp_str_buf_init( & version );
+        __kmp_str_buf_print( & version, "%s", catgets( cat, section, number, NULL ) );
+
+            // String returned by catgets is invalid after closing the catalog, so copy it.
+        if ( strcmp( version.str, expected ) != 0 ) {
+            __kmp_i18n_catclose();     // Close bad catalog.
+            status = KMP_I18N_ABSENT;  // And mark it as absent.
+            if (__kmp_generate_warnings > kmp_warnings_low) { // AC: only issue warning in case explicitly asked to
+                // And now print a warning using default messages.
+                char const * name    = "NLSPATH";
+                char const * nlspath = __kmp_env_get( name );
+                __kmp_msg(
+                    kmp_ms_warning,
+                    KMP_MSG( WrongMessageCatalog, name, version.str, expected ),
+                    KMP_HNT( CheckEnvVar, name, nlspath ),
+                    __kmp_msg_null
+                );
+                KMP_INFORM( WillUseDefaultMessages );
+                KMP_INTERNAL_FREE( (void *) nlspath );
+            } // __kmp_generate_warnings
+        }; // if
+        __kmp_str_buf_free( & version );
+
+    }; // if
+
+} // func __kmp_i18n_do_catopen
+
+
+void
+__kmp_i18n_catclose(
+) {
+    if ( status == KMP_I18N_OPENED ) {
+        KMP_DEBUG_ASSERT( cat != KMP_I18N_NULLCAT );
+        catclose( cat );
+        cat = KMP_I18N_NULLCAT;
+    }; // if
+    status = KMP_I18N_CLOSED;
+} // func __kmp_i18n_catclose
+
+
+char const *
+__kmp_i18n_catgets(
+    kmp_i18n_id_t  id
+) {
+
+    int section = get_section( id );
+    int number  = get_number( id );
+    char const * message = NULL;
+
+    if ( 1 <= section && section <= __kmp_i18n_default_table.size ) {
+        if ( 1 <= number && number <= __kmp_i18n_default_table.sect[ section ].size ) {
+            if ( status == KMP_I18N_CLOSED ) {
+                __kmp_i18n_catopen();
+            }; // if
+            if ( status == KMP_I18N_OPENED ) {
+                message =
+                    catgets(
+                        cat,
+                        section, number,
+                        __kmp_i18n_default_table.sect[ section ].str[ number ]
+                    );
+            }; // if
+            if ( message == NULL ) {
+                message = __kmp_i18n_default_table.sect[ section ].str[ number ];
+            }; // if
+        }; // if
+    }; // if
+    if ( message == NULL ) {
+        message = no_message_available;
+    }; // if
+    return message;
+
+} // func __kmp_i18n_catgets
+
+
+#endif // KMP_OS_UNIX
+
+/*
+    ================================================================================================
+    Windows* OS part.
+    ================================================================================================
+*/
+
+#if KMP_OS_WINDOWS
+#define KMP_I18N_OK
+
+#include "kmp_environment.h"
+#include <windows.h>
+
+#define KMP_I18N_NULLCAT  NULL
+static HMODULE       cat  = KMP_I18N_NULLCAT;    // !!! Shall it be volatile?
+static char const *  name = ( KMP_VERSION_MAJOR == 4 ? "libguide40ui.dll" : "libompui.dll" );
+
+static kmp_i18n_table_t  table             = { 0, NULL };
+    // Messages formatted by FormatMessage() should be freed, but catgets() interface assumes
+    // user will not free messages. So we cache all the retrieved messages in the table, which
+    // are freed at catclose().
+static UINT const        default_code_page = CP_OEMCP;
+static UINT              code_page         = default_code_page;
+
+static char const * ___catgets( kmp_i18n_id_t  id );
+static UINT         get_code_page();
+static void         kmp_i18n_table_free( kmp_i18n_table_t * table );
+
+
+static UINT
+get_code_page(
+) {
+
+    UINT cp = default_code_page;
+    char const * value = __kmp_env_get( "KMP_CODEPAGE" );
+    if ( value != NULL ) {
+        if ( _stricmp( value, "ANSI" ) == 0 ) {
+            cp = CP_ACP;
+        } else if ( _stricmp( value, "OEM" ) == 0 ) {
+            cp = CP_OEMCP;
+        } else if ( _stricmp( value, "UTF-8" ) == 0 || _stricmp( value, "UTF8" ) == 0 ) {
+            cp = CP_UTF8;
+        } else if ( _stricmp( value, "UTF-7" ) == 0 || _stricmp( value, "UTF7" ) == 0 ) {
+            cp = CP_UTF7;
+        } else {
+            // !!! TODO: Issue a warning?
+        }; // if
+    }; // if
+    KMP_INTERNAL_FREE( (void *) value );
+    return cp;
+
+} // func get_code_page
+
+
+static void
+kmp_i18n_table_free(
+    kmp_i18n_table_t * table
+) {
+    int s;
+    int m;
+    for ( s = 0; s < table->size; ++ s ) {
+        for ( m = 0; m < table->sect[ s ].size; ++ m ) {
+            // Free message.
+            KMP_INTERNAL_FREE( (void *) table->sect[ s ].str[ m ] );
+            table->sect[ s ].str[ m ] = NULL;
+        }; // for m
+        table->sect[ s ].size = 0;
+        // Free section itself.
+        KMP_INTERNAL_FREE ( (void *) table->sect[ s ].str );
+        table->sect[ s ].str = NULL;
+    }; // for s
+    table->size = 0;
+    KMP_INTERNAL_FREE( (void *) table->sect );
+    table->sect = NULL;
+} // kmp_i8n_table_free
+
+
+void
+__kmp_i18n_do_catopen(
+) {
+
+    LCID          locale_id = GetThreadLocale();
+    WORD 	  lang_id = LANGIDFROMLCID( locale_id );
+    WORD          primary_lang_id = PRIMARYLANGID( lang_id );
+    kmp_str_buf_t path;
+
+    KMP_DEBUG_ASSERT( status == KMP_I18N_CLOSED );
+    KMP_DEBUG_ASSERT( cat    == KMP_I18N_NULLCAT );
+
+    __kmp_str_buf_init( & path );
+
+    // Do not try to open English catalog because internal messages are
+    // exact copy of messages in English catalog.
+    if ( primary_lang_id == LANG_ENGLISH ) {
+	status = KMP_I18N_ABSENT;  // mark catalog as absent so it will not be re-opened.
+	goto end;
+    }; // if
+
+    // Construct resource DLL name.
+    /*
+        Simple
+            LoadLibrary( name )
+        is not suitable due to security issue (see
+        http://www.microsoft.com/technet/security/advisory/2269637.mspx). We have to specify full
+        path to the message catalog.
+    */
+    {
+
+        // Get handle of our DLL first.
+        HMODULE handle;
+        BOOL brc =
+            GetModuleHandleEx(
+                GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                reinterpret_cast< LPCSTR >( & __kmp_i18n_do_catopen ),
+                & handle
+            );
+        if ( ! brc ) {    // Error occurred.
+            status = KMP_I18N_ABSENT;  // mark catalog as absent so it will not be re-opened.
+            goto end;
+            // TODO: Enable multiple messages (KMP_MSG) to be passed to __kmp_msg; and print
+            // a proper warning.
+        }; // if
+
+        // Now get path to the our DLL.
+        for ( ; ; ) {
+            DWORD drc = GetModuleFileName( handle, path.str, path.size );
+            if ( drc == 0 ) {    // Error occurred.
+                status = KMP_I18N_ABSENT;
+                goto end;
+            }; // if
+            if ( drc < path.size ) {
+                path.used = drc;
+                break;
+            }; // if
+            __kmp_str_buf_reserve( & path, path.size * 2 );
+        }; // forever
+
+        // Now construct the name of message catalog.
+        kmp_str_fname fname;
+        __kmp_str_fname_init( & fname, path.str );
+        __kmp_str_buf_clear( & path );
+        __kmp_str_buf_print( & path, "%s%lu/%s", fname.dir, (unsigned long)( locale_id ), name );
+        __kmp_str_fname_free( & fname );
+
+    }
+
+    // For security reasons, use LoadLibraryEx() and load message catalog as a data file.
+    cat = LoadLibraryEx( path.str, NULL, LOAD_LIBRARY_AS_DATAFILE );
+    status = ( cat == KMP_I18N_NULLCAT ? KMP_I18N_ABSENT : KMP_I18N_OPENED );
+
+    if ( status == KMP_I18N_ABSENT ) {
+      if (__kmp_generate_warnings > kmp_warnings_low) { // AC: only issue warning in case explicitly asked to
+	DWORD error = GetLastError();
+	// Infinite recursion will not occur -- status is KMP_I18N_ABSENT now, so
+	// __kmp_i18n_catgets() will not try to open catalog but will return default message.
+        /*
+         If message catalog for another architecture found (e.g. OpenMP RTL
+	 for IA-32 architecture opens libompui.dll for Intel(R) 64)
+	 Windows* OS returns error 193 (ERROR_BAD_EXE_FORMAT). However,
+         FormatMessage fails to return a message for this error, so user
+	 will see:
+
+         OMP: Warning #2: Cannot open message catalog "1041\libompui.dll":
+         OMP: System error #193: (No system error message available)
+         OMP: Info #3: Default messages will be used.
+
+         Issue a hint in this case to let cause of trouble more understandable.
+        */
+	__kmp_msg(
+	    kmp_ms_warning,
+	    KMP_MSG( CantOpenMessageCatalog, path.str ),
+	    KMP_SYSERRCODE( error ),
+            ( error == ERROR_BAD_EXE_FORMAT ? KMP_HNT( BadExeFormat, path.str, KMP_ARCH_STR ) : __kmp_msg_null ),
+	    __kmp_msg_null
+	);
+	KMP_INFORM( WillUseDefaultMessages );
+      }
+    } else { // status == KMP_I18N_OPENED
+
+        int section = get_section( kmp_i18n_prp_Version );
+        int number  = get_number( kmp_i18n_prp_Version );
+        char const * expected = __kmp_i18n_default_table.sect[ section ].str[ number ];
+        kmp_str_buf_t version;   // Actual version of the catalog.
+        __kmp_str_buf_init( & version );
+        __kmp_str_buf_print( & version, "%s", ___catgets( kmp_i18n_prp_Version ) );
+            // String returned by catgets is invalid after closing the catalog, so copy it.
+        if ( strcmp( version.str, expected ) != 0 ) {
+            // Close bad catalog.
+            __kmp_i18n_catclose();
+            status = KMP_I18N_ABSENT;  // And mark it as absent.
+            if (__kmp_generate_warnings > kmp_warnings_low) {
+                // And now print a warning using default messages.
+                __kmp_msg(
+                    kmp_ms_warning,
+                    KMP_MSG( WrongMessageCatalog, path.str, version.str, expected ),
+                    __kmp_msg_null
+                );
+                KMP_INFORM( WillUseDefaultMessages );
+            } // __kmp_generate_warnings
+        }; // if
+        __kmp_str_buf_free( & version );
+
+    }; // if
+    code_page = get_code_page();
+
+    end:
+        __kmp_str_buf_free( & path );
+        return;
+
+} // func __kmp_i18n_do_catopen
+
+
+void
+__kmp_i18n_catclose(
+) {
+    if ( status == KMP_I18N_OPENED ) {
+        KMP_DEBUG_ASSERT( cat != KMP_I18N_NULLCAT );
+        kmp_i18n_table_free( & table );
+        FreeLibrary( cat );
+        cat = KMP_I18N_NULLCAT;
+    }; // if
+    code_page = default_code_page;
+    status = KMP_I18N_CLOSED;
+} // func __kmp_i18n_catclose
+
+/*
+    We use FormatMessage() to get strings from catalog, get system error messages, etc.
+    FormatMessage() tends to return Windows* OS-style end-of-lines, "\r\n". When string is printed,
+    printf() also replaces all the occurrences of "\n" with "\r\n" (again!), so sequences like
+    "\r\r\r\n" appear in output. It is not too good.
+
+    Additional mess comes from message catalog: Our catalog source en_US.mc file (generated by
+    message-converter.pl) contains only "\n" characters, but en_US_msg_1033.bin file (produced by
+    mc.exe) may contain "\r\n" or just "\n". This mess goes from en_US_msg_1033.bin file to
+    message catalog, libompui.dll. For example, message
+
+        Error
+
+    (there is "\n" at the end) is compiled by mc.exe to "Error\r\n", while
+
+        OMP: Error %1!d!: %2!s!\n
+
+    (there is "\n" at the end as well) is compiled to "OMP: Error %1!d!: %2!s!\r\n\n".
+
+    Thus, stripping all "\r" normalizes string and returns it to canonical form, so printf() will
+    produce correct end-of-line sequences.
+
+    ___strip_crs() serves for this purpose: it removes all the occurrences of "\r" in-place and
+    returns new length of string.
+*/
+static
+int
+___strip_crs(
+    char * str
+) {
+    int in  = 0;  // Input character index.
+    int out = 0;  // Output character index.
+    for ( ; ; ) {
+        if ( str[ in ] != '\r' ) {
+            str[ out ] = str[ in ];
+            ++ out;
+        }; // if
+        if ( str[ in ] == 0 ) {
+            break;
+        }; // if
+        ++ in;
+    }; // forever
+    return out - 1;
+} // func __strip_crs
+
+
+static
+char const *
+___catgets(
+    kmp_i18n_id_t  id
+) {
+
+    char *    result = NULL;
+    PVOID     addr   = NULL;
+    wchar_t * wmsg   = NULL;
+    DWORD     wlen   = 0;
+    char *    msg    = NULL;
+    int       len    = 0;
+    int       rc;
+
+    KMP_DEBUG_ASSERT( cat != KMP_I18N_NULLCAT );
+    wlen =    // wlen does *not* include terminating null.
+        FormatMessageW(
+            FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_HMODULE |
+                FORMAT_MESSAGE_IGNORE_INSERTS,
+            cat,
+            id,
+            0,             // LangId
+            (LPWSTR) & addr,
+            0,             // Size in elements, not in bytes.
+            NULL
+        );
+    if ( wlen <= 0 ) {
+        goto end;
+    }; // if
+    wmsg = (wchar_t *) addr;  // Warning: wmsg may be not nul-terminated!
+
+    // Calculate length of multibyte message.
+    len =     // Since wlen does not include terminating null, len does not include it also.
+        WideCharToMultiByte(
+            code_page,
+            0,                // Flags.
+            wmsg, wlen,       // Wide buffer and size.
+            NULL, 0,          // Buffer and size.
+            NULL, NULL        // Default char and used default char.
+        );
+    if ( len <= 0 ) {
+        goto end;
+    }; // if
+
+    // Allocate memory.
+    msg = (char *) KMP_INTERNAL_MALLOC( len + 1 );
+
+    // Convert wide message to multibyte one.
+    rc =
+        WideCharToMultiByte(
+            code_page,
+            0,                // Flags.
+            wmsg, wlen,       // Wide buffer and size.
+            msg, len,         // Buffer and size.
+            NULL, NULL        // Default char and used default char.
+        );
+    if ( rc <= 0 || rc > len ) {
+        goto end;
+    }; // if
+    KMP_DEBUG_ASSERT( rc == len );
+    len = rc;
+    msg[ len ] = 0;           // Put terminating null to the end.
+
+    // Stripping all "\r" before stripping last end-of-line simplifies the task.
+    len = ___strip_crs( msg );
+
+    // Every message in catalog is terminated with "\n". Strip it.
+    if ( len >= 1 && msg[ len - 1 ] == '\n' ) {
+        -- len;
+        msg[ len ] = 0;
+    }; // if
+
+    // Everything looks ok.
+    result = msg;
+    msg    = NULL;
+
+    end:
+
+    if ( msg != NULL ) {
+        KMP_INTERNAL_FREE( msg );
+    }; // if
+    if ( wmsg != NULL ) {
+        LocalFree( wmsg );
+    }; // if
+
+    return result;
+
+} // ___catgets
+
+
+char const *
+__kmp_i18n_catgets(
+    kmp_i18n_id_t  id
+) {
+
+    int section = get_section( id );
+    int number  = get_number( id );
+    char const * message = NULL;
+
+    if ( 1 <= section && section <= __kmp_i18n_default_table.size ) {
+        if ( 1 <= number && number <= __kmp_i18n_default_table.sect[ section ].size ) {
+            if ( status == KMP_I18N_CLOSED ) {
+                __kmp_i18n_catopen();
+            }; // if
+            if ( cat != KMP_I18N_NULLCAT ) {
+                if ( table.size == 0 ) {
+                    table.sect = (kmp_i18n_section_t *)
+                        KMP_INTERNAL_CALLOC(
+                            ( __kmp_i18n_default_table.size + 2 ),
+                            sizeof( kmp_i18n_section_t )
+                        );
+                    table.size = __kmp_i18n_default_table.size;
+                }; // if
+                if ( table.sect[ section ].size == 0 ) {
+                    table.sect[ section ].str = (const char **)
+                        KMP_INTERNAL_CALLOC(
+                            __kmp_i18n_default_table.sect[ section ].size + 2,
+                            sizeof( char const * )
+                        );
+                    table.sect[ section ].size = __kmp_i18n_default_table.sect[ section ].size;
+                }; // if
+                if ( table.sect[ section ].str[ number ] == NULL ) {
+                    table.sect[ section ].str[ number ] = ___catgets( id );
+                }; // if
+                message = table.sect[ section ].str[ number ];
+            }; // if
+            if ( message == NULL ) {
+                // Catalog is not opened or message is not found, return default message.
+                message = __kmp_i18n_default_table.sect[ section ].str[ number ];
+            }; // if
+        }; // if
+    }; // if
+    if ( message == NULL ) {
+        message = no_message_available;
+    }; // if
+    return message;
+
+} // func __kmp_i18n_catgets
+
+
+#endif // KMP_OS_WINDOWS
+
+// -------------------------------------------------------------------------------------------------
+
+#ifndef KMP_I18N_OK
+    #error I18n support is not implemented for this OS.
+#endif // KMP_I18N_OK
+
+// -------------------------------------------------------------------------------------------------
+
+void
+__kmp_i18n_dump_catalog(
+    kmp_str_buf_t * buffer
+) {
+
+    struct kmp_i18n_id_range_t {
+        kmp_i18n_id_t  first;
+        kmp_i18n_id_t  last;
+    }; // struct kmp_i18n_id_range_t
+
+    static struct kmp_i18n_id_range_t ranges[] = {
+        { kmp_i18n_prp_first, kmp_i18n_prp_last },
+        { kmp_i18n_str_first, kmp_i18n_str_last },
+        { kmp_i18n_fmt_first, kmp_i18n_fmt_last },
+        { kmp_i18n_msg_first, kmp_i18n_msg_last },
+        { kmp_i18n_hnt_first, kmp_i18n_hnt_last }
+    }; // ranges
+
+    int           num_of_ranges = sizeof( ranges ) / sizeof( struct kmp_i18n_id_range_t );
+    int           range;
+    kmp_i18n_id_t id;
+
+    for ( range = 0; range < num_of_ranges; ++ range ) {
+        __kmp_str_buf_print( buffer, "*** Set #%d ***\n", range + 1 );
+        for ( id = (kmp_i18n_id_t)( ranges[ range ].first + 1 );
+              id < ranges[ range ].last;
+              id = (kmp_i18n_id_t)( id + 1 ) ) {
+             __kmp_str_buf_print( buffer, "%d: <<%s>>\n", id, __kmp_i18n_catgets( id ) );
+        }; // for id
+    }; // for range
+
+    __kmp_printf( "%s", buffer->str );
+
+} // __kmp_i18n_dump_catalog
+
+// -------------------------------------------------------------------------------------------------
+
+kmp_msg_t
+__kmp_msg_format(
+    kmp_i18n_id_t id,
+    ...
+) {
+
+    kmp_msg_t      msg;
+    va_list        args;
+    kmp_str_buf_t  buffer;
+    __kmp_str_buf_init( & buffer );
+
+    va_start( args, id );
+    #if KMP_OS_UNIX
+        // On Linux* OS and OS X*, printf() family functions process parameter numbers, for example:
+        // "%2$s %1$s".
+        __kmp_str_buf_vprint( & buffer, __kmp_i18n_catgets( id ), args );
+    #elif KMP_OS_WINDOWS
+        // On Winodws, printf() family functions does not recognize GNU style parameter numbers,
+        // so we have to use FormatMessage() instead. It recognizes parameter numbers, e. g.:
+        // "%2!s! "%1!s!".
+        {
+            LPTSTR str = NULL;
+            int    len;
+            FormatMessage(
+                FORMAT_MESSAGE_FROM_STRING | FORMAT_MESSAGE_ALLOCATE_BUFFER,
+                __kmp_i18n_catgets( id ),
+                0, 0,
+                (LPTSTR)( & str ),
+                0,
+                & args
+            );
+            len = ___strip_crs( str );
+            __kmp_str_buf_cat( & buffer, str, len );
+            LocalFree( str );
+        }
+    #else
+        #error
+    #endif
+    va_end( args );
+    __kmp_str_buf_detach( & buffer );
+
+    msg.type = (kmp_msg_type_t)( id >> 16 );
+    msg.num  = id & 0xFFFF;
+    msg.str  = buffer.str;
+    msg.len  = buffer.used;
+
+    return msg;
+
+} // __kmp_msg_format
+
+// -------------------------------------------------------------------------------------------------
+
+static
+char *
+sys_error(
+    int err
+) {
+
+    char * message = NULL;
+
+    #if KMP_OS_WINDOWS
+
+        LPVOID  buffer = NULL;
+        int     len;
+        DWORD   rc;
+        rc =
+            FormatMessage(
+                FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+                NULL,
+                err,
+                MAKELANGID( LANG_NEUTRAL, SUBLANG_DEFAULT ), // Default language.
+                (LPTSTR) & buffer,
+                0,
+                NULL
+            );
+        if ( rc > 0 ) {
+            // Message formatted. Copy it (so we can free it later with normal free().
+            message = __kmp_str_format( "%s", (char *) buffer );
+            len = ___strip_crs( message ); // Delete carriage returns if any.
+            // Strip trailing newlines.
+            while ( len > 0 && message[ len - 1 ] == '\n' ) {
+                -- len;
+            }; // while
+            message[ len ] = 0;
+        } else {
+            // FormatMessage() failed to format system error message. GetLastError() would give us
+            // error code, which we would convert to message... this it dangerous recursion, which
+            // cannot clarify original error, so we will not even start it.
+        }; // if
+        if ( buffer != NULL ) {
+            LocalFree( buffer );
+        }; // if
+
+    #else // Non-Windows* OS: Linux* OS or OS X*
+
+        /*
+            There are 2 incompatible versions of strerror_r:
+
+                char * strerror_r( int, char *, size_t );  // GNU version
+                int    strerror_r( int, char *, size_t );  // XSI version
+        */
+
+        #if KMP_OS_LINUX
+
+            // GNU version of strerror_r.
+
+            char   buffer[ 2048 ];
+            char * const err_msg = strerror_r( err, buffer, sizeof( buffer ) );
+                // Do not eliminate this assignment to temporary variable, otherwise compiler would
+                // not issue warning if strerror_r() returns `int' instead of expected `char *'.
+            message = __kmp_str_format( "%s", err_msg );
+
+        #else // OS X*, FreeBSD* etc.
+
+            // XSI version of strerror_r.
+
+            int    size   = 2048;
+            // TODO: Add checking result of malloc().
+            char * buffer = (char *) KMP_INTERNAL_MALLOC( size );
+            int    rc;
+            rc = strerror_r( err, buffer, size );
+            if ( rc == -1 ) {
+                rc = errno;            // XSI version sets errno.
+            }; // if
+            while ( rc == ERANGE ) {   // ERANGE means the buffer is too small.
+                KMP_INTERNAL_FREE( buffer );
+                size *= 2;
+                buffer = (char *) KMP_INTERNAL_MALLOC( size );
+                rc = strerror_r( err, buffer, size );
+                if ( rc == -1 ) {
+                    rc = errno;        // XSI version sets errno.
+                }; // if
+            }; // while
+            if ( rc == 0 ) {
+                message = buffer;
+            } else {
+                // Buffer is unused. Free it.
+                KMP_INTERNAL_FREE( buffer );
+            }; // if
+
+        #endif
+
+    #endif /* KMP_OS_WINDOWS */
+
+    if ( message == NULL ) {
+        // TODO: I18n this message.
+        message = __kmp_str_format( "%s", "(No system error message available)" );
+    }; // if
+    return message;
+
+} // sys_error
+
+// -------------------------------------------------------------------------------------------------
+
+kmp_msg_t
+__kmp_msg_error_code(
+    int  code
+) {
+
+    kmp_msg_t      msg;
+    msg.type = kmp_mt_syserr;
+    msg.num  = code;
+    msg.str  = sys_error( code );
+    msg.len  = KMP_STRLEN( msg.str );
+    return msg;
+
+} // __kmp_msg_error_code
+
+// -------------------------------------------------------------------------------------------------
+
+kmp_msg_t
+__kmp_msg_error_mesg(
+    char const * mesg
+) {
+
+    kmp_msg_t      msg;
+    msg.type = kmp_mt_syserr;
+    msg.num  = 0;
+    msg.str  = __kmp_str_format( "%s", mesg );
+    msg.len  = KMP_STRLEN( msg.str );
+    return msg;
+
+} // __kmp_msg_error_mesg
+
+// -------------------------------------------------------------------------------------------------
+
+void
+__kmp_msg(
+    kmp_msg_severity_t  severity,
+    kmp_msg_t           message,
+    ...
+) {
+
+    va_list        args;
+    kmp_i18n_id_t  format;      // format identifier
+    kmp_msg_t      fmsg;        // formatted message
+    kmp_str_buf_t  buffer;
+
+    if ( severity != kmp_ms_fatal && __kmp_generate_warnings == kmp_warnings_off )
+        return; // no reason to form a string in order to not print it
+
+    __kmp_str_buf_init( & buffer );
+
+    // Format the primary message.
+    switch ( severity ) {
+        case kmp_ms_inform : {
+            format = kmp_i18n_fmt_Info;
+        } break;
+        case kmp_ms_warning : {
+            format = kmp_i18n_fmt_Warning;
+        } break;
+        case kmp_ms_fatal : {
+            format = kmp_i18n_fmt_Fatal;
+        } break;
+        default : {
+            KMP_DEBUG_ASSERT( 0 );
+        };
+    }; // switch
+    fmsg = __kmp_msg_format( format, message.num, message.str );
+    KMP_INTERNAL_FREE( (void *) message.str );
+    __kmp_str_buf_cat( & buffer, fmsg.str, fmsg.len );
+    KMP_INTERNAL_FREE( (void *) fmsg.str );
+
+    // Format other messages.
+    va_start( args, message );
+    for ( ; ; ) {
+        message = va_arg( args, kmp_msg_t );
+        if ( message.type == kmp_mt_dummy && message.str == NULL ) {
+            break;
+        }; // if
+        if ( message.type == kmp_mt_dummy && message.str == __kmp_msg_empty.str ) {
+            continue;
+        }; // if
+        switch ( message.type ) {
+            case kmp_mt_hint : {
+                format = kmp_i18n_fmt_Hint;
+            } break;
+            case kmp_mt_syserr : {
+                format = kmp_i18n_fmt_SysErr;
+            } break;
+            default : {
+                KMP_DEBUG_ASSERT( 0 );
+            };
+        }; // switch
+        fmsg = __kmp_msg_format( format, message.num, message.str );
+        KMP_INTERNAL_FREE( (void *) message.str );
+        __kmp_str_buf_cat( & buffer, fmsg.str, fmsg.len );
+        KMP_INTERNAL_FREE( (void *) fmsg.str );
+    }; // forever
+    va_end( args );
+
+    // Print formatted messages.
+    // This lock prevents multiple fatal errors on the same problem.
+    // __kmp_acquire_bootstrap_lock( & lock );    // GEH - This lock causing tests to hang on OS X*.
+    __kmp_printf( "%s", buffer.str );
+    __kmp_str_buf_free( & buffer );
+
+    if ( severity == kmp_ms_fatal ) {
+        #if KMP_OS_WINDOWS
+        __kmp_thread_sleep( 500 );   /* Delay to give message a chance to appear before reaping */
+        #endif
+        __kmp_abort_process();
+    }; // if
+
+    // __kmp_release_bootstrap_lock( & lock );  // GEH - this lock causing tests to hang on OS X*.
+
+} // __kmp_msg
+
+// -------------------------------------------------------------------------------------------------
+
+// end of file //

diff --git a/final/runtime/src/kmp_i18n.h b/final/runtime/src/kmp_i18n.h
new file mode 100644
index 0000000..82ec51b
--- /dev/null
+++ b/final/runtime/src/kmp_i18n.h

@@ -0,0 +1,193 @@
+/*
+ * kmp_i18n.h
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_I18N_H
+#define KMP_I18N_H
+
+#include "kmp_str.h"
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+/*
+    kmp_i18n_id.inc defines kmp_i18n_id_t type. It is an enumeration with identifiers of all the
+    messages in the catalog. There is one special identifier: kmp_i18n_null, which denotes absence
+    of message.
+*/
+#include "kmp_i18n_id.inc" // Generated file. Do not edit it manually.
+
+/*
+    Low-level functions handling message catalog. __kmp_i18n_open() opens message catalog,
+    __kmp_i18n_closes() it. Explicit opening is not required: if message catalog is not yet open,
+    __kmp_i18n_catgets() will open it implicitly. However, catalog should be explicitly closed,
+    otherwise resources (mamory, handles) may leak.
+
+    __kmp_i18n_catgets() returns read-only string. It should not be freed.
+
+    KMP_I18N_STR macro simplifies acces to strings in message catalog a bit. Following two lines are
+    equivalent:
+
+        __kmp_i18n_catgets( kmp_i18n_str_Warning )
+        KMP_I18N_STR( Warning )
+*/
+
+void            __kmp_i18n_catopen();
+void            __kmp_i18n_catclose();
+char const *    __kmp_i18n_catgets( kmp_i18n_id_t id );
+
+#define KMP_I18N_STR( id )    __kmp_i18n_catgets( kmp_i18n_str_ ## id )
+
+
+/*
+    ------------------------------------------------------------------------------------------------
+
+    High-level interface for printing strings targeted to the user.
+
+    All the strings are divided into 3 types:
+
+        * messages,
+        * hints,
+        * system errors.
+
+    There are 3 kind of message severities:
+
+        * informational messages,
+        * warnings (non-fatal errors),
+        * fatal errors.
+
+    For example:
+
+        OMP: Warning #2: Cannot open message catalog "libguide.cat":   (1)
+        OMP: System error #2: No such file or directory                (2)
+        OMP: Hint: Please check NLSPATH environment variable.          (3)
+        OMP: Info #3: Default messages will be used.                   (4)
+
+    where
+
+        (1) is a message of warning severity,
+        (2) is a system error caused the previous warning,
+        (3) is a hint for the user how to fix the problem,
+        (4) is a message of informational severity.
+
+   Usage in complex cases (message is accompanied with hints and system errors):
+
+       int error = errno;   // We need save errno immediately, because it may be changed.
+       __kmp_msg(
+           kmp_ms_warning,                            // Severity
+           KMP_MSG( CantOpenMessageCatalog, name ),   // Primary message
+           KMP_ERR( error ),                          // System error
+           KMP_HNT( CheckNLSPATH ),                   // Hint
+           __kmp_msg_null                             // Variadic argument list finisher
+       );
+
+    Usage in simple cases (just a message, no system errors or hints):
+
+        KMP_INFORM( WillUseDefaultMessages );
+        KMP_WARNING( CantOpenMessageCatalog, name );
+        KMP_FATAL( StackOverlap );
+        KMP_SYSFAIL( "pthread_create", status );
+        KMP_CHECK_SYSFAIL( "pthread_create", status );
+        KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+
+    ------------------------------------------------------------------------------------------------
+*/
+
+enum kmp_msg_type {
+    kmp_mt_dummy  =  0, // Special type for internal purposes.
+    kmp_mt_mesg   =  4, // Primary OpenMP message, could be information, warning, or fatal.
+    kmp_mt_hint   =  5, // Hint to the user.
+    kmp_mt_syserr = -1  // System error message.
+}; // enum kmp_msg_type
+typedef enum kmp_msg_type  kmp_msg_type_t;
+
+struct kmp_msg {
+    kmp_msg_type_t  type;
+    int             num;
+    char const *    str;
+    int             len;
+}; // struct kmp_message
+typedef struct kmp_msg  kmp_msg_t;
+
+// Two special messages.
+extern kmp_msg_t __kmp_msg_empty;  // Can be used in place where message is required syntactically.
+extern kmp_msg_t __kmp_msg_null;   // Denotes the end of variadic list of arguments.
+
+// Helper functions. Creates messages either from message catalog or from system. Note: these
+// functions allocate memory. You should pass created messages to __kmp_msg() function, it will
+// print messages and destroy them.
+kmp_msg_t  __kmp_msg_format( kmp_i18n_id_t id, ... );
+kmp_msg_t  __kmp_msg_error_code( int code );
+kmp_msg_t  __kmp_msg_error_mesg( char const * mesg );
+
+// Helper macros to make calls shorter.
+#define KMP_MSG( ...  )   __kmp_msg_format( kmp_i18n_msg_ ## __VA_ARGS__ )
+#define KMP_HNT( ...  )   __kmp_msg_format( kmp_i18n_hnt_ ## __VA_ARGS__ )
+#define KMP_SYSERRCODE( code )  __kmp_msg_error_code( code )
+#define KMP_SYSERRMESG( mesg )  __kmp_msg_error_mesg( mesg )
+#define KMP_ERR KMP_SYSERRCODE
+
+// Message severity.
+enum kmp_msg_severity {
+    kmp_ms_inform,      // Just information for the user.
+    kmp_ms_warning,     // Non-fatal error, execution continues.
+    kmp_ms_fatal        // Fatal error, program aborts.
+}; // enum kmp_msg_severity
+typedef enum kmp_msg_severity  kmp_msg_severity_t;
+
+// Primary function for printing messages for the user. The first message is mandatory. Any number
+// of system errors and hints may be specified. Argument list must be finished with __kmp_msg_null.
+void    __kmp_msg( kmp_msg_severity_t severity, kmp_msg_t message, ... );
+
+// Helper macros to make calls shorter in simple cases.
+#define KMP_INFORM( ...  ) __kmp_msg( kmp_ms_inform,  KMP_MSG( __VA_ARGS__ ), __kmp_msg_null )
+#define KMP_WARNING( ... ) __kmp_msg( kmp_ms_warning, KMP_MSG( __VA_ARGS__ ), __kmp_msg_null )
+#define KMP_FATAL(   ... ) __kmp_msg( kmp_ms_fatal,   KMP_MSG( __VA_ARGS__ ), __kmp_msg_null )
+#define KMP_SYSFAIL( func, error )                                                                 \
+    __kmp_msg(                                                                                     \
+        kmp_ms_fatal,                                                                              \
+        KMP_MSG( FunctionError, func ),                                                            \
+        KMP_SYSERRCODE( error ),                                                                   \
+        __kmp_msg_null                                                                             \
+    )
+
+// Check error, if not zero, generate fatal error message.
+#define KMP_CHECK_SYSFAIL( func, error )                                                           \
+    {                                                                                              \
+        if ( error ) {                                                                             \
+            KMP_SYSFAIL( func, error );                                                            \
+        };                                                                                         \
+    }
+
+// Check status, if not zero, generate fatal error message using errno.
+#define KMP_CHECK_SYSFAIL_ERRNO( func, status )                                                    \
+    {                                                                                              \
+        if ( status != 0 ) {                                                                       \
+            int error = errno;                                                                     \
+            KMP_SYSFAIL( func, error );                                                            \
+        };                                                                                         \
+    }
+
+#ifdef KMP_DEBUG
+    void __kmp_i18n_dump_catalog( kmp_str_buf_t * buffer );
+#endif // KMP_DEBUG
+
+#ifdef __cplusplus
+    }; // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_I18N_H
+
+// end of file //

diff --git a/final/runtime/src/kmp_import.c b/final/runtime/src/kmp_import.c
new file mode 100644
index 0000000..42fba41
--- /dev/null
+++ b/final/runtime/src/kmp_import.c

@@ -0,0 +1,42 @@
+/*
+ * kmp_import.c
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Object generated from this source file is linked to Windows* OS DLL import library (libompmd.lib)
+    only! It is not a part of regular static or dynamic OpenMP RTL. Any code that just needs to go
+    in the libompmd.lib (but not in libompmt.lib and libompmd.dll) should be placed in this
+    file.
+    ------------------------------------------------------------------------------------------------
+*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+    These symbols are required for mutual exclusion with Microsoft OpenMP RTL (and compatibility
+    with MS Compiler).
+*/
+
+int _You_must_link_with_exactly_one_OpenMP_library = 1;
+int _You_must_link_with_Intel_OpenMP_library       = 1;
+int _You_must_link_with_Microsoft_OpenMP_library = 1;
+
+#ifdef __cplusplus
+}
+#endif
+
+// end of file //

diff --git a/final/runtime/src/kmp_io.c b/final/runtime/src/kmp_io.c
new file mode 100644
index 0000000..ef808af
--- /dev/null
+++ b/final/runtime/src/kmp_io.c

@@ -0,0 +1,248 @@
+/*
+ * KMP_IO.c -- RTL IO
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#ifndef __ABSOFT_WIN
+# include <sys/types.h>
+#endif
+
+#include "kmp_os.h"
+#include "kmp_lock.h"
+#include "kmp_str.h"
+#include "kmp_io.h"
+#include "kmp.h" // KMP_GTID_DNE, __kmp_debug_buf, etc
+
+#if KMP_OS_WINDOWS
+# pragma warning( push )
+# pragma warning( disable: 271 310 )
+# include <windows.h>
+# pragma warning( pop )
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+kmp_bootstrap_lock_t __kmp_stdio_lock   = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_stdio_lock   ); /* Control stdio functions */
+kmp_bootstrap_lock_t __kmp_console_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_console_lock ); /* Control console initialization */
+
+#if KMP_OS_WINDOWS
+
+    # ifdef KMP_DEBUG 
+    /* __kmp_stdout is used only for dev build */
+    static HANDLE    __kmp_stdout = NULL;
+    # endif
+    static HANDLE        __kmp_stderr = NULL;
+    static int           __kmp_console_exists = FALSE;
+    static kmp_str_buf_t __kmp_console_buf;
+
+    static int
+    is_console( void )
+    {
+        char buffer[ 128 ];
+        DWORD rc  = 0;
+        DWORD err = 0;
+        // Try to get console title.
+        SetLastError( 0 );
+            // GetConsoleTitle does not reset last error in case of success or short buffer,
+            // so we need to clear it explicitly.
+        rc = GetConsoleTitle( buffer, sizeof( buffer ) );
+        if ( rc == 0 ) {
+            // rc == 0 means getting console title failed. Let us find out why.
+            err = GetLastError();
+            // err == 0 means buffer too short (we suppose console exists).
+            // In Window applications we usually have err == 6 (invalid handle).
+        }; // if
+        return rc > 0 || err == 0;
+    }
+
+    void
+    __kmp_close_console( void )
+    {
+        /* wait until user presses return before closing window */
+        /* TODO only close if a window was opened */
+        if( __kmp_console_exists ) {
+            #ifdef KMP_DEBUG 
+            /* standard out is used only in dev build */
+            __kmp_stdout = NULL;
+            #endif
+            __kmp_stderr = NULL;
+            __kmp_str_buf_free( &__kmp_console_buf );
+            __kmp_console_exists = FALSE;
+        }
+    }
+
+    /* For windows, call this before stdout, stderr, or stdin are used.
+     * It opens a console window and starts processing */
+    static void
+    __kmp_redirect_output( void )
+    {
+        __kmp_acquire_bootstrap_lock( &__kmp_console_lock );
+
+        if( ! __kmp_console_exists ) {
+            #ifdef KMP_DEBUG 
+            /* standard out is used only in dev build */
+            HANDLE ho;
+            #endif
+            HANDLE he;
+
+            __kmp_str_buf_init( &__kmp_console_buf );
+
+            AllocConsole();
+            // We do not check the result of AllocConsole because
+            //  1. the call is harmless
+            //  2. it is not clear how to communicate failue
+            //  3. we will detect failure later when we get handle(s)
+
+            #ifdef KMP_DEBUG
+                ho = GetStdHandle( STD_OUTPUT_HANDLE );
+                if ( ho == INVALID_HANDLE_VALUE || ho == NULL ) {
+
+                    DWORD  err = GetLastError();
+                    // TODO: output error somehow (maybe message box)
+                    __kmp_stdout = NULL;
+
+                } else {
+
+                    __kmp_stdout = ho; // temporary code, need new global for ho
+
+                }
+            #endif
+            he = GetStdHandle( STD_ERROR_HANDLE );
+            if ( he == INVALID_HANDLE_VALUE || he == NULL ) {
+
+                DWORD  err = GetLastError();
+                // TODO: output error somehow (maybe message box)
+                __kmp_stderr = NULL;
+
+            } else {
+
+                __kmp_stderr = he; // temporary code, need new global
+            }
+            __kmp_console_exists = TRUE;
+        }
+        __kmp_release_bootstrap_lock( &__kmp_console_lock );
+    }
+
+#else
+    #define       __kmp_stderr     (stderr)
+#endif /* KMP_OS_WINDOWS */
+
+void
+__kmp_vprintf( enum kmp_io __kmp_io, char const * format, va_list ap )
+{
+    #if KMP_OS_WINDOWS
+        if( !__kmp_console_exists ) {
+            __kmp_redirect_output();
+        }
+            if( ! __kmp_stderr && __kmp_io == kmp_err ) {
+            return;
+        }
+        #ifdef KMP_DEBUG
+            if( ! __kmp_stdout && __kmp_io == kmp_out ) {
+                return;
+            }
+        #endif
+    #endif /* KMP_OS_WINDOWS */
+
+    if ( __kmp_debug_buf && __kmp_debug_buffer != NULL ) {
+
+        int dc = ( __kmp_debug_buf_atomic ?
+                   KMP_TEST_THEN_INC32( & __kmp_debug_count) : __kmp_debug_count++ )
+                   % __kmp_debug_buf_lines;
+        char *db = & __kmp_debug_buffer[ dc * __kmp_debug_buf_chars ];
+        int chars = 0;
+
+        #ifdef KMP_DEBUG_PIDS
+            chars = KMP_SNPRINTF( db, __kmp_debug_buf_chars, "pid=%d: ", (kmp_int32)getpid() );
+        #endif
+        chars += KMP_VSNPRINTF( db, __kmp_debug_buf_chars, format, ap );
+
+        if ( chars + 1 > __kmp_debug_buf_chars ) {
+            if ( chars + 1 > __kmp_debug_buf_warn_chars ) {
+                #if KMP_OS_WINDOWS
+                    DWORD count;
+                    __kmp_str_buf_print( &__kmp_console_buf,
+                        "OMP warning: Debugging buffer overflow; increase KMP_DEBUG_BUF_CHARS to %d\n",
+                        chars + 1 );
+                    WriteFile( __kmp_stderr, __kmp_console_buf.str, __kmp_console_buf.used, &count, NULL );
+                    __kmp_str_buf_clear( &__kmp_console_buf );
+                #else
+                    fprintf( __kmp_stderr,
+                         "OMP warning: Debugging buffer overflow; increase KMP_DEBUG_BUF_CHARS to %d\n",
+                         chars + 1 );
+                    fflush( __kmp_stderr );
+                #endif
+                __kmp_debug_buf_warn_chars = chars + 1;
+            }
+            /* terminate string if overflow occurred */
+            db[ __kmp_debug_buf_chars - 2 ] = '\n';
+            db[ __kmp_debug_buf_chars - 1 ] = '\0';
+        }
+    } else {
+        #if KMP_OS_WINDOWS
+            DWORD count;
+            #ifdef KMP_DEBUG_PIDS
+                __kmp_str_buf_print( &__kmp_console_buf, "pid=%d: ",
+                  (kmp_int32)getpid() );
+            #endif
+            __kmp_str_buf_vprint( &__kmp_console_buf, format, ap );
+            WriteFile(
+                __kmp_stderr,
+                __kmp_console_buf.str,
+                __kmp_console_buf.used,
+                &count,
+                NULL
+            );
+            __kmp_str_buf_clear( &__kmp_console_buf );
+        #else
+            #ifdef KMP_DEBUG_PIDS
+                fprintf( __kmp_stderr, "pid=%d: ", (kmp_int32)getpid() );
+            #endif
+            vfprintf( __kmp_stderr, format, ap );
+            fflush( __kmp_stderr );
+        #endif
+    }
+}
+
+void
+__kmp_printf( char const * format, ... )
+{
+    va_list ap;
+    va_start( ap, format );
+
+    __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
+    __kmp_vprintf( kmp_err, format, ap );
+    __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
+
+    va_end( ap );
+}
+
+void
+__kmp_printf_no_lock( char const * format, ... )
+{
+    va_list ap;
+    va_start( ap, format );
+
+    __kmp_vprintf( kmp_err, format, ap );
+
+    va_end( ap );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */

diff --git a/final/runtime/src/kmp_io.h b/final/runtime/src/kmp_io.h
new file mode 100644
index 0000000..a0caa64
--- /dev/null
+++ b/final/runtime/src/kmp_io.h

@@ -0,0 +1,44 @@
+/*
+ * kmp_io.h -- RTL IO header file.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_IO_H
+#define KMP_IO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+enum kmp_io {
+    kmp_out = 0,
+    kmp_err
+};
+
+extern kmp_bootstrap_lock_t __kmp_stdio_lock;     /* Control stdio functions */
+extern kmp_bootstrap_lock_t __kmp_console_lock;   /* Control console initialization */
+
+extern void __kmp_vprintf( enum kmp_io __kmp_io, char const * format, va_list ap );
+extern void __kmp_printf( char const * format, ... );
+extern void __kmp_printf_no_lock( char const * format, ... );
+extern void __kmp_close_console( void );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* KMP_IO_H */
+

diff --git a/final/runtime/src/kmp_itt.c b/final/runtime/src/kmp_itt.c
new file mode 100644
index 0000000..77ac809
--- /dev/null
+++ b/final/runtime/src/kmp_itt.c

@@ -0,0 +1,142 @@
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.c -- ITT Notify interface.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp_itt.h"
+
+#if KMP_DEBUG
+    #include "kmp_itt.inl"
+#endif
+
+
+#if USE_ITT_NOTIFY
+
+    kmp_int32 __kmp_barrier_domain_count;
+    kmp_int32 __kmp_region_domain_count;
+    __itt_domain* __kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
+    __itt_domain* __kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
+    __itt_domain* __kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
+    kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+    __itt_domain * metadata_domain = NULL;
+
+    #include "kmp_version.h"
+    #include "kmp_i18n.h"
+    #include "kmp_str.h"
+
+    KMP_BUILD_ASSERT( sizeof( kmp_itt_mark_t ) == sizeof( __itt_mark_type ) );
+
+    /*
+        Previously used warnings:
+
+        KMP_WARNING( IttAllNotifDisabled );
+        KMP_WARNING( IttObjNotifDisabled );
+        KMP_WARNING( IttMarkNotifDisabled );
+        KMP_WARNING( IttUnloadLibFailed, libittnotify );
+    */
+
+
+    kmp_int32 __kmp_itt_prepare_delay = 0;
+    kmp_bootstrap_lock_t __kmp_itt_debug_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_itt_debug_lock );
+
+#endif // USE_ITT_NOTIFY
+
+void __kmp_itt_initialize() {
+
+    // ITTNotify library is loaded and initialized at first call to any ittnotify function,
+    // so we do not need to explicitly load it any more.
+    // Jusr report OMP RTL version to ITTNotify.
+
+    #if USE_ITT_NOTIFY
+        // Report OpenMP RTL version.
+        kmp_str_buf_t       buf;
+        __itt_mark_type     version;
+        __kmp_str_buf_init( & buf );
+        __kmp_str_buf_print(
+            & buf,
+            "OMP RTL Version %d.%d.%d",
+            __kmp_version_major,
+            __kmp_version_minor,
+            __kmp_version_build
+        );
+        if ( __itt_api_version_ptr != NULL ) {
+            __kmp_str_buf_print( & buf, ":%s", __itt_api_version() );
+        }; // if
+        version = __itt_mark_create( buf.str );
+        __itt_mark( version, NULL );
+        __kmp_str_buf_free( & buf );
+    #endif
+
+} // __kmp_itt_initialize
+
+
+void __kmp_itt_destroy() {
+    #if USE_ITT_NOTIFY
+        __kmp_itt_fini_ittlib();
+    #endif
+} // __kmp_itt_destroy
+
+
+extern "C"
+void
+__itt_error_handler(
+    __itt_error_code err,
+    va_list args
+) {
+
+    switch ( err ) {
+        case __itt_error_no_module : {
+            char const * library = va_arg( args, char const * );
+            #if KMP_OS_WINDOWS
+                int sys_err = va_arg( args, int );
+                __kmp_msg( kmp_ms_warning, KMP_MSG( IttLoadLibFailed, library ), KMP_SYSERRCODE( sys_err ), __kmp_msg_null );
+            #else
+                char const * sys_err = va_arg( args, char const * );
+                __kmp_msg( kmp_ms_warning, KMP_MSG( IttLoadLibFailed, library ), KMP_SYSERRMESG( sys_err ), __kmp_msg_null );
+            #endif
+        } break;
+        case __itt_error_no_symbol : {
+            char const * library = va_arg( args, char const * );
+            char const * symbol  = va_arg( args, char const * );
+            KMP_WARNING( IttLookupFailed, symbol, library );
+        } break;
+        case __itt_error_unknown_group : {
+            char const * var   = va_arg( args, char const * );
+            char const * group = va_arg( args, char const * );
+            KMP_WARNING( IttUnknownGroup, var, group );
+        } break;
+        case __itt_error_env_too_long : {
+            char const * var     = va_arg( args, char const * );
+            size_t       act_len = va_arg( args, size_t );
+            size_t       max_len = va_arg( args, size_t );
+            KMP_WARNING( IttEnvVarTooLong, var, (unsigned long) act_len, (unsigned long) max_len );
+        } break;
+        case __itt_error_cant_read_env : {
+            char const * var     = va_arg( args, char const * );
+            int          sys_err = va_arg( args, int );
+            __kmp_msg( kmp_ms_warning, KMP_MSG( CantGetEnvVar, var ), KMP_ERR( sys_err ), __kmp_msg_null );
+        } break;
+        case __itt_error_system : {
+            char const * func    = va_arg( args, char const * );
+            int          sys_err = va_arg( args, int );
+            __kmp_msg( kmp_ms_warning, KMP_MSG( IttFunctionError, func ), KMP_SYSERRCODE( sys_err ), __kmp_msg_null );
+        } break;
+        default : {
+            KMP_WARNING( IttUnknownError, err );
+        };
+    }; // switch
+
+} // __itt_error_handler
+
+#endif /* USE_ITT_BUILD */

diff --git a/final/runtime/src/kmp_itt.h b/final/runtime/src/kmp_itt.h
new file mode 100644
index 0000000..925a4f0
--- /dev/null
+++ b/final/runtime/src/kmp_itt.h

@@ -0,0 +1,309 @@
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.h -- ITT Notify interface.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_ITT_H
+#define KMP_ITT_H
+
+#include "kmp_lock.h"
+
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#include "ittnotify.h"
+#include "legacy/ittnotify.h"
+
+#if KMP_DEBUG
+    #define __kmp_inline           // Turn off inlining in debug mode.
+#else
+    #define __kmp_inline static inline
+#endif
+
+#if USE_ITT_NOTIFY
+    extern kmp_int32  __kmp_itt_prepare_delay;
+# ifdef __cplusplus
+    extern "C" void __kmp_itt_fini_ittlib(void);
+# else
+    extern void __kmp_itt_fini_ittlib(void);
+# endif
+#endif
+
+// Simplify the handling of an argument that is only required when USE_ITT_BUILD is enabled.
+#define USE_ITT_BUILD_ARG(x) ,x
+
+void __kmp_itt_initialize();
+void __kmp_itt_destroy();
+
+// -------------------------------------------------------------------------------------------------
+// New stuff for reporting high-level constructs.
+// -------------------------------------------------------------------------------------------------
+
+// Note the naming convention:
+//     __kmp_itt_xxxing() function should be called before action, while
+//     __kmp_itt_xxxed()  function should be called after action.
+
+// --- Parallel region reporting ---
+__kmp_inline void __kmp_itt_region_forking(  int gtid, int team_size, int barriers, int serialized = 0 ); // Master only, before forking threads.
+__kmp_inline void __kmp_itt_region_joined(   int gtid, int serialized = 0 ); // Master only, after joining threads.
+    // (*) Note: A thread may execute tasks after this point, though.
+
+// --- Frame reporting ---
+// region = 0 - no regions, region = 1 - parallel, region = 2 - serialized parallel
+__kmp_inline void __kmp_itt_frame_submit( int gtid, __itt_timestamp begin, __itt_timestamp end, int imbalance, ident_t *loc, int team_size, int region = 0 );
+
+// --- Metadata reporting ---
+// begin/end - begin/end timestamps of a barrier frame, imbalance - aggregated wait time value, reduction -if this is a reduction barrier
+__kmp_inline void __kmp_itt_metadata_imbalance( int gtid, kmp_uint64 begin, kmp_uint64 end, kmp_uint64 imbalance, kmp_uint64 reduction );
+// sched_type: 0 - static, 1 - dynamic, 2 - guided, 3 - custom (all others); iterations - loop trip count, chunk - chunk size
+__kmp_inline void __kmp_itt_metadata_loop( ident_t * loc, kmp_uint64 sched_type, kmp_uint64 iterations, kmp_uint64 chunk );
+__kmp_inline void __kmp_itt_metadata_single( ident_t * loc );
+
+// --- Barrier reporting ---
+__kmp_inline void * __kmp_itt_barrier_object( int gtid, int bt, int set_name = 0, int delta = 0 );
+__kmp_inline void   __kmp_itt_barrier_starting( int gtid, void * object );
+__kmp_inline void   __kmp_itt_barrier_middle(   int gtid, void * object );
+__kmp_inline void   __kmp_itt_barrier_finished( int gtid, void * object );
+
+// --- Taskwait reporting ---
+__kmp_inline void * __kmp_itt_taskwait_object( int gtid );
+__kmp_inline void   __kmp_itt_taskwait_starting( int gtid, void * object );
+__kmp_inline void   __kmp_itt_taskwait_finished(   int gtid, void * object );
+
+// --- Task reporting ---
+__kmp_inline void   __kmp_itt_task_starting( void * object );
+__kmp_inline void   __kmp_itt_task_finished( void * object );
+
+// --- Lock reporting ---
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_inline void   __kmp_itt_lock_creating(  kmp_user_lock_p lock, const ident_t * );
+#else
+__kmp_inline void   __kmp_itt_lock_creating(  kmp_user_lock_p lock );
+#endif
+__kmp_inline void   __kmp_itt_lock_acquiring( kmp_user_lock_p lock );
+__kmp_inline void   __kmp_itt_lock_acquired(  kmp_user_lock_p lock );
+__kmp_inline void   __kmp_itt_lock_releasing( kmp_user_lock_p lock );
+__kmp_inline void   __kmp_itt_lock_cancelled( kmp_user_lock_p lock );
+__kmp_inline void   __kmp_itt_lock_destroyed( kmp_user_lock_p lock );
+
+// --- Critical reporting ---
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_inline void   __kmp_itt_critical_creating(  kmp_user_lock_p lock, const ident_t * );
+#else
+__kmp_inline void   __kmp_itt_critical_creating(  kmp_user_lock_p lock );
+#endif
+__kmp_inline void   __kmp_itt_critical_acquiring( kmp_user_lock_p lock );
+__kmp_inline void   __kmp_itt_critical_acquired(  kmp_user_lock_p lock );
+__kmp_inline void   __kmp_itt_critical_releasing( kmp_user_lock_p lock );
+__kmp_inline void   __kmp_itt_critical_destroyed( kmp_user_lock_p lock );
+
+// --- Single reporting ---
+__kmp_inline void   __kmp_itt_single_start( int gtid );
+__kmp_inline void   __kmp_itt_single_end(   int gtid );
+
+// --- Ordered reporting ---
+__kmp_inline void   __kmp_itt_ordered_init(  int gtid );
+__kmp_inline void   __kmp_itt_ordered_prep(  int gtid );
+__kmp_inline void   __kmp_itt_ordered_start( int gtid );
+__kmp_inline void   __kmp_itt_ordered_end(   int gtid );
+
+// --- Threads reporting ---
+__kmp_inline void  __kmp_itt_thread_ignore();
+__kmp_inline void  __kmp_itt_thread_name( int gtid );
+
+// --- System objects ---
+__kmp_inline void   __kmp_itt_system_object_created( void * object, char const * name );
+
+// --- Stack stitching ---
+__kmp_inline __itt_caller __kmp_itt_stack_caller_create(void);
+__kmp_inline void __kmp_itt_stack_caller_destroy(__itt_caller);
+__kmp_inline void __kmp_itt_stack_callee_enter(__itt_caller);
+__kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
+
+// -------------------------------------------------------------------------------------------------
+// Old stuff for reporting low-level internal synchronization.
+// -------------------------------------------------------------------------------------------------
+
+#if USE_ITT_NOTIFY
+
+    /*
+     * Support for SSC marks, which are used by SDE
+     * http://software.intel.com/en-us/articles/intel-software-development-emulator
+     * to mark points in instruction traces that represent spin-loops and are
+     * therefore uninteresting when collecting traces for architecture simulation.
+     */
+    #ifndef INCLUDE_SSC_MARKS
+    # define INCLUDE_SSC_MARKS (KMP_OS_LINUX && KMP_ARCH_X86_64)
+    #endif
+
+    /* Linux 64 only for now */
+    #if (INCLUDE_SSC_MARKS && KMP_OS_LINUX && KMP_ARCH_X86_64)
+    // Portable (at least for gcc and icc) code to insert the necessary instructions
+    // to set %ebx and execute the unlikely no-op.
+      #if defined( __INTEL_COMPILER )
+      # define INSERT_SSC_MARK(tag) __SSC_MARK(tag)
+      #else
+      # define INSERT_SSC_MARK(tag)                                          \
+      __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(tag):"%ebx")
+      #endif
+    #else
+    # define INSERT_SSC_MARK(tag) ((void)0)
+    #endif
+
+    /* Markers for the start and end of regions that represent polling and
+     * are therefore uninteresting to architectural simulations 0x4376 and
+     * 0x4377 are arbitrary numbers that should be unique in the space of
+     * SSC tags, but there is no central issuing authority rather
+     * randomness is expected to work.
+     */
+    #define SSC_MARK_SPIN_START() INSERT_SSC_MARK(0x4376)
+    #define SSC_MARK_SPIN_END()   INSERT_SSC_MARK(0x4377)
+
+    // Markers for architecture simulation.
+    // FORKING      : Before the master thread forks.
+    // JOINING      : At the start of the join.
+    // INVOKING     : Before the threads invoke microtasks.
+    // DISPATCH_INIT: At the start of dynamically scheduled loop.
+    // DISPATCH_NEXT: After claming next iteration of dynamically scheduled loop.
+    #define SSC_MARK_FORKING()          INSERT_SSC_MARK(0xd693)
+    #define SSC_MARK_JOINING()          INSERT_SSC_MARK(0xd694)
+    #define SSC_MARK_INVOKING()         INSERT_SSC_MARK(0xd695)
+    #define SSC_MARK_DISPATCH_INIT()    INSERT_SSC_MARK(0xd696)
+    #define SSC_MARK_DISPATCH_NEXT()    INSERT_SSC_MARK(0xd697)
+
+    // The object is an address that associates a specific set of the prepare, acquire, release,
+    // and cancel operations.
+
+    /* Sync prepare indicates a thread is going to start waiting for another thread
+       to send a release event.  This operation should be done just before the thread
+       begins checking for the existence of the release event */
+
+    /* Sync cancel indicates a thread is cancelling a wait on another thread anc
+       continuing execution without waiting for the other thread to release it */
+
+    /* Sync acquired indicates a thread has received a release event from another
+       thread and has stopped waiting.  This operation must occur only after the release
+       event is received. */
+
+    /* Sync release indicates a thread is going to send a release event to another thread
+       so it will stop waiting and continue execution. This operation must just happen before
+       the release event. */
+
+    #define KMP_FSYNC_PREPARE(   obj )  __itt_fsync_prepare(   (void *)( obj ) )
+    #define KMP_FSYNC_CANCEL(    obj )  __itt_fsync_cancel(    (void *)( obj ) )
+    #define KMP_FSYNC_ACQUIRED(  obj )  __itt_fsync_acquired(  (void *)( obj ) )
+    #define KMP_FSYNC_RELEASING( obj )  __itt_fsync_releasing( (void *)( obj ) )
+
+    /*
+        In case of waiting in a spin loop, ITT wants KMP_FSYNC_PREPARE() to be called with a delay
+        (and not called at all if waiting time is small). So, in spin loops, do not use
+        KMP_FSYNC_PREPARE(), but use KMP_FSYNC_SPIN_INIT() (before spin loop),
+        KMP_FSYNC_SPIN_PREPARE() (whithin the spin loop), and KMP_FSYNC_SPIN_ACQUIRED().
+        See KMP_WAIT_YIELD() for example.
+    */
+
+    #undef  KMP_FSYNC_SPIN_INIT
+    #define KMP_FSYNC_SPIN_INIT( obj, spin )    \
+        int sync_iters = 0;                     \
+        if ( __itt_fsync_prepare_ptr ) {        \
+            if ( obj == NULL ) {                \
+                obj = spin;                     \
+            } /* if */                          \
+        } /* if */                              \
+        SSC_MARK_SPIN_START()
+
+    #undef  KMP_FSYNC_SPIN_PREPARE
+    #define KMP_FSYNC_SPIN_PREPARE( obj ) do {                          \
+        if ( __itt_fsync_prepare_ptr && sync_iters < __kmp_itt_prepare_delay ) { \
+            ++ sync_iters;                                              \
+            if ( sync_iters >= __kmp_itt_prepare_delay ) {              \
+                KMP_FSYNC_PREPARE( (void*) obj );                       \
+            } /* if */                                                  \
+        } /* if */                                                      \
+     } while (0)
+    #undef  KMP_FSYNC_SPIN_ACQUIRED
+    #define KMP_FSYNC_SPIN_ACQUIRED( obj ) do {         \
+        SSC_MARK_SPIN_END();                            \
+        if ( sync_iters >= __kmp_itt_prepare_delay ) {  \
+            KMP_FSYNC_ACQUIRED( (void*) obj );          \
+        } /* if */                                      \
+     } while (0)
+
+    /* ITT will not report objects created within KMP_ITT_IGNORE(), e. g.:
+           KMP_ITT_IGNORE(
+               ptr = malloc( size );
+           );
+    */
+    #define KMP_ITT_IGNORE( statement ) do {                            \
+            __itt_state_t __itt_state_;                                 \
+            if ( __itt_state_get_ptr ) {                                \
+                __itt_state_ = __itt_state_get();                       \
+                __itt_obj_mode_set( __itt_obj_prop_ignore, __itt_obj_state_set ); \
+            }  /* if */                                                 \
+            { statement }                                               \
+            if ( __itt_state_get_ptr ) {                                \
+                __itt_state_set( __itt_state_ );                        \
+            }  /* if */                                                 \
+    } while (0)
+
+    const int KMP_MAX_FRAME_DOMAINS = 512; // Maximum number of frame domains to use (maps to
+                                           // different OpenMP regions in the user source code).
+    extern kmp_int32 __kmp_barrier_domain_count;
+    extern kmp_int32 __kmp_region_domain_count;
+    extern __itt_domain* __kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
+    extern __itt_domain* __kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
+    extern __itt_domain* __kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
+    extern kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+    extern __itt_domain * metadata_domain;
+
+#else
+
+// Null definitions of the synchronization tracing functions.
+# define KMP_FSYNC_PREPARE(   obj )        ((void)0)
+# define KMP_FSYNC_CANCEL(    obj )        ((void)0)
+# define KMP_FSYNC_ACQUIRED(  obj )        ((void)0)
+# define KMP_FSYNC_RELEASING( obj )        ((void)0)
+
+# define KMP_FSYNC_SPIN_INIT( obj, spin )  ((void)0)
+# define KMP_FSYNC_SPIN_PREPARE(  obj )    ((void)0)
+# define KMP_FSYNC_SPIN_ACQUIRED( obj )    ((void)0)
+
+# define KMP_ITT_IGNORE(stmt ) do { stmt } while (0)
+
+#endif // USE_ITT_NOTIFY
+
+#if ! KMP_DEBUG
+    // In release mode include definitions of inline functions.
+    #include "kmp_itt.inl"
+#endif
+
+#endif // KMP_ITT_H
+
+#else  /* USE_ITT_BUILD */
+
+// Null definitions of the synchronization tracing functions.
+// If USE_ITT_BULID is not enabled, USE_ITT_NOTIFY cannot be either.
+// By defining these we avoid unpleasant ifdef tests in many places.
+# define KMP_FSYNC_PREPARE(   obj )        ((void)0)
+# define KMP_FSYNC_CANCEL(    obj )        ((void)0)
+# define KMP_FSYNC_ACQUIRED(  obj )        ((void)0)
+# define KMP_FSYNC_RELEASING( obj )        ((void)0)
+
+# define KMP_FSYNC_SPIN_INIT( obj, spin )  ((void)0)
+# define KMP_FSYNC_SPIN_PREPARE(  obj )    ((void)0)
+# define KMP_FSYNC_SPIN_ACQUIRED( obj )    ((void)0)
+
+# define KMP_ITT_IGNORE(stmt ) do { stmt } while (0)
+
+# define USE_ITT_BUILD_ARG(x)
+
+#endif /* USE_ITT_BUILD */

diff --git a/final/runtime/src/kmp_itt.inl b/final/runtime/src/kmp_itt.inl
new file mode 100644
index 0000000..8f89457
--- /dev/null
+++ b/final/runtime/src/kmp_itt.inl

@@ -0,0 +1,1130 @@
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.inl -- Inline functions of ITT Notify.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Inline function definitions. This file should be included into kmp_itt.h file for prodiction
+// build (to let compliler inline functions) or into kmp_itt.c file for debug build (to reduce
+// the number of files to recompile and save build time).
+
+
+#include "kmp.h"
+#include "kmp_str.h"
+
+#if KMP_ITT_DEBUG
+    extern kmp_bootstrap_lock_t __kmp_itt_debug_lock;
+    #define KMP_ITT_DEBUG_LOCK() {                                   \
+        __kmp_acquire_bootstrap_lock( & __kmp_itt_debug_lock );      \
+    }
+    #define KMP_ITT_DEBUG_PRINT( ... ) {                             \
+        fprintf( stderr, "#%02d: ", __kmp_get_gtid() );              \
+        fprintf( stderr, __VA_ARGS__ );                              \
+        fflush( stderr );                                            \
+        __kmp_release_bootstrap_lock( & __kmp_itt_debug_lock );      \
+    }
+#else
+    #define KMP_ITT_DEBUG_LOCK()
+    #define KMP_ITT_DEBUG_PRINT( ... )
+#endif // KMP_ITT_DEBUG
+
+// Ensure that the functions are static if they're supposed to be
+// being inlined. Otherwise they cannot be used in more than one file,
+// since there will be multiple definitions.
+#if KMP_DEBUG
+# define LINKAGE
+#else
+# define LINKAGE static inline
+#endif
+
+// ZCA interface used by Intel(R) Inspector. Intel(R) Parallel Amplifier uses this
+// API to support user-defined synchronization primitives, but does not use ZCA;
+// it would be safe to turn this off until wider support becomes available.
+#if USE_ITT_ZCA
+#ifdef __INTEL_COMPILER
+#   if __INTEL_COMPILER >= 1200
+#       undef __itt_sync_acquired
+#       undef __itt_sync_releasing
+#       define __itt_sync_acquired(addr)    __notify_zc_intrinsic((char *)"sync_acquired", addr)
+#       define __itt_sync_releasing(addr)   __notify_intrinsic((char *)"sync_releasing", addr)
+#   endif
+#endif
+#endif
+
+static kmp_bootstrap_lock_t  metadata_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( metadata_lock );
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Parallel region reporting.
+
+        * __kmp_itt_region_forking should be called by master thread of a team. Exact moment of
+          call does not matter, but it should be completed before any thread of this team calls
+          __kmp_itt_region_starting.
+        * __kmp_itt_region_starting should be called by each thread of a team just before entering
+          parallel region body.
+        * __kmp_itt_region_finished should be called by each thread of a team right after returning
+          from parallel region body.
+        * __kmp_itt_region_joined should be called by master thread of a team, after all threads
+          called __kmp_itt_region_finished.
+
+    Note: Thread waiting at join barrier (after __kmp_itt_region_finished) can execute some more
+    user code -- such a thread can execute tasks.
+
+    Note: The overhead of logging region_starting and region_finished in each thread is too large,
+    so these calls are not used.
+
+    ------------------------------------------------------------------------------------------------
+*/
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_region_forking( int gtid, int team_size, int barriers, int serialized ) {
+#if USE_ITT_NOTIFY
+    kmp_team_t *      team = __kmp_team_from_gtid( gtid );
+    if (team->t.t_active_level + serialized > 1)
+    {
+        // The frame notifications are only supported for the outermost teams.
+        return;
+    }
+    ident_t *         loc  = __kmp_thread_from_gtid( gtid )->th.th_ident;
+    if (loc) {
+        // Use the reserved_2 field to store the index to the region domain.
+        // Assume that reserved_2 contains zero initially.  Since zero is special
+        // value here, store the index into domain array increased by 1.
+        if (loc->reserved_2 == 0) {
+            if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                int frm = KMP_TEST_THEN_INC32( & __kmp_region_domain_count ); // get "old" value
+                if (frm >= KMP_MAX_FRAME_DOMAINS) {
+                    KMP_TEST_THEN_DEC32( & __kmp_region_domain_count );       // revert the count
+                    return;                      // loc->reserved_2 is still 0
+                }
+                //if (!KMP_COMPARE_AND_STORE_ACQ32( &loc->reserved_2, 0, frm + 1 )) {
+                //    frm = loc->reserved_2 - 1;   // get value saved by other thread for same loc
+                //} // AC: this block is to replace next unsynchronized line
+
+                // We need to save indexes for both region and barrier frames. We'll use loc->reserved_2
+                // field but put region index to the low two bytes and barrier indexes to the high
+                // two bytes. It is OK because KMP_MAX_FRAME_DOMAINS = 512.
+                loc->reserved_2 |= (frm + 1);                                    // save "new" value
+
+                // Transform compiler-generated region location into the format
+                // that the tools more or less standardized on:
+                //                               "<func>$omp$parallel@[file:]<line>[:<col>]"
+                const char * buff = NULL;
+                kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                        str_loc.func, team_size, str_loc.file,
+                                        str_loc.line, str_loc.col);
+
+                __itt_suppress_push(__itt_suppress_memory_errors);
+                __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
+                __itt_suppress_pop();
+
+                __kmp_str_free( &buff );
+                if( barriers ) {
+                    if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                        int frm = KMP_TEST_THEN_INC32( & __kmp_barrier_domain_count ); // get "old" value
+                        if (frm >= KMP_MAX_FRAME_DOMAINS) {
+                            KMP_TEST_THEN_DEC32( & __kmp_barrier_domain_count );       // revert the count
+                            return;                      // loc->reserved_2 is still 0
+                        }
+                        const char * buff = NULL;
+                        buff = __kmp_str_format("%s$omp$barrier@%s:%d",
+                                                str_loc.func, str_loc.file, str_loc.col);
+                        __itt_suppress_push(__itt_suppress_memory_errors);
+                        __kmp_itt_barrier_domains[ frm ] = __itt_domain_create( buff );
+                        __itt_suppress_pop();
+                        __kmp_str_free( &buff );
+                        // Save the barrier frame index to the high two bytes.
+                        loc->reserved_2 |= (frm + 1) << 16;
+                    }
+                }
+                __kmp_str_loc_free( &str_loc );
+                __itt_frame_begin_v3(__kmp_itt_region_domains[ frm ], NULL);
+            }
+        } else { // Region domain exists for this location
+            // Check if team size was changed. Then create new region domain for this location
+            int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+            if( __kmp_itt_region_team_size[frm] != team_size ) {
+                const char * buff = NULL;
+                kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                        str_loc.func, team_size, str_loc.file,
+                                        str_loc.line, str_loc.col);
+
+                __itt_suppress_push(__itt_suppress_memory_errors);
+                __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
+                __itt_suppress_pop();
+
+                __kmp_str_free( &buff );
+                __kmp_str_loc_free( &str_loc );
+                __kmp_itt_region_team_size[frm] = team_size;
+                __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL);
+            } else { // Team size was not changed. Use existing domain.
+                __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL);
+            }
+        }
+        KMP_ITT_DEBUG_LOCK();
+        KMP_ITT_DEBUG_PRINT( "[frm beg] gtid=%d, idx=%x, serialized:%d, loc:%p\n",
+                         gtid, loc->reserved_2, serialized, loc );
+    }
+#endif
+} // __kmp_itt_region_forking
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_frame_submit( int gtid, __itt_timestamp begin, __itt_timestamp end, int imbalance, ident_t * loc, int team_size, int region ) {
+#if USE_ITT_NOTIFY
+    if( region ) {
+        kmp_team_t *      team = __kmp_team_from_gtid( gtid );
+        int serialized = ( region == 2 ? 1 : 0 );
+        if (team->t.t_active_level + serialized > 1)
+        {
+            // The frame notifications are only supported for the outermost teams.
+            return;
+        }
+         //Check region domain has not been created before. It's index is saved in the low two bytes.
+         if ((loc->reserved_2 & 0x0000FFFF) == 0) {
+             if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                 int frm = KMP_TEST_THEN_INC32( & __kmp_region_domain_count ); // get "old" value
+                 if (frm >= KMP_MAX_FRAME_DOMAINS) {
+                     KMP_TEST_THEN_DEC32( & __kmp_region_domain_count );       // revert the count
+                     return;                      // loc->reserved_2 is still 0
+                 }
+
+                 // We need to save indexes for both region and barrier frames. We'll use loc->reserved_2
+                 // field but put region index to the low two bytes and barrier indexes to the high
+                 // two bytes. It is OK because KMP_MAX_FRAME_DOMAINS = 512.
+                 loc->reserved_2 |= (frm + 1);                                 // save "new" value
+
+                 // Transform compiler-generated region location into the format
+                 // that the tools more or less standardized on:
+                 //                               "<func>$omp$parallel:team_size@[file:]<line>[:<col>]"
+                 const char * buff = NULL;
+                 kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                 buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                         str_loc.func, team_size, str_loc.file,
+                                         str_loc.line, str_loc.col);
+
+                 __itt_suppress_push(__itt_suppress_memory_errors);
+                 __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
+                 __itt_suppress_pop();
+
+                 __kmp_str_free( &buff );
+                 __kmp_str_loc_free( &str_loc );
+                 __kmp_itt_region_team_size[frm] = team_size;
+                 __itt_frame_submit_v3(__kmp_itt_region_domains[ frm ], NULL, begin, end );
+             }
+         } else { // Region domain exists for this location
+             // Check if team size was changed. Then create new region domain for this location
+             int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+             if( __kmp_itt_region_team_size[frm] != team_size ) {
+                 const char * buff = NULL;
+                 kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                 buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                         str_loc.func, team_size, str_loc.file,
+                                         str_loc.line, str_loc.col);
+
+                 __itt_suppress_push(__itt_suppress_memory_errors);
+                 __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
+                 __itt_suppress_pop();
+
+                 __kmp_str_free( &buff );
+                 __kmp_str_loc_free( &str_loc );
+                 __kmp_itt_region_team_size[frm] = team_size;
+                 __itt_frame_submit_v3(__kmp_itt_region_domains[ frm ], NULL, begin, end );
+             } else { // Team size was not changed. Use existing domain.
+                 __itt_frame_submit_v3(__kmp_itt_region_domains[ frm ], NULL, begin, end );
+             }
+         }
+         KMP_ITT_DEBUG_LOCK();
+         KMP_ITT_DEBUG_PRINT( "[reg sub] gtid=%d, idx=%x, region:%d, loc:%p, beg:%llu, end:%llu\n",
+                          gtid, loc->reserved_2, region, loc, begin, end );
+         return;
+    } else { // called for barrier reporting
+        if (loc) {
+            if ((loc->reserved_2 & 0xFFFF0000) == 0) {
+                if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                    int frm = KMP_TEST_THEN_INC32( & __kmp_barrier_domain_count ); // get "old" value
+                    if (frm >= KMP_MAX_FRAME_DOMAINS) {
+                        KMP_TEST_THEN_DEC32( & __kmp_barrier_domain_count );       // revert the count
+                        return;                      // loc->reserved_2 is still 0
+                    }
+                    // Save the barrier frame index to the high two bytes.
+                    loc->reserved_2 |= (frm + 1) << 16;                          // save "new" value
+
+                    // Transform compiler-generated region location into the format
+                    // that the tools more or less standardized on:
+                    //                               "<func>$omp$frame@[file:]<line>[:<col>]"
+                    kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                    if( imbalance ) {
+                        const char * buff_imb = NULL;
+                        buff_imb = __kmp_str_format("%s$omp$barrier-imbalance:%d@%s:%d",
+                                                str_loc.func, team_size, str_loc.file, str_loc.col);
+                        __itt_suppress_push(__itt_suppress_memory_errors);
+                        __kmp_itt_imbalance_domains[ frm ] = __itt_domain_create( buff_imb );
+                        __itt_suppress_pop();
+                        __itt_frame_submit_v3(__kmp_itt_imbalance_domains[ frm ], NULL, begin, end );
+                        __kmp_str_free( &buff_imb );
+                    } else {
+                        const char * buff = NULL;
+                        buff = __kmp_str_format("%s$omp$barrier@%s:%d",
+                                                str_loc.func, str_loc.file, str_loc.col);
+                        __itt_suppress_push(__itt_suppress_memory_errors);
+                        __kmp_itt_barrier_domains[ frm ] = __itt_domain_create( buff );
+                        __itt_suppress_pop();
+                        __itt_frame_submit_v3(__kmp_itt_barrier_domains[ frm ], NULL, begin, end );
+                        __kmp_str_free( &buff );
+                    }
+                    __kmp_str_loc_free( &str_loc );
+                }
+            } else { // if it is not 0 then it should be <= KMP_MAX_FRAME_DOMAINS
+                if( imbalance ) {
+                    __itt_frame_submit_v3(__kmp_itt_imbalance_domains[ (loc->reserved_2 >> 16) - 1 ], NULL, begin, end );
+                } else {
+                    __itt_frame_submit_v3(__kmp_itt_barrier_domains[(loc->reserved_2 >> 16) - 1], NULL, begin, end );
+                }
+            }
+            KMP_ITT_DEBUG_LOCK();
+            KMP_ITT_DEBUG_PRINT( "[frm sub] gtid=%d, idx=%x, loc:%p, beg:%llu, end:%llu\n",
+                             gtid, loc->reserved_2, loc, begin, end );
+        }
+    }
+#endif
+} // __kmp_itt_frame_submit
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_metadata_imbalance( int gtid, kmp_uint64 begin, kmp_uint64 end, kmp_uint64 imbalance, kmp_uint64 reduction ) {
+#if USE_ITT_NOTIFY
+    if( metadata_domain == NULL) {
+        __kmp_acquire_bootstrap_lock( & metadata_lock );
+        if( metadata_domain == NULL) {
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            metadata_domain = __itt_domain_create( "OMP Metadata" );
+            __itt_suppress_pop();
+        }
+        __kmp_release_bootstrap_lock( & metadata_lock );
+    }
+
+    __itt_string_handle * string_handle = __itt_string_handle_create( "omp_metadata_imbalance");
+
+    kmp_uint64 imbalance_data[ 4 ];
+    imbalance_data[ 0 ] = begin;
+    imbalance_data[ 1 ] = end;
+    imbalance_data[ 2 ] = imbalance;
+    imbalance_data[ 3 ] = reduction;
+
+    __itt_metadata_add(metadata_domain, __itt_null, string_handle, __itt_metadata_u64, 4, imbalance_data);
+#endif
+} // __kmp_itt_metadata_imbalance
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_metadata_loop( ident_t * loc, kmp_uint64 sched_type, kmp_uint64 iterations, kmp_uint64 chunk ) {
+#if USE_ITT_NOTIFY
+    if( metadata_domain == NULL) {
+        __kmp_acquire_bootstrap_lock( & metadata_lock );
+        if( metadata_domain == NULL) {
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            metadata_domain = __itt_domain_create( "OMP Metadata" );
+            __itt_suppress_pop();
+        }
+        __kmp_release_bootstrap_lock( & metadata_lock );
+    }
+
+    __itt_string_handle * string_handle = __itt_string_handle_create( "omp_metadata_loop");
+    kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+
+    kmp_uint64 loop_data[ 5 ];
+    loop_data[ 0 ] = str_loc.line;
+    loop_data[ 1 ] = str_loc.col;
+    loop_data[ 2 ] = sched_type;
+    loop_data[ 3 ] = iterations;
+    loop_data[ 4 ] = chunk;
+
+    __kmp_str_loc_free( &str_loc );
+
+    __itt_metadata_add(metadata_domain, __itt_null, string_handle, __itt_metadata_u64, 5, loop_data);
+#endif
+} // __kmp_itt_metadata_loop
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_metadata_single( ident_t * loc ) {
+#if USE_ITT_NOTIFY
+    if( metadata_domain == NULL) {
+        __kmp_acquire_bootstrap_lock( & metadata_lock );
+        if( metadata_domain == NULL) {
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            metadata_domain = __itt_domain_create( "OMP Metadata" );
+            __itt_suppress_pop();
+        }
+        __kmp_release_bootstrap_lock( & metadata_lock );
+    }
+
+    __itt_string_handle * string_handle = __itt_string_handle_create( "omp_metadata_single");
+    kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+    kmp_uint64 single_data[ 2 ];
+    single_data[ 0 ] = str_loc.line;
+    single_data[ 1 ] = str_loc.col;
+
+    __kmp_str_loc_free( &str_loc );
+
+    __itt_metadata_add(metadata_domain, __itt_null, string_handle, __itt_metadata_u64, 2, single_data);
+#endif
+} // __kmp_itt_metadata_single
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_region_starting( int gtid ) {
+#if USE_ITT_NOTIFY
+#endif
+} // __kmp_itt_region_starting
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_region_finished( int gtid ) {
+#if USE_ITT_NOTIFY
+#endif
+} // __kmp_itt_region_finished
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_region_joined( int gtid, int serialized ) {
+#if USE_ITT_NOTIFY
+    kmp_team_t *      team = __kmp_team_from_gtid( gtid );
+    if (team->t.t_active_level + serialized > 1)
+    {
+        // The frame notifications are only supported for the outermost teams.
+        return;
+    }
+    ident_t *         loc  = __kmp_thread_from_gtid( gtid )->th.th_ident;
+    if (loc && loc->reserved_2)
+    {
+        int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+        if(frm < KMP_MAX_FRAME_DOMAINS) {
+            KMP_ITT_DEBUG_LOCK();
+            __itt_frame_end_v3(__kmp_itt_region_domains[frm], NULL);
+            KMP_ITT_DEBUG_PRINT( "[frm end] gtid=%d, idx=%x, serialized:%d, loc:%p\n",
+                         gtid, loc->reserved_2, serialized, loc );
+        }
+    }
+#endif
+} // __kmp_itt_region_joined
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Barriers reporting.
+
+    A barrier consists of two phases:
+
+        1. Gather -- master waits for arriving of all the worker threads; each worker thread
+           registers arrival and goes further.
+        2. Release -- each worker threads waits until master lets it go; master lets worker threads
+           go.
+
+    Function should be called by each thread:
+
+        * __kmp_itt_barrier_starting() -- before arriving to the gather phase.
+        * __kmp_itt_barrier_middle()   -- between gather and release phases.
+        * __kmp_itt_barrier_finished() -- after release phase.
+
+    Note: Call __kmp_itt_barrier_object() before call to __kmp_itt_barrier_starting() and save
+    result in local variable. __kmp_itt_barrier_object(), being called too late (e. g. after gather
+    phase) would return itt sync object for the next barrier!
+
+    ITT need an address (void *) to be specified as a sync object. OpenMP RTL does not have
+    barrier object or barrier data structure. Barrier is just a counter in team and thread
+    structures. We could use an address of team structure as an barrier sync object, but ITT wants
+    different objects for different barriers (even whithin the same team). So let us use
+    team address as barrier sync object for the first barrier, then increase it by one for the next
+    barrier, and so on (but wrap it not to use addresses outside of team structure).
+
+    ------------------------------------------------------------------------------------------------
+*/
+
+void *
+__kmp_itt_barrier_object(
+    int  gtid,
+    int  bt,
+    int  set_name,
+    int  delta    // 0 (current barrier) is default value; specify -1 to get previous barrier.
+) {
+    void * object = NULL;
+#if USE_ITT_NOTIFY
+    kmp_info_t *    thr  = __kmp_thread_from_gtid( gtid );
+    kmp_team_t *    team = thr->th.th_team;
+
+    // NOTE:
+    // If the function is called from __kmp_fork_barrier, team pointer can be NULL. This "if"
+    // helps to avoid crash. However, this is not complete solution, and reporting fork/join
+    // barriers to ITT should be revisited.
+
+    if ( team != NULL ) {
+
+        // Master thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time. Divide b_arrived
+        // by KMP_BARRIER_STATE_BUMP to get plain barrier counter.
+        kmp_uint counter = team->t.t_bar[ bt ].b_arrived / KMP_BARRIER_STATE_BUMP + delta;
+        // Now form the barrier id. Encode barrier type (bt) in barrier id too, so barriers of
+        // different types do not have the same ids.
+        KMP_BUILD_ASSERT( sizeof( kmp_team_t ) >= bs_last_barrier );
+            // This conditon is a must (we would have zero divide otherwise).
+        KMP_BUILD_ASSERT( sizeof( kmp_team_t ) >= 2 * bs_last_barrier );
+            // More strong condition: make sure we have room at least for for two differtent ids
+            // (for each barrier type).
+        object =
+            reinterpret_cast< void * >(
+                kmp_uintptr_t( team )
+                    + counter % ( sizeof( kmp_team_t ) / bs_last_barrier ) * bs_last_barrier
+                    + bt
+            );
+        KMP_ITT_DEBUG_LOCK();
+        KMP_ITT_DEBUG_PRINT( "[bar obj] type=%d, counter=%d, object=%p\n", bt, counter, object );
+
+        if ( set_name ) {
+            ident_t const * loc  = NULL;
+            char const *    src  = NULL;
+            char const *    type = "OMP Barrier";
+            switch ( bt ) {
+                case bs_plain_barrier : {
+                    // For plain barrier compiler calls __kmpc_barrier() function, which saves
+                    // location in thr->th.th_ident.
+                    loc = thr->th.th_ident;
+                    // Get the barrier type from flags provided by compiler.
+                    kmp_int32   expl = 0;
+                    kmp_uint32  impl = 0;
+                    if ( loc != NULL ) {
+                        src  = loc->psource;
+                        expl = ( loc->flags & KMP_IDENT_BARRIER_EXPL ) != 0;
+                        impl = ( loc->flags & KMP_IDENT_BARRIER_IMPL ) != 0;
+                    }; // if
+                    if ( impl ) {
+                        switch ( loc->flags & KMP_IDENT_BARRIER_IMPL_MASK ) {
+                            case KMP_IDENT_BARRIER_IMPL_FOR : {
+                                type = "OMP For Barrier";
+                            } break;
+                            case KMP_IDENT_BARRIER_IMPL_SECTIONS : {
+                                type = "OMP Sections Barrier";
+                            } break;
+                            case KMP_IDENT_BARRIER_IMPL_SINGLE : {
+                                type = "OMP Single Barrier";
+                            } break;
+                            case KMP_IDENT_BARRIER_IMPL_WORKSHARE : {
+                                type = "OMP Workshare Barrier";
+                            } break;
+                            default : {
+                                type = "OMP Implicit Barrier";
+                                KMP_DEBUG_ASSERT( 0 );
+                            };
+                        }; /* switch */
+                    } else if ( expl ) {
+                        type = "OMP Explicit Barrier";
+                    }; /* if */
+                } break;
+                case bs_forkjoin_barrier : {
+                    // In case of fork/join barrier we can read thr->th.th_ident, because it
+                    // contains location of last passed construct (while join barrier is not
+                    // such one). Use th_ident of master thread instead -- __kmp_join_call()
+                    // called by the master thread saves location.
+                    //
+                    // AC: cannot read from master because __kmp_join_call may be not called
+                    //    yet, so we read the location from team. This is the same location.
+                    //    And team is valid at the enter to join barrier where this happens.
+                    loc  = team->t.t_ident;
+                    if ( loc != NULL ) {
+                        src  = loc->psource;
+                    }; // if
+                    type = "OMP Join Barrier";
+                } break;
+            }; // switch
+            KMP_ITT_DEBUG_LOCK();
+            __itt_sync_create( object, type, src, __itt_attr_barrier );
+            KMP_ITT_DEBUG_PRINT( "[bar sta] scre( %p, \"%s\", \"%s\", __itt_attr_barrier )\n", object, type, src );
+        }; // if
+
+    }; // if
+#endif
+    return object;
+} // __kmp_itt_barrier_object
+
+// -------------------------------------------------------------------------------------------------
+
+void
+__kmp_itt_barrier_starting( int gtid, void * object ) {
+#if USE_ITT_NOTIFY
+    if ( !KMP_MASTER_GTID( gtid ) ) {
+        KMP_ITT_DEBUG_LOCK();
+        __itt_sync_releasing( object );
+        KMP_ITT_DEBUG_PRINT( "[bar sta] srel( %p )\n", object );
+    }; // if
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_prepare( object );
+    KMP_ITT_DEBUG_PRINT( "[bar sta] spre( %p )\n", object );
+#endif
+} // __kmp_itt_barrier_starting
+
+// -------------------------------------------------------------------------------------------------
+
+void
+__kmp_itt_barrier_middle( int gtid, void * object ) {
+#if USE_ITT_NOTIFY
+    if ( KMP_MASTER_GTID( gtid ) ) {
+        KMP_ITT_DEBUG_LOCK();
+        __itt_sync_acquired( object );
+        KMP_ITT_DEBUG_PRINT( "[bar mid] sacq( %p )\n", object );
+        KMP_ITT_DEBUG_LOCK();
+        __itt_sync_releasing( object );
+        KMP_ITT_DEBUG_PRINT( "[bar mid] srel( %p )\n", object );
+    } else {
+    }; // if
+#endif
+} // __kmp_itt_barrier_middle
+
+// -------------------------------------------------------------------------------------------------
+
+void
+__kmp_itt_barrier_finished( int gtid, void * object ) {
+#if USE_ITT_NOTIFY
+    if ( KMP_MASTER_GTID( gtid ) ) {
+    } else {
+        KMP_ITT_DEBUG_LOCK();
+        __itt_sync_acquired( object );
+        KMP_ITT_DEBUG_PRINT( "[bar end] sacq( %p )\n", object );
+    }; // if
+#endif
+} // __kmp_itt_barrier_finished
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Taskwait reporting.
+
+    ITT need an address (void *) to be specified as a sync object. OpenMP RTL does not have taskwait
+    structure, so we need to construct something.
+
+*/
+
+void *
+__kmp_itt_taskwait_object( int gtid ) {
+    void * object = NULL;
+#if USE_ITT_NOTIFY
+    if ( __itt_sync_create_ptr ) {
+        kmp_info_t *     thread   = __kmp_thread_from_gtid( gtid );
+        kmp_taskdata_t * taskdata = thread -> th.th_current_task;
+        object =
+            reinterpret_cast< void * >(
+                kmp_uintptr_t( taskdata ) + taskdata->td_taskwait_counter % sizeof( kmp_taskdata_t )
+            );
+    }; // if
+#endif
+    return object;
+} // __kmp_itt_taskwait_object
+
+void
+__kmp_itt_taskwait_starting(
+    int     gtid,
+    void *  object
+) {
+#if USE_ITT_NOTIFY
+    kmp_info_t *     thread   = __kmp_thread_from_gtid( gtid );
+    kmp_taskdata_t * taskdata = thread -> th.th_current_task;
+    ident_t const *  loc      = taskdata->td_taskwait_ident;
+    char const *     src      = ( loc == NULL? NULL : loc->psource );
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_create( object, "OMP Taskwait", src, 0 );
+    KMP_ITT_DEBUG_PRINT( "[twa sta] scre( %p, \"OMP Taskwait\", \"%s\", 0 )\n", object, src );
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_prepare( object );
+    KMP_ITT_DEBUG_PRINT( "[twa sta] spre( %p )\n", object );
+#endif
+} // __kmp_itt_taskwait_starting
+
+void
+__kmp_itt_taskwait_finished(
+    int     gtid,
+    void *  object
+) {
+#if USE_ITT_NOTIFY
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_acquired( object );
+    KMP_ITT_DEBUG_PRINT( "[twa end] sacq( %p )\n", object );
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_destroy( object );
+    KMP_ITT_DEBUG_PRINT( "[twa end] sdes( %p )\n", object );
+#endif
+} // __kmp_itt_taskwait_finished
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Task reporting.
+
+    Only those tasks are reported which are executed by a thread spinning at barrier (or taskwait).
+    Synch object passed to the function must be barrier of taskwait the threads waiting at.
+    ------------------------------------------------------------------------------------------------
+*/
+
+void
+__kmp_itt_task_starting(
+    void * object     // ITT sync object: barrier or taskwait.
+) {
+#if USE_ITT_NOTIFY
+    if ( object != NULL ) {
+        KMP_ITT_DEBUG_LOCK();
+        __itt_sync_cancel( object );
+        KMP_ITT_DEBUG_PRINT( "[tsk sta] scan( %p )\n", object );
+    }; // if
+#endif
+} // __kmp_itt_task_starting
+
+// -------------------------------------------------------------------------------------------------
+
+void
+__kmp_itt_task_finished(
+    void * object     // ITT sync object: barrier or taskwait.
+) {
+#if USE_ITT_NOTIFY
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_prepare( object );
+    KMP_ITT_DEBUG_PRINT( "[tsk end] spre( %p )\n", object );
+#endif
+} // __kmp_itt_task_finished
+
+// -------------------------------------------------------------------------------------------------
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Lock reporting.
+
+        * __kmp_itt_lock_creating( lock ) should be called *before* the first lock operation
+          (set/unset). It is not a real event shown to the user but just setting a name for
+          synchronization object. `lock' is an address of sync object, the same address should be
+          used in all subsequent calls.
+
+        * __kmp_itt_lock_acquiring() should be called before setting the lock.
+
+        * __kmp_itt_lock_acquired() should be called after setting the lock.
+
+        * __kmp_itt_lock_realeasing() should be called before unsetting the lock.
+
+        * __kmp_itt_lock_cancelled() should be called after thread cancelled waiting for the lock.
+
+        * __kmp_itt_lock_destroyed( lock ) should be called after the last lock operation. After
+          __kmp_itt_lock_destroyed() all the references to the same address will be considered
+          as another sync object, not related with the original one.
+    ------------------------------------------------------------------------------------------------
+*/
+
+// -------------------------------------------------------------------------------------------------
+
+#if KMP_USE_DYNAMIC_LOCK
+// Takes location information directly
+__kmp_inline
+void
+___kmp_itt_lock_init( kmp_user_lock_p lock, char const *type, const ident_t *loc ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_sync_create_ptr ) {
+        char const *    src = ( loc == NULL ? NULL : loc->psource );
+        KMP_ITT_DEBUG_LOCK();
+        __itt_sync_create( lock, type, src, 0 );
+        KMP_ITT_DEBUG_PRINT( "[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type, src );
+    }
+#endif
+}
+#else // KMP_USE_DYNAMIC_LOCK
+// Internal guts -- common code for locks and critical sections, do not call directly.
+__kmp_inline
+void
+___kmp_itt_lock_init( kmp_user_lock_p lock, char const * type ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_sync_create_ptr ) {
+        ident_t const * loc = NULL;
+        if ( __kmp_get_user_lock_location_ != NULL )
+            loc = __kmp_get_user_lock_location_( (lock) );
+        char const *    src = ( loc == NULL ? NULL : loc->psource );
+        KMP_ITT_DEBUG_LOCK();
+        __itt_sync_create( lock, type, src, 0 );
+        KMP_ITT_DEBUG_PRINT( "[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type, src );
+    }; // if
+#endif
+} // ___kmp_itt_lock_init
+#endif // KMP_USE_DYNAMIC_LOCK
+
+// Internal guts -- common code for locks and critical sections, do not call directly.
+__kmp_inline
+void
+___kmp_itt_lock_fini( kmp_user_lock_p lock, char const * type ) {
+#if USE_ITT_NOTIFY
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_destroy( lock );
+    KMP_ITT_DEBUG_PRINT( "[lck dst] sdes( %p )\n", lock );
+#endif
+} // ___kmp_itt_lock_fini
+
+
+// -------------------------------------------------------------------------------------------------
+
+#if KMP_USE_DYNAMIC_LOCK
+void
+__kmp_itt_lock_creating( kmp_user_lock_p lock, const ident_t *loc ) {
+    ___kmp_itt_lock_init( lock, "OMP Lock", loc );
+}
+#else
+void
+__kmp_itt_lock_creating( kmp_user_lock_p lock ) {
+    ___kmp_itt_lock_init( lock, "OMP Lock" );
+} // __kmp_itt_lock_creating
+#endif
+
+void
+__kmp_itt_lock_acquiring( kmp_user_lock_p lock ) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+    // postpone lock object access
+    if ( __itt_sync_prepare_ptr ) {
+        if ( DYNA_EXTRACT_D_TAG(lock) == 0 ) {
+            kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(lock);
+            __itt_sync_prepare( ilk->lock );
+        } else {
+            __itt_sync_prepare( lock );
+        }
+    }
+#else
+    __itt_sync_prepare( lock );
+#endif
+} // __kmp_itt_lock_acquiring
+
+void
+__kmp_itt_lock_acquired( kmp_user_lock_p lock ) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+    // postpone lock object access
+    if ( __itt_sync_acquired_ptr ) {
+        if ( DYNA_EXTRACT_D_TAG(lock) == 0 ) {
+            kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(lock);
+            __itt_sync_acquired( ilk->lock );
+        } else {
+            __itt_sync_acquired( lock );
+        }
+    }
+#else
+    __itt_sync_acquired( lock );
+#endif
+} // __kmp_itt_lock_acquired
+
+void
+__kmp_itt_lock_releasing( kmp_user_lock_p lock ) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+    if ( __itt_sync_releasing_ptr ) {
+        if ( DYNA_EXTRACT_D_TAG(lock) == 0 ) {
+            kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(lock);
+            __itt_sync_releasing( ilk->lock );
+        } else {
+            __itt_sync_releasing( lock );
+        }
+    }
+#else
+    __itt_sync_releasing( lock );
+#endif
+} // __kmp_itt_lock_releasing
+
+void
+__kmp_itt_lock_cancelled( kmp_user_lock_p lock ) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+    if ( __itt_sync_cancel_ptr ) {
+        if ( DYNA_EXTRACT_D_TAG(lock) == 0 ) {
+            kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(lock);
+            __itt_sync_cancel( ilk->lock );
+        } else {
+            __itt_sync_cancel( lock );
+        }
+    }
+#else
+    __itt_sync_cancel( lock );
+#endif
+} // __kmp_itt_lock_cancelled
+
+void
+__kmp_itt_lock_destroyed( kmp_user_lock_p lock ) {
+    ___kmp_itt_lock_fini( lock, "OMP Lock" );
+} // __kmp_itt_lock_destroyed
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Critical reporting.
+
+    Critical sections are treated exactly as locks (but have different object type).
+    ------------------------------------------------------------------------------------------------
+*/
+#if KMP_USE_DYNAMIC_LOCK
+void
+__kmp_itt_critical_creating( kmp_user_lock_p lock, const ident_t *loc ) {
+    ___kmp_itt_lock_init( lock, "OMP Critical", loc);
+}
+#else
+void
+__kmp_itt_critical_creating( kmp_user_lock_p lock ) {
+    ___kmp_itt_lock_init( lock, "OMP Critical" );
+} // __kmp_itt_critical_creating
+#endif
+
+void
+__kmp_itt_critical_acquiring( kmp_user_lock_p lock ) {
+    __itt_sync_prepare( lock );
+} // __kmp_itt_critical_acquiring
+
+void
+__kmp_itt_critical_acquired( kmp_user_lock_p lock ) {
+    __itt_sync_acquired( lock );
+} // __kmp_itt_critical_acquired
+
+void
+__kmp_itt_critical_releasing( kmp_user_lock_p lock ) {
+    __itt_sync_releasing( lock );
+} // __kmp_itt_critical_releasing
+
+void
+__kmp_itt_critical_destroyed( kmp_user_lock_p lock ) {
+    ___kmp_itt_lock_fini( lock, "OMP Critical" );
+} // __kmp_itt_critical_destroyed
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Single reporting.
+    ------------------------------------------------------------------------------------------------
+*/
+
+void
+__kmp_itt_single_start( int gtid ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_mark_create_ptr || KMP_ITT_DEBUG ) {
+        kmp_info_t *   thr = __kmp_thread_from_gtid( (gtid) );
+        ident_t *      loc = thr->th.th_ident;
+        char const *   src = ( loc == NULL ? NULL : loc->psource );
+        kmp_str_buf_t  name;
+        __kmp_str_buf_init( & name );
+        __kmp_str_buf_print( & name, "OMP Single-%s", src );
+        KMP_ITT_DEBUG_LOCK();
+        thr->th.th_itt_mark_single = __itt_mark_create( name.str );
+        KMP_ITT_DEBUG_PRINT( "[sin sta] mcre( \"%s\") -> %d\n", name.str, thr->th.th_itt_mark_single );
+        __kmp_str_buf_free( & name );
+        KMP_ITT_DEBUG_LOCK();
+        __itt_mark( thr->th.th_itt_mark_single, NULL );
+        KMP_ITT_DEBUG_PRINT( "[sin sta] mark( %d, NULL )\n", thr->th.th_itt_mark_single );
+    }; // if
+#endif
+} // __kmp_itt_single_start
+
+void
+__kmp_itt_single_end( int gtid ) {
+#if USE_ITT_NOTIFY
+    __itt_mark_type  mark = __kmp_thread_from_gtid( gtid )->th.th_itt_mark_single;
+    KMP_ITT_DEBUG_LOCK();
+    __itt_mark_off( mark );
+    KMP_ITT_DEBUG_PRINT( "[sin end] moff( %d )\n", mark );
+#endif
+} // __kmp_itt_single_end
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Ordered reporting.
+
+    __kmp_itt_ordered_init is called by each thread *before* first using sync
+    object. ITT team would like it to be called once, but it requires extra synchronization.
+
+    __kmp_itt_ordered_prep is called when thread is going to enter ordered section
+    (before synchronization).
+
+    __kmp_itt_ordered_start is called just before entering user code (after
+    synchronization).
+
+    __kmp_itt_ordered_end is called after returning from user code.
+
+    Sync object is th->th.th_dispatch->th_dispatch_sh_current.
+
+    Events are not generated in case of serialized team.
+    ------------------------------------------------------------------------------------------------
+*/
+
+void
+__kmp_itt_ordered_init( int gtid ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_sync_create_ptr ) {
+        kmp_info_t *   thr   = __kmp_thread_from_gtid( gtid );
+        ident_t const * loc  = thr->th.th_ident;
+        char const *    src  = ( loc == NULL ? NULL : loc->psource );
+        __itt_sync_create(
+            thr->th.th_dispatch->th_dispatch_sh_current, "OMP Ordered", src, 0
+        );
+    }; // if
+#endif
+} // __kmp_itt_ordered_init
+
+void
+__kmp_itt_ordered_prep( int gtid ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_sync_create_ptr ) {
+        kmp_team_t * t = __kmp_team_from_gtid( gtid );
+        if ( ! t->t.t_serialized ) {
+            kmp_info_t * th = __kmp_thread_from_gtid( gtid );
+            __itt_sync_prepare( th->th.th_dispatch->th_dispatch_sh_current );
+        }; // if
+    }; // if
+#endif
+} // __kmp_itt_ordered_prep
+
+void
+__kmp_itt_ordered_start( int gtid ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_sync_create_ptr ) {
+        kmp_team_t * t = __kmp_team_from_gtid( gtid );
+        if ( ! t->t.t_serialized ) {
+            kmp_info_t * th = __kmp_thread_from_gtid( gtid );
+            __itt_sync_acquired( th->th.th_dispatch->th_dispatch_sh_current );
+        }; // if
+    }; // if
+#endif
+} // __kmp_itt_ordered_start
+
+void
+__kmp_itt_ordered_end( int gtid ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_sync_create_ptr ) {
+        kmp_team_t * t = __kmp_team_from_gtid( gtid );
+        if ( ! t->t.t_serialized ) {
+            kmp_info_t * th = __kmp_thread_from_gtid( gtid );
+            __itt_sync_releasing( th->th.th_dispatch->th_dispatch_sh_current );
+        }; // if
+    }; // if
+#endif
+} // __kmp_itt_ordered_end
+
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Threads reporting.
+    ------------------------------------------------------------------------------------------------
+*/
+
+void
+__kmp_itt_thread_ignore() {
+    __itt_thr_ignore();
+} // __kmp_itt_thread_ignore
+
+void
+__kmp_itt_thread_name( int gtid ) {
+#if USE_ITT_NOTIFY
+    if ( __itt_thr_name_set_ptr ) {
+        kmp_str_buf_t name;
+        __kmp_str_buf_init( & name );
+        if( KMP_MASTER_GTID(gtid) ) {
+            __kmp_str_buf_print( & name, "OMP Master Thread #%d", gtid );
+        } else {
+            __kmp_str_buf_print( & name, "OMP Worker Thread #%d", gtid );
+        }
+        KMP_ITT_DEBUG_LOCK();
+        __itt_thr_name_set( name.str, name.used );
+        KMP_ITT_DEBUG_PRINT( "[thr nam] name( \"%s\")\n", name.str );
+        __kmp_str_buf_free( & name );
+    }; // if
+#endif
+} // __kmp_itt_thread_name
+
+
+/*
+    --------------------------------------------------------------------------
+    System object reporting.
+
+    ITT catches operations with system sync objects (like Windows* OS on IA-32
+    architecture API critical sections and events). We only need to specify
+    name ("OMP Scheduler") for the object to let ITT know it is an object used
+    by OpenMP RTL for internal purposes.
+    --------------------------------------------------------------------------
+*/
+
+void
+__kmp_itt_system_object_created( void * object, char const * name ) {
+#if USE_ITT_NOTIFY
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_create( object, "OMP Scheduler", name, 0 );
+   KMP_ITT_DEBUG_PRINT( "[sys obj] scre( %p, \"OMP Scheduler\", \"%s\", 0 )\n", object, name );
+#endif
+} // __kmp_itt_system_object_created
+
+
+/*
+    ------------------------------------------------------------------------------------------------
+    Stack stitching api.
+
+    Master calls "create" and put the stitching id into team structure.
+    Workers read the stitching id and call "enter" / "leave" api.
+    Master calls "destroy" at the end of the parallel region.
+    ------------------------------------------------------------------------------------------------
+*/
+
+__itt_caller
+__kmp_itt_stack_caller_create()
+{
+#if USE_ITT_NOTIFY
+    if ( !__itt_stack_caller_create_ptr )
+        return NULL;
+    KMP_ITT_DEBUG_LOCK();
+    __itt_caller id = __itt_stack_caller_create();
+    KMP_ITT_DEBUG_PRINT( "[stk cre] %p\n", id );
+    return id;
+#endif
+    return NULL;
+}
+
+void
+__kmp_itt_stack_caller_destroy( __itt_caller id )
+{
+#if USE_ITT_NOTIFY
+    if ( __itt_stack_caller_destroy_ptr ) {
+        KMP_ITT_DEBUG_LOCK();
+        __itt_stack_caller_destroy( id );
+        KMP_ITT_DEBUG_PRINT( "[stk des] %p\n", id );
+    }
+#endif
+}
+
+void
+__kmp_itt_stack_callee_enter( __itt_caller id )
+{
+#if USE_ITT_NOTIFY
+    if ( __itt_stack_callee_enter_ptr ) {
+        KMP_ITT_DEBUG_LOCK();
+        __itt_stack_callee_enter( id );
+        KMP_ITT_DEBUG_PRINT( "[stk ent] %p\n", id );
+    }
+#endif
+}
+
+void
+__kmp_itt_stack_callee_leave( __itt_caller id )
+{
+#if USE_ITT_NOTIFY
+    if ( __itt_stack_callee_leave_ptr ) {
+        KMP_ITT_DEBUG_LOCK();
+        __itt_stack_callee_leave( id );
+        KMP_ITT_DEBUG_PRINT( "[stk lea] %p\n", id );
+    }
+#endif
+}
+
+#endif /* USE_ITT_BUILD */

diff --git a/final/runtime/src/kmp_lock.cpp b/final/runtime/src/kmp_lock.cpp
new file mode 100644
index 0000000..21c87e6
--- /dev/null
+++ b/final/runtime/src/kmp_lock.cpp

@@ -0,0 +1,4161 @@
+/*
+ * kmp_lock.cpp -- lock-related functions
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stddef.h>
+
+#include "kmp.h"
+#include "kmp_itt.h"
+#include "kmp_i18n.h"
+#include "kmp_lock.h"
+#include "kmp_io.h"
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+# include <unistd.h>
+# include <sys/syscall.h>
+// We should really include <futex.h>, but that causes compatibility problems on different
+// Linux* OS distributions that either require that you include (or break when you try to include)
+// <pci/types.h>.
+// Since all we need is the two macros below (which are part of the kernel ABI, so can't change)
+// we just define the constants here and don't include <futex.h>
+# ifndef FUTEX_WAIT
+#  define FUTEX_WAIT    0
+# endif
+# ifndef FUTEX_WAKE
+#  define FUTEX_WAKE    1
+# endif
+#endif
+
+/* Implement spin locks for internal library use.             */
+/* The algorithm implemented is Lamport's bakery lock [1974]. */
+
+void
+__kmp_validate_locks( void )
+{
+    int i;
+    kmp_uint32  x, y;
+
+    /* Check to make sure unsigned arithmetic does wraps properly */
+    x = ~((kmp_uint32) 0) - 2;
+    y = x - 2;
+
+    for (i = 0; i < 8; ++i, ++x, ++y) {
+        kmp_uint32 z = (x - y);
+        KMP_ASSERT( z == 2 );
+    }
+
+    KMP_ASSERT( offsetof( kmp_base_queuing_lock, tail_id ) % 8 == 0 );
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* test and set locks */
+
+//
+// For the non-nested locks, we can only assume that the first 4 bytes were
+// allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
+// compiler only allocates a 4 byte pointer on IA-32 architecture.  On
+// Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
+//
+// gcc reserves >= 8 bytes for nested locks, so we can assume that the
+// entire 8 bytes were allocated for nested locks on all 64-bit platforms.
+//
+
+static kmp_int32
+__kmp_get_tas_lock_owner( kmp_tas_lock_t *lck )
+{
+    return DYNA_LOCK_STRIP(TCR_4( lck->lk.poll )) - 1;
+}
+
+static inline bool
+__kmp_is_tas_lock_nestable( kmp_tas_lock_t *lck )
+{
+    return lck->lk.depth_locked != -1;
+}
+
+__forceinline static void
+__kmp_acquire_tas_lock_timed_template( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_MB();
+
+#ifdef USE_LOCK_PROFILE
+    kmp_uint32 curr = TCR_4( lck->lk.poll );
+    if ( ( curr != 0 ) && ( curr != gtid + 1 ) )
+        __kmp_printf( "LOCK CONTENTION: %p\n", lck );
+    /* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+    if ( ( lck->lk.poll == DYNA_LOCK_FREE(tas) )
+      && KMP_COMPARE_AND_STORE_ACQ32( & ( lck->lk.poll ), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas) ) ) {
+        KMP_FSYNC_ACQUIRED(lck);
+        return;
+    }
+
+    kmp_uint32 spins;
+    KMP_FSYNC_PREPARE( lck );
+    KMP_INIT_YIELD( spins );
+    if ( TCR_4( __kmp_nth ) > ( __kmp_avail_proc ? __kmp_avail_proc :
+      __kmp_xproc ) ) {
+        KMP_YIELD( TRUE );
+    }
+    else {
+        KMP_YIELD_SPIN( spins );
+    }
+
+    while ( ( lck->lk.poll != DYNA_LOCK_FREE(tas) ) ||
+      ( ! KMP_COMPARE_AND_STORE_ACQ32( & ( lck->lk.poll ), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas) ) ) ) {
+        //
+        // FIXME - use exponential backoff here
+        //
+        if ( TCR_4( __kmp_nth ) > ( __kmp_avail_proc ? __kmp_avail_proc :
+          __kmp_xproc ) ) {
+            KMP_YIELD( TRUE );
+        }
+        else {
+            KMP_YIELD_SPIN( spins );
+        }
+    }
+    KMP_FSYNC_ACQUIRED( lck );
+}
+
+void
+__kmp_acquire_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    __kmp_acquire_tas_lock_timed_template( lck, gtid );
+}
+
+static void
+__kmp_acquire_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_lock";
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_tas_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
+    }
+    __kmp_acquire_tas_lock( lck, gtid );
+}
+
+int
+__kmp_test_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    if ( ( lck->lk.poll == DYNA_LOCK_FREE(tas) )
+      && KMP_COMPARE_AND_STORE_ACQ32( & ( lck->lk.poll ), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas) ) ) {
+        KMP_FSYNC_ACQUIRED( lck );
+        return TRUE;
+    }
+    return FALSE;
+}
+
+static int
+__kmp_test_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_lock";
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    return __kmp_test_tas_lock( lck, gtid );
+}
+
+int
+__kmp_release_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KMP_FSYNC_RELEASING(lck);
+    KMP_ST_REL32( &(lck->lk.poll), DYNA_LOCK_FREE(tas) );
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KMP_YIELD( TCR_4( __kmp_nth ) > ( __kmp_avail_proc ? __kmp_avail_proc :
+      __kmp_xproc ) );
+    return KMP_LOCK_RELEASED;
+}
+
+static int
+__kmp_release_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_tas_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_tas_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    return __kmp_release_tas_lock( lck, gtid );
+}
+
+void
+__kmp_init_tas_lock( kmp_tas_lock_t * lck )
+{
+    TCW_4( lck->lk.poll, DYNA_LOCK_FREE(tas) );
+}
+
+static void
+__kmp_init_tas_lock_with_checks( kmp_tas_lock_t * lck )
+{
+    __kmp_init_tas_lock( lck );
+}
+
+void
+__kmp_destroy_tas_lock( kmp_tas_lock_t *lck )
+{
+    lck->lk.poll = 0;
+}
+
+static void
+__kmp_destroy_tas_lock_with_checks( kmp_tas_lock_t *lck )
+{
+    char const * const func = "omp_destroy_lock";
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_tas_lock( lck );
+}
+
+
+//
+// nested test and set locks
+//
+
+void
+__kmp_acquire_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_tas_lock_owner( lck ) == gtid ) {
+        lck->lk.depth_locked += 1;
+    }
+    else {
+        __kmp_acquire_tas_lock_timed_template( lck, gtid );
+        lck->lk.depth_locked = 1;
+    }
+}
+
+static void
+__kmp_acquire_nested_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_nest_lock";
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    __kmp_acquire_nested_tas_lock( lck, gtid );
+}
+
+int
+__kmp_test_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    int retval;
+
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_tas_lock_owner( lck ) == gtid ) {
+        retval = ++lck->lk.depth_locked;
+    }
+    else if ( !__kmp_test_tas_lock( lck, gtid ) ) {
+        retval = 0;
+    }
+    else {
+        KMP_MB();
+        retval = lck->lk.depth_locked = 1;
+    }
+    return retval;
+}
+
+static int
+__kmp_test_nested_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_nest_lock";
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    return __kmp_test_nested_tas_lock( lck, gtid );
+}
+
+int
+__kmp_release_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    KMP_MB();
+    if ( --(lck->lk.depth_locked) == 0 ) {
+        __kmp_release_tas_lock( lck, gtid );
+        return KMP_LOCK_RELEASED;
+    }
+    return KMP_LOCK_STILL_HELD;
+}
+
+static int
+__kmp_release_nested_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    return __kmp_release_nested_tas_lock( lck, gtid );
+}
+
+void
+__kmp_init_nested_tas_lock( kmp_tas_lock_t * lck )
+{
+    __kmp_init_tas_lock( lck );
+    lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+static void
+__kmp_init_nested_tas_lock_with_checks( kmp_tas_lock_t * lck )
+{
+    __kmp_init_nested_tas_lock( lck );
+}
+
+void
+__kmp_destroy_nested_tas_lock( kmp_tas_lock_t *lck )
+{
+    __kmp_destroy_tas_lock( lck );
+    lck->lk.depth_locked = 0;
+}
+
+static void
+__kmp_destroy_nested_tas_lock_with_checks( kmp_tas_lock_t *lck )
+{
+    char const * const func = "omp_destroy_nest_lock";
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_nested_tas_lock( lck );
+}
+
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+/* ------------------------------------------------------------------------ */
+/* futex locks */
+
+// futex locks are really just test and set locks, with a different method
+// of handling contention.  They take the same amount of space as test and
+// set locks, and are allocated the same way (i.e. use the area allocated by
+// the compiler for non-nested locks / allocate nested locks on the heap).
+
+static kmp_int32
+__kmp_get_futex_lock_owner( kmp_futex_lock_t *lck )
+{
+    return DYNA_LOCK_STRIP(( TCR_4( lck->lk.poll ) >> 1 )) - 1;
+}
+
+static inline bool
+__kmp_is_futex_lock_nestable( kmp_futex_lock_t *lck )
+{
+    return lck->lk.depth_locked != -1;
+}
+
+__forceinline static void
+__kmp_acquire_futex_lock_timed_template( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    kmp_int32 gtid_code = ( gtid + 1 ) << 1;
+
+    KMP_MB();
+
+#ifdef USE_LOCK_PROFILE
+    kmp_uint32 curr = TCR_4( lck->lk.poll );
+    if ( ( curr != 0 ) && ( curr != gtid_code ) )
+        __kmp_printf( "LOCK CONTENTION: %p\n", lck );
+    /* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+    KMP_FSYNC_PREPARE( lck );
+    KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
+      lck, lck->lk.poll, gtid ) );
+
+    kmp_int32 poll_val;
+
+    while ( ( poll_val = KMP_COMPARE_AND_STORE_RET32( & ( lck->lk.poll ), DYNA_LOCK_FREE(futex),
+             DYNA_LOCK_BUSY(gtid_code, futex) ) ) != DYNA_LOCK_FREE(futex) ) {
+
+        kmp_int32 cond = DYNA_LOCK_STRIP(poll_val) & 1;
+        KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
+           lck, gtid, poll_val, cond ) );
+
+        //
+        // NOTE: if you try to use the following condition for this branch
+        //
+        // if ( poll_val & 1 == 0 )
+        //
+        // Then the 12.0 compiler has a bug where the following block will
+        // always be skipped, regardless of the value of the LSB of poll_val.
+        //
+        if ( ! cond ) {
+            //
+            // Try to set the lsb in the poll to indicate to the owner
+            // thread that they need to wake this thread up.
+            //
+            if ( ! KMP_COMPARE_AND_STORE_REL32( & ( lck->lk.poll ), poll_val, poll_val | DYNA_LOCK_BUSY(1, futex) ) ) {
+                KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
+                  lck, lck->lk.poll, gtid ) );
+                continue;
+            }
+            poll_val |= DYNA_LOCK_BUSY(1, futex);
+
+            KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n",
+              lck, lck->lk.poll, gtid ) );
+        }
+
+        KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
+           lck, gtid, poll_val ) );
+
+        kmp_int32 rc;
+        if ( ( rc = syscall( __NR_futex, & ( lck->lk.poll ), FUTEX_WAIT,
+          poll_val, NULL, NULL, 0 ) ) != 0 ) {
+            KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) failed (rc=%d errno=%d)\n",
+               lck, gtid, poll_val, rc, errno ) );
+            continue;
+        }
+
+        KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
+           lck, gtid, poll_val ) );
+        //
+        // This thread has now done a successful futex wait call and was
+        // entered on the OS futex queue.  We must now perform a futex
+        // wake call when releasing the lock, as we have no idea how many
+        // other threads are in the queue.
+        //
+        gtid_code |= 1;
+    }
+
+    KMP_FSYNC_ACQUIRED( lck );
+    KA_TRACE( 1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n",
+      lck, lck->lk.poll, gtid ) );
+}
+
+void
+__kmp_acquire_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    __kmp_acquire_futex_lock_timed_template( lck, gtid );
+}
+
+static void
+__kmp_acquire_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_lock";
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_futex_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
+    }
+    __kmp_acquire_futex_lock( lck, gtid );
+}
+
+int
+__kmp_test_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    if ( KMP_COMPARE_AND_STORE_ACQ32( & ( lck->lk.poll ), DYNA_LOCK_FREE(futex), DYNA_LOCK_BUSY(gtid+1, futex) << 1 ) ) {
+        KMP_FSYNC_ACQUIRED( lck );
+        return TRUE;
+    }
+    return FALSE;
+}
+
+static int
+__kmp_test_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_lock";
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    return __kmp_test_futex_lock( lck, gtid );
+}
+
+int
+__kmp_release_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
+      lck, lck->lk.poll, gtid ) );
+
+    KMP_FSYNC_RELEASING(lck);
+
+    kmp_int32 poll_val = KMP_XCHG_FIXED32( & ( lck->lk.poll ), DYNA_LOCK_FREE(futex) );
+
+    KA_TRACE( 1000, ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
+       lck, gtid, poll_val ) );
+
+    if ( DYNA_LOCK_STRIP(poll_val) & 1 ) {
+        KA_TRACE( 1000, ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
+           lck, gtid ) );
+        syscall( __NR_futex, & ( lck->lk.poll ), FUTEX_WAKE, DYNA_LOCK_BUSY(1, futex), NULL, NULL, 0 );
+    }
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n",
+      lck, lck->lk.poll, gtid ) );
+
+    KMP_YIELD( TCR_4( __kmp_nth ) > ( __kmp_avail_proc ? __kmp_avail_proc :
+      __kmp_xproc ) );
+    return KMP_LOCK_RELEASED;
+}
+
+static int
+__kmp_release_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_futex_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_futex_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    return __kmp_release_futex_lock( lck, gtid );
+}
+
+void
+__kmp_init_futex_lock( kmp_futex_lock_t * lck )
+{
+    TCW_4( lck->lk.poll, DYNA_LOCK_FREE(futex) );
+}
+
+static void
+__kmp_init_futex_lock_with_checks( kmp_futex_lock_t * lck )
+{
+    __kmp_init_futex_lock( lck );
+}
+
+void
+__kmp_destroy_futex_lock( kmp_futex_lock_t *lck )
+{
+    lck->lk.poll = 0;
+}
+
+static void
+__kmp_destroy_futex_lock_with_checks( kmp_futex_lock_t *lck )
+{
+    char const * const func = "omp_destroy_lock";
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_futex_lock( lck );
+}
+
+
+//
+// nested futex locks
+//
+
+void
+__kmp_acquire_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_futex_lock_owner( lck ) == gtid ) {
+        lck->lk.depth_locked += 1;
+    }
+    else {
+        __kmp_acquire_futex_lock_timed_template( lck, gtid );
+        lck->lk.depth_locked = 1;
+    }
+}
+
+static void
+__kmp_acquire_nested_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_nest_lock";
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    __kmp_acquire_nested_futex_lock( lck, gtid );
+}
+
+int
+__kmp_test_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    int retval;
+
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_futex_lock_owner( lck ) == gtid ) {
+        retval = ++lck->lk.depth_locked;
+    }
+    else if ( !__kmp_test_futex_lock( lck, gtid ) ) {
+        retval = 0;
+    }
+    else {
+        KMP_MB();
+        retval = lck->lk.depth_locked = 1;
+    }
+    return retval;
+}
+
+static int
+__kmp_test_nested_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_nest_lock";
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    return __kmp_test_nested_futex_lock( lck, gtid );
+}
+
+int
+__kmp_release_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    KMP_MB();
+    if ( --(lck->lk.depth_locked) == 0 ) {
+        __kmp_release_futex_lock( lck, gtid );
+        return KMP_LOCK_RELEASED;
+    }
+    return KMP_LOCK_STILL_HELD;
+}
+
+static int
+__kmp_release_nested_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    return __kmp_release_nested_futex_lock( lck, gtid );
+}
+
+void
+__kmp_init_nested_futex_lock( kmp_futex_lock_t * lck )
+{
+    __kmp_init_futex_lock( lck );
+    lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+static void
+__kmp_init_nested_futex_lock_with_checks( kmp_futex_lock_t * lck )
+{
+    __kmp_init_nested_futex_lock( lck );
+}
+
+void
+__kmp_destroy_nested_futex_lock( kmp_futex_lock_t *lck )
+{
+    __kmp_destroy_futex_lock( lck );
+    lck->lk.depth_locked = 0;
+}
+
+static void
+__kmp_destroy_nested_futex_lock_with_checks( kmp_futex_lock_t *lck )
+{
+    char const * const func = "omp_destroy_nest_lock";
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_nested_futex_lock( lck );
+}
+
+#endif // KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
+
+
+/* ------------------------------------------------------------------------ */
+/* ticket (bakery) locks */
+
+static kmp_int32
+__kmp_get_ticket_lock_owner( kmp_ticket_lock_t *lck )
+{
+    return TCR_4( lck->lk.owner_id ) - 1;
+}
+
+static inline bool
+__kmp_is_ticket_lock_nestable( kmp_ticket_lock_t *lck )
+{
+    return lck->lk.depth_locked != -1;
+}
+
+static kmp_uint32
+__kmp_bakery_check(kmp_uint value, kmp_uint checker)
+{
+    register kmp_uint32 pause;
+
+    if (value == checker) {
+        return TRUE;
+    }
+    for (pause = checker - value; pause != 0; --pause);
+    return FALSE;
+}
+
+__forceinline static void
+__kmp_acquire_ticket_lock_timed_template( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    kmp_uint32 my_ticket;
+    KMP_MB();
+
+    my_ticket = KMP_TEST_THEN_INC32( (kmp_int32 *) &lck->lk.next_ticket );
+
+#ifdef USE_LOCK_PROFILE
+    if ( TCR_4( lck->lk.now_serving ) != my_ticket )
+        __kmp_printf( "LOCK CONTENTION: %p\n", lck );
+    /* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+    if ( TCR_4( lck->lk.now_serving ) == my_ticket ) {
+        KMP_FSYNC_ACQUIRED(lck);
+        return;
+    }
+    KMP_WAIT_YIELD( &lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck );
+    KMP_FSYNC_ACQUIRED(lck);
+}
+
+void
+__kmp_acquire_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    __kmp_acquire_ticket_lock_timed_template( lck, gtid );
+}
+
+static void
+__kmp_acquire_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_ticket_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
+    }
+
+    __kmp_acquire_ticket_lock( lck, gtid );
+
+    lck->lk.owner_id = gtid + 1;
+}
+
+int
+__kmp_test_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    kmp_uint32 my_ticket = TCR_4( lck->lk.next_ticket );
+    if ( TCR_4( lck->lk.now_serving ) == my_ticket ) {
+        kmp_uint32 next_ticket = my_ticket + 1;
+        if ( KMP_COMPARE_AND_STORE_ACQ32( (kmp_int32 *) &lck->lk.next_ticket,
+          my_ticket, next_ticket ) ) {
+            KMP_FSYNC_ACQUIRED( lck );
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
+static int
+__kmp_test_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+
+    int retval = __kmp_test_ticket_lock( lck, gtid );
+
+    if ( retval ) {
+        lck->lk.owner_id = gtid + 1;
+    }
+    return retval;
+}
+
+int
+__kmp_release_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    kmp_uint32  distance;
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KMP_FSYNC_RELEASING(lck);
+    distance = ( TCR_4( lck->lk.next_ticket ) - TCR_4( lck->lk.now_serving ) );
+
+    KMP_ST_REL32( &(lck->lk.now_serving), lck->lk.now_serving + 1 );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KMP_YIELD( distance
+      > (kmp_uint32) (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc) );
+    return KMP_LOCK_RELEASED;
+}
+
+static int
+__kmp_release_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_ticket_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_ticket_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    lck->lk.owner_id = 0;
+    return __kmp_release_ticket_lock( lck, gtid );
+}
+
+void
+__kmp_init_ticket_lock( kmp_ticket_lock_t * lck )
+{
+    lck->lk.location = NULL;
+    TCW_4( lck->lk.next_ticket, 0 );
+    TCW_4( lck->lk.now_serving, 0 );
+    lck->lk.owner_id = 0;      // no thread owns the lock.
+    lck->lk.depth_locked = -1; // -1 => not a nested lock.
+    lck->lk.initialized = (kmp_ticket_lock *)lck;
+}
+
+static void
+__kmp_init_ticket_lock_with_checks( kmp_ticket_lock_t * lck )
+{
+    __kmp_init_ticket_lock( lck );
+}
+
+void
+__kmp_destroy_ticket_lock( kmp_ticket_lock_t *lck )
+{
+    lck->lk.initialized = NULL;
+    lck->lk.location    = NULL;
+    lck->lk.next_ticket = 0;
+    lck->lk.now_serving = 0;
+    lck->lk.owner_id = 0;
+    lck->lk.depth_locked = -1;
+}
+
+static void
+__kmp_destroy_ticket_lock_with_checks( kmp_ticket_lock_t *lck )
+{
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_ticket_lock( lck );
+}
+
+
+//
+// nested ticket locks
+//
+
+void
+__kmp_acquire_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_ticket_lock_owner( lck ) == gtid ) {
+        lck->lk.depth_locked += 1;
+    }
+    else {
+        __kmp_acquire_ticket_lock_timed_template( lck, gtid );
+        KMP_MB();
+        lck->lk.depth_locked = 1;
+        KMP_MB();
+        lck->lk.owner_id = gtid + 1;
+    }
+}
+
+static void
+__kmp_acquire_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    __kmp_acquire_nested_ticket_lock( lck, gtid );
+}
+
+int
+__kmp_test_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    int retval;
+
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_ticket_lock_owner( lck ) == gtid ) {
+        retval = ++lck->lk.depth_locked;
+    }
+    else if ( !__kmp_test_ticket_lock( lck, gtid ) ) {
+        retval = 0;
+    }
+    else {
+        KMP_MB();
+        retval = lck->lk.depth_locked = 1;
+        KMP_MB();
+        lck->lk.owner_id = gtid + 1;
+    }
+    return retval;
+}
+
+static int
+__kmp_test_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck,
+  kmp_int32 gtid )
+{
+    char const * const func = "omp_test_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    return __kmp_test_nested_ticket_lock( lck, gtid );
+}
+
+int
+__kmp_release_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    KMP_MB();
+    if ( --(lck->lk.depth_locked) == 0 ) {
+        KMP_MB();
+        lck->lk.owner_id = 0;
+        __kmp_release_ticket_lock( lck, gtid );
+        return KMP_LOCK_RELEASED;
+    }
+    return KMP_LOCK_STILL_HELD;
+}
+
+static int
+__kmp_release_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    return __kmp_release_nested_ticket_lock( lck, gtid );
+}
+
+void
+__kmp_init_nested_ticket_lock( kmp_ticket_lock_t * lck )
+{
+    __kmp_init_ticket_lock( lck );
+    lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+static void
+__kmp_init_nested_ticket_lock_with_checks( kmp_ticket_lock_t * lck )
+{
+    __kmp_init_nested_ticket_lock( lck );
+}
+
+void
+__kmp_destroy_nested_ticket_lock( kmp_ticket_lock_t *lck )
+{
+    __kmp_destroy_ticket_lock( lck );
+    lck->lk.depth_locked = 0;
+}
+
+static void
+__kmp_destroy_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck )
+{
+    char const * const func = "omp_destroy_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_nested_ticket_lock( lck );
+}
+
+
+//
+// access functions to fields which don't exist for all lock kinds.
+//
+
+static int
+__kmp_is_ticket_lock_initialized( kmp_ticket_lock_t *lck )
+{
+    return lck == lck->lk.initialized;
+}
+
+static const ident_t *
+__kmp_get_ticket_lock_location( kmp_ticket_lock_t *lck )
+{
+    return lck->lk.location;
+}
+
+static void
+__kmp_set_ticket_lock_location( kmp_ticket_lock_t *lck, const ident_t *loc )
+{
+    lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t
+__kmp_get_ticket_lock_flags( kmp_ticket_lock_t *lck )
+{
+    return lck->lk.flags;
+}
+
+static void
+__kmp_set_ticket_lock_flags( kmp_ticket_lock_t *lck, kmp_lock_flags_t flags )
+{
+    lck->lk.flags = flags;
+}
+
+/* ------------------------------------------------------------------------ */
+/* queuing locks */
+
+/*
+ * First the states
+ * (head,tail) =  0, 0  means lock is unheld, nobody on queue
+ *   UINT_MAX or -1, 0  means lock is held, nobody on queue
+ *                h, h  means lock is held or about to transition, 1 element on queue
+ *                h, t  h <> t, means lock is held or about to transition, >1 elements on queue
+ *
+ * Now the transitions
+ *    Acquire(0,0)  = -1 ,0
+ *    Release(0,0)  = Error
+ *    Acquire(-1,0) =  h ,h    h > 0
+ *    Release(-1,0) =  0 ,0
+ *    Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
+ *    Release(h,h)  = -1 ,0    h > 0
+ *    Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
+ *    Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
+ *
+ * And pictorially
+ *
+ *
+ *          +-----+
+ *          | 0, 0|------- release -------> Error
+ *          +-----+
+ *            |  ^
+ *     acquire|  |release
+ *            |  |
+ *            |  |
+ *            v  |
+ *          +-----+
+ *          |-1, 0|
+ *          +-----+
+ *            |  ^
+ *     acquire|  |release
+ *            |  |
+ *            |  |
+ *            v  |
+ *          +-----+
+ *          | h, h|
+ *          +-----+
+ *            |  ^
+ *     acquire|  |release
+ *            |  |
+ *            |  |
+ *            v  |
+ *          +-----+
+ *          | h, t|----- acquire, release loopback ---+
+ *          +-----+                                   |
+ *               ^                                    |
+ *               |                                    |
+ *               +------------------------------------+
+ *
+ */
+
+#ifdef DEBUG_QUEUING_LOCKS
+
+/* Stuff for circular trace buffer */
+#define TRACE_BUF_ELE	1024
+static char traces[TRACE_BUF_ELE][128] = { 0 }
+static int tc = 0;
+#define TRACE_LOCK(X,Y)          KMP_SNPRINTF( traces[tc++ % TRACE_BUF_ELE], 128,  "t%d at %s\n", X, Y );
+#define TRACE_LOCK_T(X,Y,Z)      KMP_SNPRINTF( traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X,Y,Z );
+#define TRACE_LOCK_HT(X,Y,Z,Q)   KMP_SNPRINTF( traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y, Z, Q );
+
+static void
+__kmp_dump_queuing_lock( kmp_info_t *this_thr, kmp_int32 gtid,
+  kmp_queuing_lock_t *lck, kmp_int32 head_id, kmp_int32 tail_id )
+{
+    kmp_int32 t, i;
+
+    __kmp_printf_no_lock( "\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n" );
+
+    i = tc % TRACE_BUF_ELE;
+    __kmp_printf_no_lock( "%s\n", traces[i] );
+    i = (i+1) % TRACE_BUF_ELE;
+    while ( i != (tc % TRACE_BUF_ELE) ) {
+        __kmp_printf_no_lock( "%s", traces[i] );
+        i = (i+1) % TRACE_BUF_ELE;
+    }
+    __kmp_printf_no_lock( "\n" );
+
+    __kmp_printf_no_lock(
+             "\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, next_wait:%d, head_id:%d, tail_id:%d\n",
+             gtid+1, this_thr->th.th_spin_here, this_thr->th.th_next_waiting,
+             head_id, tail_id );
+
+    __kmp_printf_no_lock( "\t\thead: %d ", lck->lk.head_id );
+
+    if ( lck->lk.head_id >= 1 ) {
+        t = __kmp_threads[lck->lk.head_id-1]->th.th_next_waiting;
+        while (t > 0) {
+            __kmp_printf_no_lock( "-> %d ", t );
+            t = __kmp_threads[t-1]->th.th_next_waiting;
+        }
+    }
+    __kmp_printf_no_lock( ";  tail: %d ", lck->lk.tail_id );
+    __kmp_printf_no_lock( "\n\n" );
+}
+
+#endif /* DEBUG_QUEUING_LOCKS */
+
+static kmp_int32
+__kmp_get_queuing_lock_owner( kmp_queuing_lock_t *lck )
+{
+    return TCR_4( lck->lk.owner_id ) - 1;
+}
+
+static inline bool
+__kmp_is_queuing_lock_nestable( kmp_queuing_lock_t *lck )
+{
+    return lck->lk.depth_locked != -1;
+}
+
+/* Acquire a lock using a the queuing lock implementation */
+template <bool takeTime>
+/* [TLW] The unused template above is left behind because of what BEB believes is a
+   potential compiler problem with __forceinline. */
+__forceinline static void
+__kmp_acquire_queuing_lock_timed_template( kmp_queuing_lock_t *lck,
+  kmp_int32 gtid )
+{
+    register kmp_info_t *this_thr    = __kmp_thread_from_gtid( gtid );
+    volatile kmp_int32  *head_id_p   = & lck->lk.head_id;
+    volatile kmp_int32  *tail_id_p   = & lck->lk.tail_id;
+    volatile kmp_uint32 *spin_here_p;
+    kmp_int32 need_mf = 1;
+
+#if OMPT_SUPPORT
+    ompt_state_t prev_state = ompt_state_undefined;
+#endif
+
+    KA_TRACE( 1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid ));
+
+    KMP_FSYNC_PREPARE( lck );
+    KMP_DEBUG_ASSERT( this_thr != NULL );
+    spin_here_p = & this_thr->th.th_spin_here;
+
+#ifdef DEBUG_QUEUING_LOCKS
+    TRACE_LOCK( gtid+1, "acq ent" );
+    if ( *spin_here_p )
+        __kmp_dump_queuing_lock( this_thr, gtid, lck, *head_id_p, *tail_id_p );
+    if ( this_thr->th.th_next_waiting != 0 )
+        __kmp_dump_queuing_lock( this_thr, gtid, lck, *head_id_p, *tail_id_p );
+#endif
+    KMP_DEBUG_ASSERT( !*spin_here_p );
+    KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
+
+
+    /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to head_id_p
+       that may follow, not just in execution order, but also in visibility order.  This way,
+       when a releasing thread observes the changes to the queue by this thread, it can
+       rightly assume that spin_here_p has already been set to TRUE, so that when it sets
+       spin_here_p to FALSE, it is not premature.  If the releasing thread sets spin_here_p
+       to FALSE before this thread sets it to TRUE, this thread will hang.
+    */
+    *spin_here_p = TRUE;  /* before enqueuing to prevent race */
+
+    while( 1 ) {
+        kmp_int32 enqueued;
+        kmp_int32 head;
+        kmp_int32 tail;
+
+        head = *head_id_p;
+
+        switch ( head ) {
+
+            case -1:
+            {
+#ifdef DEBUG_QUEUING_LOCKS
+                tail = *tail_id_p;
+                TRACE_LOCK_HT( gtid+1, "acq read: ", head, tail );
+#endif
+                tail = 0;  /* to make sure next link asynchronously read is not set accidentally;
+                           this assignment prevents us from entering the if ( t > 0 )
+                           condition in the enqueued case below, which is not necessary for
+                           this state transition */
+
+                need_mf = 0;
+                /* try (-1,0)->(tid,tid) */
+                enqueued = KMP_COMPARE_AND_STORE_ACQ64( (volatile kmp_int64 *) tail_id_p,
+                  KMP_PACK_64( -1, 0 ),
+                  KMP_PACK_64( gtid+1, gtid+1 ) );
+#ifdef DEBUG_QUEUING_LOCKS
+                  if ( enqueued ) TRACE_LOCK( gtid+1, "acq enq: (-1,0)->(tid,tid)" );
+#endif
+            }
+            break;
+
+            default:
+            {
+                tail = *tail_id_p;
+                KMP_DEBUG_ASSERT( tail != gtid + 1 );
+
+#ifdef DEBUG_QUEUING_LOCKS
+                TRACE_LOCK_HT( gtid+1, "acq read: ", head, tail );
+#endif
+
+                if ( tail == 0 ) {
+                    enqueued = FALSE;
+                }
+                else {
+                    need_mf = 0;
+                    /* try (h,t) or (h,h)->(h,tid) */
+                    enqueued = KMP_COMPARE_AND_STORE_ACQ32( tail_id_p, tail, gtid+1 );
+
+#ifdef DEBUG_QUEUING_LOCKS
+                        if ( enqueued ) TRACE_LOCK( gtid+1, "acq enq: (h,t)->(h,tid)" );
+#endif
+                }
+            }
+            break;
+
+            case 0: /* empty queue */
+            {
+                kmp_int32 grabbed_lock;
+
+#ifdef DEBUG_QUEUING_LOCKS
+                tail = *tail_id_p;
+                TRACE_LOCK_HT( gtid+1, "acq read: ", head, tail );
+#endif
+                /* try (0,0)->(-1,0) */
+
+                /* only legal transition out of head = 0 is head = -1 with no change to tail */
+                grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32( head_id_p, 0, -1 );
+
+                if ( grabbed_lock ) {
+
+                    *spin_here_p = FALSE;
+
+                    KA_TRACE( 1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
+                              lck, gtid ));
+#ifdef DEBUG_QUEUING_LOCKS
+                    TRACE_LOCK_HT( gtid+1, "acq exit: ", head, 0 );
+#endif
+
+#if OMPT_SUPPORT
+                    if ((ompt_status & ompt_status_track) &&
+                        prev_state != ompt_state_undefined) {
+                        /* change the state before clearing wait_id */
+                        this_thr->th.ompt_thread_info.state = prev_state;
+                        this_thr->th.ompt_thread_info.wait_id = 0;
+                    }
+#endif
+
+                    KMP_FSYNC_ACQUIRED( lck );
+                    return; /* lock holder cannot be on queue */
+                }
+                enqueued = FALSE;
+            }
+            break;
+        }
+
+#if OMPT_SUPPORT
+        if ((ompt_status & ompt_status_track) &&
+            prev_state == ompt_state_undefined) {
+            /* this thread will spin; set wait_id before entering wait state */
+            prev_state = this_thr->th.ompt_thread_info.state;
+            this_thr->th.ompt_thread_info.wait_id = (uint64_t) lck;
+            this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
+        }
+#endif
+
+        if ( enqueued ) {
+            if ( tail > 0 ) {
+                kmp_info_t *tail_thr = __kmp_thread_from_gtid( tail - 1 );
+                KMP_ASSERT( tail_thr != NULL );
+                tail_thr->th.th_next_waiting = gtid+1;
+                /* corresponding wait for this write in release code */
+            }
+            KA_TRACE( 1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n", lck, gtid ));
+
+
+            /* ToDo: May want to consider using __kmp_wait_sleep  or something that sleeps for
+             *       throughput only here.
+             */
+            KMP_MB();
+            KMP_WAIT_YIELD(spin_here_p, FALSE, KMP_EQ, lck);
+
+#ifdef DEBUG_QUEUING_LOCKS
+            TRACE_LOCK( gtid+1, "acq spin" );
+
+            if ( this_thr->th.th_next_waiting != 0 )
+                __kmp_dump_queuing_lock( this_thr, gtid, lck, *head_id_p, *tail_id_p );
+#endif
+            KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
+            KA_TRACE( 1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after waiting on queue\n",
+                      lck, gtid ));
+
+#ifdef DEBUG_QUEUING_LOCKS
+            TRACE_LOCK( gtid+1, "acq exit 2" );
+#endif
+
+#if OMPT_SUPPORT
+            /* change the state before clearing wait_id */
+            this_thr->th.ompt_thread_info.state = prev_state;
+            this_thr->th.ompt_thread_info.wait_id = 0;
+#endif
+
+            /* got lock, we were dequeued by the thread that released lock */
+            return;
+        }
+
+        /* Yield if number of threads > number of logical processors */
+        /* ToDo: Not sure why this should only be in oversubscription case,
+           maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
+        KMP_YIELD( TCR_4( __kmp_nth ) > (__kmp_avail_proc ? __kmp_avail_proc :
+          __kmp_xproc ) );
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK( gtid+1, "acq retry" );
+#endif
+
+    }
+    KMP_ASSERT2( 0, "should not get here" );
+}
+
+void
+__kmp_acquire_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    __kmp_acquire_queuing_lock_timed_template<false>( lck, gtid );
+}
+
+static void
+__kmp_acquire_queuing_lock_with_checks( kmp_queuing_lock_t *lck,
+  kmp_int32 gtid )
+{
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) == gtid ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
+    }
+
+    __kmp_acquire_queuing_lock( lck, gtid );
+
+    lck->lk.owner_id = gtid + 1;
+}
+
+int
+__kmp_test_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    volatile kmp_int32 *head_id_p  = & lck->lk.head_id;
+    kmp_int32 head;
+#ifdef KMP_DEBUG
+    kmp_info_t *this_thr;
+#endif
+
+    KA_TRACE( 1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid ));
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+#ifdef KMP_DEBUG
+    this_thr = __kmp_thread_from_gtid( gtid );
+    KMP_DEBUG_ASSERT( this_thr != NULL );
+    KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
+#endif
+
+    head = *head_id_p;
+
+    if ( head == 0 ) { /* nobody on queue, nobody holding */
+
+        /* try (0,0)->(-1,0) */
+
+        if ( KMP_COMPARE_AND_STORE_ACQ32( head_id_p, 0, -1 ) ) {
+            KA_TRACE( 1000, ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid ));
+            KMP_FSYNC_ACQUIRED(lck);
+            return TRUE;
+        }
+    }
+
+    KA_TRACE( 1000, ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid ));
+    return FALSE;
+}
+
+static int
+__kmp_test_queuing_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+
+    int retval = __kmp_test_queuing_lock( lck, gtid );
+
+    if ( retval ) {
+        lck->lk.owner_id = gtid + 1;
+    }
+    return retval;
+}
+
+int
+__kmp_release_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    register kmp_info_t *this_thr;
+    volatile kmp_int32 *head_id_p = & lck->lk.head_id;
+    volatile kmp_int32 *tail_id_p = & lck->lk.tail_id;
+
+    KA_TRACE( 1000, ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid ));
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+    this_thr    = __kmp_thread_from_gtid( gtid );
+    KMP_DEBUG_ASSERT( this_thr != NULL );
+#ifdef DEBUG_QUEUING_LOCKS
+    TRACE_LOCK( gtid+1, "rel ent" );
+
+    if ( this_thr->th.th_spin_here )
+        __kmp_dump_queuing_lock( this_thr, gtid, lck, *head_id_p, *tail_id_p );
+    if ( this_thr->th.th_next_waiting != 0 )
+        __kmp_dump_queuing_lock( this_thr, gtid, lck, *head_id_p, *tail_id_p );
+#endif
+    KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
+    KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
+
+    KMP_FSYNC_RELEASING(lck);
+
+    while( 1 ) {
+        kmp_int32 dequeued;
+        kmp_int32 head;
+        kmp_int32 tail;
+
+        head = *head_id_p;
+
+#ifdef DEBUG_QUEUING_LOCKS
+        tail = *tail_id_p;
+        TRACE_LOCK_HT( gtid+1, "rel read: ", head, tail );
+        if ( head == 0 ) __kmp_dump_queuing_lock( this_thr, gtid, lck, head, tail );
+#endif
+        KMP_DEBUG_ASSERT( head != 0 ); /* holding the lock, head must be -1 or queue head */
+
+        if ( head == -1 ) { /* nobody on queue */
+
+            /* try (-1,0)->(0,0) */
+            if ( KMP_COMPARE_AND_STORE_REL32( head_id_p, -1, 0 ) ) {
+                KA_TRACE( 1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
+                          lck, gtid ));
+#ifdef DEBUG_QUEUING_LOCKS
+                TRACE_LOCK_HT( gtid+1, "rel exit: ", 0, 0 );
+#endif
+
+#if OMPT_SUPPORT
+                /* nothing to do - no other thread is trying to shift blame */
+#endif
+
+                return KMP_LOCK_RELEASED;
+            }
+            dequeued = FALSE;
+
+        }
+        else {
+
+            tail = *tail_id_p;
+            if ( head == tail ) {  /* only one thread on the queue */
+
+#ifdef DEBUG_QUEUING_LOCKS
+                if ( head <= 0 ) __kmp_dump_queuing_lock( this_thr, gtid, lck, head, tail );
+#endif
+                KMP_DEBUG_ASSERT( head > 0 );
+
+                /* try (h,h)->(-1,0) */
+                dequeued = KMP_COMPARE_AND_STORE_REL64( (kmp_int64 *) tail_id_p,
+                  KMP_PACK_64( head, head ), KMP_PACK_64( -1, 0 ) );
+#ifdef DEBUG_QUEUING_LOCKS
+                TRACE_LOCK( gtid+1, "rel deq: (h,h)->(-1,0)" );
+#endif
+
+            }
+            else {
+                volatile kmp_int32 *waiting_id_p;
+                kmp_info_t         *head_thr = __kmp_thread_from_gtid( head - 1 );
+                KMP_DEBUG_ASSERT( head_thr != NULL );
+                waiting_id_p = & head_thr->th.th_next_waiting;
+
+                /* Does this require synchronous reads? */
+#ifdef DEBUG_QUEUING_LOCKS
+                if ( head <= 0 || tail <= 0 ) __kmp_dump_queuing_lock( this_thr, gtid, lck, head, tail );
+#endif
+                KMP_DEBUG_ASSERT( head > 0 && tail > 0 );
+
+                /* try (h,t)->(h',t) or (t,t) */
+
+                KMP_MB();
+                /* make sure enqueuing thread has time to update next waiting thread field */
+                *head_id_p = (kmp_int32) KMP_WAIT_YIELD((volatile kmp_uint*) waiting_id_p, 0, KMP_NEQ, NULL);
+#ifdef DEBUG_QUEUING_LOCKS
+                TRACE_LOCK( gtid+1, "rel deq: (h,t)->(h',t)" );
+#endif
+                dequeued = TRUE;
+            }
+        }
+
+        if ( dequeued ) {
+            kmp_info_t *head_thr = __kmp_thread_from_gtid( head - 1 );
+            KMP_DEBUG_ASSERT( head_thr != NULL );
+
+            /* Does this require synchronous reads? */
+#ifdef DEBUG_QUEUING_LOCKS
+            if ( head <= 0 || tail <= 0 ) __kmp_dump_queuing_lock( this_thr, gtid, lck, head, tail );
+#endif
+            KMP_DEBUG_ASSERT( head > 0 && tail > 0 );
+
+            /* For clean code only.
+             * Thread not released until next statement prevents race with acquire code.
+             */
+            head_thr->th.th_next_waiting = 0;
+#ifdef DEBUG_QUEUING_LOCKS
+            TRACE_LOCK_T( gtid+1, "rel nw=0 for t=", head );
+#endif
+
+            KMP_MB();
+            /* reset spin value */
+            head_thr->th.th_spin_here = FALSE;
+
+            KA_TRACE( 1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after dequeuing\n",
+                      lck, gtid ));
+#ifdef DEBUG_QUEUING_LOCKS
+            TRACE_LOCK( gtid+1, "rel exit 2" );
+#endif
+            return KMP_LOCK_RELEASED;
+        }
+        /* KMP_CPU_PAUSE( );  don't want to make releasing thread hold up acquiring threads */
+
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK( gtid+1, "rel retry" );
+#endif
+
+    } /* while */
+    KMP_ASSERT2( 0, "should not get here" );
+    return KMP_LOCK_RELEASED;
+}
+
+static int
+__kmp_release_queuing_lock_with_checks( kmp_queuing_lock_t *lck,
+  kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    lck->lk.owner_id = 0;
+    return __kmp_release_queuing_lock( lck, gtid );
+}
+
+void
+__kmp_init_queuing_lock( kmp_queuing_lock_t *lck )
+{
+    lck->lk.location = NULL;
+    lck->lk.head_id = 0;
+    lck->lk.tail_id = 0;
+    lck->lk.next_ticket = 0;
+    lck->lk.now_serving = 0;
+    lck->lk.owner_id = 0;      // no thread owns the lock.
+    lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
+    lck->lk.initialized = lck;
+
+    KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
+}
+
+static void
+__kmp_init_queuing_lock_with_checks( kmp_queuing_lock_t * lck )
+{
+    __kmp_init_queuing_lock( lck );
+}
+
+void
+__kmp_destroy_queuing_lock( kmp_queuing_lock_t *lck )
+{
+    lck->lk.initialized = NULL;
+    lck->lk.location = NULL;
+    lck->lk.head_id = 0;
+    lck->lk.tail_id = 0;
+    lck->lk.next_ticket = 0;
+    lck->lk.now_serving = 0;
+    lck->lk.owner_id = 0;
+    lck->lk.depth_locked = -1;
+}
+
+static void
+__kmp_destroy_queuing_lock_with_checks( kmp_queuing_lock_t *lck )
+{
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_queuing_lock( lck );
+}
+
+
+//
+// nested queuing locks
+//
+
+void
+__kmp_acquire_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_queuing_lock_owner( lck ) == gtid ) {
+        lck->lk.depth_locked += 1;
+    }
+    else {
+        __kmp_acquire_queuing_lock_timed_template<false>( lck, gtid );
+        KMP_MB();
+        lck->lk.depth_locked = 1;
+        KMP_MB();
+        lck->lk.owner_id = gtid + 1;
+    }
+}
+
+static void
+__kmp_acquire_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    __kmp_acquire_nested_queuing_lock( lck, gtid );
+}
+
+int
+__kmp_test_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    int retval;
+
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_queuing_lock_owner( lck ) == gtid ) {
+        retval = ++lck->lk.depth_locked;
+    }
+    else if ( !__kmp_test_queuing_lock( lck, gtid ) ) {
+        retval = 0;
+    }
+    else {
+        KMP_MB();
+        retval = lck->lk.depth_locked = 1;
+        KMP_MB();
+        lck->lk.owner_id = gtid + 1;
+    }
+    return retval;
+}
+
+static int
+__kmp_test_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck,
+  kmp_int32 gtid )
+{
+    char const * const func = "omp_test_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    return __kmp_test_nested_queuing_lock( lck, gtid );
+}
+
+int
+__kmp_release_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    KMP_MB();
+    if ( --(lck->lk.depth_locked) == 0 ) {
+        KMP_MB();
+        lck->lk.owner_id = 0;
+        __kmp_release_queuing_lock( lck, gtid );
+        return KMP_LOCK_RELEASED;
+    }
+    return KMP_LOCK_STILL_HELD;
+}
+
+static int
+__kmp_release_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    return __kmp_release_nested_queuing_lock( lck, gtid );
+}
+
+void
+__kmp_init_nested_queuing_lock( kmp_queuing_lock_t * lck )
+{
+    __kmp_init_queuing_lock( lck );
+    lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+static void
+__kmp_init_nested_queuing_lock_with_checks( kmp_queuing_lock_t * lck )
+{
+    __kmp_init_nested_queuing_lock( lck );
+}
+
+void
+__kmp_destroy_nested_queuing_lock( kmp_queuing_lock_t *lck )
+{
+    __kmp_destroy_queuing_lock( lck );
+    lck->lk.depth_locked = 0;
+}
+
+static void
+__kmp_destroy_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck )
+{
+    char const * const func = "omp_destroy_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_nested_queuing_lock( lck );
+}
+
+
+//
+// access functions to fields which don't exist for all lock kinds.
+//
+
+static int
+__kmp_is_queuing_lock_initialized( kmp_queuing_lock_t *lck )
+{
+    return lck == lck->lk.initialized;
+}
+
+static const ident_t *
+__kmp_get_queuing_lock_location( kmp_queuing_lock_t *lck )
+{
+    return lck->lk.location;
+}
+
+static void
+__kmp_set_queuing_lock_location( kmp_queuing_lock_t *lck, const ident_t *loc )
+{
+    lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t
+__kmp_get_queuing_lock_flags( kmp_queuing_lock_t *lck )
+{
+    return lck->lk.flags;
+}
+
+static void
+__kmp_set_queuing_lock_flags( kmp_queuing_lock_t *lck, kmp_lock_flags_t flags )
+{
+    lck->lk.flags = flags;
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+/*
+    RTM Adaptive locks
+*/
+
+// TODO: Use the header for intrinsics below with the compiler 13.0
+//#include <immintrin.h>
+
+// Values from the status register after failed speculation.
+#define _XBEGIN_STARTED          (~0u)
+#define _XABORT_EXPLICIT         (1 << 0)
+#define _XABORT_RETRY            (1 << 1)
+#define _XABORT_CONFLICT         (1 << 2)
+#define _XABORT_CAPACITY         (1 << 3)
+#define _XABORT_DEBUG            (1 << 4)
+#define _XABORT_NESTED           (1 << 5)
+#define _XABORT_CODE(x)          ((unsigned char)(((x) >> 24) & 0xFF))
+
+// Aborts for which it's worth trying again immediately
+#define SOFT_ABORT_MASK  (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
+
+#define STRINGIZE_INTERNAL(arg) #arg
+#define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
+
+// Access to RTM instructions
+
+/*
+  A version of XBegin which returns -1 on speculation, and the value of EAX on an abort.
+  This is the same definition as the compiler intrinsic that will be supported at some point.
+*/
+static __inline int _xbegin()
+{
+    int res = -1;
+
+#if KMP_OS_WINDOWS
+#if KMP_ARCH_X86_64
+    _asm {
+        _emit 0xC7
+        _emit 0xF8
+        _emit 2
+        _emit 0
+        _emit 0
+        _emit 0
+        jmp   L2
+        mov   res, eax
+    L2:
+    }
+#else /* IA32 */
+    _asm {
+        _emit 0xC7
+        _emit 0xF8
+        _emit 2
+        _emit 0
+        _emit 0
+        _emit 0
+        jmp   L2
+        mov   res, eax
+    L2:
+    }
+#endif // KMP_ARCH_X86_64
+#else
+    /* Note that %eax must be noted as killed (clobbered), because
+     * the XSR is returned in %eax(%rax) on abort.  Other register
+     * values are restored, so don't need to be killed.
+     *
+     * We must also mark 'res' as an input and an output, since otherwise
+     * 'res=-1' may be dropped as being dead, whereas we do need the
+     * assignment on the successful (i.e., non-abort) path.
+     */
+    __asm__ volatile ("1: .byte  0xC7; .byte 0xF8;\n"
+                      "   .long  1f-1b-6\n"
+                      "    jmp   2f\n"
+                      "1:  movl  %%eax,%0\n"
+                      "2:"
+                      :"+r"(res)::"memory","%eax");
+#endif // KMP_OS_WINDOWS
+    return res;
+}
+
+/*
+  Transaction end
+*/
+static __inline void _xend()
+{
+#if KMP_OS_WINDOWS
+    __asm  {
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd5
+    }
+#else
+    __asm__ volatile (".byte 0x0f; .byte 0x01; .byte 0xd5" :::"memory");
+#endif
+}
+
+/*
+  This is a macro, the argument must be a single byte constant which
+  can be evaluated by the inline assembler, since it is emitted as a
+  byte into the assembly code.
+*/
+#if KMP_OS_WINDOWS
+#define _xabort(ARG)                            \
+    _asm _emit 0xc6                             \
+    _asm _emit 0xf8                             \
+    _asm _emit ARG
+#else
+#define _xabort(ARG) \
+    __asm__ volatile (".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG) :::"memory");
+#endif
+
+//
+//    Statistics is collected for testing purpose
+//
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+// We accumulate speculative lock statistics when the lock is destroyed.
+// We keep locks that haven't been destroyed in the liveLocks list
+// so that we can grab their statistics too.
+static kmp_adaptive_lock_statistics_t destroyedStats;
+
+// To hold the list of live locks.
+static kmp_adaptive_lock_info_t liveLocks;
+
+// A lock so we can safely update the list of locks.
+static kmp_bootstrap_lock_t chain_lock;
+
+// Initialize the list of stats.
+void
+__kmp_init_speculative_stats()
+{
+    kmp_adaptive_lock_info_t *lck = &liveLocks;
+
+    memset( ( void * ) & ( lck->stats ), 0, sizeof( lck->stats ) );
+    lck->stats.next = lck;
+    lck->stats.prev = lck;
+
+    KMP_ASSERT( lck->stats.next->stats.prev == lck );
+    KMP_ASSERT( lck->stats.prev->stats.next == lck );
+
+    __kmp_init_bootstrap_lock( &chain_lock );
+
+}
+
+// Insert the lock into the circular list
+static void
+__kmp_remember_lock( kmp_adaptive_lock_info_t * lck )
+{
+    __kmp_acquire_bootstrap_lock( &chain_lock );
+
+    lck->stats.next = liveLocks.stats.next;
+    lck->stats.prev = &liveLocks;
+
+    liveLocks.stats.next = lck;
+    lck->stats.next->stats.prev  = lck;
+
+    KMP_ASSERT( lck->stats.next->stats.prev == lck );
+    KMP_ASSERT( lck->stats.prev->stats.next == lck );
+
+    __kmp_release_bootstrap_lock( &chain_lock );
+}
+
+static void
+__kmp_forget_lock( kmp_adaptive_lock_info_t * lck )
+{
+    KMP_ASSERT( lck->stats.next->stats.prev == lck );
+    KMP_ASSERT( lck->stats.prev->stats.next == lck );
+
+    kmp_adaptive_lock_info_t * n = lck->stats.next;
+    kmp_adaptive_lock_info_t * p = lck->stats.prev;
+
+    n->stats.prev = p;
+    p->stats.next = n;
+}
+
+static void
+__kmp_zero_speculative_stats( kmp_adaptive_lock_info_t * lck )
+{
+    memset( ( void * )&lck->stats, 0, sizeof( lck->stats ) );
+    __kmp_remember_lock( lck );
+}
+
+static void
+__kmp_add_stats( kmp_adaptive_lock_statistics_t * t, kmp_adaptive_lock_info_t * lck )
+{
+    kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
+
+    t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
+    t->successfulSpeculations += s->successfulSpeculations;
+    t->hardFailedSpeculations += s->hardFailedSpeculations;
+    t->softFailedSpeculations += s->softFailedSpeculations;
+    t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
+    t->lemmingYields          += s->lemmingYields;
+}
+
+static void
+__kmp_accumulate_speculative_stats( kmp_adaptive_lock_info_t * lck)
+{
+    kmp_adaptive_lock_statistics_t *t = &destroyedStats;
+
+    __kmp_acquire_bootstrap_lock( &chain_lock );
+
+    __kmp_add_stats( &destroyedStats, lck );
+    __kmp_forget_lock( lck );
+
+    __kmp_release_bootstrap_lock( &chain_lock );
+}
+
+static float
+percent (kmp_uint32 count, kmp_uint32 total)
+{
+    return (total == 0) ? 0.0: (100.0 * count)/total;
+}
+
+static
+FILE * __kmp_open_stats_file()
+{
+    if (strcmp (__kmp_speculative_statsfile, "-") == 0)
+        return stdout;
+
+    size_t buffLen = KMP_STRLEN( __kmp_speculative_statsfile ) + 20;
+    char buffer[buffLen];
+    KMP_SNPRINTF (&buffer[0], buffLen, __kmp_speculative_statsfile,
+      (kmp_int32)getpid());
+    FILE * result = fopen(&buffer[0], "w");
+
+    // Maybe we should issue a warning here...
+    return result ? result : stdout;
+}
+
+void
+__kmp_print_speculative_stats()
+{
+    if (__kmp_user_lock_kind != lk_adaptive)
+        return;
+
+    FILE * statsFile = __kmp_open_stats_file();
+
+    kmp_adaptive_lock_statistics_t total = destroyedStats;
+    kmp_adaptive_lock_info_t *lck;
+
+    for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
+        __kmp_add_stats( &total, lck );
+    }
+    kmp_adaptive_lock_statistics_t *t = &total;
+    kmp_uint32 totalSections     = t->nonSpeculativeAcquires + t->successfulSpeculations;
+    kmp_uint32 totalSpeculations = t->successfulSpeculations + t->hardFailedSpeculations +
+                                   t->softFailedSpeculations;
+
+    fprintf ( statsFile, "Speculative lock statistics (all approximate!)\n");
+    fprintf ( statsFile, " Lock parameters: \n"
+             "   max_soft_retries               : %10d\n"
+             "   max_badness                    : %10d\n",
+             __kmp_adaptive_backoff_params.max_soft_retries,
+             __kmp_adaptive_backoff_params.max_badness);
+    fprintf( statsFile, " Non-speculative acquire attempts : %10d\n", t->nonSpeculativeAcquireAttempts );
+    fprintf( statsFile, " Total critical sections          : %10d\n", totalSections );
+    fprintf( statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
+             t->successfulSpeculations, percent( t->successfulSpeculations, totalSections ) );
+    fprintf( statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
+             t->nonSpeculativeAcquires, percent( t->nonSpeculativeAcquires, totalSections ) );
+    fprintf( statsFile, " Lemming yields                   : %10d\n\n", t->lemmingYields );
+
+    fprintf( statsFile, " Speculative acquire attempts     : %10d\n", totalSpeculations );
+    fprintf( statsFile, " Successes                        : %10d (%5.1f%%)\n",
+             t->successfulSpeculations, percent( t->successfulSpeculations, totalSpeculations ) );
+    fprintf( statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
+             t->softFailedSpeculations, percent( t->softFailedSpeculations, totalSpeculations ) );
+    fprintf( statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
+             t->hardFailedSpeculations, percent( t->hardFailedSpeculations, totalSpeculations ) );
+
+    if (statsFile != stdout)
+        fclose( statsFile );
+}
+
+# define KMP_INC_STAT(lck,stat) ( lck->lk.adaptive.stats.stat++ )
+#else
+# define KMP_INC_STAT(lck,stat)
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+static inline bool
+__kmp_is_unlocked_queuing_lock( kmp_queuing_lock_t *lck )
+{
+    // It is enough to check that the head_id is zero.
+    // We don't also need to check the tail.
+    bool res = lck->lk.head_id == 0;
+
+    // We need a fence here, since we must ensure that no memory operations
+    // from later in this thread float above that read.
+#if KMP_COMPILER_ICC
+    _mm_mfence();
+#else
+    __sync_synchronize();
+#endif
+
+    return res;
+}
+
+// Functions for manipulating the badness
+static __inline void
+__kmp_update_badness_after_success( kmp_adaptive_lock_t *lck )
+{
+    // Reset the badness to zero so we eagerly try to speculate again
+    lck->lk.adaptive.badness = 0;
+    KMP_INC_STAT(lck,successfulSpeculations);
+}
+
+// Create a bit mask with one more set bit.
+static __inline void
+__kmp_step_badness( kmp_adaptive_lock_t *lck )
+{
+    kmp_uint32 newBadness = ( lck->lk.adaptive.badness << 1 ) | 1;
+    if ( newBadness > lck->lk.adaptive.max_badness) {
+        return;
+    } else {
+        lck->lk.adaptive.badness = newBadness;
+    }
+}
+
+// Check whether speculation should be attempted.
+static __inline int
+__kmp_should_speculate( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
+{
+    kmp_uint32 badness = lck->lk.adaptive.badness;
+    kmp_uint32 attempts= lck->lk.adaptive.acquire_attempts;
+    int res = (attempts & badness) == 0;
+    return res;
+}
+
+// Attempt to acquire only the speculative lock.
+// Does not back off to the non-speculative lock.
+//
+static int
+__kmp_test_adaptive_lock_only( kmp_adaptive_lock_t * lck, kmp_int32 gtid )
+{
+    int retries = lck->lk.adaptive.max_soft_retries;
+
+    // We don't explicitly count the start of speculation, rather we record
+    // the results (success, hard fail, soft fail). The sum of all of those
+    // is the total number of times we started speculation since all
+    // speculations must end one of those ways.
+    do
+    {
+        kmp_uint32 status = _xbegin();
+        // Switch this in to disable actual speculation but exercise
+        // at least some of the rest of the code. Useful for debugging...
+        // kmp_uint32 status = _XABORT_NESTED;
+
+        if (status == _XBEGIN_STARTED )
+        { /* We have successfully started speculation
+           * Check that no-one acquired the lock for real between when we last looked
+           * and now. This also gets the lock cache line into our read-set,
+           * which we need so that we'll abort if anyone later claims it for real.
+           */
+            if (! __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
+            {
+                // Lock is now visibly acquired, so someone beat us to it.
+                // Abort the transaction so we'll restart from _xbegin with the
+                // failure status.
+                _xabort(0x01)
+                KMP_ASSERT2( 0, "should not get here" );
+            }
+            return 1;   // Lock has been acquired (speculatively)
+        } else {
+            // We have aborted, update the statistics
+            if ( status & SOFT_ABORT_MASK)
+            {
+                KMP_INC_STAT(lck,softFailedSpeculations);
+                // and loop round to retry.
+            }
+            else
+            {
+                KMP_INC_STAT(lck,hardFailedSpeculations);
+                // Give up if we had a hard failure.
+                break;
+            }
+        }
+    }  while( retries-- ); // Loop while we have retries, and didn't fail hard.
+
+    // Either we had a hard failure or we didn't succeed softly after
+    // the full set of attempts, so back off the badness.
+    __kmp_step_badness( lck );
+    return 0;
+}
+
+// Attempt to acquire the speculative lock, or back off to the non-speculative one
+// if the speculative lock cannot be acquired.
+// We can succeed speculatively, non-speculatively, or fail.
+static int
+__kmp_test_adaptive_lock( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
+{
+    // First try to acquire the lock speculatively
+    if ( __kmp_should_speculate( lck, gtid ) && __kmp_test_adaptive_lock_only( lck, gtid ) )
+        return 1;
+
+    // Speculative acquisition failed, so try to acquire it non-speculatively.
+    // Count the non-speculative acquire attempt
+    lck->lk.adaptive.acquire_attempts++;
+
+    // Use base, non-speculative lock.
+    if ( __kmp_test_queuing_lock( GET_QLK_PTR(lck), gtid ) )
+    {
+        KMP_INC_STAT(lck,nonSpeculativeAcquires);
+        return 1;       // Lock is acquired (non-speculatively)
+    }
+    else
+    {
+        return 0;       // Failed to acquire the lock, it's already visibly locked.
+    }
+}
+
+static int
+__kmp_test_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+
+    int retval = __kmp_test_adaptive_lock( lck, gtid );
+
+    if ( retval ) {
+        lck->lk.qlk.owner_id = gtid + 1;
+    }
+    return retval;
+}
+
+// Block until we can acquire a speculative, adaptive lock.
+// We check whether we should be trying to speculate.
+// If we should be, we check the real lock to see if it is free,
+// and, if not, pause without attempting to acquire it until it is.
+// Then we try the speculative acquire.
+// This means that although we suffer from lemmings a little (
+// because all we can't acquire the lock speculatively until
+// the queue of threads waiting has cleared), we don't get into a
+// state where we can never acquire the lock speculatively (because we
+// force the queue to clear by preventing new arrivals from entering the
+// queue).
+// This does mean that when we're trying to break lemmings, the lock
+// is no longer fair. However OpenMP makes no guarantee that its
+// locks are fair, so this isn't a real problem.
+static void
+__kmp_acquire_adaptive_lock( kmp_adaptive_lock_t * lck, kmp_int32 gtid )
+{
+    if ( __kmp_should_speculate( lck, gtid ) )
+    {
+        if ( __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
+        {
+            if ( __kmp_test_adaptive_lock_only( lck , gtid ) )
+                return;
+            // We tried speculation and failed, so give up.
+        }
+        else
+        {
+            // We can't try speculation until the lock is free, so we
+            // pause here (without suspending on the queueing lock,
+            // to allow it to drain, then try again.
+            // All other threads will also see the same result for
+            // shouldSpeculate, so will be doing the same if they
+            // try to claim the lock from now on.
+            while ( ! __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
+            {
+                KMP_INC_STAT(lck,lemmingYields);
+                __kmp_yield (TRUE);
+            }
+
+            if ( __kmp_test_adaptive_lock_only( lck, gtid ) )
+                return;
+        }
+    }
+
+    // Speculative acquisition failed, so acquire it non-speculatively.
+    // Count the non-speculative acquire attempt
+    lck->lk.adaptive.acquire_attempts++;
+
+    __kmp_acquire_queuing_lock_timed_template<FALSE>( GET_QLK_PTR(lck), gtid );
+    // We have acquired the base lock, so count that.
+    KMP_INC_STAT(lck,nonSpeculativeAcquires );
+}
+
+static void
+__kmp_acquire_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) == gtid ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
+    }
+
+    __kmp_acquire_adaptive_lock( lck, gtid );
+
+    lck->lk.qlk.owner_id = gtid + 1;
+}
+
+static int
+__kmp_release_adaptive_lock( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
+{
+    if ( __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
+    {   // If the lock doesn't look claimed we must be speculating.
+        // (Or the user's code is buggy and they're releasing without locking;
+        // if we had XTEST we'd be able to check that case...)
+        _xend();        // Exit speculation
+        __kmp_update_badness_after_success( lck );
+    }
+    else
+    {   // Since the lock *is* visibly locked we're not speculating,
+        // so should use the underlying lock's release scheme.
+        __kmp_release_queuing_lock( GET_QLK_PTR(lck), gtid );
+    }
+    return KMP_LOCK_RELEASED;
+}
+
+static int
+__kmp_release_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    lck->lk.qlk.owner_id = 0;
+    __kmp_release_adaptive_lock( lck, gtid );
+    return KMP_LOCK_RELEASED;
+}
+
+static void
+__kmp_init_adaptive_lock( kmp_adaptive_lock_t *lck )
+{
+    __kmp_init_queuing_lock( GET_QLK_PTR(lck) );
+    lck->lk.adaptive.badness = 0;
+    lck->lk.adaptive.acquire_attempts = 0; //nonSpeculativeAcquireAttempts = 0;
+    lck->lk.adaptive.max_soft_retries = __kmp_adaptive_backoff_params.max_soft_retries;
+    lck->lk.adaptive.max_badness      = __kmp_adaptive_backoff_params.max_badness;
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+    __kmp_zero_speculative_stats( &lck->lk.adaptive );
+#endif
+    KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
+}
+
+static void
+__kmp_init_adaptive_lock_with_checks( kmp_adaptive_lock_t * lck )
+{
+    __kmp_init_adaptive_lock( lck );
+}
+
+static void
+__kmp_destroy_adaptive_lock( kmp_adaptive_lock_t *lck )
+{
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+    __kmp_accumulate_speculative_stats( &lck->lk.adaptive );
+#endif
+    __kmp_destroy_queuing_lock (GET_QLK_PTR(lck));
+    // Nothing needed for the speculative part.
+}
+
+static void
+__kmp_destroy_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck )
+{
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_adaptive_lock( lck );
+}
+
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+
+/* ------------------------------------------------------------------------ */
+/* DRDPA ticket locks                                                */
+/* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
+
+static kmp_int32
+__kmp_get_drdpa_lock_owner( kmp_drdpa_lock_t *lck )
+{
+    return TCR_4( lck->lk.owner_id ) - 1;
+}
+
+static inline bool
+__kmp_is_drdpa_lock_nestable( kmp_drdpa_lock_t *lck )
+{
+    return lck->lk.depth_locked != -1;
+}
+
+__forceinline static void
+__kmp_acquire_drdpa_lock_timed_template( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    kmp_uint64 ticket = KMP_TEST_THEN_INC64((kmp_int64 *)&lck->lk.next_ticket);
+    kmp_uint64 mask = TCR_8(lck->lk.mask);              // volatile load
+    volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls
+      = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
+      TCR_PTR(lck->lk.polls);                           // volatile load
+
+#ifdef USE_LOCK_PROFILE
+    if (TCR_8(polls[ticket & mask].poll) != ticket)
+        __kmp_printf("LOCK CONTENTION: %p\n", lck);
+    /* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+    //
+    // Now spin-wait, but reload the polls pointer and mask, in case the
+    // polling area has been reconfigured.  Unless it is reconfigured, the
+    // reloads stay in L1 cache and are cheap.
+    //
+    // Keep this code in sync with KMP_WAIT_YIELD, in kmp_dispatch.c !!!
+    //
+    // The current implementation of KMP_WAIT_YIELD doesn't allow for mask
+    // and poll to be re-read every spin iteration.
+    //
+    kmp_uint32 spins;
+
+    KMP_FSYNC_PREPARE(lck);
+    KMP_INIT_YIELD(spins);
+    while (TCR_8(polls[ticket & mask]).poll < ticket) { // volatile load
+        // If we are oversubscribed,
+        // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
+        // CPU Pause is in the macros for yield.
+        //
+        KMP_YIELD(TCR_4(__kmp_nth)
+          > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+        KMP_YIELD_SPIN(spins);
+
+        // Re-read the mask and the poll pointer from the lock structure.
+        //
+        // Make certain that "mask" is read before "polls" !!!
+        //
+        // If another thread picks reconfigures the polling area and updates
+        // their values, and we get the new value of mask and the old polls
+        // pointer, we could access memory beyond the end of the old polling
+        // area.
+        //
+        mask = TCR_8(lck->lk.mask);                     // volatile load
+        polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
+          TCR_PTR(lck->lk.polls);                       // volatile load
+    }
+
+    //
+    // Critical section starts here
+    //
+    KMP_FSYNC_ACQUIRED(lck);
+    KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
+      ticket, lck));
+    lck->lk.now_serving = ticket;                       // non-volatile store
+
+    //
+    // Deallocate a garbage polling area if we know that we are the last
+    // thread that could possibly access it.
+    //
+    // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
+    // ticket.
+    //
+    if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
+        __kmp_free((void *)lck->lk.old_polls);
+        lck->lk.old_polls = NULL;
+        lck->lk.cleanup_ticket = 0;
+    }
+
+    //
+    // Check to see if we should reconfigure the polling area.
+    // If there is still a garbage polling area to be deallocated from a
+    // previous reconfiguration, let a later thread reconfigure it.
+    //
+    if (lck->lk.old_polls == NULL) {
+        bool reconfigure = false;
+        volatile struct kmp_base_drdpa_lock::kmp_lock_poll *old_polls = polls;
+        kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
+
+        if (TCR_4(__kmp_nth)
+          > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
+            //
+            // We are in oversubscription mode.  Contract the polling area
+            // down to a single location, if that hasn't been done already.
+            //
+            if (num_polls > 1) {
+                reconfigure = true;
+                num_polls = TCR_4(lck->lk.num_polls);
+                mask = 0;
+                num_polls = 1;
+                polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
+                  __kmp_allocate(num_polls * sizeof(*polls));
+                polls[0].poll = ticket;
+            }
+        }
+        else {
+            //
+            // We are in under/fully subscribed mode.  Check the number of
+            // threads waiting on the lock.  The size of the polling area
+            // should be at least the number of threads waiting.
+            //
+            kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
+            if (num_waiting > num_polls) {
+                kmp_uint32 old_num_polls = num_polls;
+                reconfigure = true;
+                do {
+                    mask = (mask << 1) | 1;
+                    num_polls *= 2;
+                } while (num_polls <= num_waiting);
+
+                //
+                // Allocate the new polling area, and copy the relevant portion
+                // of the old polling area to the new area.  __kmp_allocate()
+                // zeroes the memory it allocates, and most of the old area is
+                // just zero padding, so we only copy the release counters.
+                //
+                polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
+                  __kmp_allocate(num_polls * sizeof(*polls));
+                kmp_uint32 i;
+                for (i = 0; i < old_num_polls; i++) {
+                    polls[i].poll = old_polls[i].poll;
+                }
+            }
+        }
+
+        if (reconfigure) {
+            //
+            // Now write the updated fields back to the lock structure.
+            //
+            // Make certain that "polls" is written before "mask" !!!
+            //
+            // If another thread picks up the new value of mask and the old
+            // polls pointer , it could access memory beyond the end of the
+            // old polling area.
+            //
+            // On x86, we need memory fences.
+            //
+            KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring lock %p to %d polls\n",
+              ticket, lck, num_polls));
+
+            lck->lk.old_polls = old_polls;              // non-volatile store
+            lck->lk.polls = polls;                      // volatile store
+
+            KMP_MB();
+
+            lck->lk.num_polls = num_polls;              // non-volatile store
+            lck->lk.mask = mask;                        // volatile store
+
+            KMP_MB();
+
+            //
+            // Only after the new polling area and mask have been flushed
+            // to main memory can we update the cleanup ticket field.
+            //
+            // volatile load / non-volatile store
+            //
+            lck->lk.cleanup_ticket = TCR_8(lck->lk.next_ticket);
+        }
+    }
+}
+
+void
+__kmp_acquire_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    __kmp_acquire_drdpa_lock_timed_template( lck, gtid );
+}
+
+static void
+__kmp_acquire_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_drdpa_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
+    }
+
+    __kmp_acquire_drdpa_lock( lck, gtid );
+
+    lck->lk.owner_id = gtid + 1;
+}
+
+int
+__kmp_test_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    //
+    // First get a ticket, then read the polls pointer and the mask.
+    // The polls pointer must be read before the mask!!! (See above)
+    //
+    kmp_uint64 ticket = TCR_8(lck->lk.next_ticket);     // volatile load
+    volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls
+      = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
+      TCR_PTR(lck->lk.polls);                           // volatile load
+    kmp_uint64 mask = TCR_8(lck->lk.mask);              // volatile load
+    if (TCR_8(polls[ticket & mask].poll) == ticket) {
+        kmp_uint64 next_ticket = ticket + 1;
+        if (KMP_COMPARE_AND_STORE_ACQ64((kmp_int64 *)&lck->lk.next_ticket,
+          ticket, next_ticket)) {
+            KMP_FSYNC_ACQUIRED(lck);
+            KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
+               ticket, lck));
+            lck->lk.now_serving = ticket;               // non-volatile store
+
+            //
+            // Since no threads are waiting, there is no possibility that
+            // we would want to reconfigure the polling area.  We might
+            // have the cleanup ticket value (which says that it is now
+            // safe to deallocate old_polls), but we'll let a later thread
+            // which calls __kmp_acquire_lock do that - this routine
+            // isn't supposed to block, and we would risk blocks if we
+            // called __kmp_free() to do the deallocation.
+            //
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
+static int
+__kmp_test_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+
+    int retval = __kmp_test_drdpa_lock( lck, gtid );
+
+    if ( retval ) {
+        lck->lk.owner_id = gtid + 1;
+    }
+    return retval;
+}
+
+int
+__kmp_release_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    //
+    // Read the ticket value from the lock data struct, then the polls
+    // pointer and the mask.  The polls pointer must be read before the
+    // mask!!! (See above)
+    //
+    kmp_uint64 ticket = lck->lk.now_serving + 1;        // non-volatile load
+    volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls
+      = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
+      TCR_PTR(lck->lk.polls);                           // volatile load
+    kmp_uint64 mask = TCR_8(lck->lk.mask);              // volatile load
+    KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
+       ticket - 1, lck));
+    KMP_FSYNC_RELEASING(lck);
+    KMP_ST_REL64(&(polls[ticket & mask].poll), ticket); // volatile store
+    return KMP_LOCK_RELEASED;
+}
+
+static int
+__kmp_release_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_drdpa_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_drdpa_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    lck->lk.owner_id = 0;
+    return __kmp_release_drdpa_lock( lck, gtid );
+}
+
+void
+__kmp_init_drdpa_lock( kmp_drdpa_lock_t *lck )
+{
+    lck->lk.location = NULL;
+    lck->lk.mask = 0;
+    lck->lk.num_polls = 1;
+    lck->lk.polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
+      __kmp_allocate(lck->lk.num_polls * sizeof(*(lck->lk.polls)));
+    lck->lk.cleanup_ticket = 0;
+    lck->lk.old_polls = NULL;
+    lck->lk.next_ticket = 0;
+    lck->lk.now_serving = 0;
+    lck->lk.owner_id = 0;      // no thread owns the lock.
+    lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
+    lck->lk.initialized = lck;
+
+    KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
+}
+
+static void
+__kmp_init_drdpa_lock_with_checks( kmp_drdpa_lock_t * lck )
+{
+    __kmp_init_drdpa_lock( lck );
+}
+
+void
+__kmp_destroy_drdpa_lock( kmp_drdpa_lock_t *lck )
+{
+    lck->lk.initialized = NULL;
+    lck->lk.location    = NULL;
+    if (lck->lk.polls != NULL) {
+        __kmp_free((void *)lck->lk.polls);
+        lck->lk.polls = NULL;
+    }
+    if (lck->lk.old_polls != NULL) {
+        __kmp_free((void *)lck->lk.old_polls);
+        lck->lk.old_polls = NULL;
+    }
+    lck->lk.mask = 0;
+    lck->lk.num_polls = 0;
+    lck->lk.cleanup_ticket = 0;
+    lck->lk.next_ticket = 0;
+    lck->lk.now_serving = 0;
+    lck->lk.owner_id = 0;
+    lck->lk.depth_locked = -1;
+}
+
+static void
+__kmp_destroy_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck )
+{
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_drdpa_lock( lck );
+}
+
+
+//
+// nested drdpa ticket locks
+//
+
+void
+__kmp_acquire_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_drdpa_lock_owner( lck ) == gtid ) {
+        lck->lk.depth_locked += 1;
+    }
+    else {
+        __kmp_acquire_drdpa_lock_timed_template( lck, gtid );
+        KMP_MB();
+        lck->lk.depth_locked = 1;
+        KMP_MB();
+        lck->lk.owner_id = gtid + 1;
+    }
+}
+
+static void
+__kmp_acquire_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_set_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    __kmp_acquire_nested_drdpa_lock( lck, gtid );
+}
+
+int
+__kmp_test_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    int retval;
+
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    if ( __kmp_get_drdpa_lock_owner( lck ) == gtid ) {
+        retval = ++lck->lk.depth_locked;
+    }
+    else if ( !__kmp_test_drdpa_lock( lck, gtid ) ) {
+        retval = 0;
+    }
+    else {
+        KMP_MB();
+        retval = lck->lk.depth_locked = 1;
+        KMP_MB();
+        lck->lk.owner_id = gtid + 1;
+    }
+    return retval;
+}
+
+static int
+__kmp_test_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_test_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    return __kmp_test_nested_drdpa_lock( lck, gtid );
+}
+
+int
+__kmp_release_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( gtid >= 0 );
+
+    KMP_MB();
+    if ( --(lck->lk.depth_locked) == 0 ) {
+        KMP_MB();
+        lck->lk.owner_id = 0;
+        __kmp_release_drdpa_lock( lck, gtid );
+        return KMP_LOCK_RELEASED;
+    }
+    return KMP_LOCK_STILL_HELD;
+}
+
+static int
+__kmp_release_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
+{
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    return __kmp_release_nested_drdpa_lock( lck, gtid );
+}
+
+void
+__kmp_init_nested_drdpa_lock( kmp_drdpa_lock_t * lck )
+{
+    __kmp_init_drdpa_lock( lck );
+    lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+static void
+__kmp_init_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t * lck )
+{
+    __kmp_init_nested_drdpa_lock( lck );
+}
+
+void
+__kmp_destroy_nested_drdpa_lock( kmp_drdpa_lock_t *lck )
+{
+    __kmp_destroy_drdpa_lock( lck );
+    lck->lk.depth_locked = 0;
+}
+
+static void
+__kmp_destroy_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck )
+{
+    char const * const func = "omp_destroy_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
+    }
+    __kmp_destroy_nested_drdpa_lock( lck );
+}
+
+
+//
+// access functions to fields which don't exist for all lock kinds.
+//
+
+static int
+__kmp_is_drdpa_lock_initialized( kmp_drdpa_lock_t *lck )
+{
+    return lck == lck->lk.initialized;
+}
+
+static const ident_t *
+__kmp_get_drdpa_lock_location( kmp_drdpa_lock_t *lck )
+{
+    return lck->lk.location;
+}
+
+static void
+__kmp_set_drdpa_lock_location( kmp_drdpa_lock_t *lck, const ident_t *loc )
+{
+    lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t
+__kmp_get_drdpa_lock_flags( kmp_drdpa_lock_t *lck )
+{
+    return lck->lk.flags;
+}
+
+static void
+__kmp_set_drdpa_lock_flags( kmp_drdpa_lock_t *lck, kmp_lock_flags_t flags )
+{
+    lck->lk.flags = flags;
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// Definitions of lock hints.
+# ifndef __OMP_H 
+typedef enum kmp_lock_hint_t {
+    kmp_lock_hint_none = 0,
+    kmp_lock_hint_contended,
+    kmp_lock_hint_uncontended,
+    kmp_lock_hint_nonspeculative,
+    kmp_lock_hint_speculative,
+    kmp_lock_hint_adaptive,
+} kmp_lock_hint_t;
+# endif
+
+// Direct lock initializers. It simply writes a tag to the low 8 bits of the lock word.
+#define expand_init_lock(l, a)                                              \
+static void init_##l##_lock(kmp_dyna_lock_t *lck, kmp_dyna_lockseq_t seq) { \
+    *lck = DYNA_LOCK_FREE(l);                                               \
+    KA_TRACE(20, ("Initialized direct lock, tag = %x\n", *lck));            \
+}
+FOREACH_D_LOCK(expand_init_lock, 0)
+#undef expand_init_lock
+
+#if DYNA_HAS_HLE
+
+// HLE lock functions - imported from the testbed runtime.
+#if KMP_MIC
+# define machine_pause() _mm_delay_32(10) // TODO: find the right argument
+#else
+# define machine_pause() _mm_pause()
+#endif
+#define HLE_ACQUIRE ".byte 0xf2;"
+#define HLE_RELEASE ".byte 0xf3;"
+
+static inline kmp_uint32
+swap4(kmp_uint32 volatile *p, kmp_uint32 v)
+{
+    __asm__ volatile(HLE_ACQUIRE "xchg %1,%0"
+                    : "+r"(v), "+m"(*p)
+                    :
+                    : "memory");
+    return v;
+}
+
+static void
+__kmp_destroy_hle_lock(kmp_dyna_lock_t *lck)
+{
+    *lck = 0;
+}
+
+static void
+__kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid)
+{
+    // Use gtid for DYNA_LOCK_BUSY if necessary
+    if (swap4(lck, DYNA_LOCK_BUSY(1, hle)) != DYNA_LOCK_FREE(hle)) {
+        int delay = 1;
+        do {
+            while (*(kmp_uint32 volatile *)lck != DYNA_LOCK_FREE(hle)) {
+                for (int i = delay; i != 0; --i)
+                    machine_pause();
+                delay = ((delay << 1) | 1) & 7;
+            }
+        } while (swap4(lck, DYNA_LOCK_BUSY(1, hle)) != DYNA_LOCK_FREE(hle));
+    }
+}
+
+static void
+__kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck, kmp_int32 gtid)
+{
+    __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static void
+__kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid)
+{
+    __asm__ volatile(HLE_RELEASE "movl %1,%0"
+                    : "=m"(*lck)
+                    : "r"(DYNA_LOCK_FREE(hle))
+                    : "memory");
+}
+
+static void
+__kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck, kmp_int32 gtid)
+{
+    __kmp_release_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static int
+__kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid)
+{
+    return swap4(lck, DYNA_LOCK_BUSY(1, hle)) == DYNA_LOCK_FREE(hle);
+}
+
+static int
+__kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck, kmp_int32 gtid)
+{
+    return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
+}
+
+#endif // DYNA_HAS_HLE
+
+// Entry functions for indirect locks (first element of direct_*_ops[]).
+static void __kmp_init_indirect_lock(kmp_dyna_lock_t * l, kmp_dyna_lockseq_t tag);
+static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t * lock);
+static void __kmp_set_indirect_lock(kmp_dyna_lock_t * lock, kmp_int32);
+static void __kmp_unset_indirect_lock(kmp_dyna_lock_t * lock, kmp_int32);
+static int  __kmp_test_indirect_lock(kmp_dyna_lock_t * lock, kmp_int32);
+static void __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t * lock, kmp_int32);
+static void __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t * lock, kmp_int32);
+static int  __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t * lock, kmp_int32);
+
+//
+// Jump tables for the indirect lock functions.
+// Only fill in the odd entries, that avoids the need to shift out the low bit.
+//
+#define expand_func0(l, op) 0,op##_##l##_##lock,
+void (*__kmp_direct_init_ops[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t)
+    = { __kmp_init_indirect_lock, 0, FOREACH_D_LOCK(expand_func0, init) };
+
+#define expand_func1(l, op) 0,(void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_##lock,
+void (*__kmp_direct_destroy_ops[])(kmp_dyna_lock_t *)
+    = { __kmp_destroy_indirect_lock, 0, FOREACH_D_LOCK(expand_func1, destroy) };
+
+// Differentiates *lock and *lock_with_checks.
+#define expand_func2(l, op)  0,(void (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_##lock,
+#define expand_func2c(l, op) 0,(void (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_##lock_with_checks,
+static void (*direct_set_tab[][DYNA_NUM_D_LOCKS*2+2])(kmp_dyna_lock_t *, kmp_int32)
+    = { { __kmp_set_indirect_lock, 0, FOREACH_D_LOCK(expand_func2, acquire)  },
+        { __kmp_set_indirect_lock_with_checks, 0, FOREACH_D_LOCK(expand_func2c, acquire) } };
+static void (*direct_unset_tab[][DYNA_NUM_D_LOCKS*2+2])(kmp_dyna_lock_t *, kmp_int32)
+    = { { __kmp_unset_indirect_lock, 0, FOREACH_D_LOCK(expand_func2, release)  },
+        { __kmp_unset_indirect_lock_with_checks, 0, FOREACH_D_LOCK(expand_func2c, release) } };
+
+#define expand_func3(l, op)  0,(int  (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_##lock,
+#define expand_func3c(l, op) 0,(int  (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_##lock_with_checks,
+static int  (*direct_test_tab[][DYNA_NUM_D_LOCKS*2+2])(kmp_dyna_lock_t *, kmp_int32)
+    = { { __kmp_test_indirect_lock, 0, FOREACH_D_LOCK(expand_func3, test)  },
+        { __kmp_test_indirect_lock_with_checks, 0, FOREACH_D_LOCK(expand_func3c, test) } };
+
+// Exposes only one set of jump tables (*lock or *lock_with_checks).
+void (*(*__kmp_direct_set_ops))(kmp_dyna_lock_t *, kmp_int32) = 0;
+void (*(*__kmp_direct_unset_ops))(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (*(*__kmp_direct_test_ops))(kmp_dyna_lock_t *, kmp_int32) = 0;
+
+//
+// Jump tables for the indirect lock functions.
+//
+#define expand_func4(l, op) (void (*)(kmp_user_lock_p))__kmp_##op##_##l##_##lock,
+void (*__kmp_indirect_init_ops[])(kmp_user_lock_p)
+    = { FOREACH_I_LOCK(expand_func4, init) };
+void (*__kmp_indirect_destroy_ops[])(kmp_user_lock_p)
+    = { FOREACH_I_LOCK(expand_func4, destroy) };
+
+// Differentiates *lock and *lock_with_checks.
+#define expand_func5(l, op)  (void (*)(kmp_user_lock_p, kmp_int32))__kmp_##op##_##l##_##lock,
+#define expand_func5c(l, op) (void (*)(kmp_user_lock_p, kmp_int32))__kmp_##op##_##l##_##lock_with_checks,
+static void (*indirect_set_tab[][DYNA_NUM_I_LOCKS])(kmp_user_lock_p, kmp_int32)
+    = { { FOREACH_I_LOCK(expand_func5, acquire)  },
+        { FOREACH_I_LOCK(expand_func5c, acquire) } };
+static void (*indirect_unset_tab[][DYNA_NUM_I_LOCKS])(kmp_user_lock_p, kmp_int32)
+    = { { FOREACH_I_LOCK(expand_func5, release)  },
+        { FOREACH_I_LOCK(expand_func5c, release) } };
+
+#define expand_func6(l, op)  (int  (*)(kmp_user_lock_p, kmp_int32))__kmp_##op##_##l##_##lock,
+#define expand_func6c(l, op) (int  (*)(kmp_user_lock_p, kmp_int32))__kmp_##op##_##l##_##lock_with_checks,
+static int  (*indirect_test_tab[][DYNA_NUM_I_LOCKS])(kmp_user_lock_p, kmp_int32)
+    = { { FOREACH_I_LOCK(expand_func6, test)  },
+        { FOREACH_I_LOCK(expand_func6c, test) } };
+
+// Exposes only one set of jump tables (*lock or *lock_with_checks).
+void (*(*__kmp_indirect_set_ops))(kmp_user_lock_p, kmp_int32) = 0;
+void (*(*__kmp_indirect_unset_ops))(kmp_user_lock_p, kmp_int32) = 0;
+int (*(*__kmp_indirect_test_ops))(kmp_user_lock_p, kmp_int32) = 0;
+
+// Lock index table.
+kmp_indirect_lock_t **__kmp_indirect_lock_table;
+kmp_lock_index_t __kmp_indirect_lock_table_size;
+kmp_lock_index_t __kmp_indirect_lock_table_next;
+
+// Size of indirect locks.
+static kmp_uint32 __kmp_indirect_lock_size[DYNA_NUM_I_LOCKS] = {
+    sizeof(kmp_ticket_lock_t),      sizeof(kmp_queuing_lock_t),
+#if KMP_USE_ADAPTIVE_LOCKS
+    sizeof(kmp_adaptive_lock_t),
+#endif
+    sizeof(kmp_drdpa_lock_t),
+    sizeof(kmp_tas_lock_t),
+#if DYNA_HAS_FUTEX
+    sizeof(kmp_futex_lock_t),
+#endif
+    sizeof(kmp_ticket_lock_t),      sizeof(kmp_queuing_lock_t),
+    sizeof(kmp_drdpa_lock_t)
+};
+
+// Jump tables for lock accessor/modifier.
+void (*__kmp_indirect_set_location[DYNA_NUM_I_LOCKS])(kmp_user_lock_p, const ident_t *) = { 0 };
+void (*__kmp_indirect_set_flags[DYNA_NUM_I_LOCKS])(kmp_user_lock_p, kmp_lock_flags_t) = { 0 };
+const ident_t * (*__kmp_indirect_get_location[DYNA_NUM_I_LOCKS])(kmp_user_lock_p) = { 0 };
+kmp_lock_flags_t (*__kmp_indirect_get_flags[DYNA_NUM_I_LOCKS])(kmp_user_lock_p) = { 0 };
+
+// Use different lock pools for different lock types.
+static kmp_indirect_lock_t * __kmp_indirect_lock_pool[DYNA_NUM_I_LOCKS] = { 0 };
+
+// Inserts the given lock ptr to the lock table.
+kmp_lock_index_t 
+__kmp_insert_indirect_lock(kmp_indirect_lock_t *lck)
+{
+    kmp_lock_index_t next = __kmp_indirect_lock_table_next;
+    // Check capacity and double the size if required
+    if (next >= __kmp_indirect_lock_table_size) {
+        kmp_lock_index_t i;
+        kmp_lock_index_t size = __kmp_indirect_lock_table_size;
+        kmp_indirect_lock_t **old_table = __kmp_indirect_lock_table;
+        __kmp_indirect_lock_table = (kmp_indirect_lock_t **)__kmp_allocate(2*next*sizeof(kmp_indirect_lock_t *));
+        KMP_MEMCPY(__kmp_indirect_lock_table, old_table, next*sizeof(kmp_indirect_lock_t *));
+        __kmp_free(old_table);
+        __kmp_indirect_lock_table_size = 2*next;
+    }
+    // Insert lck to the table and return the index.
+    __kmp_indirect_lock_table[next] = lck;
+    __kmp_indirect_lock_table_next++;
+    return next;
+}
+
+// User lock allocator for dynamically dispatched locks.
+kmp_indirect_lock_t *
+__kmp_allocate_indirect_lock(void **user_lock, kmp_int32 gtid, kmp_indirect_locktag_t tag)
+{
+    kmp_indirect_lock_t *lck;
+    kmp_lock_index_t idx;
+
+    __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+    if (__kmp_indirect_lock_pool[tag] != NULL) {
+        lck = __kmp_indirect_lock_pool[tag];
+        if (OMP_LOCK_T_SIZE < sizeof(void *))
+            idx = lck->lock->pool.index;
+        __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
+    } else {
+        lck = (kmp_indirect_lock_t *)__kmp_allocate(sizeof(kmp_indirect_lock_t));
+        lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
+        if (OMP_LOCK_T_SIZE < sizeof(void *))
+            idx = __kmp_insert_indirect_lock(lck);
+    }
+
+    __kmp_release_lock(&__kmp_global_lock, gtid);
+
+    lck->type = tag;
+
+    if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+        *((kmp_lock_index_t *)user_lock) = idx << 1; // indirect lock word must be even.
+    } else {
+        *((kmp_indirect_lock_t **)user_lock) = lck;
+    }
+
+    return lck;
+}
+
+// User lock lookup for dynamically dispatched locks.
+static __forceinline
+kmp_indirect_lock_t *
+__kmp_lookup_indirect_lock(void **user_lock, const char *func)
+{
+    if (__kmp_env_consistency_check) {
+        kmp_indirect_lock_t *lck = NULL;
+        if (user_lock == NULL) {
+            KMP_FATAL(LockIsUninitialized, func);
+        }
+        if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+            kmp_lock_index_t idx = DYNA_EXTRACT_I_INDEX(user_lock);
+            if (idx < 0 || idx >= __kmp_indirect_lock_table_size) {
+                KMP_FATAL(LockIsUninitialized, func);
+            }
+            lck = __kmp_indirect_lock_table[idx];
+        } else {
+            lck = *((kmp_indirect_lock_t **)user_lock);
+        }
+        if (lck == NULL) {
+            KMP_FATAL(LockIsUninitialized, func);
+        }
+        return lck; 
+    } else {
+        if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+            return __kmp_indirect_lock_table[DYNA_EXTRACT_I_INDEX(user_lock)];
+        } else {
+            return *((kmp_indirect_lock_t **)user_lock);
+        }
+    }
+}
+
+static void
+__kmp_init_indirect_lock(kmp_dyna_lock_t * lock, kmp_dyna_lockseq_t seq)
+{
+#if KMP_USE_ADAPTIVE_LOCKS
+    if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
+        KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
+        seq = lockseq_queuing;
+    }
+#endif
+    kmp_indirect_locktag_t tag = DYNA_GET_I_TAG(seq);
+    kmp_indirect_lock_t *l = __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
+    DYNA_I_LOCK_FUNC(l, init)(l->lock);
+    KA_TRACE(20, ("__kmp_init_indirect_lock: initialized indirect lock, tag = %x\n", l->type));
+}
+
+static void
+__kmp_destroy_indirect_lock(kmp_dyna_lock_t * lock)
+{
+    kmp_uint32 gtid = __kmp_entry_gtid();
+    kmp_indirect_lock_t *l = __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
+    DYNA_I_LOCK_FUNC(l, destroy)(l->lock);
+    kmp_indirect_locktag_t tag = l->type;
+
+    __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+    // Use the base lock's space to keep the pool chain.
+    l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
+    if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+        l->lock->pool.index = DYNA_EXTRACT_I_INDEX(lock);
+    }
+    __kmp_indirect_lock_pool[tag] = l;
+
+    __kmp_release_lock(&__kmp_global_lock, gtid);
+}
+
+static void
+__kmp_set_indirect_lock(kmp_dyna_lock_t * lock, kmp_int32 gtid)
+{
+    kmp_indirect_lock_t *l = DYNA_LOOKUP_I_LOCK(lock);
+    DYNA_I_LOCK_FUNC(l, set)(l->lock, gtid);
+}
+
+static void
+__kmp_unset_indirect_lock(kmp_dyna_lock_t * lock, kmp_int32 gtid)
+{
+    kmp_indirect_lock_t *l = DYNA_LOOKUP_I_LOCK(lock);
+    DYNA_I_LOCK_FUNC(l, unset)(l->lock, gtid);
+}
+
+static int
+__kmp_test_indirect_lock(kmp_dyna_lock_t * lock, kmp_int32 gtid)
+{
+    kmp_indirect_lock_t *l = DYNA_LOOKUP_I_LOCK(lock);
+    return DYNA_I_LOCK_FUNC(l, test)(l->lock, gtid);
+}
+
+static void
+__kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t * lock, kmp_int32 gtid)
+{
+    kmp_indirect_lock_t *l = __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
+    DYNA_I_LOCK_FUNC(l, set)(l->lock, gtid);
+}
+
+static void
+__kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t * lock, kmp_int32 gtid)
+{
+    kmp_indirect_lock_t *l = __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
+    DYNA_I_LOCK_FUNC(l, unset)(l->lock, gtid);
+}
+
+static int
+__kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t * lock, kmp_int32 gtid)
+{
+    kmp_indirect_lock_t *l = __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
+    return DYNA_I_LOCK_FUNC(l, test)(l->lock, gtid);
+}
+
+kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
+
+// Initialize a hinted lock.
+void
+__kmp_init_lock_hinted(void **lock, int hint)
+{
+    kmp_dyna_lockseq_t seq;
+    switch (hint) {
+        case kmp_lock_hint_uncontended:
+            seq = lockseq_tas;
+            break;
+        case kmp_lock_hint_speculative:
+#if DYNA_HAS_HLE
+            seq = lockseq_hle;
+#else
+            seq = lockseq_tas;
+#endif
+            break;
+        case kmp_lock_hint_adaptive:
+#if KMP_USE_ADAPTIVE_LOCKS
+            seq = lockseq_adaptive;
+#else
+            seq = lockseq_queuing;
+#endif
+            break;
+        // Defaults to queuing locks.
+        case kmp_lock_hint_contended:
+        case kmp_lock_hint_nonspeculative:
+        default:
+            seq = lockseq_queuing;
+            break;
+    }
+    if (DYNA_IS_D_LOCK(seq)) {
+        DYNA_INIT_D_LOCK(lock, seq);
+#if USE_ITT_BUILD
+        __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
+#endif
+    } else {
+        DYNA_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+        kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(lock);
+        __kmp_itt_lock_creating(ilk->lock, NULL);
+#endif
+    }
+}
+
+// This is used only in kmp_error.c when consistency checking is on.
+kmp_int32
+__kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq)
+{
+    switch (seq) {
+        case lockseq_tas:
+        case lockseq_nested_tas:
+            return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
+#if DYNA_HAS_FUTEX
+        case lockseq_futex:
+        case lockseq_nested_futex:
+            return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
+#endif
+        case lockseq_ticket:
+        case lockseq_nested_ticket:
+            return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
+        case lockseq_queuing:
+        case lockseq_nested_queuing:
+#if KMP_USE_ADAPTIVE_LOCKS
+        case lockseq_adaptive:
+            return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
+#endif
+        case lockseq_drdpa:
+        case lockseq_nested_drdpa:
+            return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
+        default:
+            return 0;
+    }
+}
+
+// The value initialized from KMP_LOCK_KIND needs to be translated to its
+// nested version.
+void
+__kmp_init_nest_lock_hinted(void **lock, int hint)
+{
+    kmp_dyna_lockseq_t seq;
+    switch (hint) {
+        case kmp_lock_hint_uncontended:
+            seq = lockseq_nested_tas;
+            break;
+        // Defaults to queuing locks.
+        case kmp_lock_hint_contended:
+        case kmp_lock_hint_nonspeculative:
+        default:
+            seq = lockseq_nested_queuing;
+            break;
+    }
+    DYNA_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+    kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(lock);
+    __kmp_itt_lock_creating(ilk->lock, NULL);
+#endif
+}
+
+// Initializes the lock table for indirect locks.
+static void
+__kmp_init_indirect_lock_table()
+{
+    __kmp_indirect_lock_table = (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *)*1024);
+    __kmp_indirect_lock_table_size = 1024;
+    __kmp_indirect_lock_table_next = 0;
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+# define init_lock_func(table, expand) {             \
+    table[locktag_ticket]         = expand(ticket);  \
+    table[locktag_queuing]        = expand(queuing); \
+    table[locktag_adaptive]       = expand(queuing); \
+    table[locktag_drdpa]          = expand(drdpa);   \
+    table[locktag_nested_ticket]  = expand(ticket);  \
+    table[locktag_nested_queuing] = expand(queuing); \
+    table[locktag_nested_drdpa]   = expand(drdpa);   \
+}
+#else
+# define init_lock_func(table, expand) {             \
+    table[locktag_ticket]         = expand(ticket);  \
+    table[locktag_queuing]        = expand(queuing); \
+    table[locktag_drdpa]          = expand(drdpa);   \
+    table[locktag_nested_ticket]  = expand(ticket);  \
+    table[locktag_nested_queuing] = expand(queuing); \
+    table[locktag_nested_drdpa]   = expand(drdpa);   \
+}
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+// Initializes data for dynamic user locks.
+void
+__kmp_init_dynamic_user_locks()
+{
+    // Initialize jump table location
+    int offset = (__kmp_env_consistency_check)? 1: 0;
+    __kmp_direct_set_ops = direct_set_tab[offset];
+    __kmp_direct_unset_ops = direct_unset_tab[offset];
+    __kmp_direct_test_ops = direct_test_tab[offset];
+    __kmp_indirect_set_ops = indirect_set_tab[offset];
+    __kmp_indirect_unset_ops = indirect_unset_tab[offset];
+    __kmp_indirect_test_ops = indirect_test_tab[offset];
+    __kmp_init_indirect_lock_table();
+
+    // Initialize lock accessor/modifier
+    // Could have used designated initializer, but -TP /Qstd=c99 did not work with icl.exe.
+#define expand_func(l) (void (*)(kmp_user_lock_p, const ident_t *))__kmp_set_##l##_lock_location
+    init_lock_func(__kmp_indirect_set_location, expand_func);
+#undef expand_func
+#define expand_func(l) (void (*)(kmp_user_lock_p, kmp_lock_flags_t))__kmp_set_##l##_lock_flags
+    init_lock_func(__kmp_indirect_set_flags, expand_func);
+#undef expand_func
+#define expand_func(l) (const ident_t * (*)(kmp_user_lock_p))__kmp_get_##l##_lock_location
+    init_lock_func(__kmp_indirect_get_location, expand_func);
+#undef expand_func
+#define expand_func(l) (kmp_lock_flags_t (*)(kmp_user_lock_p))__kmp_get_##l##_lock_flags
+    init_lock_func(__kmp_indirect_get_flags, expand_func);
+#undef expand_func
+
+    __kmp_init_user_locks = TRUE;
+}
+
+// Clean up the lock table.
+void
+__kmp_cleanup_indirect_user_locks()
+{
+    kmp_lock_index_t i;
+    int k;
+
+    // Clean up locks in the pools first (they were already destroyed before going into the pools).
+    for (k = 0; k < DYNA_NUM_I_LOCKS; ++k) {
+        kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
+        while (l != NULL) {
+            kmp_indirect_lock_t *ll = l;
+            l = (kmp_indirect_lock_t *)l->lock->pool.next;
+            if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+                __kmp_indirect_lock_table[ll->lock->pool.index] = NULL;
+            }
+            __kmp_free(ll->lock);
+            __kmp_free(ll);
+        }
+    }
+    // Clean up the remaining undestroyed locks.
+    for (i = 0; i < __kmp_indirect_lock_table_next; i++) {
+        kmp_indirect_lock_t *l = __kmp_indirect_lock_table[i];
+        if (l != NULL) {
+            // Locks not destroyed explicitly need to be destroyed here.
+            DYNA_I_LOCK_FUNC(l, destroy)(l->lock);
+            __kmp_free(l->lock);
+            __kmp_free(l);
+        }
+    }
+    // Free the table
+    __kmp_free(__kmp_indirect_lock_table);
+
+    __kmp_init_user_locks = FALSE;
+}
+
+enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
+int __kmp_num_locks_in_block = 1;             // FIXME - tune this value
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+/* ------------------------------------------------------------------------ */
+/* user locks
+ *
+ * They are implemented as a table of function pointers which are set to the
+ * lock functions of the appropriate kind, once that has been determined.
+ */
+
+enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
+
+size_t __kmp_base_user_lock_size = 0;
+size_t __kmp_user_lock_size = 0;
+
+kmp_int32 ( *__kmp_get_user_lock_owner_ )( kmp_user_lock_p lck ) = NULL;
+void ( *__kmp_acquire_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid ) = NULL;
+
+int ( *__kmp_test_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid ) = NULL;
+int ( *__kmp_release_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid ) = NULL;
+void ( *__kmp_init_user_lock_with_checks_ )( kmp_user_lock_p lck ) = NULL;
+void ( *__kmp_destroy_user_lock_ )( kmp_user_lock_p lck ) = NULL;
+void ( *__kmp_destroy_user_lock_with_checks_ )( kmp_user_lock_p lck ) = NULL;
+void ( *__kmp_acquire_nested_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid ) = NULL;
+
+int ( *__kmp_test_nested_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid ) = NULL;
+int ( *__kmp_release_nested_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid ) = NULL;
+void ( *__kmp_init_nested_user_lock_with_checks_ )( kmp_user_lock_p lck ) = NULL;
+void ( *__kmp_destroy_nested_user_lock_with_checks_ )( kmp_user_lock_p lck ) = NULL;
+
+int ( *__kmp_is_user_lock_initialized_ )( kmp_user_lock_p lck ) = NULL;
+const ident_t * ( *__kmp_get_user_lock_location_ )( kmp_user_lock_p lck ) = NULL;
+void ( *__kmp_set_user_lock_location_ )( kmp_user_lock_p lck, const ident_t *loc ) = NULL;
+kmp_lock_flags_t ( *__kmp_get_user_lock_flags_ )( kmp_user_lock_p lck ) = NULL;
+void ( *__kmp_set_user_lock_flags_ )( kmp_user_lock_p lck, kmp_lock_flags_t flags ) = NULL;
+
+void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind )
+{
+    switch ( user_lock_kind ) {
+        case lk_default:
+        default:
+        KMP_ASSERT( 0 );
+
+        case lk_tas: {
+            __kmp_base_user_lock_size = sizeof( kmp_base_tas_lock_t );
+            __kmp_user_lock_size = sizeof( kmp_tas_lock_t );
+
+            __kmp_get_user_lock_owner_ =
+              ( kmp_int32 ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_tas_lock_owner );
+
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
+            }
+            else {
+                KMP_BIND_USER_LOCK(tas);
+                KMP_BIND_NESTED_USER_LOCK(tas);
+            }
+
+            __kmp_destroy_user_lock_ =
+              ( void ( * )( kmp_user_lock_p ) )
+              ( &__kmp_destroy_tas_lock );
+
+             __kmp_is_user_lock_initialized_ =
+               ( int ( * )( kmp_user_lock_p ) ) NULL;
+
+             __kmp_get_user_lock_location_ =
+               ( const ident_t * ( * )( kmp_user_lock_p ) ) NULL;
+
+             __kmp_set_user_lock_location_ =
+               ( void ( * )( kmp_user_lock_p, const ident_t * ) ) NULL;
+
+             __kmp_get_user_lock_flags_ =
+               ( kmp_lock_flags_t ( * )( kmp_user_lock_p ) ) NULL;
+
+             __kmp_set_user_lock_flags_ =
+               ( void ( * )( kmp_user_lock_p, kmp_lock_flags_t ) ) NULL;
+        }
+        break;
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
+
+        case lk_futex: {
+            __kmp_base_user_lock_size = sizeof( kmp_base_futex_lock_t );
+            __kmp_user_lock_size = sizeof( kmp_futex_lock_t );
+
+            __kmp_get_user_lock_owner_ =
+              ( kmp_int32 ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_futex_lock_owner );
+
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
+            }
+            else {
+                KMP_BIND_USER_LOCK(futex);
+                KMP_BIND_NESTED_USER_LOCK(futex);
+            }
+
+            __kmp_destroy_user_lock_ =
+              ( void ( * )( kmp_user_lock_p ) )
+              ( &__kmp_destroy_futex_lock );
+
+             __kmp_is_user_lock_initialized_ =
+               ( int ( * )( kmp_user_lock_p ) ) NULL;
+
+             __kmp_get_user_lock_location_ =
+               ( const ident_t * ( * )( kmp_user_lock_p ) ) NULL;
+
+             __kmp_set_user_lock_location_ =
+               ( void ( * )( kmp_user_lock_p, const ident_t * ) ) NULL;
+
+             __kmp_get_user_lock_flags_ =
+               ( kmp_lock_flags_t ( * )( kmp_user_lock_p ) ) NULL;
+
+             __kmp_set_user_lock_flags_ =
+               ( void ( * )( kmp_user_lock_p, kmp_lock_flags_t ) ) NULL;
+        }
+        break;
+
+#endif // KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
+
+        case lk_ticket: {
+            __kmp_base_user_lock_size = sizeof( kmp_base_ticket_lock_t );
+            __kmp_user_lock_size = sizeof( kmp_ticket_lock_t );
+
+            __kmp_get_user_lock_owner_ =
+              ( kmp_int32 ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_ticket_lock_owner );
+
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
+            }
+            else {
+                KMP_BIND_USER_LOCK(ticket);
+                KMP_BIND_NESTED_USER_LOCK(ticket);
+            }
+
+            __kmp_destroy_user_lock_ =
+              ( void ( * )( kmp_user_lock_p ) )
+              ( &__kmp_destroy_ticket_lock );
+
+             __kmp_is_user_lock_initialized_ =
+               ( int ( * )( kmp_user_lock_p ) )
+               ( &__kmp_is_ticket_lock_initialized );
+
+             __kmp_get_user_lock_location_ =
+               ( const ident_t * ( * )( kmp_user_lock_p ) )
+               ( &__kmp_get_ticket_lock_location );
+
+             __kmp_set_user_lock_location_ =
+               ( void ( * )( kmp_user_lock_p, const ident_t * ) )
+               ( &__kmp_set_ticket_lock_location );
+
+             __kmp_get_user_lock_flags_ =
+               ( kmp_lock_flags_t ( * )( kmp_user_lock_p ) )
+               ( &__kmp_get_ticket_lock_flags );
+
+             __kmp_set_user_lock_flags_ =
+               ( void ( * )( kmp_user_lock_p, kmp_lock_flags_t ) )
+               ( &__kmp_set_ticket_lock_flags );
+        }
+        break;
+
+        case lk_queuing: {
+            __kmp_base_user_lock_size = sizeof( kmp_base_queuing_lock_t );
+            __kmp_user_lock_size = sizeof( kmp_queuing_lock_t );
+
+            __kmp_get_user_lock_owner_ =
+              ( kmp_int32 ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_queuing_lock_owner );
+
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
+            }
+            else {
+                KMP_BIND_USER_LOCK(queuing);
+                KMP_BIND_NESTED_USER_LOCK(queuing);
+            }
+
+            __kmp_destroy_user_lock_ =
+              ( void ( * )( kmp_user_lock_p ) )
+              ( &__kmp_destroy_queuing_lock );
+
+             __kmp_is_user_lock_initialized_ =
+               ( int ( * )( kmp_user_lock_p ) )
+               ( &__kmp_is_queuing_lock_initialized );
+
+             __kmp_get_user_lock_location_ =
+               ( const ident_t * ( * )( kmp_user_lock_p ) )
+               ( &__kmp_get_queuing_lock_location );
+
+             __kmp_set_user_lock_location_ =
+               ( void ( * )( kmp_user_lock_p, const ident_t * ) )
+               ( &__kmp_set_queuing_lock_location );
+
+             __kmp_get_user_lock_flags_ =
+               ( kmp_lock_flags_t ( * )( kmp_user_lock_p ) )
+               ( &__kmp_get_queuing_lock_flags );
+
+             __kmp_set_user_lock_flags_ =
+               ( void ( * )( kmp_user_lock_p, kmp_lock_flags_t ) )
+               ( &__kmp_set_queuing_lock_flags );
+        }
+        break;
+
+#if KMP_USE_ADAPTIVE_LOCKS
+        case lk_adaptive: {
+            __kmp_base_user_lock_size = sizeof( kmp_base_adaptive_lock_t );
+            __kmp_user_lock_size = sizeof( kmp_adaptive_lock_t );
+
+            __kmp_get_user_lock_owner_ =
+              ( kmp_int32 ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_queuing_lock_owner );
+
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
+            }
+            else {
+                KMP_BIND_USER_LOCK(adaptive);
+            }
+
+            __kmp_destroy_user_lock_ =
+              ( void ( * )( kmp_user_lock_p ) )
+              ( &__kmp_destroy_adaptive_lock );
+
+            __kmp_is_user_lock_initialized_ =
+              ( int ( * )( kmp_user_lock_p ) )
+              ( &__kmp_is_queuing_lock_initialized );
+
+            __kmp_get_user_lock_location_ =
+              ( const ident_t * ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_queuing_lock_location );
+
+            __kmp_set_user_lock_location_ =
+              ( void ( * )( kmp_user_lock_p, const ident_t * ) )
+              ( &__kmp_set_queuing_lock_location );
+
+            __kmp_get_user_lock_flags_ =
+              ( kmp_lock_flags_t ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_queuing_lock_flags );
+
+            __kmp_set_user_lock_flags_ =
+              ( void ( * )( kmp_user_lock_p, kmp_lock_flags_t ) )
+              ( &__kmp_set_queuing_lock_flags );
+
+        }
+        break;
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+        case lk_drdpa: {
+            __kmp_base_user_lock_size = sizeof( kmp_base_drdpa_lock_t );
+            __kmp_user_lock_size = sizeof( kmp_drdpa_lock_t );
+
+            __kmp_get_user_lock_owner_ =
+              ( kmp_int32 ( * )( kmp_user_lock_p ) )
+              ( &__kmp_get_drdpa_lock_owner );
+
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
+            }
+            else {
+                KMP_BIND_USER_LOCK(drdpa);
+                KMP_BIND_NESTED_USER_LOCK(drdpa);
+            }
+
+            __kmp_destroy_user_lock_ =
+              ( void ( * )( kmp_user_lock_p ) )
+              ( &__kmp_destroy_drdpa_lock );
+
+             __kmp_is_user_lock_initialized_ =
+               ( int ( * )( kmp_user_lock_p ) )
+               ( &__kmp_is_drdpa_lock_initialized );
+
+             __kmp_get_user_lock_location_ =
+               ( const ident_t * ( * )( kmp_user_lock_p ) )
+               ( &__kmp_get_drdpa_lock_location );
+
+             __kmp_set_user_lock_location_ =
+               ( void ( * )( kmp_user_lock_p, const ident_t * ) )
+               ( &__kmp_set_drdpa_lock_location );
+
+             __kmp_get_user_lock_flags_ =
+               ( kmp_lock_flags_t ( * )( kmp_user_lock_p ) )
+               ( &__kmp_get_drdpa_lock_flags );
+
+             __kmp_set_user_lock_flags_ =
+               ( void ( * )( kmp_user_lock_p, kmp_lock_flags_t ) )
+               ( &__kmp_set_drdpa_lock_flags );
+        }
+        break;
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+// User lock table & lock allocation
+
+kmp_lock_table_t __kmp_user_lock_table = { 1, 0, NULL };
+kmp_user_lock_p __kmp_lock_pool = NULL;
+
+// Lock block-allocation support.
+kmp_block_of_locks* __kmp_lock_blocks = NULL;
+int __kmp_num_locks_in_block = 1;             // FIXME - tune this value
+
+static kmp_lock_index_t
+__kmp_lock_table_insert( kmp_user_lock_p lck )
+{
+    // Assume that kmp_global_lock is held upon entry/exit.
+    kmp_lock_index_t index;
+    if ( __kmp_user_lock_table.used >= __kmp_user_lock_table.allocated ) {
+        kmp_lock_index_t size;
+        kmp_user_lock_p *table;
+        // Reallocate lock table.
+        if ( __kmp_user_lock_table.allocated == 0 ) {
+            size = 1024;
+        }
+        else {
+            size = __kmp_user_lock_table.allocated * 2;
+        }
+        table = (kmp_user_lock_p *)__kmp_allocate( sizeof( kmp_user_lock_p ) * size );
+        KMP_MEMCPY( table + 1, __kmp_user_lock_table.table + 1, sizeof( kmp_user_lock_p ) * ( __kmp_user_lock_table.used - 1 ) );
+        table[ 0 ] = (kmp_user_lock_p)__kmp_user_lock_table.table;
+            // We cannot free the previous table now, since it may be in use by other
+            // threads. So save the pointer to the previous table in in the first element of the
+            // new table. All the tables will be organized into a list, and could be freed when
+            // library shutting down.
+        __kmp_user_lock_table.table = table;
+        __kmp_user_lock_table.allocated = size;
+    }
+    KMP_DEBUG_ASSERT( __kmp_user_lock_table.used < __kmp_user_lock_table.allocated );
+    index = __kmp_user_lock_table.used;
+    __kmp_user_lock_table.table[ index ] = lck;
+    ++ __kmp_user_lock_table.used;
+    return index;
+}
+
+static kmp_user_lock_p
+__kmp_lock_block_allocate()
+{
+    // Assume that kmp_global_lock is held upon entry/exit.
+    static int last_index = 0;
+    if ( ( last_index >= __kmp_num_locks_in_block )
+      || ( __kmp_lock_blocks == NULL ) ) {
+        // Restart the index.
+        last_index = 0;
+        // Need to allocate a new block.
+        KMP_DEBUG_ASSERT( __kmp_user_lock_size > 0 );
+        size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
+        char* buffer = (char*)__kmp_allocate( space_for_locks + sizeof( kmp_block_of_locks ) );
+        // Set up the new block.
+        kmp_block_of_locks *new_block = (kmp_block_of_locks *)(& buffer[space_for_locks]);
+        new_block->next_block = __kmp_lock_blocks;
+        new_block->locks = (void *)buffer;
+        // Publish the new block.
+        KMP_MB();
+        __kmp_lock_blocks = new_block;
+    }
+    kmp_user_lock_p ret = (kmp_user_lock_p)(& ( ( (char *)( __kmp_lock_blocks->locks ) )
+      [ last_index * __kmp_user_lock_size ] ) );
+    last_index++;
+    return ret;
+}
+
+//
+// Get memory for a lock. It may be freshly allocated memory or reused memory
+// from lock pool.
+//
+kmp_user_lock_p
+__kmp_user_lock_allocate( void **user_lock, kmp_int32 gtid,
+  kmp_lock_flags_t flags )
+{
+    kmp_user_lock_p lck;
+    kmp_lock_index_t index;
+    KMP_DEBUG_ASSERT( user_lock );
+
+    __kmp_acquire_lock( &__kmp_global_lock, gtid );
+
+    if ( __kmp_lock_pool == NULL ) {
+        // Lock pool is empty. Allocate new memory.
+        if ( __kmp_num_locks_in_block <= 1 ) { // Tune this cutoff point.
+            lck = (kmp_user_lock_p) __kmp_allocate( __kmp_user_lock_size );
+        }
+        else {
+            lck = __kmp_lock_block_allocate();
+        }
+
+        // Insert lock in the table so that it can be freed in __kmp_cleanup,
+        // and debugger has info on all allocated locks.
+        index = __kmp_lock_table_insert( lck );
+    }
+    else {
+        // Pick up lock from pool.
+        lck = __kmp_lock_pool;
+        index = __kmp_lock_pool->pool.index;
+        __kmp_lock_pool = __kmp_lock_pool->pool.next;
+    }
+
+    //
+    // We could potentially differentiate between nested and regular locks
+    // here, and do the lock table lookup for regular locks only.
+    //
+    if ( OMP_LOCK_T_SIZE < sizeof(void *) ) {
+        * ( (kmp_lock_index_t *) user_lock ) = index;
+    }
+    else {
+        * ( (kmp_user_lock_p *) user_lock ) = lck;
+    }
+
+    // mark the lock if it is critical section lock.
+    __kmp_set_user_lock_flags( lck, flags );
+
+    __kmp_release_lock( & __kmp_global_lock, gtid ); // AC: TODO: move this line upper
+
+    return lck;
+}
+
+// Put lock's memory to pool for reusing.
+void
+__kmp_user_lock_free( void **user_lock, kmp_int32 gtid, kmp_user_lock_p lck )
+{
+    KMP_DEBUG_ASSERT( user_lock != NULL );
+    KMP_DEBUG_ASSERT( lck != NULL );
+
+    __kmp_acquire_lock( & __kmp_global_lock, gtid );
+
+    lck->pool.next = __kmp_lock_pool;
+    __kmp_lock_pool = lck;
+    if ( OMP_LOCK_T_SIZE < sizeof(void *) ) {
+        kmp_lock_index_t index = * ( (kmp_lock_index_t *) user_lock );
+        KMP_DEBUG_ASSERT( 0 < index && index <= __kmp_user_lock_table.used );
+        lck->pool.index = index;
+    }
+
+    __kmp_release_lock( & __kmp_global_lock, gtid );
+}
+
+kmp_user_lock_p
+__kmp_lookup_user_lock( void **user_lock, char const *func )
+{
+    kmp_user_lock_p lck = NULL;
+
+    if ( __kmp_env_consistency_check ) {
+        if ( user_lock == NULL ) {
+            KMP_FATAL( LockIsUninitialized, func );
+        }
+    }
+
+    if ( OMP_LOCK_T_SIZE < sizeof(void *) ) {
+        kmp_lock_index_t index = *( (kmp_lock_index_t *)user_lock );
+        if ( __kmp_env_consistency_check ) {
+            if ( ! ( 0 < index && index < __kmp_user_lock_table.used ) ) {
+                KMP_FATAL( LockIsUninitialized, func );
+            }
+        }
+        KMP_DEBUG_ASSERT( 0 < index && index < __kmp_user_lock_table.used );
+        KMP_DEBUG_ASSERT( __kmp_user_lock_size > 0 );
+        lck = __kmp_user_lock_table.table[index];
+    }
+    else {
+        lck = *( (kmp_user_lock_p *)user_lock );
+    }
+
+    if ( __kmp_env_consistency_check ) {
+        if ( lck == NULL ) {
+            KMP_FATAL( LockIsUninitialized, func );
+        }
+    }
+
+    return lck;
+}
+
+void
+__kmp_cleanup_user_locks( void )
+{
+    //
+    // Reset lock pool. Do not worry about lock in the pool -- we will free
+    // them when iterating through lock table (it includes all the locks,
+    // dead or alive).
+    //
+    __kmp_lock_pool = NULL;
+
+#define IS_CRITICAL(lck) \
+        ( ( __kmp_get_user_lock_flags_ != NULL ) && \
+        ( ( *__kmp_get_user_lock_flags_ )( lck ) & kmp_lf_critical_section ) )
+
+    //
+    // Loop through lock table, free all locks.
+    //
+    // Do not free item [0], it is reserved for lock tables list.
+    //
+    // FIXME - we are iterating through a list of (pointers to) objects of
+    // type union kmp_user_lock, but we have no way of knowing whether the
+    // base type is currently "pool" or whatever the global user lock type
+    // is.
+    //
+    // We are relying on the fact that for all of the user lock types
+    // (except "tas"), the first field in the lock struct is the "initialized"
+    // field, which is set to the address of the lock object itself when
+    // the lock is initialized.  When the union is of type "pool", the
+    // first field is a pointer to the next object in the free list, which
+    // will not be the same address as the object itself.
+    //
+    // This means that the check ( *__kmp_is_user_lock_initialized_ )( lck )
+    // will fail for "pool" objects on the free list.  This must happen as
+    // the "location" field of real user locks overlaps the "index" field
+    // of "pool" objects.
+    //
+    // It would be better to run through the free list, and remove all "pool"
+    // objects from the lock table before executing this loop.  However,
+    // "pool" objects do not always have their index field set (only on
+    // lin_32e), and I don't want to search the lock table for the address
+    // of every "pool" object on the free list.
+    //
+    while ( __kmp_user_lock_table.used > 1 ) {
+        const ident *loc;
+
+        //
+        // reduce __kmp_user_lock_table.used before freeing the lock,
+        // so that state of locks is consistent
+        //
+        kmp_user_lock_p lck = __kmp_user_lock_table.table[
+          --__kmp_user_lock_table.used ];
+
+        if ( ( __kmp_is_user_lock_initialized_ != NULL ) &&
+          ( *__kmp_is_user_lock_initialized_ )( lck ) ) {
+            //
+            // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is
+            // initialized AND it is NOT a critical section (user is not
+            // responsible for destroying criticals) AND we know source
+            // location to report.
+            //
+            if ( __kmp_env_consistency_check && ( ! IS_CRITICAL( lck ) ) &&
+              ( ( loc = __kmp_get_user_lock_location( lck ) ) != NULL ) &&
+              ( loc->psource != NULL ) ) {
+                kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 0 );
+                KMP_WARNING( CnsLockNotDestroyed, str_loc.file, str_loc.line );
+                __kmp_str_loc_free( &str_loc);
+            }
+
+#ifdef KMP_DEBUG
+            if ( IS_CRITICAL( lck ) ) {
+                KA_TRACE( 20, ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n", lck, *(void**)lck ) );
+            }
+            else {
+                KA_TRACE( 20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck, *(void**)lck ) );
+            }
+#endif // KMP_DEBUG
+
+            //
+            // Cleanup internal lock dynamic resources
+            // (for drdpa locks particularly).
+            //
+            __kmp_destroy_user_lock( lck );
+        }
+
+        //
+        // Free the lock if block allocation of locks is not used.
+        //
+        if ( __kmp_lock_blocks == NULL ) {
+            __kmp_free( lck );
+        }
+    }
+
+#undef IS_CRITICAL
+
+    //
+    // delete lock table(s).
+    //
+    kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
+    __kmp_user_lock_table.table = NULL;
+    __kmp_user_lock_table.allocated = 0;
+
+    while ( table_ptr != NULL ) {
+        //
+        // In the first element we saved the pointer to the previous
+        // (smaller) lock table.
+        //
+        kmp_user_lock_p *next = (kmp_user_lock_p *)( table_ptr[ 0 ] );
+        __kmp_free( table_ptr );
+        table_ptr = next;
+    }
+
+    //
+    // Free buffers allocated for blocks of locks.
+    //
+    kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
+    __kmp_lock_blocks = NULL;
+
+    while ( block_ptr != NULL ) {
+        kmp_block_of_locks_t *next = block_ptr->next_block;
+        __kmp_free( block_ptr->locks );
+        //
+        // *block_ptr itself was allocated at the end of the locks vector.
+        //
+	block_ptr = next;
+    }
+
+    TCW_4(__kmp_init_user_locks, FALSE);
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK

diff --git a/final/runtime/src/kmp_lock.h b/final/runtime/src/kmp_lock.h
new file mode 100644
index 0000000..29a2e4f
--- /dev/null
+++ b/final/runtime/src/kmp_lock.h

@@ -0,0 +1,1242 @@
+/*
+ * kmp_lock.h -- lock header file
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_LOCK_H
+#define KMP_LOCK_H
+
+#include <limits.h>    // CHAR_BIT
+#include <stddef.h>    // offsetof
+
+#include "kmp_os.h"
+#include "kmp_debug.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// ----------------------------------------------------------------------------
+// Have to copy these definitions from kmp.h because kmp.h cannot be included
+// due to circular dependencies.  Will undef these at end of file.
+
+#define KMP_PAD(type, sz)     (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
+#define KMP_GTID_DNE (-2)
+
+// Forward declaration of ident and ident_t
+
+struct ident;
+typedef struct ident ident_t;
+
+// End of copied code.
+// ----------------------------------------------------------------------------
+
+//
+// We need to know the size of the area we can assume that the compiler(s)
+// allocated for obects of type omp_lock_t and omp_nest_lock_t.  The Intel
+// compiler always allocates a pointer-sized area, as does visual studio.
+//
+// gcc however, only allocates 4 bytes for regular locks, even on 64-bit
+// intel archs.  It allocates at least 8 bytes for nested lock (more on
+// recent versions), but we are bounded by the pointer-sized chunks that
+// the Intel compiler allocates.
+//
+
+#if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT)
+# define OMP_LOCK_T_SIZE        sizeof(int)
+# define OMP_NEST_LOCK_T_SIZE   sizeof(void *)
+#else
+# define OMP_LOCK_T_SIZE        sizeof(void *)
+# define OMP_NEST_LOCK_T_SIZE   sizeof(void *)
+#endif
+
+//
+// The Intel compiler allocates a 32-byte chunk for a critical section.
+// Both gcc and visual studio only allocate enough space for a pointer.
+// Sometimes we know that the space was allocated by the Intel compiler.
+//
+#define OMP_CRITICAL_SIZE       sizeof(void *)
+#define INTEL_CRITICAL_SIZE     32
+
+//
+// lock flags
+//
+typedef kmp_uint32 kmp_lock_flags_t;
+
+#define kmp_lf_critical_section 1
+
+//
+// When a lock table is used, the indices are of kmp_lock_index_t
+//
+typedef kmp_uint32 kmp_lock_index_t;
+
+//
+// When memory allocated for locks are on the lock pool (free list),
+// it is treated as structs of this type.
+//
+struct kmp_lock_pool {
+    union kmp_user_lock *next;
+    kmp_lock_index_t index;
+};
+
+typedef struct kmp_lock_pool kmp_lock_pool_t;
+
+
+extern void __kmp_validate_locks( void );
+
+
+// ----------------------------------------------------------------------------
+//
+//  There are 5 lock implementations:
+//
+//       1. Test and set locks.
+//       2. futex locks (Linux* OS on x86 and Intel(R) Many Integrated Core architecture)
+//       3. Ticket (Lamport bakery) locks.
+//       4. Queuing locks (with separate spin fields).
+//       5. DRPA (Dynamically Reconfigurable Distributed Polling Area) locks
+//
+//   and 3 lock purposes:
+//
+//       1. Bootstrap locks -- Used for a few locks available at library startup-shutdown time.
+//          These do not require non-negative global thread ID's.
+//       2. Internal RTL locks -- Used everywhere else in the RTL
+//       3. User locks (includes critical sections)
+//
+// ----------------------------------------------------------------------------
+
+
+// ============================================================================
+// Lock implementations.
+// ============================================================================
+
+
+// ----------------------------------------------------------------------------
+// Test and set locks.
+//
+// Non-nested test and set locks differ from the other lock kinds (except
+// futex) in that we use the memory allocated by the compiler for the lock,
+// rather than a pointer to it.
+//
+// On lin32, lin_32e, and win_32, the space allocated may be as small as 4
+// bytes, so we have to use a lock table for nested locks, and avoid accessing
+// the depth_locked field for non-nested locks.
+//
+// Information normally available to the tools, such as lock location,
+// lock usage (normal lock vs. critical section), etc. is not available with
+// test and set locks.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_tas_lock {
+    volatile kmp_int32 poll;         // 0 => unlocked
+                                     // locked: (gtid+1) of owning thread
+    kmp_int32          depth_locked; // depth locked, for nested locks only
+};
+
+typedef struct kmp_base_tas_lock kmp_base_tas_lock_t;
+
+union kmp_tas_lock {
+    kmp_base_tas_lock_t lk;
+    kmp_lock_pool_t pool;   // make certain struct is large enough
+    double lk_align;        // use worst case alignment
+                            // no cache line padding
+};
+
+typedef union kmp_tas_lock kmp_tas_lock_t;
+
+//
+// Static initializer for test and set lock variables. Usage:
+//    kmp_tas_lock_t xlock = KMP_TAS_LOCK_INITIALIZER( xlock );
+//
+#define KMP_TAS_LOCK_INITIALIZER( lock ) { { 0, 0 } }
+
+extern void __kmp_acquire_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_tas_lock( kmp_tas_lock_t *lck );
+extern void __kmp_destroy_tas_lock( kmp_tas_lock_t *lck );
+
+extern void __kmp_acquire_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_nested_tas_lock( kmp_tas_lock_t *lck );
+extern void __kmp_destroy_nested_tas_lock( kmp_tas_lock_t *lck );
+
+#define KMP_LOCK_RELEASED       1
+#define KMP_LOCK_STILL_HELD     0
+
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+// ----------------------------------------------------------------------------
+// futex locks.  futex locks are only available on Linux* OS.
+//
+// Like non-nested test and set lock, non-nested futex locks use the memory
+// allocated by the compiler for the lock, rather than a pointer to it.
+//
+// Information normally available to the tools, such as lock location,
+// lock usage (normal lock vs. critical section), etc. is not available with
+// test and set locks.  With non-nested futex locks, the lock owner is not
+// even available.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_futex_lock {
+    volatile kmp_int32 poll;         // 0 => unlocked
+                                     // 2*(gtid+1) of owning thread, 0 if unlocked
+                                     // locked: (gtid+1) of owning thread
+    kmp_int32          depth_locked; // depth locked, for nested locks only
+};
+
+typedef struct kmp_base_futex_lock kmp_base_futex_lock_t;
+
+union kmp_futex_lock {
+    kmp_base_futex_lock_t lk;
+    kmp_lock_pool_t pool;   // make certain struct is large enough
+    double lk_align;        // use worst case alignment
+                            // no cache line padding
+};
+
+typedef union kmp_futex_lock kmp_futex_lock_t;
+
+//
+// Static initializer for futex lock variables. Usage:
+//    kmp_futex_lock_t xlock = KMP_FUTEX_LOCK_INITIALIZER( xlock );
+//
+#define KMP_FUTEX_LOCK_INITIALIZER( lock ) { { 0, 0 } }
+
+extern void __kmp_acquire_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_futex_lock( kmp_futex_lock_t *lck );
+extern void __kmp_destroy_futex_lock( kmp_futex_lock_t *lck );
+
+extern void __kmp_acquire_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_nested_futex_lock( kmp_futex_lock_t *lck );
+extern void __kmp_destroy_nested_futex_lock( kmp_futex_lock_t *lck );
+
+#endif // KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+
+// ----------------------------------------------------------------------------
+// Ticket locks.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_ticket_lock {
+    // `initialized' must be the first entry in the lock data structure!
+    volatile union kmp_ticket_lock * initialized;  // points to the lock union if in initialized state
+    ident_t const *     location;     // Source code location of omp_init_lock().
+    volatile kmp_uint32 next_ticket;  // ticket number to give to next thread which acquires
+    volatile kmp_uint32 now_serving;  // ticket number for thread which holds the lock
+    volatile kmp_int32  owner_id;     // (gtid+1) of owning thread, 0 if unlocked
+    kmp_int32           depth_locked; // depth locked, for nested locks only
+    kmp_lock_flags_t    flags;        // lock specifics, e.g. critical section lock
+};
+
+typedef struct kmp_base_ticket_lock kmp_base_ticket_lock_t;
+
+union KMP_ALIGN_CACHE kmp_ticket_lock {
+    kmp_base_ticket_lock_t lk;       // This field must be first to allow static initializing.
+    kmp_lock_pool_t pool;
+    double                 lk_align; // use worst case alignment
+    char                   lk_pad[ KMP_PAD( kmp_base_ticket_lock_t, CACHE_LINE ) ];
+};
+
+typedef union kmp_ticket_lock kmp_ticket_lock_t;
+
+//
+// Static initializer for simple ticket lock variables. Usage:
+//    kmp_ticket_lock_t xlock = KMP_TICKET_LOCK_INITIALIZER( xlock );
+// Note the macro argument. It is important to make var properly initialized.
+//
+#define KMP_TICKET_LOCK_INITIALIZER( lock ) { { (kmp_ticket_lock_t *) & (lock), NULL, 0, 0, 0, -1 } }
+
+extern void __kmp_acquire_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_ticket_lock_with_cheks( kmp_ticket_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_ticket_lock( kmp_ticket_lock_t *lck );
+extern void __kmp_destroy_ticket_lock( kmp_ticket_lock_t *lck );
+
+extern void __kmp_acquire_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_nested_ticket_lock( kmp_ticket_lock_t *lck );
+extern void __kmp_destroy_nested_ticket_lock( kmp_ticket_lock_t *lck );
+
+
+// ----------------------------------------------------------------------------
+// Queuing locks.
+// ----------------------------------------------------------------------------
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_info;
+
+typedef struct kmp_adaptive_lock_info kmp_adaptive_lock_info_t;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_statistics {
+    /* So we can get stats from locks that haven't been destroyed. */
+    kmp_adaptive_lock_info_t * next;
+    kmp_adaptive_lock_info_t * prev;
+
+    /* Other statistics */
+    kmp_uint32 successfulSpeculations;
+    kmp_uint32 hardFailedSpeculations;
+    kmp_uint32 softFailedSpeculations;
+    kmp_uint32 nonSpeculativeAcquires;
+    kmp_uint32 nonSpeculativeAcquireAttempts;
+    kmp_uint32 lemmingYields;
+};
+
+typedef struct kmp_adaptive_lock_statistics kmp_adaptive_lock_statistics_t;
+
+extern void __kmp_print_speculative_stats();
+extern void __kmp_init_speculative_stats();
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_info
+{
+    /* Values used for adaptivity.
+     * Although these are accessed from multiple threads we don't access them atomically,
+     * because if we miss updates it probably doesn't matter much. (It just affects our
+     * decision about whether to try speculation on the lock).
+     */
+    kmp_uint32 volatile badness;
+    kmp_uint32 volatile acquire_attempts;
+    /* Parameters of the lock. */
+    kmp_uint32 max_badness;
+    kmp_uint32 max_soft_retries;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+    kmp_adaptive_lock_statistics_t volatile stats;
+#endif
+};
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+
+struct kmp_base_queuing_lock {
+
+    //  `initialized' must be the first entry in the lock data structure!
+    volatile union kmp_queuing_lock *initialized; // Points to the lock union if in initialized state.
+
+    ident_t const *     location;     // Source code location of omp_init_lock().
+
+    KMP_ALIGN( 8 )                    // tail_id  must be 8-byte aligned!
+
+    volatile kmp_int32  tail_id;      // (gtid+1) of thread at tail of wait queue, 0 if empty
+                                      // Must be no padding here since head/tail used in 8-byte CAS
+    volatile kmp_int32  head_id;      // (gtid+1) of thread at head of wait queue, 0 if empty
+                                      // Decl order assumes little endian
+    // bakery-style lock
+    volatile kmp_uint32 next_ticket;  // ticket number to give to next thread which acquires
+    volatile kmp_uint32 now_serving;  // ticket number for thread which holds the lock
+    volatile kmp_int32  owner_id;     // (gtid+1) of owning thread, 0 if unlocked
+    kmp_int32           depth_locked; // depth locked, for nested locks only
+
+    kmp_lock_flags_t    flags;        // lock specifics, e.g. critical section lock
+};
+
+typedef struct kmp_base_queuing_lock kmp_base_queuing_lock_t;
+
+KMP_BUILD_ASSERT( offsetof( kmp_base_queuing_lock_t, tail_id ) % 8 == 0 );
+
+union KMP_ALIGN_CACHE kmp_queuing_lock {
+    kmp_base_queuing_lock_t lk;       // This field must be first to allow static initializing.
+    kmp_lock_pool_t pool;
+    double                   lk_align; // use worst case alignment
+    char                     lk_pad[ KMP_PAD( kmp_base_queuing_lock_t, CACHE_LINE ) ];
+};
+
+typedef union kmp_queuing_lock kmp_queuing_lock_t;
+
+extern void __kmp_acquire_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_queuing_lock( kmp_queuing_lock_t *lck );
+extern void __kmp_destroy_queuing_lock( kmp_queuing_lock_t *lck );
+
+extern void __kmp_acquire_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_nested_queuing_lock( kmp_queuing_lock_t *lck );
+extern void __kmp_destroy_nested_queuing_lock( kmp_queuing_lock_t *lck );
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// ----------------------------------------------------------------------------
+// Adaptive locks.
+// ----------------------------------------------------------------------------
+struct kmp_base_adaptive_lock {
+    kmp_base_queuing_lock qlk;
+    KMP_ALIGN(CACHE_LINE)
+    kmp_adaptive_lock_info_t adaptive;     // Information for the speculative adaptive lock
+};
+
+typedef struct kmp_base_adaptive_lock kmp_base_adaptive_lock_t;
+
+union KMP_ALIGN_CACHE kmp_adaptive_lock {
+    kmp_base_adaptive_lock_t lk;
+    kmp_lock_pool_t pool;
+    double lk_align;
+    char lk_pad[ KMP_PAD(kmp_base_adaptive_lock_t, CACHE_LINE) ];
+};
+typedef union kmp_adaptive_lock kmp_adaptive_lock_t;
+
+# define GET_QLK_PTR(l) ((kmp_queuing_lock_t *) & (l)->lk.qlk)
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+// ----------------------------------------------------------------------------
+// DRDPA ticket locks.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_drdpa_lock {
+    //
+    // All of the fields on the first cache line are only written when
+    // initializing or reconfiguring the lock.  These are relatively rare
+    // operations, so data from the first cache line will usually stay
+    // resident in the cache of each thread trying to acquire the lock.
+    //
+    // initialized must be the first entry in the lock data structure!
+    //
+    KMP_ALIGN_CACHE
+
+    volatile union kmp_drdpa_lock * initialized;    // points to the lock union if in initialized state
+    ident_t const *                 location;       // Source code location of omp_init_lock().
+    volatile struct kmp_lock_poll {
+        kmp_uint64 poll;
+    } * volatile polls;
+    volatile kmp_uint64             mask;           // is 2**num_polls-1 for mod op
+    kmp_uint64                      cleanup_ticket; // thread with cleanup ticket
+    volatile struct kmp_lock_poll * old_polls;      // will deallocate old_polls
+    kmp_uint32                      num_polls;      // must be power of 2
+
+    //
+    // next_ticket it needs to exist in a separate cache line, as it is
+    // invalidated every time a thread takes a new ticket.
+    //
+    KMP_ALIGN_CACHE
+
+    volatile kmp_uint64             next_ticket;
+
+    //
+    // now_serving is used to store our ticket value while we hold the lock.
+    // It has a slightly different meaning in the DRDPA ticket locks (where
+    // it is written by the acquiring thread) than it does in the simple
+    // ticket locks (where it is written by the releasing thread).
+    //
+    // Since now_serving is only read an written in the critical section,
+    // it is non-volatile, but it needs to exist on a separate cache line,
+    // as it is invalidated at every lock acquire.
+    //
+    // Likewise, the vars used for nested locks (owner_id and depth_locked)
+    // are only written by the thread owning the lock, so they are put in
+    // this cache line.  owner_id is read by other threads, so it must be
+    // declared volatile.
+    //
+    KMP_ALIGN_CACHE
+
+    kmp_uint64                      now_serving;    // doesn't have to be volatile
+    volatile kmp_uint32             owner_id;       // (gtid+1) of owning thread, 0 if unlocked
+    kmp_int32                       depth_locked;   // depth locked
+    kmp_lock_flags_t                flags;          // lock specifics, e.g. critical section lock
+};
+
+typedef struct kmp_base_drdpa_lock kmp_base_drdpa_lock_t;
+
+union KMP_ALIGN_CACHE kmp_drdpa_lock {
+    kmp_base_drdpa_lock_t lk;       // This field must be first to allow static initializing. */
+    kmp_lock_pool_t pool;
+    double                lk_align; // use worst case alignment
+    char                  lk_pad[ KMP_PAD( kmp_base_drdpa_lock_t, CACHE_LINE ) ];
+};
+
+typedef union kmp_drdpa_lock kmp_drdpa_lock_t;
+
+extern void __kmp_acquire_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_drdpa_lock( kmp_drdpa_lock_t *lck );
+extern void __kmp_destroy_drdpa_lock( kmp_drdpa_lock_t *lck );
+
+extern void __kmp_acquire_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_test_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid );
+extern int __kmp_release_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid );
+extern void __kmp_init_nested_drdpa_lock( kmp_drdpa_lock_t *lck );
+extern void __kmp_destroy_nested_drdpa_lock( kmp_drdpa_lock_t *lck );
+
+
+// ============================================================================
+// Lock purposes.
+// ============================================================================
+
+
+// ----------------------------------------------------------------------------
+// Bootstrap locks.
+// ----------------------------------------------------------------------------
+
+// Bootstrap locks -- very few locks used at library initialization time.
+// Bootstrap locks are currently implemented as ticket locks.
+// They could also be implemented as test and set lock, but cannot be
+// implemented with other lock kinds as they require gtids which are not
+// available at initialization time.
+
+typedef kmp_ticket_lock_t kmp_bootstrap_lock_t;
+
+#define KMP_BOOTSTRAP_LOCK_INITIALIZER( lock ) KMP_TICKET_LOCK_INITIALIZER( (lock) )
+
+static inline void
+__kmp_acquire_bootstrap_lock( kmp_bootstrap_lock_t *lck )
+{
+    __kmp_acquire_ticket_lock( lck, KMP_GTID_DNE );
+}
+
+static inline int
+__kmp_test_bootstrap_lock( kmp_bootstrap_lock_t *lck )
+{
+    return __kmp_test_ticket_lock( lck, KMP_GTID_DNE );
+}
+
+static inline void
+__kmp_release_bootstrap_lock( kmp_bootstrap_lock_t *lck )
+{
+    __kmp_release_ticket_lock( lck, KMP_GTID_DNE );
+}
+
+static inline void
+__kmp_init_bootstrap_lock( kmp_bootstrap_lock_t *lck )
+{
+    __kmp_init_ticket_lock( lck );
+}
+
+static inline void
+__kmp_destroy_bootstrap_lock( kmp_bootstrap_lock_t *lck )
+{
+    __kmp_destroy_ticket_lock( lck );
+}
+
+
+// ----------------------------------------------------------------------------
+// Internal RTL locks.
+// ----------------------------------------------------------------------------
+
+//
+// Internal RTL locks are also implemented as ticket locks, for now.
+//
+// FIXME - We should go through and figure out which lock kind works best for
+// each internal lock, and use the type declaration and function calls for
+// that explicit lock kind (and get rid of this section).
+//
+
+typedef kmp_ticket_lock_t kmp_lock_t;
+
+static inline void
+__kmp_acquire_lock( kmp_lock_t *lck, kmp_int32 gtid )
+{
+    __kmp_acquire_ticket_lock( lck, gtid );
+}
+
+static inline int
+__kmp_test_lock( kmp_lock_t *lck, kmp_int32 gtid )
+{
+    return __kmp_test_ticket_lock( lck, gtid );
+}
+
+static inline void
+__kmp_release_lock( kmp_lock_t *lck, kmp_int32 gtid )
+{
+    __kmp_release_ticket_lock( lck, gtid );
+}
+
+static inline void
+__kmp_init_lock( kmp_lock_t *lck )
+{
+    __kmp_init_ticket_lock( lck );
+}
+
+static inline void
+__kmp_destroy_lock( kmp_lock_t *lck )
+{
+    __kmp_destroy_ticket_lock( lck );
+}
+
+
+// ----------------------------------------------------------------------------
+// User locks.
+// ----------------------------------------------------------------------------
+
+//
+// Do not allocate objects of type union kmp_user_lock!!!
+// This will waste space unless __kmp_user_lock_kind == lk_drdpa.
+// Instead, check the value of __kmp_user_lock_kind and allocate objects of
+// the type of the appropriate union member, and cast their addresses to
+// kmp_user_lock_p.
+//
+
+enum kmp_lock_kind {
+    lk_default = 0,
+    lk_tas,
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    lk_futex,
+#endif
+    lk_ticket,
+    lk_queuing,
+    lk_drdpa,
+#if KMP_USE_ADAPTIVE_LOCKS
+    lk_adaptive
+#endif // KMP_USE_ADAPTIVE_LOCKS
+};
+
+typedef enum kmp_lock_kind kmp_lock_kind_t;
+
+extern kmp_lock_kind_t __kmp_user_lock_kind;
+
+union kmp_user_lock {
+    kmp_tas_lock_t     tas;
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    kmp_futex_lock_t   futex;
+#endif
+    kmp_ticket_lock_t  ticket;
+    kmp_queuing_lock_t queuing;
+    kmp_drdpa_lock_t   drdpa;
+#if KMP_USE_ADAPTIVE_LOCKS
+    kmp_adaptive_lock_t     adaptive;
+#endif // KMP_USE_ADAPTIVE_LOCKS
+    kmp_lock_pool_t    pool;
+};
+
+typedef union kmp_user_lock *kmp_user_lock_p;
+
+#if ! KMP_USE_DYNAMIC_LOCK
+
+extern size_t __kmp_base_user_lock_size;
+extern size_t __kmp_user_lock_size;
+
+extern kmp_int32 ( *__kmp_get_user_lock_owner_ )( kmp_user_lock_p lck );
+
+static inline kmp_int32
+__kmp_get_user_lock_owner( kmp_user_lock_p lck )
+{
+    KMP_DEBUG_ASSERT( __kmp_get_user_lock_owner_ != NULL );
+    return ( *__kmp_get_user_lock_owner_ )( lck );
+}
+
+extern void ( *__kmp_acquire_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid );
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+#define __kmp_acquire_user_lock_with_checks(lck,gtid)                                           \
+    if (__kmp_user_lock_kind == lk_tas) {                                                       \
+        if ( __kmp_env_consistency_check ) {                                                    \
+            char const * const func = "omp_set_lock";                                           \
+            if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )                               \
+                && lck->tas.lk.depth_locked != -1 ) {                                           \
+                KMP_FATAL( LockNestableUsedAsSimple, func );                                    \
+            }                                                                                   \
+            if ( ( gtid >= 0 ) && ( lck->tas.lk.poll - 1 == gtid ) ) {                          \
+                KMP_FATAL( LockIsAlreadyOwned, func );                                          \
+            }                                                                                   \
+        }                                                                                       \
+        if ( ( lck->tas.lk.poll != 0 ) ||                                                       \
+          ( ! KMP_COMPARE_AND_STORE_ACQ32( &(lck->tas.lk.poll), 0, gtid + 1 ) )  ) {            \
+            kmp_uint32 spins;                                                                   \
+            KMP_FSYNC_PREPARE( lck );                                                           \
+            KMP_INIT_YIELD( spins );                                                            \
+            if ( TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc) ) {     \
+                KMP_YIELD( TRUE );                                                              \
+            } else {                                                                            \
+                KMP_YIELD_SPIN( spins );                                                        \
+            }                                                                                   \
+            while ( ( lck->tas.lk.poll != 0 ) ||                                                \
+              ( ! KMP_COMPARE_AND_STORE_ACQ32( &(lck->tas.lk.poll), 0, gtid + 1 ) )  )  {       \
+                if ( TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc) ) { \
+                    KMP_YIELD( TRUE );                                                          \
+                } else {                                                                        \
+                    KMP_YIELD_SPIN( spins );                                                    \
+                }                                                                               \
+            }                                                                                   \
+        }                                                                                       \
+        KMP_FSYNC_ACQUIRED( lck );                                                              \
+    } else {                                                                                    \
+        KMP_DEBUG_ASSERT( __kmp_acquire_user_lock_with_checks_ != NULL );                       \
+        ( *__kmp_acquire_user_lock_with_checks_ )( lck, gtid );                                 \
+    }
+
+#else
+static inline void
+__kmp_acquire_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( __kmp_acquire_user_lock_with_checks_ != NULL );
+    ( *__kmp_acquire_user_lock_with_checks_ )( lck, gtid );
+}
+#endif
+
+extern int ( *__kmp_test_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid );
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+#include "kmp_i18n.h"                       /* AC: KMP_FATAL definition */
+extern int __kmp_env_consistency_check;     /* AC: copy from kmp.h here */
+static inline int
+__kmp_test_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    if ( __kmp_user_lock_kind == lk_tas ) {
+        if ( __kmp_env_consistency_check ) {
+            char const * const func = "omp_test_lock";
+            if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+                && lck->tas.lk.depth_locked != -1 ) {
+                KMP_FATAL( LockNestableUsedAsSimple, func );
+            }
+        }
+        return ( ( lck->tas.lk.poll == 0 ) &&
+          KMP_COMPARE_AND_STORE_ACQ32( &(lck->tas.lk.poll), 0, gtid + 1 ) );
+    } else {
+        KMP_DEBUG_ASSERT( __kmp_test_user_lock_with_checks_ != NULL );
+        return ( *__kmp_test_user_lock_with_checks_ )( lck, gtid );
+    }
+}
+#else
+static inline int
+__kmp_test_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( __kmp_test_user_lock_with_checks_ != NULL );
+    return ( *__kmp_test_user_lock_with_checks_ )( lck, gtid );
+}
+#endif
+
+extern int ( *__kmp_release_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid );
+
+static inline void
+__kmp_release_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( __kmp_release_user_lock_with_checks_ != NULL );
+    ( *__kmp_release_user_lock_with_checks_ ) ( lck, gtid );
+}
+
+extern void ( *__kmp_init_user_lock_with_checks_ )( kmp_user_lock_p lck );
+
+static inline void
+__kmp_init_user_lock_with_checks( kmp_user_lock_p lck )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_user_lock_with_checks_ != NULL );
+    ( *__kmp_init_user_lock_with_checks_ )( lck );
+}
+
+//
+// We need a non-checking version of destroy lock for when the RTL is
+// doing the cleanup as it can't always tell if the lock is nested or not.
+//
+extern void ( *__kmp_destroy_user_lock_ )( kmp_user_lock_p lck );
+
+static inline void
+__kmp_destroy_user_lock( kmp_user_lock_p lck )
+{
+    KMP_DEBUG_ASSERT( __kmp_destroy_user_lock_ != NULL );
+    ( *__kmp_destroy_user_lock_ )( lck );
+}
+
+extern void ( *__kmp_destroy_user_lock_with_checks_ )( kmp_user_lock_p lck );
+
+static inline void
+__kmp_destroy_user_lock_with_checks( kmp_user_lock_p lck )
+{
+    KMP_DEBUG_ASSERT( __kmp_destroy_user_lock_with_checks_ != NULL );
+    ( *__kmp_destroy_user_lock_with_checks_ )( lck );
+}
+
+extern void ( *__kmp_acquire_nested_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid );
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+
+#define __kmp_acquire_nested_user_lock_with_checks(lck,gtid)                                        \
+    if (__kmp_user_lock_kind == lk_tas) {                                                           \
+        if ( __kmp_env_consistency_check ) {                                                        \
+            char const * const func = "omp_set_nest_lock";                                          \
+            if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_NEST_LOCK_T_SIZE )                              \
+                && lck->tas.lk.depth_locked == -1 ) {                                               \
+                KMP_FATAL( LockSimpleUsedAsNestable, func );                                        \
+            }                                                                                       \
+        }                                                                                           \
+        if ( lck->tas.lk.poll - 1 == gtid ) {                                                       \
+            lck->tas.lk.depth_locked += 1;                                                          \
+        } else {                                                                                    \
+            if ( ( lck->tas.lk.poll != 0 ) ||                                                       \
+              ( ! KMP_COMPARE_AND_STORE_ACQ32( &(lck->tas.lk.poll), 0, gtid + 1 ) )  ) {            \
+                kmp_uint32 spins;                                                                   \
+                KMP_FSYNC_PREPARE( lck );                                                           \
+                KMP_INIT_YIELD( spins );                                                            \
+                if ( TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc) ) {     \
+                    KMP_YIELD( TRUE );                                                              \
+                } else {                                                                            \
+                    KMP_YIELD_SPIN( spins );                                                        \
+                }                                                                                   \
+                while ( ( lck->tas.lk.poll != 0 ) ||                                                \
+                  ( ! KMP_COMPARE_AND_STORE_ACQ32( &(lck->tas.lk.poll), 0, gtid + 1 ) )  ) {        \
+                    if ( TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc) ) { \
+                        KMP_YIELD( TRUE );                                                          \
+                    } else {                                                                        \
+                        KMP_YIELD_SPIN( spins );                                                    \
+                    }                                                                               \
+                }                                                                                   \
+            }                                                                                       \
+            lck->tas.lk.depth_locked = 1;                                                           \
+        }                                                                                           \
+        KMP_FSYNC_ACQUIRED( lck );                                                                  \
+    } else {                                                                                        \
+        KMP_DEBUG_ASSERT( __kmp_acquire_nested_user_lock_with_checks_ != NULL );                    \
+        ( *__kmp_acquire_nested_user_lock_with_checks_ )( lck, gtid );                              \
+    }
+
+#else
+static inline void
+__kmp_acquire_nested_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( __kmp_acquire_nested_user_lock_with_checks_ != NULL );
+    ( *__kmp_acquire_nested_user_lock_with_checks_ )( lck, gtid );
+}
+#endif
+
+extern int ( *__kmp_test_nested_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid );
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+static inline int
+__kmp_test_nested_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    if ( __kmp_user_lock_kind == lk_tas ) {
+        int retval;
+        if ( __kmp_env_consistency_check ) {
+            char const * const func = "omp_test_nest_lock";
+            if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_NEST_LOCK_T_SIZE )
+                && lck->tas.lk.depth_locked == -1 ) {
+                KMP_FATAL( LockSimpleUsedAsNestable, func );
+            }
+        }
+        KMP_DEBUG_ASSERT( gtid >= 0 );
+        if ( lck->tas.lk.poll - 1 == gtid ) {   /* __kmp_get_tas_lock_owner( lck ) == gtid */
+            return ++lck->tas.lk.depth_locked;  /* same owner, depth increased */
+        }
+        retval = ( ( lck->tas.lk.poll == 0 ) &&
+          KMP_COMPARE_AND_STORE_ACQ32( &(lck->tas.lk.poll), 0, gtid + 1 ) );
+        if ( retval ) {
+            KMP_MB();
+            lck->tas.lk.depth_locked = 1;
+        }
+        return retval;
+    } else {
+        KMP_DEBUG_ASSERT( __kmp_test_nested_user_lock_with_checks_ != NULL );
+        return ( *__kmp_test_nested_user_lock_with_checks_ )( lck, gtid );
+    }
+}
+#else
+static inline int
+__kmp_test_nested_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( __kmp_test_nested_user_lock_with_checks_ != NULL );
+    return ( *__kmp_test_nested_user_lock_with_checks_ )( lck, gtid );
+}
+#endif
+
+extern int ( *__kmp_release_nested_user_lock_with_checks_ )( kmp_user_lock_p lck, kmp_int32 gtid );
+
+static inline int
+__kmp_release_nested_user_lock_with_checks( kmp_user_lock_p lck, kmp_int32 gtid )
+{
+    KMP_DEBUG_ASSERT( __kmp_release_nested_user_lock_with_checks_ != NULL );
+    return ( *__kmp_release_nested_user_lock_with_checks_ )( lck, gtid );
+}
+
+extern void ( *__kmp_init_nested_user_lock_with_checks_ )( kmp_user_lock_p lck );
+
+static inline void __kmp_init_nested_user_lock_with_checks( kmp_user_lock_p lck )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_nested_user_lock_with_checks_ != NULL );
+    ( *__kmp_init_nested_user_lock_with_checks_ )( lck );
+}
+
+extern void ( *__kmp_destroy_nested_user_lock_with_checks_ )( kmp_user_lock_p lck );
+
+static inline void
+__kmp_destroy_nested_user_lock_with_checks( kmp_user_lock_p lck )
+{
+    KMP_DEBUG_ASSERT( __kmp_destroy_nested_user_lock_with_checks_ != NULL );
+    ( *__kmp_destroy_nested_user_lock_with_checks_ )( lck );
+}
+
+//
+// user lock functions which do not necessarily exist for all lock kinds.
+//
+// The "set" functions usually have wrapper routines that check for a NULL set
+// function pointer and call it if non-NULL.
+//
+// In some cases, it makes sense to have a "get" wrapper function check for a
+// NULL get function pointer and return NULL / invalid value / error code if
+// the function pointer is NULL.
+//
+// In other cases, the calling code really should differentiate between an
+// unimplemented function and one that is implemented but returning NULL /
+// invalied value.  If this is the case, no get function wrapper exists.
+//
+
+extern int ( *__kmp_is_user_lock_initialized_ )( kmp_user_lock_p lck );
+
+// no set function; fields set durining local allocation
+
+extern const ident_t * ( *__kmp_get_user_lock_location_ )( kmp_user_lock_p lck );
+
+static inline const ident_t *
+__kmp_get_user_lock_location( kmp_user_lock_p lck )
+{
+    if ( __kmp_get_user_lock_location_  != NULL ) {
+        return ( *__kmp_get_user_lock_location_ )( lck );
+    }
+    else {
+        return NULL;
+    }
+}
+
+extern void ( *__kmp_set_user_lock_location_ )( kmp_user_lock_p lck, const ident_t *loc );
+
+static inline void
+__kmp_set_user_lock_location( kmp_user_lock_p lck, const ident_t *loc )
+{
+    if ( __kmp_set_user_lock_location_  != NULL ) {
+        ( *__kmp_set_user_lock_location_ )( lck, loc );
+    }
+}
+
+extern kmp_lock_flags_t ( *__kmp_get_user_lock_flags_ )( kmp_user_lock_p lck );
+
+extern void ( *__kmp_set_user_lock_flags_ )( kmp_user_lock_p lck, kmp_lock_flags_t flags );
+
+static inline void
+__kmp_set_user_lock_flags( kmp_user_lock_p lck, kmp_lock_flags_t flags )
+{
+    if ( __kmp_set_user_lock_flags_  != NULL ) {
+        ( *__kmp_set_user_lock_flags_ )( lck, flags );
+    }
+}
+
+//
+// The fuction which sets up all of the vtbl pointers for kmp_user_lock_t.
+//
+extern void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind );
+
+//
+// Macros for binding user lock functions.
+//
+#define KMP_BIND_USER_LOCK_TEMPLATE(nest, kind, suffix) {                                       \
+    __kmp_acquire##nest##user_lock_with_checks_ = ( void (*)( kmp_user_lock_p, kmp_int32 ) )    \
+                                                  __kmp_acquire##nest##kind##_##suffix;         \
+    __kmp_release##nest##user_lock_with_checks_ = ( int (*)( kmp_user_lock_p, kmp_int32 ) )     \
+                                                  __kmp_release##nest##kind##_##suffix;         \
+    __kmp_test##nest##user_lock_with_checks_    = ( int (*)( kmp_user_lock_p, kmp_int32 ) )     \
+                                                  __kmp_test##nest##kind##_##suffix;            \
+    __kmp_init##nest##user_lock_with_checks_    = ( void (*)( kmp_user_lock_p ) )               \
+                                                  __kmp_init##nest##kind##_##suffix;            \
+    __kmp_destroy##nest##user_lock_with_checks_ = ( void (*)( kmp_user_lock_p ) )               \
+                                                  __kmp_destroy##nest##kind##_##suffix;         \
+}
+
+#define KMP_BIND_USER_LOCK(kind)                    KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock)
+#define KMP_BIND_USER_LOCK_WITH_CHECKS(kind)        KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock_with_checks)
+#define KMP_BIND_NESTED_USER_LOCK(kind)             KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock)
+#define KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(kind) KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock_with_checks)
+
+// ----------------------------------------------------------------------------
+// User lock table & lock allocation
+// ----------------------------------------------------------------------------
+
+/*
+    On 64-bit Linux* OS (and OS X*) GNU compiler allocates only 4 bytems memory for lock variable, which
+    is not enough to store a pointer, so we have to use lock indexes instead of pointers and
+    maintain lock table to map indexes to pointers.
+
+
+    Note: The first element of the table is not a pointer to lock! It is a pointer to previously
+    allocated table (or NULL if it is the first table).
+
+    Usage:
+
+        if ( OMP_LOCK_T_SIZE < sizeof( <lock> ) ) { // or OMP_NEST_LOCK_T_SIZE
+            Lock table is fully utilized. User locks are indexes, so table is
+            used on user lock operation.
+            Note: it may be the case (lin_32) that we don't need to use a lock
+            table for regular locks, but do need the table for nested locks.
+        }
+        else {
+            Lock table initialized but not actually used.
+        }
+*/
+
+struct kmp_lock_table {
+    kmp_lock_index_t  used;      // Number of used elements
+    kmp_lock_index_t  allocated; // Number of allocated elements
+    kmp_user_lock_p * table;     // Lock table.
+};
+
+typedef struct kmp_lock_table kmp_lock_table_t;
+
+extern kmp_lock_table_t __kmp_user_lock_table;
+extern kmp_user_lock_p __kmp_lock_pool;
+
+struct kmp_block_of_locks {
+    struct kmp_block_of_locks * next_block;
+    void *                      locks;
+};
+
+typedef struct kmp_block_of_locks kmp_block_of_locks_t;
+
+extern kmp_block_of_locks_t *__kmp_lock_blocks;
+extern int __kmp_num_locks_in_block;
+
+extern kmp_user_lock_p __kmp_user_lock_allocate( void **user_lock, kmp_int32 gtid, kmp_lock_flags_t flags );
+extern void __kmp_user_lock_free( void **user_lock, kmp_int32 gtid, kmp_user_lock_p lck );
+extern kmp_user_lock_p __kmp_lookup_user_lock( void **user_lock, char const *func );
+extern void __kmp_cleanup_user_locks();
+
+#define KMP_CHECK_USER_LOCK_INIT() \
+        {                                                               \
+            if ( ! TCR_4( __kmp_init_user_locks ) ) {                   \
+                __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );      \
+                if ( ! TCR_4( __kmp_init_user_locks ) ) {               \
+                    TCW_4( __kmp_init_user_locks, TRUE );               \
+                }                                                       \
+                __kmp_release_bootstrap_lock( &__kmp_initz_lock );      \
+            }                                                           \
+        }
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#undef KMP_PAD
+#undef KMP_GTID_DNE
+
+#if KMP_USE_DYNAMIC_LOCK
+
+#define DYNA_HAS_FUTEX          (KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM))
+#define DYNA_HAS_HLE            (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC)
+#define DYNA_USE_FAST_FUTEX     0 && DYNA_HAS_FUTEX
+#define DYNA_USE_FAST_TAS       1 && DYNA_HAS_FUTEX
+
+// List of lock definitions; all nested locks are indirect locks.
+// hle lock is xchg lock prefixed with XACQUIRE/XRELEASE.
+// All nested locks are indirect lock types.
+#if DYNA_HAS_FUTEX
+# if DYNA_HAS_HLE
+#  define FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a)
+#  define DYNA_LAST_D_LOCK_SEQ lockseq_hle
+# else
+#  define FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a)
+#  define DYNA_LAST_D_LOCK_SEQ lockseq_futex
+# endif // DYNA_HAS_HLE
+# if KMP_USE_ADAPTIVE_LOCKS
+#  define FOREACH_I_LOCK(m, a) m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a)   \
+                               m(nested_tas, a) m(nested_futex, a) m(nested_ticket, a) \
+                               m(nested_queuing, a) m(nested_drdpa, a)
+# else
+#  define FOREACH_I_LOCK(m, a) m(ticket, a) m(queuing, a)                m(drdpa, a)   \
+                               m(nested_tas, a) m(nested_futex, a) m(nested_ticket, a) \
+                               m(nested_queuing, a) m(nested_drdpa, a)
+# endif // KMP_USE_ADAPTIVE_LOCKS
+#else
+# if DYNA_HAS_HLE
+#  define FOREACH_D_LOCK(m, a) m(tas, a)             m(hle, a)
+#  define DYNA_LAST_D_LOCK_SEQ lockseq_hle
+# else
+#  define FOREACH_D_LOCK(m, a) m(tas, a)
+#  define DYNA_LAST_D_LOCK_SEQ lockseq_tas
+# endif // DYNA_HAS_HLE
+# if KMP_USE_ADAPTIVE_LOCKS
+#  define FOREACH_I_LOCK(m, a) m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a)   \
+                               m(nested_tas, a)                    m(nested_ticket, a) \
+                               m(nested_queuing, a) m(nested_drdpa, a)
+# else
+#  define FOREACH_I_LOCK(m, a) m(ticket, a) m(queuing, a)                m(drdpa, a)   \
+                               m(nested_tas, a)                    m(nested_ticket, a) \
+                               m(nested_queuing, a) m(nested_drdpa, a)
+# endif // KMP_USE_ADAPTIVE_LOCKS
+#endif // DYNA_HAS_FUTEX
+
+// Information used in dynamic dispatch
+#define DYNA_LOCK_VALUE_SHIFT 8
+#define DYNA_LOCK_TYPE_MASK   ((1<<DYNA_LOCK_VALUE_SHIFT)-1)
+#define DYNA_NUM_D_LOCKS      DYNA_LAST_D_LOCK_SEQ
+#define DYNA_NUM_I_LOCKS      (locktag_nested_drdpa+1)
+
+// Base type for dynamic locks.
+typedef kmp_uint32 kmp_dyna_lock_t;
+
+// Lock sequence that enumerates all lock kinds.
+// Always make this enumeration consistent with kmp_lockseq_t in the include directory.
+typedef enum {
+    lockseq_indirect = 0,
+#define expand_seq(l,a) lockseq_##l,
+    FOREACH_D_LOCK(expand_seq, 0)
+    FOREACH_I_LOCK(expand_seq, 0)
+#undef expand_seq
+} kmp_dyna_lockseq_t;
+
+// Enumerates indirect lock tags.
+typedef enum {
+#define expand_tag(l,a) locktag_##l,
+    FOREACH_I_LOCK(expand_tag, 0)
+#undef expand_tag
+} kmp_indirect_locktag_t;
+
+// Utility macros that extract information from lock sequences.
+#define DYNA_IS_D_LOCK(seq) (seq >= lockseq_tas && seq <= DYNA_LAST_D_LOCK_SEQ)
+#define DYNA_IS_I_LOCK(seq) (seq >= lockseq_ticket && seq <= lockseq_nested_drdpa)
+#define DYNA_GET_I_TAG(seq) (kmp_indirect_locktag_t)(seq - lockseq_ticket)
+#define DYNA_GET_D_TAG(seq) (seq<<1 | 1)
+
+// Enumerates direct lock tags starting from indirect tag.
+typedef enum {
+#define expand_tag(l,a) locktag_##l = DYNA_GET_D_TAG(lockseq_##l),
+    FOREACH_D_LOCK(expand_tag, 0)
+#undef expand_tag
+} kmp_direct_locktag_t;
+
+// Indirect lock type
+typedef struct {
+    kmp_user_lock_p lock;
+    kmp_indirect_locktag_t type;
+} kmp_indirect_lock_t;
+
+// Function tables for direct locks. Set/unset/test differentiate functions with/without consistency checking.
+extern void (*__kmp_direct_init_ops[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t);
+extern void (*__kmp_direct_destroy_ops[])(kmp_dyna_lock_t *);
+extern void (*(*__kmp_direct_set_ops))(kmp_dyna_lock_t *, kmp_int32);
+extern void (*(*__kmp_direct_unset_ops))(kmp_dyna_lock_t *, kmp_int32);
+extern int  (*(*__kmp_direct_test_ops))(kmp_dyna_lock_t *, kmp_int32);
+
+// Function tables for indirect locks. Set/unset/test differentiate functions with/withuot consistency checking.
+extern void (*__kmp_indirect_init_ops[])(kmp_user_lock_p);
+extern void (*__kmp_indirect_destroy_ops[])(kmp_user_lock_p);
+extern void (*(*__kmp_indirect_set_ops))(kmp_user_lock_p, kmp_int32);
+extern void (*(*__kmp_indirect_unset_ops))(kmp_user_lock_p, kmp_int32);
+extern int  (*(*__kmp_indirect_test_ops))(kmp_user_lock_p, kmp_int32);
+
+// Extracts direct lock tag from a user lock pointer
+#define DYNA_EXTRACT_D_TAG(l)   (*((kmp_dyna_lock_t *)(l)) & DYNA_LOCK_TYPE_MASK & -(*((kmp_dyna_lock_t *)(l)) & 1))
+
+// Extracts indirect lock index from a user lock pointer
+#define DYNA_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1)
+
+// Returns function pointer to the direct lock function with l (kmp_dyna_lock_t *) and op (operation type).
+#define DYNA_D_LOCK_FUNC(l, op) __kmp_direct_##op##_ops[DYNA_EXTRACT_D_TAG(l)]
+
+// Returns function pointer to the indirect lock function with l (kmp_indirect_lock_t *) and op (operation type).
+#define DYNA_I_LOCK_FUNC(l, op) __kmp_indirect_##op##_ops[((kmp_indirect_lock_t *)(l))->type]
+
+// Initializes a direct lock with the given lock pointer and lock sequence.
+#define DYNA_INIT_D_LOCK(l, seq) __kmp_direct_init_ops[DYNA_GET_D_TAG(seq)]((kmp_dyna_lock_t *)l, seq)
+
+// Initializes an indirect lock with the given lock pointer and lock sequence.
+#define DYNA_INIT_I_LOCK(l, seq) __kmp_direct_init_ops[0]((kmp_dyna_lock_t *)(l), seq)
+
+// Returns "free" lock value for the given lock type.
+#define DYNA_LOCK_FREE(type)      (locktag_##type)
+
+// Returns "busy" lock value for the given lock teyp.
+#define DYNA_LOCK_BUSY(v, type)   ((v)<<DYNA_LOCK_VALUE_SHIFT | locktag_##type)
+
+// Returns lock value after removing (shifting) lock tag.
+#define DYNA_LOCK_STRIP(v)        ((v)>>DYNA_LOCK_VALUE_SHIFT)
+
+// Updates __kmp_user_lock_seq with the give lock type.
+#define DYNA_STORE_LOCK_SEQ(type) (__kmp_user_lock_seq = lockseq_##type)
+
+// Internal entries for hinted lock initializers.
+extern void __kmp_init_lock_hinted(void **, int);
+extern void __kmp_init_nest_lock_hinted(void **, int);
+
+// Initializes global states and data structures for managing dynamic user locks.
+extern void __kmp_init_dynamic_user_locks();
+
+// Allocates and returns an indirect lock with the given indirect lock tag.
+extern kmp_indirect_lock_t * __kmp_allocate_indirect_lock(void **, kmp_int32, kmp_indirect_locktag_t);
+
+// Cleans up global states and data structures for managing dynamic user locks.
+extern void __kmp_cleanup_indirect_user_locks();
+
+// Default user lock sequence when not using hinted locks. 
+extern kmp_dyna_lockseq_t __kmp_user_lock_seq;
+
+// Jump table for "set lock location", available only for indirect locks.
+extern void (*__kmp_indirect_set_location[DYNA_NUM_I_LOCKS])(kmp_user_lock_p, const ident_t *);
+#define DYNA_SET_I_LOCK_LOCATION(lck, loc) {                        \
+    if (__kmp_indirect_set_location[(lck)->type] != NULL)           \
+        __kmp_indirect_set_location[(lck)->type]((lck)->lock, loc); \
+}
+
+// Jump table for "set lock flags", available only for indirect locks.
+extern void (*__kmp_indirect_set_flags[DYNA_NUM_I_LOCKS])(kmp_user_lock_p, kmp_lock_flags_t);
+#define DYNA_SET_I_LOCK_FLAGS(lck, flag) {                        \
+    if (__kmp_indirect_set_flags[(lck)->type] != NULL)            \
+        __kmp_indirect_set_flags[(lck)->type]((lck)->lock, flag); \
+}
+
+// Jump table for "get lock location", available only for indirect locks.
+extern const ident_t * (*__kmp_indirect_get_location[DYNA_NUM_I_LOCKS])(kmp_user_lock_p);
+#define DYNA_GET_I_LOCK_LOCATION(lck) ( __kmp_indirect_get_location[(lck)->type] != NULL      \
+                                      ? __kmp_indirect_get_location[(lck)->type]((lck)->lock) \
+                                      : NULL )
+
+// Jump table for "get lock flags", available only for indirect locks.
+extern kmp_lock_flags_t (*__kmp_indirect_get_flags[DYNA_NUM_I_LOCKS])(kmp_user_lock_p);
+#define DYNA_GET_I_LOCK_FLAGS(lck) ( __kmp_indirect_get_flags[(lck)->type] != NULL      \
+                                   ? __kmp_indirect_get_flags[(lck)->type]((lck)->lock) \
+                                   : NULL )
+
+//
+// Lock table for indirect locks.
+//
+// Simple linear structure is used to keep pointers to allocated indirect locks.
+extern kmp_indirect_lock_t **__kmp_indirect_lock_table;
+// Current size of the lock table; it may increase but never shrink.
+extern kmp_lock_index_t __kmp_indirect_lock_table_size;
+// Next index to be used for a new indirect lock (= number of indirect locks allocated).
+extern kmp_lock_index_t __kmp_indirect_lock_table_next;
+// Number of locks in a lock block, which is fixed to "1" now.
+// TODO: No lock block implementation now. If we do support, we need to manage lock block data
+// structure for each indirect lock type.
+extern int __kmp_num_locks_in_block;
+
+// Fast lock table lookup without consistency checking
+#define DYNA_LOOKUP_I_LOCK(l) ( (OMP_LOCK_T_SIZE < sizeof(void *)) \
+                              ? __kmp_indirect_lock_table[DYNA_EXTRACT_I_INDEX(l)] \
+                              : *((kmp_indirect_lock_t **)l) )
+
+// Used once in kmp_error.c
+extern kmp_int32
+__kmp_get_user_lock_owner(kmp_user_lock_p, kmp_uint32);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+# define DYNA_LOCK_BUSY(v, type)    (v)
+# define DYNA_LOCK_FREE(type)       0
+# define DYNA_LOCK_STRIP(v)         (v)
+# define DYNA_STORE_LOCK_SEQ(seq)
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_LOCK_H */
+

diff --git a/final/runtime/src/kmp_omp.h b/final/runtime/src/kmp_omp.h
new file mode 100644
index 0000000..5a9419f
--- /dev/null
+++ b/final/runtime/src/kmp_omp.h

@@ -0,0 +1,227 @@
+#if USE_DEBUGGER
+/*
+ * kmp_omp.h -- OpenMP definition for kmp_omp_struct_info_t.
+ *              This is for information about runtime library structures.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/* THIS FILE SHOULD NOT BE MODIFIED IN IDB INTERFACE LIBRARY CODE
+ * It should instead be modified in the OpenMP runtime and copied
+ * to the interface library code.  This way we can minimize the
+ * problems that this is sure to cause having two copies of the
+ * same file.
+ *
+ * files live in libomp and libomp_db/src/include
+ */
+
+/* CHANGE THIS WHEN STRUCTURES BELOW CHANGE
+ * Before we release this to a customer, please don't change this value.  After it is released and
+ * stable, then any new updates to the structures or data structure traversal algorithms need to
+ * change this value.
+ */
+#define KMP_OMP_VERSION 9
+
+typedef struct {
+    kmp_int32  offset;
+    kmp_int32  size;
+} offset_and_size_t;
+
+typedef struct {
+    kmp_uint64    addr;
+    kmp_int32     size;
+    kmp_int32     padding;
+} addr_and_size_t;
+
+typedef struct {
+    kmp_uint64   flags;         // Flags for future extensions.
+    kmp_uint64   file;          // Pointer to name of source file where the parallel region is.
+    kmp_uint64   func;          // Pointer to name of routine where the parallel region is.
+    kmp_int32    begin;         // Beginning of source line range.
+    kmp_int32    end;           // End of source line range.
+    kmp_int32    num_threads;   // Specified number of threads.
+} kmp_omp_nthr_item_t;
+
+typedef struct {
+    kmp_int32     num;          // Number of items in the arrray.
+    kmp_uint64    array;        // Address of array of kmp_omp_num_threads_item_t.
+} kmp_omp_nthr_info_t;
+
+
+/* This structure is known to the idb interface library */
+typedef struct {
+
+    /* Change this only if you make a fundamental data structure change here */
+    kmp_int32          lib_version;
+
+    /* sanity check.  Only should be checked if versions are identical
+     * This is also used for backward compatibility to get the runtime
+     * structure size if it the runtime is older than the interface */
+    kmp_int32          sizeof_this_structure;
+
+    /* OpenMP RTL version info. */
+    addr_and_size_t    major;
+    addr_and_size_t    minor;
+    addr_and_size_t    build;
+    addr_and_size_t    openmp_version;
+    addr_and_size_t    banner;
+
+    /* Various globals. */
+    addr_and_size_t  threads;            // Pointer to __kmp_threads.
+    addr_and_size_t  roots;              // Pointer to __kmp_root.
+    addr_and_size_t  capacity;           // Pointer to __kmp_threads_capacity.
+    addr_and_size_t  monitor;            // Pointer to __kmp_monitor.
+#if ! KMP_USE_DYNAMIC_LOCK
+    addr_and_size_t  lock_table;         // Pointer to __kmp_lock_table.
+#endif
+    addr_and_size_t  func_microtask;
+    addr_and_size_t  func_fork;
+    addr_and_size_t  func_fork_teams;
+    addr_and_size_t  team_counter;
+    addr_and_size_t  task_counter;
+    addr_and_size_t  nthr_info;
+    kmp_int32        address_width;
+    kmp_int32        indexed_locks;
+    kmp_int32        last_barrier;       // The end in enum barrier_type
+    kmp_int32        deque_size;         // TASK_DEQUE_SIZE
+
+    /* thread structure information. */
+    kmp_int32          th_sizeof_struct;
+    offset_and_size_t  th_info;          // descriptor for thread
+    offset_and_size_t  th_team;          // team for this thread
+    offset_and_size_t  th_root;          // root for this thread
+    offset_and_size_t  th_serial_team;   // serial team under this thread
+    offset_and_size_t  th_ident;         // location for this thread (if available)
+    offset_and_size_t  th_spin_here;     // is thread waiting for lock (if available)
+    offset_and_size_t  th_next_waiting;  // next thread waiting for lock (if available)
+    offset_and_size_t  th_task_team;     // task team struct
+    offset_and_size_t  th_current_task;  // innermost task being executed
+    offset_and_size_t  th_task_state;    // alternating 0/1 for task team identification
+    offset_and_size_t  th_bar;
+    offset_and_size_t  th_b_worker_arrived; // the worker increases it by 1 when it arrives to the barrier
+
+    /* teams information */
+    offset_and_size_t th_teams_microtask;// entry address for teams construct
+    offset_and_size_t th_teams_level;    // initial level of teams construct
+    offset_and_size_t th_teams_nteams;   // number of teams in a league
+    offset_and_size_t th_teams_nth;      // number of threads in each team of the league
+
+    /* kmp_desc structure (for info field above) */
+    kmp_int32          ds_sizeof_struct;
+    offset_and_size_t  ds_tid;           // team thread id
+    offset_and_size_t  ds_gtid;          // global thread id
+    offset_and_size_t  ds_thread;        // native thread id
+
+    /* team structure information */
+    kmp_int32          t_sizeof_struct;
+    offset_and_size_t  t_master_tid;     // tid of master in parent team
+    offset_and_size_t  t_ident;          // location of parallel region
+    offset_and_size_t  t_parent;         // parent team
+    offset_and_size_t  t_nproc;          // # team threads
+    offset_and_size_t  t_threads;        // array of threads
+    offset_and_size_t  t_serialized;     // # levels of serialized teams
+    offset_and_size_t  t_id;             // unique team id
+    offset_and_size_t  t_pkfn;
+    offset_and_size_t  t_task_team;      // task team structure
+    offset_and_size_t  t_implicit_task;  // taskdata for the thread's implicit task
+    offset_and_size_t  t_cancel_request;
+    offset_and_size_t  t_bar;
+    offset_and_size_t  t_b_master_arrived; // increased by 1 when master arrives to a barrier
+    offset_and_size_t  t_b_team_arrived;   // increased by one when all the threads arrived
+
+    /* root structure information */
+    kmp_int32          r_sizeof_struct;
+    offset_and_size_t  r_root_team;      // team at root
+    offset_and_size_t  r_hot_team;       // hot team for this root
+    offset_and_size_t  r_uber_thread;    // root thread
+    offset_and_size_t  r_root_id;        // unique root id (if available)
+
+    /* ident structure information */
+    kmp_int32          id_sizeof_struct;
+    offset_and_size_t  id_psource;       /* address of string ";file;func;line1;line2;;". */
+    offset_and_size_t  id_flags;
+
+    /* lock structure information */
+    kmp_int32          lk_sizeof_struct;
+    offset_and_size_t  lk_initialized;
+    offset_and_size_t  lk_location;
+    offset_and_size_t  lk_tail_id;
+    offset_and_size_t  lk_head_id;
+    offset_and_size_t  lk_next_ticket;
+    offset_and_size_t  lk_now_serving;
+    offset_and_size_t  lk_owner_id;
+    offset_and_size_t  lk_depth_locked;
+    offset_and_size_t  lk_lock_flags;
+
+#if ! KMP_USE_DYNAMIC_LOCK
+    /* lock_table_t */
+    kmp_int32          lt_size_of_struct;    /* Size and layout of kmp_lock_table_t. */
+    offset_and_size_t  lt_used;
+    offset_and_size_t  lt_allocated;
+    offset_and_size_t  lt_table;
+#endif
+
+    /* task_team_t */
+    kmp_int32          tt_sizeof_struct;
+    offset_and_size_t  tt_threads_data;
+    offset_and_size_t  tt_found_tasks;
+    offset_and_size_t  tt_nproc;
+    offset_and_size_t  tt_unfinished_threads;
+    offset_and_size_t  tt_active;
+
+    /* kmp_taskdata_t */
+    kmp_int32          td_sizeof_struct;
+    offset_and_size_t  td_task_id;                  // task id
+    offset_and_size_t  td_flags;                    // task flags
+    offset_and_size_t  td_team;                     // team for this task
+    offset_and_size_t  td_parent;                   // parent task
+    offset_and_size_t  td_level;                    // task testing level
+    offset_and_size_t  td_ident;                    // task identifier
+    offset_and_size_t  td_allocated_child_tasks;    // child tasks (+ current task) not yet deallocated
+    offset_and_size_t  td_incomplete_child_tasks;   // child tasks not yet complete
+
+    /* Taskwait */
+    offset_and_size_t  td_taskwait_ident;
+    offset_and_size_t  td_taskwait_counter;
+    offset_and_size_t  td_taskwait_thread;          // gtid + 1 of thread encountered taskwait
+
+    /* Taskgroup */
+    offset_and_size_t  td_taskgroup;                // pointer to the current taskgroup
+    offset_and_size_t  td_task_count;               // number of allocated and not yet complete tasks
+    offset_and_size_t  td_cancel;                   // request for cancellation of this taskgroup
+
+    /* Task dependency */
+    offset_and_size_t  td_depnode;                  // pointer to graph node if the task has dependencies
+    offset_and_size_t  dn_node;
+    offset_and_size_t  dn_next;
+    offset_and_size_t  dn_successors;
+    offset_and_size_t  dn_task;
+    offset_and_size_t  dn_npredecessors;
+    offset_and_size_t  dn_nrefs;
+    offset_and_size_t  dn_routine;
+
+    /* kmp_thread_data_t */
+    kmp_int32          hd_sizeof_struct;
+    offset_and_size_t  hd_deque;
+    offset_and_size_t  hd_deque_head;
+    offset_and_size_t  hd_deque_tail;
+    offset_and_size_t  hd_deque_ntasks;
+    offset_and_size_t  hd_deque_last_stolen;
+
+    // The last field of stable version.
+    kmp_uint64         last_field;
+
+} kmp_omp_struct_info_t;
+
+#endif /* USE_DEBUGGER */
+
+/* end of file */

diff --git a/final/runtime/src/kmp_os.h b/final/runtime/src/kmp_os.h
new file mode 100644
index 0000000..6b82bb4
--- /dev/null
+++ b/final/runtime/src/kmp_os.h

@@ -0,0 +1,728 @@
+/*
+ * kmp_os.h -- KPTS runtime header file.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_OS_H
+#define KMP_OS_H
+
+#include <stdlib.h>
+
+#define KMP_FTN_PLAIN   1
+#define KMP_FTN_APPEND  2
+#define KMP_FTN_UPPER   3
+/*
+#define KMP_FTN_PREPEND 4
+#define KMP_FTN_UAPPEND 5
+*/
+
+#define KMP_PTR_SKIP    (sizeof(void*))
+
+/* -------------------------- Compiler variations ------------------------ */
+
+#define KMP_OFF				0
+#define KMP_ON				1
+
+#define KMP_MEM_CONS_VOLATILE		0
+#define KMP_MEM_CONS_FENCE		1
+
+#ifndef KMP_MEM_CONS_MODEL
+# define KMP_MEM_CONS_MODEL	 KMP_MEM_CONS_VOLATILE
+#endif
+
+/* ------------------------- Compiler recognition ---------------------- */
+#define KMP_COMPILER_ICC 0
+#define KMP_COMPILER_GCC 0
+#define KMP_COMPILER_CLANG 0
+#define KMP_COMPILER_MSVC 0
+
+#if defined( __INTEL_COMPILER )
+# undef KMP_COMPILER_ICC
+# define KMP_COMPILER_ICC 1
+#elif defined( __clang__ )
+# undef KMP_COMPILER_CLANG
+# define KMP_COMPILER_CLANG 1
+#elif defined( __GNUC__ )
+# undef KMP_COMPILER_GCC
+# define KMP_COMPILER_GCC 1
+#elif defined( _MSC_VER )
+# undef KMP_COMPILER_MSVC
+# define KMP_COMPILER_MSVC 1
+#else
+# error Unknown compiler
+#endif
+
+#include "kmp_platform.h"
+
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS) && !KMP_OS_CNK && !KMP_ARCH_PPC64
+# define KMP_AFFINITY_SUPPORTED 1
+# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
+#  define KMP_GROUP_AFFINITY    1
+# else
+#  define KMP_GROUP_AFFINITY    0
+# endif
+#else
+# define KMP_AFFINITY_SUPPORTED 0
+# define KMP_GROUP_AFFINITY     0
+#endif
+
+/* Check for quad-precision extension. */
+#define KMP_HAVE_QUAD 0
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+# if KMP_COMPILER_ICC
+   /* _Quad is already defined for icc */
+#  undef  KMP_HAVE_QUAD
+#  define KMP_HAVE_QUAD 1
+# elif KMP_COMPILER_CLANG
+   /* Clang doesn't support a software-implemented
+      128-bit extended precision type yet */
+   typedef long double _Quad;
+# elif KMP_COMPILER_GCC
+   typedef __float128 _Quad;
+#  undef  KMP_HAVE_QUAD
+#  define KMP_HAVE_QUAD 1
+# elif KMP_COMPILER_MSVC
+   typedef long double _Quad;
+# endif
+#else
+# if __LDBL_MAX_EXP__ >= 16384 && KMP_COMPILER_GCC
+   typedef long double _Quad;
+#  undef  KMP_HAVE_QUAD
+#  define KMP_HAVE_QUAD 1
+# endif
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_OS_WINDOWS
+  typedef char              kmp_int8;
+  typedef unsigned char     kmp_uint8;
+  typedef short             kmp_int16;
+  typedef unsigned short    kmp_uint16;
+  typedef int               kmp_int32;
+  typedef unsigned int      kmp_uint32;
+# define KMP_INT32_SPEC     "d"
+# define KMP_UINT32_SPEC    "u"
+# ifndef KMP_STRUCT64
+   typedef __int64 		kmp_int64;
+   typedef unsigned __int64 	kmp_uint64;
+   #define KMP_INT64_SPEC 	"I64d"
+   #define KMP_UINT64_SPEC	"I64u"
+# else
+   struct kmp_struct64 {
+    kmp_int32   a,b;
+   };
+   typedef struct kmp_struct64 kmp_int64;
+   typedef struct kmp_struct64 kmp_uint64;
+   /* Not sure what to use for KMP_[U]INT64_SPEC here */
+# endif
+# if KMP_ARCH_X86_64
+#  define KMP_INTPTR 1
+   typedef __int64         	kmp_intptr_t;
+   typedef unsigned __int64	kmp_uintptr_t;
+#  define KMP_INTPTR_SPEC  	"I64d"
+#  define KMP_UINTPTR_SPEC 	"I64u"
+# endif
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_OS_UNIX
+  typedef char               kmp_int8;
+  typedef unsigned char      kmp_uint8;
+  typedef short              kmp_int16;
+  typedef unsigned short     kmp_uint16;
+  typedef int                kmp_int32;
+  typedef unsigned int       kmp_uint32;
+  typedef long long          kmp_int64;
+  typedef unsigned long long kmp_uint64;
+# define KMP_INT32_SPEC      "d"
+# define KMP_UINT32_SPEC     "u"
+# define KMP_INT64_SPEC	     "lld"
+# define KMP_UINT64_SPEC     "llu"
+#endif /* KMP_OS_UNIX */
+
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+# define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
+#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
+# define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
+#else
+# error "Can't determine size_t printf format specifier."
+#endif
+
+#if KMP_ARCH_X86
+# define KMP_SIZE_T_MAX (0xFFFFFFFF)
+#else
+# define KMP_SIZE_T_MAX (0xFFFFFFFFFFFFFFFF)
+#endif
+
+typedef size_t  kmp_size_t;
+typedef float   kmp_real32;
+typedef double  kmp_real64;
+
+#ifndef KMP_INTPTR
+# define KMP_INTPTR 1
+  typedef long             kmp_intptr_t;
+  typedef unsigned long    kmp_uintptr_t;
+# define KMP_INTPTR_SPEC   "ld"
+# define KMP_UINTPTR_SPEC  "lu"
+#endif
+
+#ifdef KMP_I8
+  typedef kmp_int64      kmp_int;
+  typedef kmp_uint64     kmp_uint;
+# define  KMP_INT_SPEC	 KMP_INT64_SPEC
+# define  KMP_UINT_SPEC	 KMP_UINT64_SPEC
+# define  KMP_INT_MAX    ((kmp_int64)0x7FFFFFFFFFFFFFFFLL)
+# define  KMP_INT_MIN    ((kmp_int64)0x8000000000000000LL)
+#else
+  typedef kmp_int32      kmp_int;
+  typedef kmp_uint32     kmp_uint;
+# define  KMP_INT_SPEC	 KMP_INT32_SPEC
+# define  KMP_UINT_SPEC	 KMP_UINT32_SPEC
+# define  KMP_INT_MAX    ((kmp_int32)0x7FFFFFFF)
+# define  KMP_INT_MIN    ((kmp_int32)0x80000000)
+#endif /* KMP_I8 */
+
+#ifdef __cplusplus
+    //-------------------------------------------------------------------------
+    // template for debug prints specification ( d, u, lld, llu ), and to obtain
+    // signed/unsigned flavors of a type
+    template< typename T >
+    struct traits_t {
+        typedef T           signed_t;
+        typedef T           unsigned_t;
+        typedef T           floating_t;
+        static char const * spec;
+    };
+    // int
+    template<>
+    struct traits_t< signed int > {
+        typedef signed int    signed_t;
+        typedef unsigned int  unsigned_t;
+        typedef double        floating_t;
+        static char const *   spec;
+    };
+    // unsigned int
+    template<>
+    struct traits_t< unsigned int > {
+        typedef signed int    signed_t;
+        typedef unsigned int  unsigned_t;
+        typedef double        floating_t;
+        static char const *   spec;
+    };
+    // long long
+    template<>
+    struct traits_t< signed long long > {
+        typedef signed long long    signed_t;
+        typedef unsigned long long  unsigned_t;
+        typedef long double         floating_t;
+        static char const *         spec;
+    };
+    // unsigned long long
+    template<>
+    struct traits_t< unsigned long long > {
+        typedef signed long long    signed_t;
+        typedef unsigned long long  unsigned_t;
+        typedef long double         floating_t;
+        static char const *         spec;
+    };
+    //-------------------------------------------------------------------------
+#endif // __cplusplus
+
+#define KMP_EXPORT	extern	/* export declaration in guide libraries */
+
+#if __GNUC__ >= 4
+    #define __forceinline __inline
+#endif
+
+#define PAGE_SIZE                       (0x4000)
+#define PAGE_ALIGNED(_addr)     ( ! ((size_t) _addr & \
+                                     (size_t)(PAGE_SIZE - 1)))
+#define ALIGN_TO_PAGE(x)   (void *)(((size_t)(x)) & ~((size_t)(PAGE_SIZE - 1)))
+
+/* ---------------------- Support for cache alignment, padding, etc. -----------------*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define INTERNODE_CACHE_LINE 4096 /* for multi-node systems */
+
+/* Define the default size of the cache line */
+#ifndef CACHE_LINE
+    #define CACHE_LINE                  128         /* cache line size in bytes */
+#else
+    #if ( CACHE_LINE < 64 ) && ! defined( KMP_OS_DARWIN )
+        // 2006-02-13: This produces too many warnings on OS X*. Disable it for a while...
+        #warning CACHE_LINE is too small.
+    #endif
+#endif /* CACHE_LINE */
+
+#define KMP_CACHE_PREFETCH(ADDR) 	/* nothing */
+
+/* Temporary note: if performance testing of this passes, we can remove
+   all references to KMP_DO_ALIGN and replace with KMP_ALIGN.  */
+#if KMP_OS_UNIX && defined(__GNUC__)
+# define KMP_DO_ALIGN(bytes)  __attribute__((aligned(bytes)))
+# define KMP_ALIGN_CACHE      __attribute__((aligned(CACHE_LINE)))
+# define KMP_ALIGN_CACHE_INTERNODE __attribute__((aligned(INTERNODE_CACHE_LINE)))
+# define KMP_ALIGN(bytes)     __attribute__((aligned(bytes)))
+#else
+# define KMP_DO_ALIGN(bytes)  __declspec( align(bytes) )
+# define KMP_ALIGN_CACHE      __declspec( align(CACHE_LINE) )
+# define KMP_ALIGN_CACHE_INTERNODE      __declspec( align(INTERNODE_CACHE_LINE) )
+# define KMP_ALIGN(bytes)     __declspec( align(bytes) )
+#endif
+
+#if defined(__MIC__) || defined(__MIC2__)
+    #define KMP_MIC  1
+// Intel(R) Composer XE (13.0) defines both __MIC__ and __MIC2__ !
+# if __MIC2__ || __KNC__
+    #define KMP_MIC1 0
+    #define KMP_MIC2 1
+# else
+    #define KMP_MIC1 1
+    #define KMP_MIC2 0
+# endif
+#else
+    #define KMP_MIC  0
+    #define KMP_MIC1 0
+    #define KMP_MIC2 0
+#endif
+
+/* General purpose fence types for memory operations */
+enum kmp_mem_fence_type {
+    kmp_no_fence,         /* No memory fence */
+    kmp_acquire_fence,    /* Acquire (read) memory fence */
+    kmp_release_fence,    /* Release (write) memory fence */
+    kmp_full_fence        /* Full (read+write) memory fence */
+};
+
+
+//
+// Synchronization primitives
+//
+
+#if KMP_ASM_INTRINS && KMP_OS_WINDOWS
+
+#include <Windows.h>
+
+#pragma intrinsic(InterlockedExchangeAdd)
+#pragma intrinsic(InterlockedCompareExchange)
+#pragma intrinsic(InterlockedExchange)
+#pragma intrinsic(InterlockedExchange64)
+
+//
+// Using InterlockedIncrement / InterlockedDecrement causes a library loading
+// ordering problem, so we use InterlockedExchangeAdd instead.
+//
+# define KMP_TEST_THEN_INC32(p)                 InterlockedExchangeAdd( (volatile long *)(p), 1 )
+# define KMP_TEST_THEN_INC_ACQ32(p)             InterlockedExchangeAdd( (volatile long *)(p), 1 )
+# define KMP_TEST_THEN_ADD4_32(p)               InterlockedExchangeAdd( (volatile long *)(p), 4 )
+# define KMP_TEST_THEN_ADD4_ACQ32(p)            InterlockedExchangeAdd( (volatile long *)(p), 4 )
+# define KMP_TEST_THEN_DEC32(p)                 InterlockedExchangeAdd( (volatile long *)(p), -1 )
+# define KMP_TEST_THEN_DEC_ACQ32(p)             InterlockedExchangeAdd( (volatile long *)(p), -1 )
+# define KMP_TEST_THEN_ADD32(p, v)              InterlockedExchangeAdd( (volatile long *)(p), (v) )
+
+extern kmp_int8 __kmp_test_then_add8( volatile kmp_int8 *p, kmp_int8 v );
+extern kmp_int8 __kmp_test_then_or8( volatile kmp_int8 *p, kmp_int8 v );
+extern kmp_int8 __kmp_test_then_and8( volatile kmp_int8 *p, kmp_int8 v );
+# define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) InterlockedCompareExchange( (volatile long *)(p),(long)(sv),(long)(cv) )
+
+# define KMP_XCHG_FIXED32(p, v)                 InterlockedExchange( (volatile long *)(p), (long)(v) )
+# define KMP_XCHG_FIXED64(p, v)                 InterlockedExchange64( (volatile kmp_int64 *)(p), (kmp_int64)(v) )
+
+inline kmp_real32 KMP_XCHG_REAL32( volatile kmp_real32 *p, kmp_real32 v)
+{
+    kmp_int32 tmp = InterlockedExchange( (volatile long *)p, *(long *)&v);
+    return *(kmp_real32*)&tmp;
+}
+
+//
+// Routines that we still need to implement in assembly.
+//
+extern kmp_int32 __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int32 __kmp_test_then_or32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int32 __kmp_test_then_and32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int64 __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 v );
+extern kmp_int64 __kmp_test_then_or64( volatile kmp_int64 *p, kmp_int64 v );
+extern kmp_int64 __kmp_test_then_and64( volatile kmp_int64 *p, kmp_int64 v );
+
+extern kmp_int8 __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+extern kmp_int16 __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+extern kmp_int32 __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+extern kmp_int32 __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+extern kmp_int8  __kmp_compare_and_store_ret8(  volatile kmp_int8  *p, kmp_int8  cv, kmp_int8  sv );
+extern kmp_int16 __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+extern kmp_int32 __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+extern kmp_int64 __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+
+extern kmp_int8  __kmp_xchg_fixed8( volatile kmp_int8  *p, kmp_int8  v );
+extern kmp_int16 __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 v );
+extern kmp_int32 __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int64 __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 v );
+extern kmp_real32 __kmp_xchg_real32( volatile kmp_real32 *p, kmp_real32 v );
+extern kmp_real64 __kmp_xchg_real64( volatile kmp_real64 *p, kmp_real64 v );
+# define KMP_TEST_THEN_ADD8(p, v)              __kmp_test_then_add8( (p), (v) )
+
+//# define KMP_TEST_THEN_INC32(p)                 __kmp_test_then_add32( (p), 1 )
+# define KMP_TEST_THEN_OR8(p, v)               __kmp_test_then_or8( (p), (v) )
+# define KMP_TEST_THEN_AND8(p, v)              __kmp_test_then_and8( (p), (v) )
+//# define KMP_TEST_THEN_INC_ACQ32(p)             __kmp_test_then_add32( (p), 1 )
+# define KMP_TEST_THEN_INC64(p)                 __kmp_test_then_add64( (p), 1LL )
+# define KMP_TEST_THEN_INC_ACQ64(p)             __kmp_test_then_add64( (p), 1LL )
+//# define KMP_TEST_THEN_ADD4_32(p)               __kmp_test_then_add32( (p), 4 )
+//# define KMP_TEST_THEN_ADD4_ACQ32(p)            __kmp_test_then_add32( (p), 4 )
+# define KMP_TEST_THEN_ADD4_64(p)               __kmp_test_then_add64( (p), 4LL )
+# define KMP_TEST_THEN_ADD4_ACQ64(p)            __kmp_test_then_add64( (p), 4LL )
+//# define KMP_TEST_THEN_DEC32(p)                 __kmp_test_then_add32( (p), -1 )
+//# define KMP_TEST_THEN_DEC_ACQ32(p)             __kmp_test_then_add32( (p), -1 )
+# define KMP_TEST_THEN_DEC64(p)                 __kmp_test_then_add64( (p), -1LL )
+# define KMP_TEST_THEN_DEC_ACQ64(p)             __kmp_test_then_add64( (p), -1LL )
+//# define KMP_TEST_THEN_ADD32(p, v)              __kmp_test_then_add32( (p), (v) )
+# define KMP_TEST_THEN_ADD64(p, v)              __kmp_test_then_add64( (p), (v) )
+
+# define KMP_TEST_THEN_OR32(p, v)               __kmp_test_then_or32( (p), (v) )
+# define KMP_TEST_THEN_AND32(p, v)              __kmp_test_then_and32( (p), (v) )
+# define KMP_TEST_THEN_OR64(p, v)               __kmp_test_then_or64( (p), (v) )
+# define KMP_TEST_THEN_AND64(p, v)              __kmp_test_then_and64( (p), (v) )
+
+# define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)  __kmp_compare_and_store8( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)  __kmp_compare_and_store8( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) __kmp_compare_and_store16( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) __kmp_compare_and_store16( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) __kmp_compare_and_store32( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) __kmp_compare_and_store32( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) __kmp_compare_and_store64( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) __kmp_compare_and_store64( (p), (cv), (sv) )
+
+# if KMP_ARCH_X86
+#  define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)  __kmp_compare_and_store32( (volatile kmp_int32*)(p), (kmp_int32)(cv), (kmp_int32)(sv) )
+# else /* 64 bit pointers */
+#  define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)  __kmp_compare_and_store64( (volatile kmp_int64*)(p), (kmp_int64)(cv), (kmp_int64)(sv) )
+# endif /* KMP_ARCH_X86 */
+
+# define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)  __kmp_compare_and_store_ret8( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) __kmp_compare_and_store_ret16( (p), (cv), (sv) )
+//# define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) __kmp_compare_and_store_ret32( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) __kmp_compare_and_store_ret64( (p), (cv), (sv) )
+
+# define KMP_XCHG_FIXED8(p, v)                  __kmp_xchg_fixed8( (p), (v) );
+# define KMP_XCHG_FIXED16(p, v)                 __kmp_xchg_fixed16( (p), (v) );
+//# define KMP_XCHG_FIXED32(p, v)                 __kmp_xchg_fixed32( (p), (v) );
+//# define KMP_XCHG_FIXED64(p, v)                 __kmp_xchg_fixed64( (p), (v) );
+//# define KMP_XCHG_REAL32(p, v)                  __kmp_xchg_real32( (p), (v) );
+# define KMP_XCHG_REAL64(p, v)                  __kmp_xchg_real64( (p), (v) );
+
+
+#elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
+# define KMP_TEST_THEN_ADD8(p, v)               __sync_fetch_and_add( (kmp_int8 *)(p), (v) )
+
+/* cast p to correct type so that proper intrinsic will be used */
+# define KMP_TEST_THEN_INC32(p)                 __sync_fetch_and_add( (kmp_int32 *)(p), 1 )
+# define KMP_TEST_THEN_OR8(p, v)                __sync_fetch_and_or( (kmp_int8 *)(p), (v) )
+# define KMP_TEST_THEN_AND8(p, v)               __sync_fetch_and_and( (kmp_int8 *)(p), (v) )
+# define KMP_TEST_THEN_INC_ACQ32(p)             __sync_fetch_and_add( (kmp_int32 *)(p), 1 )
+# define KMP_TEST_THEN_INC64(p)                 __sync_fetch_and_add( (kmp_int64 *)(p), 1LL )
+# define KMP_TEST_THEN_INC_ACQ64(p)             __sync_fetch_and_add( (kmp_int64 *)(p), 1LL )
+# define KMP_TEST_THEN_ADD4_32(p)               __sync_fetch_and_add( (kmp_int32 *)(p), 4 )
+# define KMP_TEST_THEN_ADD4_ACQ32(p)            __sync_fetch_and_add( (kmp_int32 *)(p), 4 )
+# define KMP_TEST_THEN_ADD4_64(p)               __sync_fetch_and_add( (kmp_int64 *)(p), 4LL )
+# define KMP_TEST_THEN_ADD4_ACQ64(p)            __sync_fetch_and_add( (kmp_int64 *)(p), 4LL )
+# define KMP_TEST_THEN_DEC32(p)                 __sync_fetch_and_sub( (kmp_int32 *)(p), 1 )
+# define KMP_TEST_THEN_DEC_ACQ32(p)             __sync_fetch_and_sub( (kmp_int32 *)(p), 1 )
+# define KMP_TEST_THEN_DEC64(p)                 __sync_fetch_and_sub( (kmp_int64 *)(p), 1LL )
+# define KMP_TEST_THEN_DEC_ACQ64(p)             __sync_fetch_and_sub( (kmp_int64 *)(p), 1LL )
+# define KMP_TEST_THEN_ADD32(p, v)              __sync_fetch_and_add( (kmp_int32 *)(p), (v) )
+# define KMP_TEST_THEN_ADD64(p, v)              __sync_fetch_and_add( (kmp_int64 *)(p), (v) )
+
+# define KMP_TEST_THEN_OR32(p, v)               __sync_fetch_and_or( (kmp_int32 *)(p), (v) )
+# define KMP_TEST_THEN_AND32(p, v)              __sync_fetch_and_and( (kmp_int32 *)(p), (v) )
+# define KMP_TEST_THEN_OR64(p, v)               __sync_fetch_and_or( (kmp_int64 *)(p), (v) )
+# define KMP_TEST_THEN_AND64(p, v)              __sync_fetch_and_and( (kmp_int64 *)(p), (v) )
+
+# define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)  __sync_bool_compare_and_swap( (volatile kmp_uint8 *)(p),(kmp_uint8)(cv),(kmp_uint8)(sv) )
+# define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)  __sync_bool_compare_and_swap( (volatile kmp_uint8 *)(p),(kmp_uint8)(cv),(kmp_uint8)(sv) )
+# define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) __sync_bool_compare_and_swap( (volatile kmp_uint16 *)(p),(kmp_uint16)(cv),(kmp_uint16)(sv) )
+# define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) __sync_bool_compare_and_swap( (volatile kmp_uint16 *)(p),(kmp_uint16)(cv),(kmp_uint16)(sv) )
+# define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) __sync_bool_compare_and_swap( (volatile kmp_uint32 *)(p),(kmp_uint32)(cv),(kmp_uint32)(sv) )
+# define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) __sync_bool_compare_and_swap( (volatile kmp_uint32 *)(p),(kmp_uint32)(cv),(kmp_uint32)(sv) )
+# define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) __sync_bool_compare_and_swap( (volatile kmp_uint64 *)(p),(kmp_uint64)(cv),(kmp_uint64)(sv) )
+# define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) __sync_bool_compare_and_swap( (volatile kmp_uint64 *)(p),(kmp_uint64)(cv),(kmp_uint64)(sv) )
+# define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)   __sync_bool_compare_and_swap( (volatile void **)(p),(void *)(cv),(void *)(sv) )
+
+# define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)  __sync_val_compare_and_swap( (volatile kmp_uint8 *)(p),(kmp_uint8)(cv),(kmp_uint8)(sv) )
+# define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) __sync_val_compare_and_swap( (volatile kmp_uint16 *)(p),(kmp_uint16)(cv),(kmp_uint16)(sv) )
+# define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) __sync_val_compare_and_swap( (volatile kmp_uint32 *)(p),(kmp_uint32)(cv),(kmp_uint32)(sv) )
+# define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) __sync_val_compare_and_swap( (volatile kmp_uint64 *)(p),(kmp_uint64)(cv),(kmp_uint64)(sv) )
+
+#define KMP_XCHG_FIXED8(p, v)                   __sync_lock_test_and_set( (volatile kmp_uint8 *)(p), (kmp_uint8)(v) )
+#define KMP_XCHG_FIXED16(p, v)                  __sync_lock_test_and_set( (volatile kmp_uint16 *)(p), (kmp_uint16)(v) )
+#define KMP_XCHG_FIXED32(p, v)                  __sync_lock_test_and_set( (volatile kmp_uint32 *)(p), (kmp_uint32)(v) )
+#define KMP_XCHG_FIXED64(p, v)                  __sync_lock_test_and_set( (volatile kmp_uint64 *)(p), (kmp_uint64)(v) )
+
+extern kmp_int8 __kmp_test_then_add8( volatile kmp_int8 *p, kmp_int8 v );
+extern kmp_int8 __kmp_test_then_or8( volatile kmp_int8 *p, kmp_int8 v );
+extern kmp_int8 __kmp_test_then_and8( volatile kmp_int8 *p, kmp_int8 v );
+inline kmp_real32 KMP_XCHG_REAL32( volatile kmp_real32 *p, kmp_real32 v)
+{
+    kmp_int32 tmp = __sync_lock_test_and_set( (kmp_int32*)p, *(kmp_int32*)&v);
+    return *(kmp_real32*)&tmp;
+}
+
+inline kmp_real64 KMP_XCHG_REAL64( volatile kmp_real64 *p, kmp_real64 v)
+{
+    kmp_int64 tmp = __sync_lock_test_and_set( (kmp_int64*)p, *(kmp_int64*)&v);
+    return *(kmp_real64*)&tmp;
+}
+
+#else
+
+extern kmp_int32 __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int32 __kmp_test_then_or32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int32 __kmp_test_then_and32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int64 __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 v );
+extern kmp_int64 __kmp_test_then_or64( volatile kmp_int64 *p, kmp_int64 v );
+extern kmp_int64 __kmp_test_then_and64( volatile kmp_int64 *p, kmp_int64 v );
+
+extern kmp_int8 __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+extern kmp_int16 __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+extern kmp_int32 __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+extern kmp_int32 __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+extern kmp_int8  __kmp_compare_and_store_ret8(  volatile kmp_int8  *p, kmp_int8  cv, kmp_int8  sv );
+extern kmp_int16 __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+extern kmp_int32 __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+extern kmp_int64 __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+
+extern kmp_int8  __kmp_xchg_fixed8( volatile kmp_int8  *p, kmp_int8  v );
+extern kmp_int16 __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 v );
+extern kmp_int32 __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 v );
+extern kmp_int64 __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 v );
+extern kmp_real32 __kmp_xchg_real32( volatile kmp_real32 *p, kmp_real32 v );
+# define KMP_TEST_THEN_ADD8(p, v)               __kmp_test_then_add8( (p), (v) )
+extern kmp_real64 __kmp_xchg_real64( volatile kmp_real64 *p, kmp_real64 v );
+
+# define KMP_TEST_THEN_INC32(p)                 __kmp_test_then_add32( (p), 1 )
+# define KMP_TEST_THEN_OR8(p, v)                __kmp_test_then_or8( (p), (v) )
+# define KMP_TEST_THEN_AND8(p, v)               __kmp_test_then_and8( (p), (v) )
+# define KMP_TEST_THEN_INC_ACQ32(p)             __kmp_test_then_add32( (p), 1 )
+# define KMP_TEST_THEN_INC64(p)                 __kmp_test_then_add64( (p), 1LL )
+# define KMP_TEST_THEN_INC_ACQ64(p)             __kmp_test_then_add64( (p), 1LL )
+# define KMP_TEST_THEN_ADD4_32(p)               __kmp_test_then_add32( (p), 4 )
+# define KMP_TEST_THEN_ADD4_ACQ32(p)            __kmp_test_then_add32( (p), 4 )
+# define KMP_TEST_THEN_ADD4_64(p)               __kmp_test_then_add64( (p), 4LL )
+# define KMP_TEST_THEN_ADD4_ACQ64(p)            __kmp_test_then_add64( (p), 4LL )
+# define KMP_TEST_THEN_DEC32(p)                 __kmp_test_then_add32( (p), -1 )
+# define KMP_TEST_THEN_DEC_ACQ32(p)             __kmp_test_then_add32( (p), -1 )
+# define KMP_TEST_THEN_DEC64(p)                 __kmp_test_then_add64( (p), -1LL )
+# define KMP_TEST_THEN_DEC_ACQ64(p)             __kmp_test_then_add64( (p), -1LL )
+# define KMP_TEST_THEN_ADD32(p, v)              __kmp_test_then_add32( (p), (v) )
+# define KMP_TEST_THEN_ADD64(p, v)              __kmp_test_then_add64( (p), (v) )
+
+# define KMP_TEST_THEN_OR32(p, v)               __kmp_test_then_or32( (p), (v) )
+# define KMP_TEST_THEN_AND32(p, v)              __kmp_test_then_and32( (p), (v) )
+# define KMP_TEST_THEN_OR64(p, v)               __kmp_test_then_or64( (p), (v) )
+# define KMP_TEST_THEN_AND64(p, v)              __kmp_test_then_and64( (p), (v) )
+
+# define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)  __kmp_compare_and_store8( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)  __kmp_compare_and_store8( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) __kmp_compare_and_store16( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) __kmp_compare_and_store16( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) __kmp_compare_and_store32( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) __kmp_compare_and_store32( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) __kmp_compare_and_store64( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) __kmp_compare_and_store64( (p), (cv), (sv) )
+
+# if KMP_ARCH_X86
+#  define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)  __kmp_compare_and_store32( (volatile kmp_int32*)(p), (kmp_int32)(cv), (kmp_int32)(sv) )
+# else /* 64 bit pointers */
+#  define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)  __kmp_compare_and_store64( (volatile kmp_int64*)(p), (kmp_int64)(cv), (kmp_int64)(sv) )
+# endif /* KMP_ARCH_X86 */
+
+# define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)  __kmp_compare_and_store_ret8( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) __kmp_compare_and_store_ret16( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) __kmp_compare_and_store_ret32( (p), (cv), (sv) )
+# define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) __kmp_compare_and_store_ret64( (p), (cv), (sv) )
+
+# define KMP_XCHG_FIXED8(p, v)                  __kmp_xchg_fixed8( (p), (v) );
+# define KMP_XCHG_FIXED16(p, v)                 __kmp_xchg_fixed16( (p), (v) );
+# define KMP_XCHG_FIXED32(p, v)                 __kmp_xchg_fixed32( (p), (v) );
+# define KMP_XCHG_FIXED64(p, v)                 __kmp_xchg_fixed64( (p), (v) );
+# define KMP_XCHG_REAL32(p, v)                  __kmp_xchg_real32( (p), (v) );
+# define KMP_XCHG_REAL64(p, v)                  __kmp_xchg_real64( (p), (v) );
+
+#endif /* KMP_ASM_INTRINS */
+
+
+/* ------------- relaxed consistency memory model stuff ------------------ */
+
+#if KMP_OS_WINDOWS
+# ifdef __ABSOFT_WIN
+#   define KMP_MB()     asm ("nop")
+#   define KMP_IMB()    asm ("nop")
+# else
+#   define KMP_MB()     /* _asm{ nop } */
+#   define KMP_IMB()    /* _asm{ nop } */
+# endif
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64
+# define KMP_MB()       __sync_synchronize()
+#endif
+
+#ifndef KMP_MB
+# define KMP_MB()       /* nothing to do */
+#endif
+
+#ifndef KMP_IMB
+# define KMP_IMB()      /* nothing to do */
+#endif
+
+#ifndef KMP_ST_REL32
+# define KMP_ST_REL32(A,D)      ( *(A) = (D) )
+#endif
+
+#ifndef KMP_ST_REL64
+# define KMP_ST_REL64(A,D)      ( *(A) = (D) )
+#endif
+
+#ifndef KMP_LD_ACQ32
+# define KMP_LD_ACQ32(A)        ( *(A) )
+#endif
+
+#ifndef KMP_LD_ACQ64
+# define KMP_LD_ACQ64(A)        ( *(A) )
+#endif
+
+#define TCR_1(a)            (a)
+#define TCW_1(a,b)          (a) = (b)
+/* ------------------------------------------------------------------------ */
+//
+// FIXME - maybe this should this be
+//
+// #define TCR_4(a)    (*(volatile kmp_int32 *)(&a))
+// #define TCW_4(a,b)  (a) = (*(volatile kmp_int32 *)&(b))
+//
+// #define TCR_8(a)    (*(volatile kmp_int64 *)(a))
+// #define TCW_8(a,b)  (a) = (*(volatile kmp_int64 *)(&b))
+//
+// I'm fairly certain this is the correct thing to do, but I'm afraid
+// of performance regressions.
+//
+
+#define TCR_4(a)            (a)
+#define TCW_4(a,b)          (a) = (b)
+#define TCR_8(a)            (a)
+#define TCW_8(a,b)          (a) = (b)
+#define TCR_SYNC_4(a)       (a)
+#define TCW_SYNC_4(a,b)     (a) = (b)
+#define TCX_SYNC_4(a,b,c)   KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)(volatile void *)&(a), (kmp_int32)(b), (kmp_int32)(c))
+#define TCR_SYNC_8(a)       (a)
+#define TCW_SYNC_8(a,b)     (a) = (b)
+#define TCX_SYNC_8(a,b,c)   KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)(volatile void *)&(a), (kmp_int64)(b), (kmp_int64)(c))
+
+#if KMP_ARCH_X86
+// What about ARM?
+    #define TCR_PTR(a)          ((void *)TCR_4(a))
+    #define TCW_PTR(a,b)        TCW_4((a),(b))
+    #define TCR_SYNC_PTR(a)     ((void *)TCR_SYNC_4(a))
+    #define TCW_SYNC_PTR(a,b)   TCW_SYNC_4((a),(b))
+    #define TCX_SYNC_PTR(a,b,c) ((void *)TCX_SYNC_4((a),(b),(c)))
+
+#else /* 64 bit pointers */
+
+    #define TCR_PTR(a)          ((void *)TCR_8(a))
+    #define TCW_PTR(a,b)        TCW_8((a),(b))
+    #define TCR_SYNC_PTR(a)     ((void *)TCR_SYNC_8(a))
+    #define TCW_SYNC_PTR(a,b)   TCW_SYNC_8((a),(b))
+    #define TCX_SYNC_PTR(a,b,c) ((void *)TCX_SYNC_8((a),(b),(c)))
+
+#endif /* KMP_ARCH_X86 */
+
+/*
+ * If these FTN_{TRUE,FALSE} values change, may need to
+ * change several places where they are used to check that
+ * language is Fortran, not C.
+ */
+
+#ifndef FTN_TRUE
+# define FTN_TRUE       TRUE
+#endif
+
+#ifndef FTN_FALSE
+# define FTN_FALSE      FALSE
+#endif
+
+typedef void    (*microtask_t)( int *gtid, int *npr, ... );
+
+#ifdef USE_VOLATILE_CAST
+# define VOLATILE_CAST(x)        (volatile x)
+#else
+# define VOLATILE_CAST(x)        (x)
+#endif
+
+#ifdef KMP_I8
+# define KMP_WAIT_YIELD           __kmp_wait_yield_8
+# define KMP_EQ                   __kmp_eq_8
+# define KMP_NEQ                  __kmp_neq_8
+# define KMP_LT                   __kmp_lt_8
+# define KMP_GE                   __kmp_ge_8
+# define KMP_LE                   __kmp_le_8
+#else
+# define KMP_WAIT_YIELD           __kmp_wait_yield_4
+# define KMP_EQ                   __kmp_eq_4
+# define KMP_NEQ                  __kmp_neq_4
+# define KMP_LT                   __kmp_lt_4
+# define KMP_GE                   __kmp_ge_4
+# define KMP_LE                   __kmp_le_4
+#endif /* KMP_I8 */
+
+/* Workaround for Intel(R) 64 code gen bug when taking address of static array (Intel(R) 64 Tracker #138) */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_PPC64) && KMP_OS_LINUX
+# define STATIC_EFI2_WORKAROUND
+#else
+# define STATIC_EFI2_WORKAROUND static
+#endif
+
+// Support of BGET usage
+#ifndef KMP_USE_BGET
+#define KMP_USE_BGET 1
+#endif
+
+
+// Switches for OSS builds
+#ifndef USE_SYSFS_INFO
+# define USE_SYSFS_INFO  0
+#endif
+#ifndef USE_CMPXCHG_FIX
+# define USE_CMPXCHG_FIX 1
+#endif
+
+// Enable dynamic user lock
+#ifndef KMP_USE_DYNAMIC_LOCK
+# define KMP_USE_DYNAMIC_LOCK 0
+#endif
+
+// Warning levels
+enum kmp_warnings_level {
+    kmp_warnings_off = 0,		/* No warnings */
+    kmp_warnings_low,			/* Minimal warnings (default) */
+    kmp_warnings_explicit = 6,		/* Explicitly set to ON - more warnings */
+    kmp_warnings_verbose		/* reserved */
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_OS_H */
+// Safe C API
+#include "kmp_safe_c_api.h"
+

diff --git a/final/runtime/src/kmp_platform.h b/final/runtime/src/kmp_platform.h
new file mode 100644
index 0000000..9f6b0c4
--- /dev/null
+++ b/final/runtime/src/kmp_platform.h

@@ -0,0 +1,147 @@
+/*
+ * kmp_platform.h -- header for determining operating system and architecture
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_PLATFORM_H
+#define KMP_PLATFORM_H
+
+/* ---------------------- Operating system recognition ------------------- */
+
+#define KMP_OS_LINUX    0
+#define KMP_OS_FREEBSD  0
+#define KMP_OS_DARWIN   0
+#define KMP_OS_WINDOWS  0
+#define KMP_OS_CNK      0
+#define KMP_OS_UNIX     0  /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
+
+
+#ifdef _WIN32
+# undef KMP_OS_WINDOWS
+# define KMP_OS_WINDOWS 1
+#endif
+
+#if ( defined __APPLE__ && defined __MACH__ )
+# undef KMP_OS_DARWIN
+# define KMP_OS_DARWIN 1
+#endif
+
+// in some ppc64 linux installations, only the second condition is met
+#if ( defined __linux )
+# undef KMP_OS_LINUX
+# define KMP_OS_LINUX 1
+#elif ( defined __linux__)
+# undef KMP_OS_LINUX
+# define KMP_OS_LINUX 1
+#else
+#endif
+
+#if ( defined __FreeBSD__ )
+# undef KMP_OS_FREEBSD
+# define KMP_OS_FREEBSD 1
+#endif
+
+#if ( defined __bgq__ )
+# undef KMP_OS_CNK
+# define KMP_OS_CNK 1
+#endif
+
+#if (1 != KMP_OS_LINUX + KMP_OS_FREEBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS)
+# error Unknown OS
+#endif
+
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_DARWIN
+# undef KMP_OS_UNIX
+# define KMP_OS_UNIX 1
+#endif
+
+/* ---------------------- Architecture recognition ------------------- */
+
+#define KMP_ARCH_X86        0
+#define KMP_ARCH_X86_64     0
+#define KMP_ARCH_AARCH64    0
+#define KMP_ARCH_PPC64_BE   0
+#define KMP_ARCH_PPC64_LE   0
+#define KMP_ARCH_PPC64 (KMP_ARCH_PPC64_LE || KMP_ARCH_PPC64_BE)
+
+#if KMP_OS_WINDOWS
+# if defined _M_AMD64
+#  undef KMP_ARCH_X86_64
+#  define KMP_ARCH_X86_64 1
+# else
+#  undef KMP_ARCH_X86
+#  define KMP_ARCH_X86 1
+# endif
+#endif
+
+#if KMP_OS_UNIX
+# if defined __x86_64
+#  undef KMP_ARCH_X86_64
+#  define KMP_ARCH_X86_64 1
+# elif defined __i386
+#  undef KMP_ARCH_X86
+#  define KMP_ARCH_X86 1
+# elif defined __powerpc64__
+#  if defined __LITTLE_ENDIAN__
+#   undef KMP_ARCH_PPC64_LE
+#   define KMP_ARCH_PPC64_LE 1
+#  else
+#   undef KMP_ARCH_PPC64_BE
+#   define KMP_ARCH_PPC64_BE 1
+#  endif
+# elif defined __aarch64__
+#  undef KMP_ARCH_AARCH64
+#  define KMP_ARCH_AARCH64 1
+# endif
+#endif
+
+#if defined(__ARM_ARCH_7__)   || defined(__ARM_ARCH_7R__)  || \
+    defined(__ARM_ARCH_7A__)
+# define KMP_ARCH_ARMV7 1
+#endif
+
+#if defined(KMP_ARCH_ARMV7)   || defined(__ARM_ARCH_6__)   || \
+    defined(__ARM_ARCH_6J__)  || defined(__ARM_ARCH_6K__)  || \
+    defined(__ARM_ARCH_6Z__)  || defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6ZK__)
+# define KMP_ARCH_ARMV6 1
+#endif
+
+#if defined(KMP_ARCH_ARMV6)   || defined(__ARM_ARCH_5T__)  || \
+    defined(__ARM_ARCH_5E__)  || defined(__ARM_ARCH_5TE__) || \
+    defined(__ARM_ARCH_5TEJ__)
+# define KMP_ARCH_ARMV5 1
+#endif
+
+#if defined(KMP_ARCH_ARMV5)   || defined(__ARM_ARCH_4__)   || \
+    defined(__ARM_ARCH_4T__)
+# define KMP_ARCH_ARMV4 1
+#endif
+
+#if defined(KMP_ARCH_ARMV4)   || defined(__ARM_ARCH_3__)   || \
+    defined(__ARM_ARCH_3M__)
+# define KMP_ARCH_ARMV3 1
+#endif
+
+#if defined(KMP_ARCH_ARMV3)   || defined(__ARM_ARCH_2__)
+# define KMP_ARCH_ARMV2 1
+#endif
+
+#if defined(KMP_ARCH_ARMV2)
+# define KMP_ARCH_ARM 1
+#endif
+
+// TODO: Fixme - This is clever, but really fugly
+#if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 + KMP_ARCH_AARCH64)
+# error Unknown or unsupported architecture
+#endif
+
+#endif // KMP_PLATFORM_H

diff --git a/final/runtime/src/kmp_runtime.c b/final/runtime/src/kmp_runtime.c
new file mode 100644
index 0000000..df8ef6a
--- /dev/null
+++ b/final/runtime/src/kmp_runtime.c

@@ -0,0 +1,7675 @@
+/*
+ * kmp_runtime.c -- KPTS runtime support library
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_atomic.h"
+#include "kmp_wrapper_getpid.h"
+#include "kmp_environment.h"
+#include "kmp_itt.h"
+#include "kmp_str.h"
+#include "kmp_settings.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_error.h"
+#include "kmp_stats.h"
+#include "kmp_wait_release.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/* these are temporary issues to be dealt with */
+#define KMP_USE_PRCTL 0
+#define KMP_USE_POOLED_ALLOC 0
+
+#if KMP_OS_WINDOWS
+#include <process.h>
+#endif
+
+
+#if defined(KMP_GOMP_COMPAT)
+char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
+#endif /* defined(KMP_GOMP_COMPAT) */
+
+char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
+#if OMP_40_ENABLED
+    "4.0 (201307)";
+#else
+    "3.1 (201107)";
+#endif
+
+#ifdef KMP_DEBUG
+char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
+#endif /* KMP_DEBUG */
+
+
+#define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+kmp_info_t __kmp_monitor;
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* Forward declarations */
+
+void __kmp_cleanup( void );
+
+static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
+static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+static void __kmp_partition_places( kmp_team_t *team );
+#endif
+static void __kmp_do_serial_initialize( void );
+void __kmp_fork_barrier( int gtid, int tid );
+void __kmp_join_barrier( int gtid );
+void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
+
+
+#ifdef USE_LOAD_BALANCE
+static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
+#endif
+
+static int __kmp_expand_threads(int nWish, int nNeed);
+#if KMP_OS_WINDOWS
+static int __kmp_unregister_root_other_thread( int gtid );
+#endif
+static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
+static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
+static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* Calculate the identifier of the current thread */
+/* fast (and somewhat portable) way to get unique */
+/* identifier of executing thread.                */
+/* returns KMP_GTID_DNE if we haven't been assigned a gtid   */
+
+int
+__kmp_get_global_thread_id( )
+{
+    int i;
+    kmp_info_t   **other_threads;
+    size_t         stack_data;
+    char          *stack_addr;
+    size_t         stack_size;
+    char          *stack_base;
+
+    KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
+                      __kmp_nth, __kmp_all_nth ));
+
+    /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
+             parallel region, made it return KMP_GTID_DNE to force serial_initialize by
+             caller.  Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
+             __kmp_init_gtid for this to work.  */
+
+    if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
+
+#ifdef KMP_TDATA_GTID
+    if ( TCR_4(__kmp_gtid_mode) >= 3) {
+        KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
+        return __kmp_gtid;
+    }
+#endif
+    if ( TCR_4(__kmp_gtid_mode) >= 2) {
+        KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
+        return __kmp_gtid_get_specific();
+    }
+    KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
+
+    stack_addr    = (char*) & stack_data;
+    other_threads = __kmp_threads;
+
+    /*
+        ATT: The code below is a source of potential bugs due to unsynchronized access to
+        __kmp_threads array. For example:
+            1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
+            2. Current thread is suspended by OS.
+            3. Another thread unregisters and finishes (debug versions of free() may fill memory
+               with something like 0xEF).
+            4. Current thread is resumed.
+            5. Current thread reads junk from *thr.
+        TODO: Fix it.
+        --ln
+    */
+
+    for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
+
+        kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
+        if( !thr ) continue;
+
+        stack_size =  (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
+        stack_base =  (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
+
+        /* stack grows down -- search through all of the active threads */
+
+        if( stack_addr <= stack_base ) {
+            size_t stack_diff = stack_base - stack_addr;
+
+            if( stack_diff <= stack_size ) {
+                /* The only way we can be closer than the allocated */
+                /* stack size is if we are running on this thread. */
+                KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
+                return i;
+            }
+        }
+    }
+
+    /* get specific to try and determine our gtid */
+    KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
+                      "thread, using TLS\n" ));
+    i = __kmp_gtid_get_specific();
+
+    /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
+
+    /* if we havn't been assigned a gtid, then return code */
+    if( i<0 ) return i;
+
+    /* dynamically updated stack window for uber threads to avoid get_specific call */
+    if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
+        KMP_FATAL( StackOverflow, i );
+    }
+
+    stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
+    if( stack_addr > stack_base ) {
+        TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
+        TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
+          other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
+    } else {
+        TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
+    }
+
+    /* Reprint stack bounds for ubermaster since they have been refined */
+    if ( __kmp_storage_map ) {
+        char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
+        char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
+        __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
+                                      other_threads[i]->th.th_info.ds.ds_stacksize,
+                                      "th_%d stack (refinement)", i );
+    }
+    return i;
+}
+
+int
+__kmp_get_global_thread_id_reg( )
+{
+    int gtid;
+
+    if ( !__kmp_init_serial ) {
+        gtid = KMP_GTID_DNE;
+    } else
+#ifdef KMP_TDATA_GTID
+    if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
+        KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
+        gtid = __kmp_gtid;
+    } else
+#endif
+    if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
+        KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
+        gtid = __kmp_gtid_get_specific();
+    } else {
+        KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
+        gtid = __kmp_get_global_thread_id();
+    }
+
+    /* we must be a new uber master sibling thread */
+    if( gtid == KMP_GTID_DNE ) {
+        KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
+                        "Registering a new gtid.\n" ));
+        __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+        if( !__kmp_init_serial ) {
+            __kmp_do_serial_initialize();
+            gtid = __kmp_gtid_get_specific();
+        } else {
+            gtid = __kmp_register_root(FALSE);
+        }
+        __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+        /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
+    }
+
+    KMP_DEBUG_ASSERT( gtid >=0 );
+
+    return gtid;
+}
+
+/* caller must hold forkjoin_lock */
+void
+__kmp_check_stack_overlap( kmp_info_t *th )
+{
+    int f;
+    char *stack_beg = NULL;
+    char *stack_end = NULL;
+    int gtid;
+
+    KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
+    if ( __kmp_storage_map ) {
+        stack_end = (char *) th->th.th_info.ds.ds_stackbase;
+        stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
+
+        gtid = __kmp_gtid_from_thread( th );
+
+        if (gtid == KMP_GTID_MONITOR) {
+            __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
+                                     "th_%s stack (%s)", "mon",
+                                     ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
+        } else {
+            __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
+                                     "th_%d stack (%s)", gtid,
+                                     ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
+        }
+    }
+
+    /* No point in checking ubermaster threads since they use refinement and cannot overlap */
+    gtid = __kmp_gtid_from_thread( th );
+    if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
+    {
+        KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
+        if ( stack_beg == NULL ) {
+            stack_end = (char *) th->th.th_info.ds.ds_stackbase;
+            stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
+        }
+
+        for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
+            kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
+
+            if( f_th && f_th != th ) {
+                char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
+                char *other_stack_beg = other_stack_end -
+                                        (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
+                if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
+                   (stack_end > other_stack_beg && stack_end < other_stack_end)) {
+
+                    /* Print the other stack values before the abort */
+                    if ( __kmp_storage_map )
+                        __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
+                            (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
+                            "th_%d stack (overlapped)",
+                                                 __kmp_gtid_from_thread( f_th ) );
+
+                    __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
+                }
+            }
+        }
+    }
+    KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
+}
+
+
+/* ------------------------------------------------------------------------ */
+
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_infinite_loop( void )
+{
+    static int done = FALSE;
+
+    while (! done) {
+        KMP_YIELD( 1 );
+    }
+}
+
+#define MAX_MESSAGE     512
+
+void
+__kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
+    char buffer[MAX_MESSAGE];
+    va_list ap;
+
+    va_start( ap, format);
+    KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
+    __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
+    __kmp_vprintf( kmp_err, buffer, ap );
+#if KMP_PRINT_DATA_PLACEMENT
+    int node;
+    if(gtid >= 0) {
+        if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
+            if( __kmp_storage_map_verbose ) {
+                node = __kmp_get_host_node(p1);
+                if(node < 0)  /* doesn't work, so don't try this next time */
+                    __kmp_storage_map_verbose = FALSE;
+                else {
+                    char *last;
+                    int lastNode;
+                    int localProc = __kmp_get_cpu_from_gtid(gtid);
+
+                    p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
+                    p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
+                    if(localProc >= 0)
+                        __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid, localProc>>1);
+                    else
+                        __kmp_printf_no_lock("  GTID %d\n", gtid);
+# if KMP_USE_PRCTL
+/* The more elaborate format is disabled for now because of the prctl hanging bug. */
+                    do {
+                        last = p1;
+                        lastNode = node;
+                        /* This loop collates adjacent pages with the same host node. */
+                        do {
+                            (char*)p1 += PAGE_SIZE;
+                        } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
+                        __kmp_printf_no_lock("    %p-%p memNode %d\n", last,
+                                             (char*)p1 - 1, lastNode);
+                    } while(p1 <= p2);
+# else
+                    __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
+                                         (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
+                    if(p1 < p2)  {
+                        __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
+                                             (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
+                    }
+# endif
+                }
+            }
+        } else
+            __kmp_printf_no_lock("  %s\n", KMP_I18N_STR( StorageMapWarning ) );
+    }
+#endif /* KMP_PRINT_DATA_PLACEMENT */
+    __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
+}
+
+void
+__kmp_warn( char const * format, ... )
+{
+    char buffer[MAX_MESSAGE];
+    va_list ap;
+
+    if ( __kmp_generate_warnings == kmp_warnings_off ) {
+        return;
+    }
+
+    va_start( ap, format );
+
+    KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
+    __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
+    __kmp_vprintf( kmp_err, buffer, ap );
+    __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
+
+    va_end( ap );
+}
+
+void
+__kmp_abort_process()
+{
+
+    // Later threads may stall here, but that's ok because abort() will kill them.
+    __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
+
+    if ( __kmp_debug_buf ) {
+        __kmp_dump_debug_buffer();
+    }; // if
+
+    if ( KMP_OS_WINDOWS ) {
+        // Let other threads know of abnormal termination and prevent deadlock
+        // if abort happened during library initialization or shutdown
+        __kmp_global.g.g_abort = SIGABRT;
+
+        /*
+            On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
+            Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
+            works well, but this function is not available in VS7 (this is not problem for DLL, but
+            it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
+            not help, at least in some versions of MS C RTL.
+
+            It seems following sequence is the only way to simulate abort() and avoid pop-up error
+            box.
+        */
+        raise( SIGABRT );
+        _exit( 3 );    // Just in case, if signal ignored, exit anyway.
+    } else {
+        abort();
+    }; // if
+
+    __kmp_infinite_loop();
+    __kmp_release_bootstrap_lock( & __kmp_exit_lock );
+
+} // __kmp_abort_process
+
+void
+__kmp_abort_thread( void )
+{
+    // TODO: Eliminate g_abort global variable and this function.
+    // In case of abort just call abort(), it will kill all the threads.
+    __kmp_infinite_loop();
+} // __kmp_abort_thread
+
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Print out the storage map for the major kmp_info_t thread data structures
+ * that are allocated together.
+ */
+
+static void
+__kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
+{
+    __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
+
+    __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
+                             "th_%d.th_info", gtid );
+
+    __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
+                             "th_%d.th_local", gtid );
+
+    __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
+                             sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
+
+    __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
+                             &thr->th.th_bar[bs_plain_barrier+1],
+                             sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
+
+    __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
+                             &thr->th.th_bar[bs_forkjoin_barrier+1],
+                             sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
+
+    #if KMP_FAST_REDUCTION_BARRIER
+        __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
+                             &thr->th.th_bar[bs_reduction_barrier+1],
+                             sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
+    #endif // KMP_FAST_REDUCTION_BARRIER
+}
+
+/*
+ * Print out the storage map for the major kmp_team_t team data structures
+ * that are allocated together.
+ */
+
+static void
+__kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
+{
+    int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
+    __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
+                             header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
+                             sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
+
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
+                             sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
+                             sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
+
+    #if KMP_FAST_REDUCTION_BARRIER
+        __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
+                             sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
+    #endif // KMP_FAST_REDUCTION_BARRIER
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
+                             sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
+                             sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
+                             sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
+                             header, team_id );
+
+    /*
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
+                             sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
+                             sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
+                             sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
+                             sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
+                             sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
+                             sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
+
+    //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
+    //                        sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
+                             sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
+#if OMP_40_ENABLED
+    __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
+                             sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
+#endif
+    */
+
+    __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
+                             sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
+}
+
+static void __kmp_init_allocator() {}
+static void __kmp_fini_allocator() {}
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef KMP_DYNAMIC_LIB
+# if KMP_OS_WINDOWS
+
+
+static void
+__kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
+    // TODO: Change to __kmp_break_bootstrap_lock().
+    __kmp_init_bootstrap_lock( lck ); // make the lock released
+}
+
+static void
+__kmp_reset_locks_on_process_detach( int gtid_req ) {
+    int i;
+    int thread_count;
+
+    // PROCESS_DETACH is expected to be called by a thread
+    // that executes ProcessExit() or FreeLibrary().
+    // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
+    // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
+    // However, in fact, some threads can be still alive here, although being about to be terminated.
+    // The threads in the array with ds_thread==0 are most suspicious.
+    // Actually, it can be not safe to access the __kmp_threads[].
+
+    // TODO: does it make sense to check __kmp_roots[] ?
+
+    // Let's check that there are no other alive threads registered with the OMP lib.
+    while( 1 ) {
+        thread_count = 0;
+        for( i = 0; i < __kmp_threads_capacity; ++i ) {
+            if( !__kmp_threads ) continue;
+            kmp_info_t* th = __kmp_threads[ i ];
+            if( th == NULL ) continue;
+            int gtid = th->th.th_info.ds.ds_gtid;
+            if( gtid == gtid_req ) continue;
+            if( gtid < 0 ) continue;
+            DWORD exit_val;
+            int alive = __kmp_is_thread_alive( th, &exit_val );
+            if( alive ) {
+            ++thread_count;
+            }
+        }
+        if( thread_count == 0 ) break; // success
+    }
+
+    // Assume that I'm alone.
+
+    // Now it might be probably safe to check and reset locks.
+    // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
+    __kmp_reset_lock( &__kmp_forkjoin_lock );
+    #ifdef KMP_DEBUG
+    __kmp_reset_lock( &__kmp_stdio_lock );
+    #endif // KMP_DEBUG
+
+
+}
+
+BOOL WINAPI
+DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
+    //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+
+    switch( fdwReason ) {
+
+        case DLL_PROCESS_ATTACH:
+            KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
+
+            return TRUE;
+
+        case DLL_PROCESS_DETACH:
+            KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
+                        __kmp_gtid_get_specific() ));
+
+            if( lpReserved != NULL )
+            {
+                // lpReserved is used for telling the difference:
+                //  lpReserved == NULL when FreeLibrary() was called,
+                //  lpReserved != NULL when the process terminates.
+                // When FreeLibrary() is called, worker threads remain alive.
+                // So they will release the forkjoin lock by themselves.
+                // When the process terminates, worker threads disappear triggering
+                // the problem of unreleased forkjoin lock as described below.
+
+                // A worker thread can take the forkjoin lock
+                // in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
+                // The problem comes up if that worker thread becomes dead
+                // before it releases the forkjoin lock.
+                // The forkjoin lock remains taken, while the thread
+                // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
+                // will try to take the forkjoin lock and will always fail,
+                // so that the application will never finish [normally].
+                // This scenario is possible if __kmpc_end() has not been executed.
+                // It looks like it's not a corner case, but common cases:
+                // - the main function was compiled by an alternative compiler;
+                // - the main function was compiled by icl but without /Qopenmp (application with plugins);
+                // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
+                // - alive foreign thread prevented __kmpc_end from doing cleanup.
+
+                // This is a hack to work around the problem.
+                // TODO: !!! to figure out something better.
+                __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
+            }
+
+            __kmp_internal_end_library( __kmp_gtid_get_specific() );
+
+            return TRUE;
+
+        case DLL_THREAD_ATTACH:
+            KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
+
+            /* if we wanted to register new siblings all the time here call
+             * __kmp_get_gtid(); */
+            return TRUE;
+
+        case DLL_THREAD_DETACH:
+            KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
+                        __kmp_gtid_get_specific() ));
+
+            __kmp_internal_end_thread( __kmp_gtid_get_specific() );
+            return TRUE;
+    }
+
+    return TRUE;
+}
+
+# endif /* KMP_OS_WINDOWS */
+#endif /* KMP_DYNAMIC_LIB */
+
+
+/* ------------------------------------------------------------------------ */
+
+/* Change the library type to "status" and return the old type */
+/* called from within initialization routines where __kmp_initz_lock is held */
+int
+__kmp_change_library( int status )
+{
+    int old_status;
+
+    old_status = __kmp_yield_init & 1;  // check whether KMP_LIBRARY=throughput (even init count)
+
+    if (status) {
+        __kmp_yield_init |= 1;  // throughput => turnaround (odd init count)
+    }
+    else {
+        __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
+    }
+
+    return old_status;  // return previous setting of whether KMP_LIBRARY=throughput
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* __kmp_parallel_deo --
+ * Wait until it's our turn.
+ */
+void
+__kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    int gtid = *gtid_ref;
+#ifdef BUILD_PARALLEL_ORDERED
+    kmp_team_t *team = __kmp_team_from_gtid( gtid );
+#endif /* BUILD_PARALLEL_ORDERED */
+
+    if( __kmp_env_consistency_check ) {
+        if( __kmp_threads[gtid]->th.th_root->r.r_active )
+#if KMP_USE_DYNAMIC_LOCK
+            __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
+#else
+            __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
+#endif
+    }
+#ifdef BUILD_PARALLEL_ORDERED
+    if( !team->t.t_serialized ) {
+        KMP_MB();
+        KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
+        KMP_MB();
+    }
+#endif /* BUILD_PARALLEL_ORDERED */
+}
+
+/* __kmp_parallel_dxo --
+ * Signal the next task.
+ */
+
+void
+__kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    int gtid = *gtid_ref;
+#ifdef BUILD_PARALLEL_ORDERED
+    int tid =  __kmp_tid_from_gtid( gtid );
+    kmp_team_t *team = __kmp_team_from_gtid( gtid );
+#endif /* BUILD_PARALLEL_ORDERED */
+
+    if( __kmp_env_consistency_check ) {
+        if( __kmp_threads[gtid]->th.th_root->r.r_active )
+            __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
+    }
+#ifdef BUILD_PARALLEL_ORDERED
+    if ( ! team->t.t_serialized ) {
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        /* use the tid of the next thread in this team */
+        /* TODO repleace with general release procedure */
+        team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
+            /* accept blame for "ordered" waiting */
+            kmp_info_t *this_thread = __kmp_threads[gtid];
+            ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
+                this_thread->th.ompt_thread_info.wait_id);
+        }
+#endif
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+    }
+#endif /* BUILD_PARALLEL_ORDERED */
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* The BARRIER for a SINGLE process section is always explicit   */
+
+int
+__kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
+{
+    int status;
+    kmp_info_t *th;
+    kmp_team_t *team;
+
+    if( ! TCR_4(__kmp_init_parallel) )
+        __kmp_parallel_initialize();
+
+    th   = __kmp_threads[ gtid ];
+    team = th->th.th_team;
+    status = 0;
+
+    th->th.th_ident = id_ref;
+
+    if ( team->t.t_serialized ) {
+        status = 1;
+    } else {
+        kmp_int32 old_this = th->th.th_local.this_construct;
+
+        ++th->th.th_local.this_construct;
+        /* try to set team count to thread count--success means thread got the
+           single block
+        */
+        /* TODO: Should this be acquire or release? */
+        status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
+                                             th->th.th_local.this_construct);
+#if USE_ITT_BUILD
+        if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
+#if OMP_40_ENABLED
+            th->th.th_teams_microtask == NULL &&
+#endif
+            team->t.t_active_level == 1 )
+        {   // Only report metadata by master of active team at level 1
+            __kmp_itt_metadata_single( id_ref );
+        }
+#endif /* USE_ITT_BUILD */
+    }
+
+    if( __kmp_env_consistency_check ) {
+        if (status && push_ws) {
+            __kmp_push_workshare( gtid, ct_psingle, id_ref );
+        } else {
+            __kmp_check_workshare( gtid, ct_psingle, id_ref );
+        }
+    }
+#if USE_ITT_BUILD
+    if ( status ) {
+        __kmp_itt_single_start( gtid );
+    }
+#endif /* USE_ITT_BUILD */
+    return status;
+}
+
+void
+__kmp_exit_single( int gtid )
+{
+#if USE_ITT_BUILD
+    __kmp_itt_single_end( gtid );
+#endif /* USE_ITT_BUILD */
+    if( __kmp_env_consistency_check )
+        __kmp_pop_workshare( gtid, ct_psingle, NULL );
+}
+
+
+/*
+ * determine if we can go parallel or must use a serialized parallel region and
+ * how many threads we can use
+ * set_nproc is the number of threads requested for the team
+ * returns 0 if we should serialize or only use one thread,
+ * otherwise the number of threads to use
+ * The forkjoin lock is held by the caller.
+ */
+static int
+__kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
+   int master_tid, int set_nthreads
+#if OMP_40_ENABLED
+  , int enter_teams
+#endif /* OMP_40_ENABLED */
+)
+{
+    int capacity;
+    int new_nthreads;
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    KMP_DEBUG_ASSERT( root && parent_team );
+
+    //
+    // Initial check to see if we should use a serialized team.
+    //
+    if ( set_nthreads == 1 ) {
+        KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
+                        __kmp_get_gtid(), set_nthreads ));
+        return 1;
+    }
+    if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
+#if OMP_40_ENABLED
+       && !enter_teams
+#endif /* OMP_40_ENABLED */
+       ) ) || ( __kmp_library == library_serial ) ) {
+        KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
+                        __kmp_get_gtid(), set_nthreads ));
+        return 1;
+    }
+
+    //
+    // If dyn-var is set, dynamically adjust the number of desired threads,
+    // according to the method specified by dynamic_mode.
+    //
+    new_nthreads = set_nthreads;
+    if ( ! get__dynamic_2( parent_team, master_tid ) ) {
+        ;
+    }
+#ifdef USE_LOAD_BALANCE
+    else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
+        new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
+        if ( new_nthreads == 1 ) {
+            KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
+              master_tid ));
+            return 1;
+        }
+        if ( new_nthreads < set_nthreads ) {
+            KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
+              master_tid, new_nthreads ));
+        }
+    }
+#endif /* USE_LOAD_BALANCE */
+    else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
+        new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
+          : root->r.r_hot_team->t.t_nproc);
+        if ( new_nthreads <= 1 ) {
+            KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
+              master_tid ));
+            return 1;
+        }
+        if ( new_nthreads < set_nthreads ) {
+            KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
+              master_tid, new_nthreads ));
+        }
+        else {
+            new_nthreads = set_nthreads;
+        }
+    }
+    else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
+        if ( set_nthreads > 2 ) {
+            new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
+            new_nthreads = ( new_nthreads % set_nthreads ) + 1;
+            if ( new_nthreads == 1 ) {
+                KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
+                  master_tid ));
+                return 1;
+            }
+            if ( new_nthreads < set_nthreads ) {
+                KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
+                  master_tid, new_nthreads ));
+            }
+        }
+    }
+    else {
+        KMP_ASSERT( 0 );
+    }
+
+    //
+    // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
+    //
+    if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
+      root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
+        int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
+          root->r.r_hot_team->t.t_nproc );
+        if ( tl_nthreads <= 0 ) {
+            tl_nthreads = 1;
+        }
+
+        //
+        // If dyn-var is false, emit a 1-time warning.
+        //
+        if ( ! get__dynamic_2( parent_team, master_tid )
+          && ( ! __kmp_reserve_warn ) ) {
+            __kmp_reserve_warn = 1;
+            __kmp_msg(
+                kmp_ms_warning,
+                KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
+                KMP_HNT( Unset_ALL_THREADS ),
+                __kmp_msg_null
+            );
+        }
+        if ( tl_nthreads == 1 ) {
+            KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
+              master_tid ));
+            return 1;
+        }
+        KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
+          master_tid, tl_nthreads ));
+        new_nthreads = tl_nthreads;
+    }
+
+
+    //
+    // Check if the threads array is large enough, or needs expanding.
+    //
+    // See comment in __kmp_register_root() about the adjustment if
+    // __kmp_threads[0] == NULL.
+    //
+    capacity = __kmp_threads_capacity;
+    if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
+        --capacity;
+    }
+    if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
+      root->r.r_hot_team->t.t_nproc ) > capacity ) {
+        //
+        // Expand the threads array.
+        //
+        int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
+          root->r.r_hot_team->t.t_nproc ) - capacity;
+        int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
+        if ( slotsAdded < slotsRequired ) {
+            //
+            // The threads array was not expanded enough.
+            //
+            new_nthreads -= ( slotsRequired - slotsAdded );
+            KMP_ASSERT( new_nthreads >= 1 );
+
+            //
+            // If dyn-var is false, emit a 1-time warning.
+            //
+            if ( ! get__dynamic_2( parent_team, master_tid )
+              && ( ! __kmp_reserve_warn ) ) {
+                __kmp_reserve_warn = 1;
+                if ( __kmp_tp_cached ) {
+                    __kmp_msg(
+                        kmp_ms_warning,
+                        KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
+                        KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
+                        KMP_HNT( PossibleSystemLimitOnThreads ),
+                        __kmp_msg_null
+                    );
+                }
+                else {
+                    __kmp_msg(
+                        kmp_ms_warning,
+                        KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
+                        KMP_HNT( SystemLimitOnThreads ),
+                        __kmp_msg_null
+                    );
+                }
+            }
+        }
+    }
+
+    if ( new_nthreads == 1 ) {
+        KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
+                        __kmp_get_gtid(), set_nthreads ) );
+        return 1;
+    }
+
+    KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
+                    __kmp_get_gtid(), new_nthreads, set_nthreads ));
+    return new_nthreads;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* allocate threads from the thread pool and assign them to the new team */
+/* we are assured that there are enough threads available, because we
+ * checked on that earlier within critical section forkjoin */
+
+static void
+__kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
+                         kmp_info_t *master_th, int master_gtid )
+{
+    int         i;
+    int use_hot_team;
+
+    KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
+    KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
+    KMP_MB();
+
+    /* first, let's setup the master thread */
+    master_th->th.th_info.ds.ds_tid  = 0;
+    master_th->th.th_team            = team;
+    master_th->th.th_team_nproc      = team->t.t_nproc;
+    master_th->th.th_team_master     = master_th;
+    master_th->th.th_team_serialized = FALSE;
+    master_th->th.th_dispatch        = & team->t.t_dispatch[ 0 ];
+
+    /* make sure we are not the optimized hot team */
+#if KMP_NESTED_HOT_TEAMS
+    use_hot_team = 0;
+    kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
+    if( hot_teams ) {  // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
+        int level = team->t.t_active_level - 1;    // index in array of hot teams
+        if( master_th->th.th_teams_microtask ) {    // are we inside the teams?
+            if( master_th->th.th_teams_size.nteams > 1 ) {
+                ++level; // level was not increased in teams construct for team_of_masters
+            }
+            if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+                master_th->th.th_teams_level == team->t.t_level ) {
+                ++level; // level was not increased in teams construct for team_of_workers before the parallel
+            }            // team->t.t_level will be increased inside parallel
+        }
+        if( level < __kmp_hot_teams_max_level ) {
+            if( hot_teams[level].hot_team ) {
+                // hot team has already been allocated for given level
+                KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
+                use_hot_team = 1; // the team is ready to use
+            } else {
+                use_hot_team = 0; // AC: threads are not allocated yet
+                hot_teams[level].hot_team = team; // remember new hot team
+                hot_teams[level].hot_team_nth = team->t.t_nproc;
+            }
+        } else {
+            use_hot_team = 0;
+        }
+    }
+#else
+    use_hot_team = team == root->r.r_hot_team;
+#endif
+    if ( !use_hot_team ) {
+
+        /* install the master thread */
+        team->t.t_threads[ 0 ]    = master_th;
+        __kmp_initialize_info( master_th, team, 0, master_gtid );
+
+        /* now, install the worker threads */
+        for ( i=1 ;  i < team->t.t_nproc ; i++ ) {
+
+            /* fork or reallocate a new thread and install it in team */
+            kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
+            team->t.t_threads[ i ] = thr;
+            KMP_DEBUG_ASSERT( thr );
+            KMP_DEBUG_ASSERT( thr->th.th_team == team );
+            /* align team and thread arrived states */
+            KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
+                            __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
+                            __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
+                            team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
+                            team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
+#if OMP_40_ENABLED
+            thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
+            thr->th.th_teams_level     = master_th->th.th_teams_level;
+            thr->th.th_teams_size      = master_th->th.th_teams_size;
+#endif
+            { // Initialize threads' barrier data.
+                int b;
+                kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
+                for ( b = 0; b < bs_last_barrier; ++ b ) {
+                    balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+                    balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
+#endif
+                }; // for b
+            }
+        }
+
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+        __kmp_partition_places( team );
+#endif
+
+    }
+
+    KMP_MB();
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+//
+// Propagate any changes to the floating point control registers out to the team
+// We try to avoid unnecessary writes to the relevant cache line in the team structure,
+// so we don't make changes unless they are needed.
+//
+inline static void
+propagateFPControl(kmp_team_t * team)
+{
+    if ( __kmp_inherit_fp_control ) {
+        kmp_int16 x87_fpu_control_word;
+        kmp_uint32 mxcsr;
+
+        // Get master values of FPU control flags (both X87 and vector)
+        __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
+        __kmp_store_mxcsr( &mxcsr );
+        mxcsr &= KMP_X86_MXCSR_MASK;
+
+        // There is no point looking at t_fp_control_saved here.
+        // If it is TRUE, we still have to update the values if they are different from those we now have.
+        // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
+        // that the values in the team are the same as those we have.
+        // So, this code achieves what we need whether or not t_fp_control_saved is true.
+        // By checking whether the value needs updating we avoid unnecessary writes that would put the
+        // cache-line into a written state, causing all threads in the team to have to read it again.
+        if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
+            team->t.t_x87_fpu_control_word = x87_fpu_control_word;
+        }
+        if ( team->t.t_mxcsr != mxcsr ) {
+            team->t.t_mxcsr = mxcsr;
+        }
+        // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
+        // So we must ensure it is correct.
+        if (!team->t.t_fp_control_saved) {
+            team->t.t_fp_control_saved = TRUE;
+        }
+    }
+    else {
+        // Similarly here. Don't write to this cache-line in the team structure unless we have to.
+        if (team->t.t_fp_control_saved)
+            team->t.t_fp_control_saved = FALSE;
+    }
+}
+
+// Do the opposite, setting the hardware registers to the updated values from the team.
+inline static void
+updateHWFPControl(kmp_team_t * team)
+{
+    if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
+        //
+        // Only reset the fp control regs if they have been changed in the team.
+        // the parallel region that we are exiting.
+        //
+        kmp_int16 x87_fpu_control_word;
+        kmp_uint32 mxcsr;
+        __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
+        __kmp_store_mxcsr( &mxcsr );
+        mxcsr &= KMP_X86_MXCSR_MASK;
+
+        if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
+            __kmp_clear_x87_fpu_status_word();
+            __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
+        }
+
+        if ( team->t.t_mxcsr != mxcsr ) {
+            __kmp_load_mxcsr( &team->t.t_mxcsr );
+        }
+    }
+}
+#else
+# define propagateFPControl(x) ((void)0)
+# define updateHWFPControl(x)  ((void)0)
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+static void
+__kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
+
+/*
+ * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
+ */
+void
+__kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
+{
+    kmp_info_t *this_thr;
+    kmp_team_t *serial_team;
+
+    KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
+
+    /* Skip all this code for autopar serialized loops since it results in
+       unacceptable overhead */
+    if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
+        return;
+
+    if( ! TCR_4( __kmp_init_parallel ) )
+        __kmp_parallel_initialize();
+
+    this_thr     = __kmp_threads[ global_tid ];
+    serial_team  = this_thr->th.th_serial_team;
+
+    /* utilize the serialized team held by this thread */
+    KMP_DEBUG_ASSERT( serial_team );
+    KMP_MB();
+
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
+        KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
+        KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
+                        global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
+        this_thr->th.th_task_team = NULL;
+    }
+
+#if OMP_40_ENABLED
+    kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
+    if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
+        proc_bind = proc_bind_false;
+    }
+    else if ( proc_bind == proc_bind_default ) {
+        //
+        // No proc_bind clause was specified, so use the current value
+        // of proc-bind-var for this parallel region.
+        //
+        proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
+    }
+    //
+    // Reset for next parallel region
+    //
+    this_thr->th.th_set_proc_bind = proc_bind_default;
+#endif /* OMP_40_ENABLED */
+
+    if( this_thr->th.th_team != serial_team ) {
+        // Nested level will be an index in the nested nthreads array
+        int level = this_thr->th.th_team->t.t_level;
+
+        if( serial_team->t.t_serialized ) {
+            /* this serial team was already used
+             * TODO increase performance by making this locks more specific */
+            kmp_team_t *new_team;
+
+            __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+
+#if OMPT_SUPPORT
+            ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
+#endif
+
+            new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
+#if OMPT_SUPPORT
+                                           ompt_parallel_id,
+#endif
+#if OMP_40_ENABLED
+                                           proc_bind,
+#endif
+                                           & this_thr->th.th_current_task->td_icvs,
+                                           0 USE_NESTED_HOT_ARG(NULL) );
+            __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+            KMP_ASSERT( new_team );
+
+            /* setup new serialized team and install it */
+            new_team->t.t_threads[0] = this_thr;
+            new_team->t.t_parent = this_thr->th.th_team;
+            serial_team = new_team;
+            this_thr->th.th_serial_team = serial_team;
+
+            KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
+                            global_tid, serial_team ) );
+
+
+            /* TODO the above breaks the requirement that if we run out of
+             * resources, then we can still guarantee that serialized teams
+             * are ok, since we may need to allocate a new one */
+        } else {
+            KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
+                            global_tid, serial_team ) );
+        }
+
+        /* we have to initialize this serial team */
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads );
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
+        KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
+        serial_team->t.t_ident         = loc;
+        serial_team->t.t_serialized    = 1;
+        serial_team->t.t_nproc         = 1;
+        serial_team->t.t_parent        = this_thr->th.th_team;
+        serial_team->t.t_sched         = this_thr->th.th_team->t.t_sched;
+        this_thr->th.th_team           = serial_team;
+        serial_team->t.t_master_tid    = this_thr->th.th_info.ds.ds_tid;
+
+        KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
+                        global_tid, this_thr->th.th_current_task ) );
+        KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
+        this_thr->th.th_current_task->td_flags.executing = 0;
+
+        __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
+
+        /* TODO: GEH: do the ICVs work for nested serialized teams?  Don't we need an implicit task for
+           each serialized task represented by team->t.t_serialized? */
+        copy_icvs(
+                  & this_thr->th.th_current_task->td_icvs,
+                  & this_thr->th.th_current_task->td_parent->td_icvs );
+
+        // Thread value exists in the nested nthreads array for the next nested level
+        if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
+            this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
+        }
+
+#if OMP_40_ENABLED
+        if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
+            this_thr->th.th_current_task->td_icvs.proc_bind
+                = __kmp_nested_proc_bind.bind_types[ level + 1 ];
+        }
+#endif /* OMP_40_ENABLED */
+
+#if USE_DEBUGGER
+        serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
+#endif
+        this_thr->th.th_info.ds.ds_tid = 0;
+
+        /* set thread cache values */
+        this_thr->th.th_team_nproc     = 1;
+        this_thr->th.th_team_master    = this_thr;
+        this_thr->th.th_team_serialized = 1;
+
+        serial_team->t.t_level        = serial_team->t.t_parent->t.t_level + 1;
+        serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
+
+        propagateFPControl (serial_team);
+
+        /* check if we need to allocate dispatch buffers stack */
+        KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+        if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
+            serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
+                __kmp_allocate( sizeof( dispatch_private_info_t ) );
+        }
+        this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+#if OMPT_SUPPORT
+        ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
+        __ompt_team_assign_id(serial_team, ompt_parallel_id);
+#endif
+
+        KMP_MB();
+
+    } else {
+        /* this serialized team is already being used,
+         * that's fine, just add another nested level */
+        KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads );
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
+        ++ serial_team->t.t_serialized;
+        this_thr->th.th_team_serialized = serial_team->t.t_serialized;
+
+        // Nested level will be an index in the nested nthreads array
+        int level = this_thr->th.th_team->t.t_level;
+        // Thread value exists in the nested nthreads array for the next nested level
+        if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
+            this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
+        }
+        serial_team->t.t_level++;
+        KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
+                        global_tid, serial_team, serial_team->t.t_level ) );
+
+        /* allocate/push dispatch buffers stack */
+        KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+        {
+            dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
+                __kmp_allocate( sizeof( dispatch_private_info_t ) );
+            disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
+            serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
+        }
+        this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+        KMP_MB();
+    }
+
+    if ( __kmp_env_consistency_check )
+        __kmp_push_parallel( global_tid, NULL );
+
+#if USE_ITT_BUILD
+    // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
+    if ( serial_team->t.t_level == 1
+#if OMP_40_ENABLED
+        && this_thr->th.th_teams_microtask == NULL
+#endif
+    ) {
+#if USE_ITT_NOTIFY
+        // Save the start of the "parallel" region for VTune. This is the frame begin at the same time.
+        if ( ( __itt_get_timestamp_ptr || KMP_ITT_DEBUG ) &&
+            ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
+        {
+             serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
+        } else // only one notification scheme (either "submit" or "forking/joined", not both)
+#endif
+        if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
+             __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode )
+        {
+            this_thr->th.th_ident = loc;
+            // 0 - no barriers; 1 - serialized parallel
+            __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
+        }
+    }
+#endif /* USE_ITT_BUILD */
+}
+
+/* most of the work for a fork */
+/* return true if we really went parallel, false if serialized */
+int
+__kmp_fork_call(
+    ident_t   * loc,
+    int         gtid,
+    enum fork_context_e  call_context, // Intel, GNU, ...
+    kmp_int32   argc,
+#if OMPT_SUPPORT
+    void       *unwrapped_task,
+#endif
+    microtask_t microtask,
+    launch_t    invoker,
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+    va_list   * ap
+#else
+    va_list     ap
+#endif
+    )
+{
+    void          **argv;
+    int             i;
+    int             master_tid;
+    int             master_this_cons;
+    kmp_team_t     *team;
+    kmp_team_t     *parent_team;
+    kmp_info_t     *master_th;
+    kmp_root_t     *root;
+    int             nthreads;
+    int             master_active;
+    int             master_set_numthreads;
+    int             level;
+#if OMP_40_ENABLED
+    int             active_level;
+    int             teams_level;
+#endif
+#if KMP_NESTED_HOT_TEAMS
+    kmp_hot_team_ptr_t **p_hot_teams;
+#endif
+    { // KMP_TIME_BLOCK
+    KMP_TIME_BLOCK(KMP_fork_call);
+
+    KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
+    if ( __kmp_stkpadding > 0 &&  __kmp_root[gtid] != NULL ) {
+        /* Some systems prefer the stack for the root thread(s) to start with */
+        /* some gap from the parent stack to prevent false sharing. */
+        void *dummy = KMP_ALLOCA(__kmp_stkpadding);
+        /* These 2 lines below are so this does not get optimized out */
+        if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
+            __kmp_stkpadding += (short)((kmp_int64)dummy);
+    }
+
+    /* initialize if needed */
+    KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
+    if( ! TCR_4(__kmp_init_parallel) )
+        __kmp_parallel_initialize();
+
+    /* setup current data */
+    master_th     = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
+    parent_team   = master_th->th.th_team;
+    master_tid    = master_th->th.th_info.ds.ds_tid;
+    master_this_cons = master_th->th.th_local.this_construct;
+    root          = master_th->th.th_root;
+    master_active = root->r.r_active;
+    master_set_numthreads = master_th->th.th_set_nproc;
+
+#if OMPT_SUPPORT
+    ompt_parallel_id_t ompt_parallel_id;
+    ompt_task_id_t ompt_task_id;
+    ompt_frame_t *ompt_frame;
+    ompt_task_id_t my_task_id;
+    ompt_parallel_id_t my_parallel_id;
+
+    if (ompt_status & ompt_status_track) {
+        ompt_parallel_id = __ompt_parallel_id_new(gtid);
+        ompt_task_id = __ompt_get_task_id_internal(0);
+        ompt_frame = __ompt_get_task_frame_internal(0);
+    }
+#endif
+
+    // Nested level will be an index in the nested nthreads array
+    level         = parent_team->t.t_level;
+#if OMP_40_ENABLED
+    active_level  = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
+    teams_level    = master_th->th.th_teams_level; // needed to check nesting inside the teams
+#endif
+#if KMP_NESTED_HOT_TEAMS
+    p_hot_teams   = &master_th->th.th_hot_teams;
+    if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
+        *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
+                sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
+        (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
+        (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
+    }
+#endif
+
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
+        int team_size = master_set_numthreads;
+
+        ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
+            ompt_task_id, ompt_frame, ompt_parallel_id,
+            team_size, unwrapped_task);
+    }
+#endif
+
+    master_th->th.th_ident = loc;
+
+#if OMP_40_ENABLED
+    if ( master_th->th.th_teams_microtask &&
+         ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
+        // AC: This is start of parallel that is nested inside teams construct.
+        //     The team is actual (hot), all workers are ready at the fork barrier.
+        //     No lock needed to initialize the team a bit, then free workers.
+        parent_team->t.t_ident = loc;
+        parent_team->t.t_argc  = argc;
+        argv = (void**)parent_team->t.t_argv;
+        for( i=argc-1; i >= 0; --i )
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+            *argv++ = va_arg( *ap, void * );
+#else
+            *argv++ = va_arg( ap, void * );
+#endif
+        /* Increment our nested depth levels, but not increase the serialization */
+        if ( parent_team == master_th->th.th_serial_team ) {
+            // AC: we are in serialized parallel
+            __kmpc_serialized_parallel(loc, gtid);
+            KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
+            parent_team->t.t_serialized--; // AC: need this in order enquiry functions
+                                           //     work correctly, will restore at join time
+
+#if OMPT_SUPPORT
+            void *dummy;
+            void **exit_runtime_p;
+
+            ompt_lw_taskteam_t lw_taskteam;
+
+            if (ompt_status & ompt_status_track) {
+                __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                    unwrapped_task, ompt_parallel_id);
+                lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
+                exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
+
+                __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+
+#if OMPT_TRACE
+                /* OMPT implicit task begin */
+                my_task_id = lw_taskteam.ompt_task_info.task_id;
+                my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
+                if ((ompt_status == ompt_status_track_callback) &&
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                        my_parallel_id, my_task_id);
+                }
+#endif
+
+                /* OMPT state */
+                master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+            } else {
+                exit_runtime_p = &dummy;
+            }
+#endif
+
+            KMP_TIME_BLOCK(OMP_work);
+            __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                , exit_runtime_p
+#endif
+                );
+
+#if OMPT_SUPPORT
+            if (ompt_status & ompt_status_track) {
+#if OMPT_TRACE
+                lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
+
+                if ((ompt_status == ompt_status_track_callback) &&
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+                    ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                        ompt_parallel_id, ompt_task_id);
+                }
+
+                __ompt_lw_taskteam_unlink(master_th);
+                // reset clear the task id only after unlinking the task
+                lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
+#endif
+
+                if ((ompt_status == ompt_status_track_callback) &&
+                    ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                    ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                        ompt_parallel_id, ompt_task_id);
+                }
+                master_th->th.ompt_thread_info.state = ompt_state_overhead;
+            }
+#endif
+            return TRUE;
+        }
+
+        parent_team->t.t_pkfn  = microtask;
+#if OMPT_SUPPORT
+        parent_team->t.ompt_team_info.microtask = unwrapped_task;
+#endif
+        parent_team->t.t_invoke = invoker;
+        KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
+        parent_team->t.t_active_level ++;
+        parent_team->t.t_level ++;
+
+        /* Change number of threads in the team if requested */
+        if ( master_set_numthreads ) {   // The parallel has num_threads clause
+            if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
+                // AC: only can reduce the number of threads dynamically, cannot increase
+                kmp_info_t **other_threads = parent_team->t.t_threads;
+                parent_team->t.t_nproc = master_set_numthreads;
+                for ( i = 0; i < master_set_numthreads; ++i ) {
+                    other_threads[i]->th.th_team_nproc = master_set_numthreads;
+                }
+                // Keep extra threads hot in the team for possible next parallels
+            }
+            master_th->th.th_set_nproc = 0;
+        }
+
+#if USE_DEBUGGER
+    if ( __kmp_debugging ) {    // Let debugger override number of threads.
+        int nth = __kmp_omp_num_threads( loc );
+        if ( nth > 0 ) {        // 0 means debugger does not want to change number of threads.
+            master_set_numthreads = nth;
+        }; // if
+    }; // if
+#endif
+
+        KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
+        __kmp_internal_fork( loc, gtid, parent_team );
+        KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
+
+        /* Invoke microtask for MASTER thread */
+        KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
+                    gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
+
+        {
+            KMP_TIME_BLOCK(OMP_work);
+            if (! parent_team->t.t_invoke( gtid )) {
+                KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
+            }
+        }
+        KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
+            gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
+
+        return TRUE;
+    } // Parallel closely nested in teams construct
+#endif /* OMP_40_ENABLED */
+
+#if KMP_DEBUG
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
+    }
+#endif
+
+    /* determine how many new threads we can use */
+    __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+
+    if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
+        nthreads = 1;
+    } else {
+        nthreads = master_set_numthreads ?
+            master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
+        nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
+#if OMP_40_ENABLED
+/* AC: If we execute teams from parallel region (on host), then teams should be created
+   but each can only have 1 thread if nesting is disabled. If teams called from serial region,
+   then teams and their threads should be created regardless of the nesting setting. */
+                                         , ((ap==NULL && active_level==0) ||
+                                            (ap && teams_level>0 && teams_level==level))
+#endif /* OMP_40_ENABLED */
+                                         );
+    }
+    KMP_DEBUG_ASSERT( nthreads > 0 );
+
+    /* If we temporarily changed the set number of threads then restore it now */
+    master_th->th.th_set_nproc = 0;
+
+
+    /* create a serialized parallel region? */
+    if ( nthreads == 1 ) {
+        /* josh todo: hypothetical question: what do we do for OS X*? */
+#if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+        void *   args[ argc ];
+#else
+        void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
+#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
+
+        __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+        KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
+
+        __kmpc_serialized_parallel(loc, gtid);
+
+        if ( call_context == fork_context_intel ) {
+            /* TODO this sucks, use the compiler itself to pass args! :) */
+            master_th->th.th_serial_team->t.t_ident = loc;
+#if OMP_40_ENABLED
+            if ( !ap ) {
+                // revert change made in __kmpc_serialized_parallel()
+                master_th->th.th_serial_team->t.t_level--;
+                // Get args from parent team for teams construct
+
+#if OMPT_SUPPORT
+                void *dummy;
+                void **exit_runtime_p;
+
+                ompt_lw_taskteam_t lw_taskteam;
+
+                if (ompt_status & ompt_status_track) {
+                    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                        unwrapped_task, ompt_parallel_id);
+                    lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
+                    exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
+
+                    __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+
+#if OMPT_TRACE
+                    my_task_id = lw_taskteam.ompt_task_info.task_id;
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                            ompt_parallel_id, my_task_id);
+                    }
+#endif
+
+                    /* OMPT state */
+                    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+                } else {
+                    exit_runtime_p = &dummy;
+                }
+#endif
+
+                {
+                    KMP_TIME_BLOCK(OMP_work);
+                    __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                        , exit_runtime_p
+#endif
+                    );
+                }
+
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+                    lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
+
+#if OMPT_TRACE
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                            ompt_parallel_id, ompt_task_id);
+                    }
+#endif
+
+                    __ompt_lw_taskteam_unlink(master_th);
+                    // reset clear the task id only after unlinking the task
+                    lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
+
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                            ompt_parallel_id, ompt_task_id);
+                    }
+                    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+                }
+#endif
+            } else if ( microtask == (microtask_t)__kmp_teams_master ) {
+                KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
+                team = master_th->th.th_team;
+                //team->t.t_pkfn = microtask;
+                team->t.t_invoke = invoker;
+                __kmp_alloc_argv_entries( argc, team, TRUE );
+                team->t.t_argc = argc;
+                argv = (void**) team->t.t_argv;
+                if ( ap ) {
+                    for( i=argc-1; i >= 0; --i )
+// TODO: revert workaround for Intel(R) 64 tracker #96
+# if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                        *argv++ = va_arg( *ap, void * );
+# else
+                        *argv++ = va_arg( ap, void * );
+# endif
+                } else {
+                    for( i=0; i < argc; ++i )
+                        // Get args from parent team for teams construct
+                        argv[i] = parent_team->t.t_argv[i];
+                }
+                // AC: revert change made in __kmpc_serialized_parallel()
+                //     because initial code in teams should have level=0
+                team->t.t_level--;
+                // AC: call special invoker for outer "parallel" of the teams construct
+                {
+                    KMP_TIME_BLOCK(OMP_work);
+                    invoker(gtid);
+                }
+            } else {
+#endif /* OMP_40_ENABLED */
+                argv = args;
+                for( i=argc-1; i >= 0; --i )
+// TODO: revert workaround for Intel(R) 64 tracker #96
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                    *argv++ = va_arg( *ap, void * );
+#else
+                    *argv++ = va_arg( ap, void * );
+#endif
+                KMP_MB();
+
+#if OMPT_SUPPORT
+                void *dummy;
+                void **exit_runtime_p;
+
+                ompt_lw_taskteam_t lw_taskteam;
+
+                if (ompt_status & ompt_status_track) {
+                    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                        unwrapped_task, ompt_parallel_id);
+                    lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
+                    exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
+
+                    __ompt_lw_taskteam_link(&lw_taskteam, master_th);
+
+#if OMPT_TRACE
+                    /* OMPT implicit task begin */
+                    my_task_id = lw_taskteam.ompt_task_info.task_id;
+                    my_parallel_id = ompt_parallel_id;
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+                            my_parallel_id, my_task_id);
+                    }
+#endif
+
+                    /* OMPT state */
+                    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+                } else {
+                    exit_runtime_p = &dummy;
+                }
+#endif
+
+                {
+                    KMP_TIME_BLOCK(OMP_work);
+                    __kmp_invoke_microtask( microtask, gtid, 0, argc, args
+#if OMPT_SUPPORT
+                        , exit_runtime_p
+#endif
+                    );
+                }
+
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+#if OMPT_TRACE
+                    lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
+
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                            my_parallel_id, my_task_id);
+                    }
+#endif
+
+                    __ompt_lw_taskteam_unlink(master_th);
+                    // reset clear the task id only after unlinking the task
+                    lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
+
+                    if ((ompt_status == ompt_status_track_callback) &&
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+                        ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+                            ompt_parallel_id, ompt_task_id);
+                    }
+                    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+                }
+#endif
+#if OMP_40_ENABLED
+            }
+#endif /* OMP_40_ENABLED */
+        }
+        else if ( call_context == fork_context_gnu ) {
+#if OMPT_SUPPORT
+            ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
+                __kmp_allocate(sizeof(ompt_lw_taskteam_t));
+            __ompt_lw_taskteam_init(lwt, master_th, gtid,
+                unwrapped_task, ompt_parallel_id);
+
+            lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
+            lwt->ompt_task_info.frame.exit_runtime_frame = 0;
+            __ompt_lw_taskteam_link(lwt, master_th);
+#endif
+
+            // we were called from GNU native code
+            KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
+            return FALSE;
+        }
+        else {
+            KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
+        }
+
+
+        KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
+        KMP_MB();
+        return FALSE;
+    }
+
+    // GEH: only modify the executing flag in the case when not serialized
+    //      serialized case is handled in kmpc_serialized_parallel
+    KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
+                  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
+                  master_th->th.th_current_task->td_icvs.max_active_levels ) );
+    // TODO: GEH - cannot do this assertion because root thread not set up as executing
+    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
+    master_th->th.th_current_task->td_flags.executing = 0;
+
+#if OMP_40_ENABLED
+    if ( !master_th->th.th_teams_microtask || level > teams_level )
+#endif /* OMP_40_ENABLED */
+    {
+        /* Increment our nested depth level */
+        KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
+    }
+
+    // See if we need to make a copy of the ICVs.
+    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
+    if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
+        nthreads_icv = __kmp_nested_nth.nth[level+1];
+    }
+    else {
+        nthreads_icv = 0;  // don't update
+    }
+
+#if OMP_40_ENABLED
+    // Figure out the proc_bind_policy for the new team.
+    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+    kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
+    if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
+        proc_bind = proc_bind_false;
+    }
+    else {
+        if (proc_bind == proc_bind_default) {
+            // No proc_bind clause specified; use current proc-bind-var for this parallel region
+            proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+        }
+        /* else: The proc_bind policy was specified explicitly on parallel clause. This
+           overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
+        // Figure the value of proc-bind-var for the child threads.
+        if ((level+1 < __kmp_nested_proc_bind.used)
+            && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
+            proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
+        }
+    }
+
+    // Reset for next parallel region
+    master_th->th.th_set_proc_bind = proc_bind_default;
+#endif /* OMP_40_ENABLED */
+
+    if ((nthreads_icv > 0)
+#if OMP_40_ENABLED
+        || (proc_bind_icv != proc_bind_default)
+#endif /* OMP_40_ENABLED */
+        ) {
+        kmp_internal_control_t new_icvs;
+        copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
+        new_icvs.next = NULL;
+        if (nthreads_icv > 0) {
+            new_icvs.nproc = nthreads_icv;
+        }
+
+#if OMP_40_ENABLED
+        if (proc_bind_icv != proc_bind_default) {
+            new_icvs.proc_bind = proc_bind_icv;
+        }
+#endif /* OMP_40_ENABLED */
+
+        /* allocate a new parallel team */
+        KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
+        team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                   ompt_parallel_id,
+#endif
+#if OMP_40_ENABLED
+                                   proc_bind,
+#endif
+                                   &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
+    } else {
+        /* allocate a new parallel team */
+        KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
+        team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                   ompt_parallel_id,
+#endif
+#if OMP_40_ENABLED
+                                   proc_bind,
+#endif
+                                   &master_th->th.th_current_task->td_icvs, argc
+                                   USE_NESTED_HOT_ARG(master_th) );
+    }
+    KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
+
+    /* setup the new team */
+    team->t.t_master_tid = master_tid;
+    team->t.t_master_this_cons = master_this_cons;
+    team->t.t_ident      = loc;
+    team->t.t_parent     = parent_team;
+    TCW_SYNC_PTR(team->t.t_pkfn, microtask);
+#if OMPT_SUPPORT
+    TCW_SYNC_PTR(team->t.ompt_team_info.microtask, unwrapped_task);
+#endif
+    team->t.t_invoke     = invoker;  /* TODO move this to root, maybe */
+    // TODO: parent_team->t.t_level == INT_MAX ???
+#if OMP_40_ENABLED
+    if ( !master_th->th.th_teams_microtask || level > teams_level ) {
+#endif /* OMP_40_ENABLED */
+        team->t.t_level        = parent_team->t.t_level + 1;
+        team->t.t_active_level = parent_team->t.t_active_level + 1;
+#if OMP_40_ENABLED
+    } else {
+        // AC: Do not increase parallel level at start of the teams construct
+        team->t.t_level        = parent_team->t.t_level;
+        team->t.t_active_level = parent_team->t.t_active_level;
+    }
+#endif /* OMP_40_ENABLED */
+    team->t.t_sched      = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
+
+    // Update the floating point rounding in the team if required.
+    propagateFPControl(team);
+
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
+        KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
+        KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
+                      __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
+                      parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
+        if (level) {
+            // Take a memo of master's task_state
+            KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
+            if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
+                kmp_uint8 *old_stack, *new_stack = (kmp_uint8 *) __kmp_allocate( 2*master_th->th.th_task_state_stack_sz );
+                kmp_uint32 i;
+                for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
+                    new_stack[i] = master_th->th.th_task_state_memo_stack[i];
+                }
+                old_stack = master_th->th.th_task_state_memo_stack;
+                master_th->th.th_task_state_memo_stack = new_stack;
+                master_th->th.th_task_state_stack_sz *= 2;
+                __kmp_free(old_stack);
+            }
+            // Store master's task_state on stack
+            master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
+            master_th->th.th_task_state_top++;
+            master_th->th.th_task_state = 0;
+        }
+        master_th->th.th_task_team = team->t.t_task_team[master_th->th.th_task_state];
+
+#if !KMP_NESTED_HOT_TEAMS
+        KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
+#endif
+    }
+
+    KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
+                gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
+    KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
+                      ( team->t.t_master_tid == 0 &&
+                        ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
+    KMP_MB();
+
+    /* now, setup the arguments */
+    argv = (void**)team->t.t_argv;
+#if OMP_40_ENABLED
+    if ( ap ) {
+#endif /* OMP_40_ENABLED */
+        for ( i=argc-1; i >= 0; --i )
+// TODO: revert workaround for Intel(R) 64 tracker #96
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+            *argv++ = va_arg( *ap, void * );
+#else
+            *argv++ = va_arg( ap, void * );
+#endif
+#if OMP_40_ENABLED
+    } else {
+        for ( i=0; i < argc; ++i )
+            // Get args from parent team for teams construct
+            argv[i] = team->t.t_parent->t.t_argv[i];
+    }
+#endif /* OMP_40_ENABLED */
+
+    /* now actually fork the threads */
+    team->t.t_master_active = master_active;
+    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
+        root->r.r_active = TRUE;
+
+    __kmp_fork_team_threads( root, team, master_th, gtid );
+    __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
+
+#if OMPT_SUPPORT
+    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+#endif
+
+    __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+
+
+#if USE_ITT_BUILD
+    if ( team->t.t_active_level == 1 // only report frames at level 1
+# if OMP_40_ENABLED
+        && !master_th->th.th_teams_microtask // not in teams construct
+# endif /* OMP_40_ENABLED */
+    ) {
+#if USE_ITT_NOTIFY
+        if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
+             ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
+        {
+            kmp_uint64 tmp_time = 0;
+            if ( __itt_get_timestamp_ptr )
+                tmp_time = __itt_get_timestamp();
+            // Internal fork - report frame begin
+            master_th->th.th_frame_time  = tmp_time;
+            if ( __kmp_forkjoin_frames_mode == 3 )
+                team->t.t_region_time = tmp_time;
+        } else // only one notification scheme (either "submit" or "forking/joined", not both)
+#endif /* USE_ITT_NOTIFY */
+        if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
+             __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
+        { // Mark start of "parallel" region for VTune.
+            __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
+        }
+    }
+#endif /* USE_ITT_BUILD */
+
+    /* now go on and do the work */
+    KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
+    KMP_MB();
+    KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
+                  root, team, master_th, gtid));
+
+#if USE_ITT_BUILD
+    if ( __itt_stack_caller_create_ptr ) {
+        team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
+    }
+#endif /* USE_ITT_BUILD */
+
+#if OMP_40_ENABLED
+    if ( ap )   // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
+#endif /* OMP_40_ENABLED */
+    {
+        __kmp_internal_fork( loc, gtid, team );
+        KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
+                      root, team, master_th, gtid));
+    }
+
+    if (call_context == fork_context_gnu) {
+        KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
+        return TRUE;
+    }
+
+    /* Invoke microtask for MASTER thread */
+    KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
+                gtid, team->t.t_id, team->t.t_pkfn ) );
+    }  // END of timer KMP_fork_call block
+
+    {
+        //KMP_TIME_BLOCK(OMP_work);
+        KMP_TIME_BLOCK(USER_master_invoke);
+        if (! team->t.t_invoke( gtid )) {
+            KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
+        }
+    }
+    KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
+        gtid, team->t.t_id, team->t.t_pkfn ) );
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
+    return TRUE;
+}
+
+#if OMPT_SUPPORT
+static inline void
+__kmp_join_restore_state(
+    kmp_info_t *thread,
+    kmp_team_t *team)
+{
+    // restore state outside the region
+    thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
+        ompt_state_work_serial : ompt_state_work_parallel);
+}
+
+static inline void
+__kmp_join_ompt(
+    kmp_info_t *thread,
+    kmp_team_t *team,
+    ompt_parallel_id_t parallel_id)
+{
+    if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
+        ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+        ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
+            parallel_id, task_info->task_id);
+    }
+
+    __kmp_join_restore_state(thread,team);
+}
+#endif
+
+void
+__kmp_join_call(ident_t *loc, int gtid
+#if OMP_40_ENABLED
+               , int exit_teams
+#endif /* OMP_40_ENABLED */
+)
+{
+    KMP_TIME_BLOCK(KMP_join_call);
+    kmp_team_t     *team;
+    kmp_team_t     *parent_team;
+    kmp_info_t     *master_th;
+    kmp_root_t     *root;
+    int             master_active;
+    int             i;
+
+    KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
+
+    /* setup current data */
+    master_th     = __kmp_threads[ gtid ];
+    root          = master_th->th.th_root;
+    team          = master_th->th.th_team;
+    parent_team   = team->t.t_parent;
+
+    master_th->th.th_ident = loc;
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
+#if KMP_DEBUG
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
+                         __kmp_gtid_from_thread( master_th ), team,
+                         team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
+        KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
+    }
+#endif
+
+    if( team->t.t_serialized ) {
+#if OMP_40_ENABLED
+        if ( master_th->th.th_teams_microtask ) {
+            // We are in teams construct
+            int level = team->t.t_level;
+            int tlevel = master_th->th.th_teams_level;
+            if ( level == tlevel ) {
+                // AC: we haven't incremented it earlier at start of teams construct,
+                //     so do it here - at the end of teams construct
+                team->t.t_level++;
+            } else if ( level == tlevel + 1 ) {
+                // AC: we are exiting parallel inside teams, need to increment serialization
+                //     in order to restore it in the next call to __kmpc_end_serialized_parallel
+                team->t.t_serialized++;
+            }
+        }
+#endif /* OMP_40_ENABLED */
+        __kmpc_end_serialized_parallel( loc, gtid );
+
+#if OMPT_SUPPORT
+        if (ompt_status == ompt_status_track_callback) {
+            __kmp_join_restore_state(master_th, parent_team);
+        }
+#endif
+
+        return;
+    }
+
+    master_active = team->t.t_master_active;
+
+#if OMP_40_ENABLED
+    if (!exit_teams)
+#endif /* OMP_40_ENABLED */
+    {
+        // AC: No barrier for internal teams at exit from teams construct.
+        //     But there is barrier for external team (league).
+        __kmp_internal_join( loc, gtid, team );
+    }
+    else {
+        master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
+    }
+
+    KMP_MB();
+
+#if OMPT_SUPPORT
+    ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
+#endif
+
+#if USE_ITT_BUILD
+    if ( __itt_stack_caller_create_ptr ) {
+        __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
+    }
+
+    // Mark end of "parallel" region for VTune.
+    if ( team->t.t_active_level == 1
+# if OMP_40_ENABLED
+        && !master_th->th.th_teams_microtask /* not in teams construct */
+# endif /* OMP_40_ENABLED */
+    ) {
+        master_th->th.th_ident = loc;
+        // only one notification scheme (either "submit" or "forking/joined", not both)
+        if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
+            __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
+                                    0, loc, master_th->th.th_team_nproc, 1 );
+        else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
+            ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
+            __kmp_itt_region_joined( gtid );
+    } // active_level == 1
+#endif /* USE_ITT_BUILD */
+
+#if OMP_40_ENABLED
+    if ( master_th->th.th_teams_microtask &&
+         !exit_teams &&
+         team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+         team->t.t_level == master_th->th.th_teams_level + 1 ) {
+        // AC: We need to leave the team structure intact at the end
+        //     of parallel inside the teams construct, so that at the next
+        //     parallel same (hot) team works, only adjust nesting levels
+
+        /* Decrement our nested depth level */
+        team->t.t_level --;
+        team->t.t_active_level --;
+        KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
+
+        /* Restore number of threads in the team if needed */
+        if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
+            int old_num = master_th->th.th_team_nproc;
+            int new_num = master_th->th.th_teams_size.nth;
+            kmp_info_t **other_threads = team->t.t_threads;
+            kmp_task_team_t * task_team = master_th->th.th_task_team;
+            team->t.t_nproc = new_num;
+            if ( task_team ) { // task team might have lesser value of counters
+                task_team->tt.tt_ref_ct = new_num - 1;
+                task_team->tt.tt_unfinished_threads = new_num;
+            }
+            for ( i = 0; i < old_num; ++i ) {
+                other_threads[i]->th.th_team_nproc = new_num;
+            }
+            // Adjust states of non-used threads of the team
+            for ( i = old_num; i < new_num; ++i ) {
+                // Re-initialize thread's barrier data.
+                int b;
+                kmp_balign_t * balign = other_threads[i]->th.th_bar;
+                for ( b = 0; b < bs_last_barrier; ++ b ) {
+                    balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+                    balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
+#endif
+                }
+                if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+                    // Synchronize thread's task state
+                    other_threads[i]->th.th_task_state = master_th->th.th_task_state;
+                }
+            }
+        }
+
+#if OMPT_SUPPORT
+        if (ompt_status == ompt_status_track_callback) {
+            __kmp_join_ompt(master_th, parent_team, parallel_id);
+        }
+#endif
+
+        return;
+    }
+#endif /* OMP_40_ENABLED */
+
+    /* do cleanup and restore the parent team */
+    master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
+    master_th->th.th_local.this_construct = team->t.t_master_this_cons;
+
+    master_th->th.th_dispatch =
+                & parent_team->t.t_dispatch[ team->t.t_master_tid ];
+
+    /* jc: The following lock has instructions with REL and ACQ semantics,
+       separating the parallel user code called in this parallel region
+       from the serial user code called after this function returns.
+    */
+    __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+
+#if OMP_40_ENABLED
+    if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
+#endif /* OMP_40_ENABLED */
+    {
+        /* Decrement our nested depth level */
+        KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
+    }
+    KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
+
+    KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
+                   0, master_th, team ) );
+    __kmp_pop_current_task_from_thread( master_th );
+
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    //
+    // Restore master thread's partition.
+    //
+    master_th->th.th_first_place = team->t.t_first_place;
+    master_th->th.th_last_place = team->t.t_last_place;
+#endif /* OMP_40_ENABLED */
+
+    updateHWFPControl (team);
+
+    if ( root->r.r_active != master_active )
+        root->r.r_active = master_active;
+
+    __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
+
+    /* this race was fun to find.  make sure the following is in the critical
+     * region otherwise assertions may fail occasionally since the old team
+     * may be reallocated and the hierarchy appears inconsistent.  it is
+     * actually safe to run and won't cause any bugs, but will cause those
+     * assertion failures.  it's only one deref&assign so might as well put this
+     * in the critical region */
+    master_th->th.th_team        =   parent_team;
+    master_th->th.th_team_nproc  =   parent_team->t.t_nproc;
+    master_th->th.th_team_master =   parent_team->t.t_threads[0];
+    master_th->th.th_team_serialized = parent_team->t.t_serialized;
+
+    /* restore serialized team, if need be */
+    if( parent_team->t.t_serialized &&
+        parent_team != master_th->th.th_serial_team &&
+        parent_team != root->r.r_root_team ) {
+            __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
+            master_th->th.th_serial_team = parent_team;
+    }
+
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        // Restore task state from memo stack
+        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
+        if (master_th->th.th_task_state_top > 0) {
+            --master_th->th.th_task_state_top; // pop
+            master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
+        }
+        // Copy the first task team from the new child / old parent team to the thread and reset state flag.
+        master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
+
+        KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
+                        __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
+                        parent_team ) );
+    }
+
+     // TODO: GEH - cannot do this assertion because root thread not set up as executing
+     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
+     master_th->th.th_current_task->td_flags.executing = 1;
+
+    __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+
+#if OMPT_SUPPORT
+    if (ompt_status == ompt_status_track_callback) {
+        __kmp_join_ompt(master_th, parent_team, parallel_id);
+    }
+#endif
+
+    KMP_MB();
+    KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* Check whether we should push an internal control record onto the
+   serial team stack.  If so, do it.  */
+void
+__kmp_save_internal_controls ( kmp_info_t * thread )
+{
+
+    if ( thread->th.th_team != thread->th.th_serial_team ) {
+        return;
+    }
+    if (thread->th.th_team->t.t_serialized > 1) {
+        int push = 0;
+
+        if (thread->th.th_team->t.t_control_stack_top == NULL) {
+            push = 1;
+        } else {
+            if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
+                 thread->th.th_team->t.t_serialized ) {
+                push = 1;
+            }
+        }
+        if (push) {  /* push a record on the serial team's stack */
+            kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
+
+            copy_icvs( control, & thread->th.th_current_task->td_icvs );
+
+            control->serial_nesting_level = thread->th.th_team->t.t_serialized;
+
+            control->next = thread->th.th_team->t.t_control_stack_top;
+            thread->th.th_team->t.t_control_stack_top = control;
+        }
+    }
+}
+
+/* Changes set_nproc */
+void
+__kmp_set_num_threads( int new_nth, int gtid )
+{
+    kmp_info_t *thread;
+    kmp_root_t *root;
+
+    KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    if (new_nth < 1)
+        new_nth = 1;
+    else if (new_nth > __kmp_max_nth)
+        new_nth = __kmp_max_nth;
+
+    thread = __kmp_threads[gtid];
+
+    __kmp_save_internal_controls( thread );
+
+    set__nproc( thread, new_nth );
+
+    //
+    // If this omp_set_num_threads() call will cause the hot team size to be
+    // reduced (in the absence of a num_threads clause), then reduce it now,
+    // rather than waiting for the next parallel region.
+    //
+    root = thread->th.th_root;
+    if ( __kmp_init_parallel && ( ! root->r.r_active )
+      && ( root->r.r_hot_team->t.t_nproc > new_nth )
+#if KMP_NESTED_HOT_TEAMS
+      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
+#endif
+    ) {
+        kmp_team_t *hot_team = root->r.r_hot_team;
+        int f;
+
+        __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+
+
+        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+            int tt_idx;
+            for (tt_idx=0; tt_idx<2; ++tt_idx) {
+                kmp_task_team_t *task_team = hot_team->t.t_task_team[tt_idx];
+                if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
+                    // Signal worker threads (esp. the extra ones) to stop looking for tasks while spin waiting.
+                    // The task teams are reference counted and will be deallocated by the last worker thread.
+                    KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
+                    TCW_SYNC_4( task_team->tt.tt_active, FALSE );
+                    KMP_MB();
+                    KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
+                                    &hot_team->t.t_task_team[tt_idx] ) );
+                    hot_team->t.t_task_team[tt_idx] = NULL;
+                }
+                else {
+                    KMP_DEBUG_ASSERT( task_team == NULL );
+                }
+            }
+        }
+
+        //
+        // Release the extra threads we don't need any more.
+        //
+        for ( f = new_nth;  f < hot_team->t.t_nproc; f++ ) {
+            KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
+            __kmp_free_thread( hot_team->t.t_threads[f] );
+            hot_team->t.t_threads[f] =  NULL;
+        }
+        hot_team->t.t_nproc = new_nth;
+#if KMP_NESTED_HOT_TEAMS
+        if( thread->th.th_hot_teams ) {
+            KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
+            thread->th.th_hot_teams[0].hot_team_nth = new_nth;
+        }
+#endif
+
+
+        __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+
+        //
+        // Update the t_nproc field in the threads that are still active.
+        //
+        for( f=0 ; f < new_nth; f++ ) {
+            KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
+            hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
+        }
+        // Special flag in case omp_set_num_threads() call
+        hot_team->t.t_size_changed = -1;
+    }
+
+}
+
+/* Changes max_active_levels */
+void
+__kmp_set_max_active_levels( int gtid, int max_active_levels )
+{
+    kmp_info_t *thread;
+
+    KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    // validate max_active_levels
+    if( max_active_levels < 0 ) {
+        KMP_WARNING( ActiveLevelsNegative, max_active_levels );
+        // We ignore this call if the user has specified a negative value.
+        // The current setting won't be changed. The last valid setting will be used.
+        // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
+        KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
+        return;
+    }
+    if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
+        // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
+        // We allow a zero value. (implementation defined behavior)
+    } else {
+        KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT  );
+        max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+        // Current upper limit is MAX_INT. (implementation defined behavior)
+        // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
+        // Actually, the flow should never get here until we use MAX_INT limit.
+    }
+    KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
+
+    thread = __kmp_threads[ gtid ];
+
+    __kmp_save_internal_controls( thread );
+
+    set__max_active_levels( thread, max_active_levels );
+
+}
+
+/* Gets max_active_levels */
+int
+__kmp_get_max_active_levels( int gtid )
+{
+    kmp_info_t *thread;
+
+    KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    thread = __kmp_threads[ gtid ];
+    KMP_DEBUG_ASSERT( thread->th.th_current_task );
+    KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
+        gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
+    return thread->th.th_current_task->td_icvs.max_active_levels;
+}
+
+/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
+void
+__kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
+{
+    kmp_info_t *thread;
+//    kmp_team_t *team;
+
+    KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    // Check if the kind parameter is valid, correct if needed.
+    // Valid parameters should fit in one of two intervals - standard or extended:
+    //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
+    // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
+    if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
+       ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
+    {
+        // TODO: Hint needs attention in case we change the default schedule.
+        __kmp_msg(
+            kmp_ms_warning,
+            KMP_MSG( ScheduleKindOutOfRange, kind ),
+            KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
+            __kmp_msg_null
+        );
+        kind = kmp_sched_default;
+        chunk = 0;         // ignore chunk value in case of bad kind
+    }
+
+    thread = __kmp_threads[ gtid ];
+
+    __kmp_save_internal_controls( thread );
+
+    if ( kind < kmp_sched_upper_std ) {
+        if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
+            // differ static chunked vs. unchunked:
+            // chunk should be invalid to indicate unchunked schedule (which is the default)
+            thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
+        } else {
+            thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
+        }
+    } else {
+        //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
+        thread->th.th_current_task->td_icvs.sched.r_sched_type =
+            __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
+    }
+    if ( kind == kmp_sched_auto ) {
+        // ignore parameter chunk for schedule auto
+        thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
+    } else {
+        thread->th.th_current_task->td_icvs.sched.chunk = chunk;
+    }
+}
+
+/* Gets def_sched_var ICV values */
+void
+__kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
+{
+    kmp_info_t     *thread;
+    enum sched_type th_type;
+
+    KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    thread = __kmp_threads[ gtid ];
+
+    //th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
+    th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
+
+    switch ( th_type ) {
+    case kmp_sch_static:
+    case kmp_sch_static_greedy:
+    case kmp_sch_static_balanced:
+        *kind = kmp_sched_static;
+        *chunk = 0;   // chunk was not set, try to show this fact via zero value
+        return;
+    case kmp_sch_static_chunked:
+        *kind = kmp_sched_static;
+        break;
+    case kmp_sch_dynamic_chunked:
+        *kind = kmp_sched_dynamic;
+        break;
+    case kmp_sch_guided_chunked:
+    case kmp_sch_guided_iterative_chunked:
+    case kmp_sch_guided_analytical_chunked:
+        *kind = kmp_sched_guided;
+        break;
+    case kmp_sch_auto:
+        *kind = kmp_sched_auto;
+        break;
+    case kmp_sch_trapezoidal:
+        *kind = kmp_sched_trapezoidal;
+        break;
+/*
+    case kmp_sch_static_steal:
+        *kind = kmp_sched_static_steal;
+        break;
+*/
+    default:
+        KMP_FATAL( UnknownSchedulingType, th_type );
+    }
+
+    //*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
+    *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
+}
+
+int
+__kmp_get_ancestor_thread_num( int gtid, int level ) {
+
+    int ii, dd;
+    kmp_team_t *team;
+    kmp_info_t *thr;
+
+    KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    // validate level
+    if( level == 0 ) return 0;
+    if( level < 0 ) return -1;
+    thr = __kmp_threads[ gtid ];
+    team = thr->th.th_team;
+    ii = team->t.t_level;
+    if( level > ii ) return -1;
+
+#if OMP_40_ENABLED
+    if( thr->th.th_teams_microtask ) {
+        // AC: we are in teams region where multiple nested teams have same level
+        int tlevel = thr->th.th_teams_level; // the level of the teams construct
+        if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
+            KMP_DEBUG_ASSERT( ii >= tlevel );
+            // AC: As we need to pass by the teams league, we need to artificially increase ii
+            if ( ii == tlevel ) {
+                ii += 2; // three teams have same level
+            } else {
+                ii ++;   // two teams have same level
+            }
+        }
+    }
+#endif
+
+    if( ii == level ) return __kmp_tid_from_gtid( gtid );
+
+    dd = team->t.t_serialized;
+    level++;
+    while( ii > level )
+    {
+        for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
+        {
+        }
+        if( ( team->t.t_serialized ) && ( !dd ) ) {
+            team = team->t.t_parent;
+            continue;
+        }
+        if( ii > level ) {
+            team = team->t.t_parent;
+            dd = team->t.t_serialized;
+            ii--;
+        }
+    }
+
+    return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
+}
+
+int
+__kmp_get_team_size( int gtid, int level ) {
+
+    int ii, dd;
+    kmp_team_t *team;
+    kmp_info_t *thr;
+
+    KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+    // validate level
+    if( level == 0 ) return 1;
+    if( level < 0 ) return -1;
+    thr = __kmp_threads[ gtid ];
+    team = thr->th.th_team;
+    ii = team->t.t_level;
+    if( level > ii ) return -1;
+
+#if OMP_40_ENABLED
+    if( thr->th.th_teams_microtask ) {
+        // AC: we are in teams region where multiple nested teams have same level
+        int tlevel = thr->th.th_teams_level; // the level of the teams construct
+        if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
+            KMP_DEBUG_ASSERT( ii >= tlevel );
+            // AC: As we need to pass by the teams league, we need to artificially increase ii
+            if ( ii == tlevel ) {
+                ii += 2; // three teams have same level
+            } else {
+                ii ++;   // two teams have same level
+            }
+        }
+    }
+#endif
+
+    while( ii > level )
+    {
+        for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
+        {
+        }
+        if( team->t.t_serialized && ( !dd ) ) {
+            team = team->t.t_parent;
+            continue;
+        }
+        if( ii > level ) {
+            team = team->t.t_parent;
+            ii--;
+        }
+    }
+
+    return team->t.t_nproc;
+}
+
+kmp_r_sched_t
+__kmp_get_schedule_global() {
+// This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
+// may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
+
+    kmp_r_sched_t r_sched;
+
+    // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
+    // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
+    // and thus have different run-time schedules in different roots (even in OMP 2.5)
+    if ( __kmp_sched == kmp_sch_static ) {
+        r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
+    } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
+        r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
+    } else {
+        r_sched.r_sched_type = __kmp_sched;  // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
+    }
+
+    if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
+        r_sched.chunk = KMP_DEFAULT_CHUNK;
+    } else {
+        r_sched.chunk = __kmp_chunk;
+    }
+
+    return r_sched;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+
+/*
+ * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
+ * at least argc number of *t_argv entries for the requested team.
+ */
+static void
+__kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
+{
+
+    KMP_DEBUG_ASSERT( team );
+    if( !realloc || argc > team->t.t_max_argc ) {
+
+        KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
+                         team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
+        /* if previously allocated heap space for args, free them */
+        if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
+            __kmp_free( (void *) team->t.t_argv );
+
+        if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
+            /* use unused space in the cache line for arguments */
+            team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
+            KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
+                             team->t.t_id, team->t.t_max_argc ));
+            team->t.t_argv = &team->t.t_inline_argv[0];
+            if ( __kmp_storage_map ) {
+                __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
+                                         &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
+                                         (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
+                                         "team_%d.t_inline_argv",
+                                         team->t.t_id );
+            }
+        } else {
+            /* allocate space for arguments in the heap */
+            team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
+                                     KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
+            KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
+                             team->t.t_id, team->t.t_max_argc ));
+            team->t.t_argv     = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
+            if ( __kmp_storage_map ) {
+                __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
+                                         sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
+                                         team->t.t_id );
+            }
+        }
+    }
+}
+
+static void
+__kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
+{
+    int i;
+    int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
+#if KMP_USE_POOLED_ALLOC
+    // AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
+    char *ptr = __kmp_allocate(max_nth *
+                            ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
+                               + sizeof(kmp_disp_t) + sizeof(int)*6
+                               //+ sizeof(int)
+                               + sizeof(kmp_r_sched_t)
+                               + sizeof(kmp_taskdata_t) ) );
+
+    team->t.t_threads          = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
+    team->t.t_disp_buffer      = (dispatch_shared_info_t*) ptr;
+                                   ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
+    team->t.t_dispatch         = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
+    team->t.t_set_nproc        = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_dynamic      = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_nested       = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_blocktime    = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_bt_set       = (int*) ptr;
+    ptr += sizeof(int) * max_nth;
+    //team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_sched        = (kmp_r_sched_t*) ptr;
+    ptr += sizeof(kmp_r_sched_t) * max_nth;
+    team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
+    ptr += sizeof(kmp_taskdata_t) * max_nth;
+#else
+
+    team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
+    team->t.t_disp_buffer = (dispatch_shared_info_t*)
+        __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
+    team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
+    //team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
+    //team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
+    team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
+#endif
+    team->t.t_max_nproc = max_nth;
+
+    /* setup dispatch buffers */
+    for(i = 0 ; i < num_disp_buff; ++i)
+        team->t.t_disp_buffer[i].buffer_index = i;
+}
+
+static void
+__kmp_free_team_arrays(kmp_team_t *team) {
+    /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
+    int i;
+    for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
+        if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
+            __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
+            team->t.t_dispatch[ i ].th_disp_buffer = NULL;
+        }; // if
+    }; // for
+    __kmp_free(team->t.t_threads);
+    #if !KMP_USE_POOLED_ALLOC
+        __kmp_free(team->t.t_disp_buffer);
+        __kmp_free(team->t.t_dispatch);
+        //__kmp_free(team->t.t_set_max_active_levels);
+        //__kmp_free(team->t.t_set_sched);
+        __kmp_free(team->t.t_implicit_task_taskdata);
+    #endif
+    team->t.t_threads     = NULL;
+    team->t.t_disp_buffer = NULL;
+    team->t.t_dispatch    = NULL;
+    //team->t.t_set_sched   = 0;
+    //team->t.t_set_max_active_levels = 0;
+    team->t.t_implicit_task_taskdata = 0;
+}
+
+static void
+__kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
+    kmp_info_t **oldThreads = team->t.t_threads;
+
+    #if !KMP_USE_POOLED_ALLOC
+        __kmp_free(team->t.t_disp_buffer);
+        __kmp_free(team->t.t_dispatch);
+        //__kmp_free(team->t.t_set_max_active_levels);
+        //__kmp_free(team->t.t_set_sched);
+        __kmp_free(team->t.t_implicit_task_taskdata);
+    #endif
+    __kmp_allocate_team_arrays(team, max_nth);
+
+    KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
+
+    __kmp_free(oldThreads);
+}
+
+static kmp_internal_control_t
+__kmp_get_global_icvs( void ) {
+
+    kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
+
+#if OMP_40_ENABLED
+    KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
+#endif /* OMP_40_ENABLED */
+
+    kmp_internal_control_t g_icvs = {
+      0,                            //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
+      (kmp_int8)__kmp_dflt_nested,            //int nested;               //internal control for nested parallelism (per thread)
+      (kmp_int8)__kmp_global.g.g_dynamic,                                 //internal control for dynamic adjustment of threads (per thread)
+      (kmp_int8)__kmp_env_blocktime,          //int bt_set;               //internal control for whether blocktime is explicitly set
+      __kmp_dflt_blocktime,         //int blocktime;            //internal control for blocktime
+      __kmp_bt_intervals,           //int bt_intervals;         //internal control for blocktime intervals
+      __kmp_dflt_team_nth,          //int nproc;                //internal control for # of threads for next parallel region (per thread)
+                                    // (use a max ub on value if __kmp_parallel_initialize not called yet)
+      __kmp_dflt_max_active_levels, //int max_active_levels;    //internal control for max_active_levels
+      r_sched,                      //kmp_r_sched_t sched;      //internal control for runtime schedule {sched,chunk} pair
+#if OMP_40_ENABLED
+      __kmp_nested_proc_bind.bind_types[0],
+#endif /* OMP_40_ENABLED */
+      NULL                          //struct kmp_internal_control *next;
+    };
+
+    return g_icvs;
+}
+
+static kmp_internal_control_t
+__kmp_get_x_global_icvs( const kmp_team_t *team ) {
+
+    kmp_internal_control_t gx_icvs;
+    gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
+    copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
+    gx_icvs.next = NULL;
+
+    return gx_icvs;
+}
+
+static void
+__kmp_initialize_root( kmp_root_t *root )
+{
+    int           f;
+    kmp_team_t   *root_team;
+    kmp_team_t   *hot_team;
+    int           hot_team_max_nth;
+    kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
+    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
+    KMP_DEBUG_ASSERT( root );
+    KMP_ASSERT( ! root->r.r_begin );
+
+    /* setup the root state structure */
+    __kmp_init_lock( &root->r.r_begin_lock );
+    root->r.r_begin        = FALSE;
+    root->r.r_active       = FALSE;
+    root->r.r_in_parallel  = 0;
+    root->r.r_blocktime    = __kmp_dflt_blocktime;
+    root->r.r_nested       = __kmp_dflt_nested;
+
+    /* setup the root team for this task */
+    /* allocate the root team structure */
+    KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
+
+    root_team =
+        __kmp_allocate_team(
+            root,
+            1,                                                         // new_nproc
+            1,                                                         // max_nproc
+#if OMPT_SUPPORT
+            0, // root parallel id
+#endif
+#if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0],
+#endif
+            &r_icvs,
+            0                                                          // argc
+            USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
+        );
+#if USE_DEBUGGER
+    // Non-NULL value should be assigned to make the debugger display the root team.
+    TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
+#endif
+
+    KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
+
+    root->r.r_root_team = root_team;
+    root_team->t.t_control_stack_top = NULL;
+
+    /* initialize root team */
+    root_team->t.t_threads[0] = NULL;
+    root_team->t.t_nproc      = 1;
+    root_team->t.t_serialized = 1;
+    // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+    root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
+    root_team->t.t_sched.chunk        = r_sched.chunk;
+    KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
+                    root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
+
+    /* setup the  hot team for this task */
+    /* allocate the hot team structure */
+    KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
+
+    hot_team =
+        __kmp_allocate_team(
+            root,
+            1,                                                         // new_nproc
+            __kmp_dflt_team_nth_ub * 2,                                // max_nproc
+#if OMPT_SUPPORT
+            0, // root parallel id
+#endif
+#if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0],
+#endif
+            &r_icvs,
+            0                                                          // argc
+            USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
+        );
+    KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
+
+    root->r.r_hot_team = hot_team;
+    root_team->t.t_control_stack_top = NULL;
+
+    /* first-time initialization */
+    hot_team->t.t_parent = root_team;
+
+    /* initialize hot team */
+    hot_team_max_nth = hot_team->t.t_max_nproc;
+    for ( f = 0; f < hot_team_max_nth; ++ f ) {
+        hot_team->t.t_threads[ f ] = NULL;
+    }; // for
+    hot_team->t.t_nproc = 1;
+    // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+    hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
+    hot_team->t.t_sched.chunk        = r_sched.chunk;
+    hot_team->t.t_size_changed = 0;
+
+}
+
+#ifdef KMP_DEBUG
+
+
+typedef struct kmp_team_list_item {
+    kmp_team_p const *           entry;
+    struct kmp_team_list_item *  next;
+} kmp_team_list_item_t;
+typedef kmp_team_list_item_t * kmp_team_list_t;
+
+
+static void
+__kmp_print_structure_team_accum(    // Add team to list of teams.
+    kmp_team_list_t     list,        // List of teams.
+    kmp_team_p const *  team         // Team to add.
+) {
+
+    // List must terminate with item where both entry and next are NULL.
+    // Team is added to the list only once.
+    // List is sorted in ascending order by team id.
+    // Team id is *not* a key.
+
+    kmp_team_list_t l;
+
+    KMP_DEBUG_ASSERT( list != NULL );
+    if ( team == NULL ) {
+        return;
+    }; // if
+
+    __kmp_print_structure_team_accum( list, team->t.t_parent );
+    __kmp_print_structure_team_accum( list, team->t.t_next_pool );
+
+    // Search list for the team.
+    l = list;
+    while ( l->next != NULL && l->entry != team ) {
+        l = l->next;
+    }; // while
+    if ( l->next != NULL ) {
+        return;  // Team has been added before, exit.
+    }; // if
+
+    // Team is not found. Search list again for insertion point.
+    l = list;
+    while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
+        l = l->next;
+    }; // while
+
+    // Insert team.
+    {
+        kmp_team_list_item_t * item =
+            (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof(  kmp_team_list_item_t ) );
+        * item = * l;
+        l->entry = team;
+        l->next  = item;
+    }
+
+}
+
+static void
+__kmp_print_structure_team(
+    char const *       title,
+    kmp_team_p const * team
+
+) {
+    __kmp_printf( "%s", title );
+    if ( team != NULL ) {
+        __kmp_printf( "%2x %p\n", team->t.t_id, team );
+    } else {
+        __kmp_printf( " - (nil)\n" );
+    }; // if
+}
+
+static void
+__kmp_print_structure_thread(
+    char const *       title,
+    kmp_info_p const * thread
+
+) {
+    __kmp_printf( "%s", title );
+    if ( thread != NULL ) {
+        __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
+    } else {
+        __kmp_printf( " - (nil)\n" );
+    }; // if
+}
+
+void
+__kmp_print_structure(
+    void
+) {
+
+    kmp_team_list_t list;
+
+    // Initialize list of teams.
+    list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
+    list->entry = NULL;
+    list->next  = NULL;
+
+    __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
+    {
+        int gtid;
+        for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
+            __kmp_printf( "%2d", gtid );
+            if ( __kmp_threads != NULL ) {
+                __kmp_printf( " %p", __kmp_threads[ gtid ] );
+            }; // if
+            if ( __kmp_root != NULL ) {
+                __kmp_printf( " %p", __kmp_root[ gtid ] );
+            }; // if
+            __kmp_printf( "\n" );
+        }; // for gtid
+    }
+
+    // Print out __kmp_threads array.
+    __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
+    if ( __kmp_threads != NULL ) {
+        int gtid;
+        for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
+            kmp_info_t const * thread = __kmp_threads[ gtid ];
+            if ( thread != NULL ) {
+                __kmp_printf( "GTID %2d %p:\n", gtid, thread );
+                __kmp_printf(                 "    Our Root:        %p\n", thread->th.th_root );
+                __kmp_print_structure_team(   "    Our Team:     ",        thread->th.th_team );
+                __kmp_print_structure_team(   "    Serial Team:  ",        thread->th.th_serial_team );
+                __kmp_printf(                 "    Threads:      %2d\n",   thread->th.th_team_nproc );
+                __kmp_print_structure_thread( "    Master:       ",        thread->th.th_team_master );
+                __kmp_printf(                 "    Serialized?:  %2d\n",   thread->th.th_team_serialized );
+                __kmp_printf(                 "    Set NProc:    %2d\n",   thread->th.th_set_nproc );
+#if OMP_40_ENABLED
+                __kmp_printf(                 "    Set Proc Bind: %2d\n",  thread->th.th_set_proc_bind );
+#endif
+                __kmp_print_structure_thread( "    Next in pool: ",        thread->th.th_next_pool );
+                __kmp_printf( "\n" );
+                __kmp_print_structure_team_accum( list, thread->th.th_team );
+                __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
+            }; // if
+        }; // for gtid
+    } else {
+        __kmp_printf( "Threads array is not allocated.\n" );
+    }; // if
+
+    // Print out __kmp_root array.
+    __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
+    if ( __kmp_root != NULL ) {
+        int gtid;
+        for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
+            kmp_root_t const * root = __kmp_root[ gtid ];
+            if ( root != NULL ) {
+                __kmp_printf( "GTID %2d %p:\n", gtid, root );
+                __kmp_print_structure_team(   "    Root Team:    ",      root->r.r_root_team );
+                __kmp_print_structure_team(   "    Hot Team:     ",      root->r.r_hot_team );
+                __kmp_print_structure_thread( "    Uber Thread:  ",      root->r.r_uber_thread );
+                __kmp_printf(                 "    Active?:      %2d\n", root->r.r_active );
+                __kmp_printf(                 "    Nested?:      %2d\n", root->r.r_nested );
+                __kmp_printf(                 "    In Parallel:  %2d\n", root->r.r_in_parallel );
+                __kmp_printf( "\n" );
+                __kmp_print_structure_team_accum( list, root->r.r_root_team );
+                __kmp_print_structure_team_accum( list, root->r.r_hot_team );
+            }; // if
+        }; // for gtid
+    } else {
+        __kmp_printf( "Ubers array is not allocated.\n" );
+    }; // if
+
+    __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
+    while ( list->next != NULL ) {
+        kmp_team_p const * team = list->entry;
+        int i;
+        __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
+        __kmp_print_structure_team( "    Parent Team:      ",      team->t.t_parent );
+        __kmp_printf(               "    Master TID:       %2d\n", team->t.t_master_tid );
+        __kmp_printf(               "    Max threads:      %2d\n", team->t.t_max_nproc );
+        __kmp_printf(               "    Levels of serial: %2d\n", team->t.t_serialized );
+        __kmp_printf(               "    Number threads:   %2d\n", team->t.t_nproc );
+        for ( i = 0; i < team->t.t_nproc; ++ i ) {
+            __kmp_printf(           "    Thread %2d:      ", i );
+            __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
+        }; // for i
+        __kmp_print_structure_team( "    Next in pool:     ",      team->t.t_next_pool );
+        __kmp_printf( "\n" );
+        list = list->next;
+    }; // while
+
+    // Print out __kmp_thread_pool and __kmp_team_pool.
+    __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
+    __kmp_print_structure_thread(   "Thread pool:          ", (kmp_info_t *)__kmp_thread_pool );
+    __kmp_print_structure_team(     "Team pool:            ", (kmp_team_t *)__kmp_team_pool );
+    __kmp_printf( "\n" );
+
+    // Free team list.
+    while ( list != NULL ) {
+        kmp_team_list_item_t * item = list;
+        list = list->next;
+        KMP_INTERNAL_FREE( item );
+    }; // while
+
+}
+
+#endif
+
+
+//---------------------------------------------------------------------------
+//  Stuff for per-thread fast random number generator
+//  Table of primes
+
+static const unsigned __kmp_primes[] = {
+  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
+  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
+  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
+  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
+  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
+  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
+  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
+  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
+  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
+  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
+  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
+  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
+  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
+  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
+  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
+  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
+};
+
+//---------------------------------------------------------------------------
+//  __kmp_get_random: Get a random number using a linear congruential method.
+
+unsigned short
+__kmp_get_random( kmp_info_t * thread )
+{
+  unsigned x = thread->th.th_x;
+  unsigned short r = x>>16;
+
+  thread->th.th_x = x*thread->th.th_a+1;
+
+  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
+         thread->th.th_info.ds.ds_tid, r) );
+
+  return r;
+}
+//--------------------------------------------------------
+// __kmp_init_random: Initialize a random number generator
+
+void
+__kmp_init_random( kmp_info_t * thread )
+{
+  unsigned seed = thread->th.th_info.ds.ds_tid;
+
+  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
+  thread->th.th_x = (seed+1)*thread->th.th_a+1;
+  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
+}
+
+
+#if KMP_OS_WINDOWS
+/* reclaim array entries for root threads that are already dead, returns number reclaimed */
+static int
+__kmp_reclaim_dead_roots(void) {
+    int i, r = 0;
+
+    for(i = 0; i < __kmp_threads_capacity; ++i) {
+        if( KMP_UBER_GTID( i ) &&
+          !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
+          !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
+            r += __kmp_unregister_root_other_thread(i);
+        }
+    }
+    return r;
+}
+#endif
+
+/*
+   This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
+   free entries generated.
+
+   For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
+   already dead.
+
+   On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
+   update to __kmp_threads_capacity.  Array capacity is increased by doubling with clipping to
+    __kmp_tp_capacity, if threadprivate cache array has been created.
+   Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
+
+   After any dead root reclamation, if the clipping value allows array expansion to result in the generation
+   of a total of nWish free slots, the function does that expansion.  If not, but the clipping value allows
+   array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
+   Otherwise, nothing is done beyond the possible initial root thread reclamation.  However, if nNeed is zero,
+   a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
+   as many free slots as possible up to nWish.
+
+   If any argument is negative, the behavior is undefined.
+*/
+static int
+__kmp_expand_threads(int nWish, int nNeed) {
+    int added = 0;
+    int old_tp_cached;
+    int __kmp_actual_max_nth;
+
+    if(nNeed > nWish) /* normalize the arguments */
+        nWish = nNeed;
+#if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
+/* only for Windows static library */
+    /* reclaim array entries for root threads that are already dead */
+    added = __kmp_reclaim_dead_roots();
+
+    if(nNeed) {
+        nNeed -= added;
+        if(nNeed < 0)
+            nNeed = 0;
+    }
+    if(nWish) {
+        nWish -= added;
+        if(nWish < 0)
+            nWish = 0;
+    }
+#endif
+    if(nWish <= 0)
+        return added;
+
+    while(1) {
+        int nTarget;
+        int minimumRequiredCapacity;
+        int newCapacity;
+        kmp_info_t **newThreads;
+        kmp_root_t **newRoot;
+
+        //
+        // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
+        // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
+        // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
+        // become > __kmp_max_nth in one of two ways:
+        //
+        // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
+        //    may not be resused by another thread, so we may need to increase
+        //    __kmp_threads_capacity to __kmp_max_threads + 1.
+        //
+        // 2) New foreign root(s) are encountered.  We always register new
+        //    foreign roots.  This may cause a smaller # of threads to be
+        //    allocated at subsequent parallel regions, but the worker threads
+        //    hang around (and eventually go to sleep) and need slots in the
+        //    __kmp_threads[] array.
+        //
+        // Anyway, that is the reason for moving the check to see if
+        // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
+        // instead of having it performed here. -BB
+        //
+        old_tp_cached = __kmp_tp_cached;
+        __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
+        KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
+
+        /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
+        nTarget = nWish;
+        if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
+            /* can't fulfil nWish, so try nNeed */
+            if(nNeed) {
+                nTarget = nNeed;
+                if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
+                    /* possible expansion too small -- give up */
+                    break;
+                }
+            } else {
+                /* best-effort */
+                nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
+                if(!nTarget) {
+                    /* can expand at all -- give up */
+                    break;
+                }
+            }
+        }
+        minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
+
+        newCapacity = __kmp_threads_capacity;
+        do{
+            newCapacity =
+                newCapacity <= (__kmp_actual_max_nth >> 1) ?
+                (newCapacity << 1) :
+                __kmp_actual_max_nth;
+        } while(newCapacity < minimumRequiredCapacity);
+        newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
+        newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
+        KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
+        KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
+        memset(newThreads + __kmp_threads_capacity, 0,
+               (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
+        memset(newRoot + __kmp_threads_capacity, 0,
+               (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
+
+        if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
+            /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
+               while we were allocating the expanded array, and our new capacity is larger than the threadprivate
+               cache capacity, so we should deallocate the expanded arrays and try again.  This is the first check
+               of a double-check pair.
+            */
+            __kmp_free(newThreads);
+            continue; /* start over and try again */
+        }
+        __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
+        if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
+            /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
+            __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+            __kmp_free(newThreads);
+            continue; /* start over and try again */
+        } else {
+            /* success */
+            // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
+            //
+            *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
+            *(kmp_root_t**volatile*)&__kmp_root = newRoot;
+            added += newCapacity - __kmp_threads_capacity;
+            *(volatile int*)&__kmp_threads_capacity = newCapacity;
+            __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+            break; /* succeeded, so we can exit the loop */
+        }
+    }
+    return added;
+}
+
+/* register the current thread as a root thread and obtain our gtid */
+/* we must have the __kmp_initz_lock held at this point */
+/* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
+int
+__kmp_register_root( int initial_thread )
+{
+    kmp_info_t *root_thread;
+    kmp_root_t *root;
+    int         gtid;
+    int         capacity;
+    __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+    KA_TRACE( 20, ("__kmp_register_root: entered\n"));
+    KMP_MB();
+
+
+    /*
+        2007-03-02:
+
+        If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
+        "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
+        return false (that means there is at least one empty slot in __kmp_threads array), but it
+        is possible the only free slot is #0, which is reserved for initial thread and so cannot be
+        used for this one. Following code workarounds this bug.
+
+        However, right solution seems to be not reserving slot #0 for initial thread because:
+            (1) there is no magic in slot #0,
+            (2) we cannot detect initial thread reliably (the first thread which does serial
+                initialization may be not a real initial thread).
+    */
+    capacity = __kmp_threads_capacity;
+    if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
+        -- capacity;
+    }; // if
+
+    /* see if there are too many threads */
+    if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
+        if ( __kmp_tp_cached ) {
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantRegisterNewThread ),
+                KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
+                KMP_HNT( PossibleSystemLimitOnThreads ),
+                __kmp_msg_null
+            );
+        }
+        else {
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantRegisterNewThread ),
+                KMP_HNT( SystemLimitOnThreads ),
+                __kmp_msg_null
+            );
+        }
+    }; // if
+
+    /* find an available thread slot */
+    /* Don't reassign the zero slot since we need that to only be used by initial
+       thread */
+    for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
+        ;
+    KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
+    KMP_ASSERT( gtid < __kmp_threads_capacity );
+
+    /* update global accounting */
+    __kmp_all_nth ++;
+    TCW_4(__kmp_nth, __kmp_nth + 1);
+
+    //
+    // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
+    // for low numbers of procs, and method #2 (keyed API call) for higher
+    // numbers of procs.
+    //
+    if ( __kmp_adjust_gtid_mode ) {
+        if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
+            if ( TCR_4(__kmp_gtid_mode) != 2) {
+                TCW_4(__kmp_gtid_mode, 2);
+            }
+        }
+        else {
+            if (TCR_4(__kmp_gtid_mode) != 1 ) {
+                TCW_4(__kmp_gtid_mode, 1);
+            }
+        }
+    }
+
+#ifdef KMP_ADJUST_BLOCKTIME
+    /* Adjust blocktime to zero if necessary            */
+    /* Middle initialization might not have occurred yet */
+    if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
+        if ( __kmp_nth > __kmp_avail_proc ) {
+            __kmp_zero_bt = TRUE;
+        }
+    }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+    /* setup this new hierarchy */
+    if( ! ( root = __kmp_root[gtid] )) {
+        root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
+        KMP_DEBUG_ASSERT( ! root->r.r_root_team );
+    }
+
+    __kmp_initialize_root( root );
+
+    /* setup new root thread structure */
+    if( root->r.r_uber_thread ) {
+        root_thread = root->r.r_uber_thread;
+    } else {
+        root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
+        if ( __kmp_storage_map ) {
+            __kmp_print_thread_storage_map( root_thread, gtid );
+        }
+        root_thread->th.th_info .ds.ds_gtid = gtid;
+        root_thread->th.th_root =  root;
+        if( __kmp_env_consistency_check ) {
+            root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
+        }
+        #if USE_FAST_MEMORY
+            __kmp_initialize_fast_memory( root_thread );
+        #endif /* USE_FAST_MEMORY */
+
+        #if KMP_USE_BGET
+            KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
+            __kmp_initialize_bget( root_thread );
+        #endif
+        __kmp_init_random( root_thread );  // Initialize random number generator
+    }
+
+    /* setup the serial team held in reserve by the root thread */
+    if( ! root_thread->th.th_serial_team ) {
+        kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
+        KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
+
+        root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
+#if OMPT_SUPPORT
+          0, // root parallel id
+#endif
+#if OMP_40_ENABLED
+          proc_bind_default,
+#endif
+          &r_icvs,
+          0 USE_NESTED_HOT_ARG(NULL) );
+    }
+    KMP_ASSERT( root_thread->th.th_serial_team );
+    KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
+      root_thread->th.th_serial_team ) );
+
+    /* drop root_thread into place */
+    TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
+
+    root->r.r_root_team->t.t_threads[0] = root_thread;
+    root->r.r_hot_team ->t.t_threads[0] = root_thread;
+    root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
+    root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
+    root->r.r_uber_thread = root_thread;
+
+    /* initialize the thread, get it ready to go */
+    __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
+
+    /* prepare the master thread for get_gtid() */
+    __kmp_gtid_set_specific( gtid );
+
+    __kmp_itt_thread_name( gtid );
+
+    #ifdef KMP_TDATA_GTID
+        __kmp_gtid = gtid;
+    #endif
+    __kmp_create_worker( gtid, root_thread, __kmp_stksize );
+    KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
+    TCW_4(__kmp_init_gtid, TRUE);
+
+    KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
+                    gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
+                    root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
+                    KMP_INIT_BARRIER_STATE ) );
+    { // Initialize barrier data.
+        int b;
+        for ( b = 0; b < bs_last_barrier; ++ b ) {
+            root_thread->th.th_bar[ b ].bb.b_arrived        = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+            root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
+#endif
+        }; // for
+    }
+    KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
+
+
+#if KMP_AFFINITY_SUPPORTED
+    if ( TCR_4(__kmp_init_middle) ) {
+        __kmp_affinity_set_init_mask( gtid, TRUE );
+    }
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+    __kmp_root_counter ++;
+
+    KMP_MB();
+    __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+
+    return gtid;
+}
+
+#if KMP_NESTED_HOT_TEAMS
+static int
+__kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
+{
+    int i, n, nth;
+    kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
+    if( !hot_teams || !hot_teams[level].hot_team ) {
+        return 0;
+    }
+    KMP_DEBUG_ASSERT( level < max_level );
+    kmp_team_t *team = hot_teams[level].hot_team;
+    nth = hot_teams[level].hot_team_nth;
+    n = nth - 1;                   // master is not freed
+    if( level < max_level - 1 ) {
+        for( i = 0; i < nth; ++i ) {
+            kmp_info_t *th = team->t.t_threads[i];
+            n += __kmp_free_hot_teams( root, th, level + 1, max_level );
+            if( i > 0 && th->th.th_hot_teams ) {
+                __kmp_free( th->th.th_hot_teams );
+                th->th.th_hot_teams = NULL;
+            }
+        }
+    }
+    __kmp_free_team( root, team, NULL );
+    return n;
+}
+#endif
+
+/* Resets a root thread and clear its root and hot teams.
+   Returns the number of __kmp_threads entries directly and indirectly freed.
+*/
+static int
+__kmp_reset_root(int gtid, kmp_root_t *root)
+{
+    kmp_team_t * root_team = root->r.r_root_team;
+    kmp_team_t * hot_team  = root->r.r_hot_team;
+    int          n         = hot_team->t.t_nproc;
+    int i;
+
+    KMP_DEBUG_ASSERT( ! root->r.r_active );
+
+    root->r.r_root_team = NULL;
+    root->r.r_hot_team  = NULL;
+        // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
+        // to __kmp_free_team().
+    __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
+#if KMP_NESTED_HOT_TEAMS
+    if( __kmp_hot_teams_max_level > 1 ) {  // need to free nested hot teams and their threads if any
+        for( i = 0; i < hot_team->t.t_nproc; ++i ) {
+            kmp_info_t *th = hot_team->t.t_threads[i];
+            n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
+            if( th->th.th_hot_teams ) {
+                __kmp_free( th->th.th_hot_teams );
+                th->th.th_hot_teams = NULL;
+            }
+        }
+    }
+#endif
+    __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
+
+    //
+    // Before we can reap the thread, we need to make certain that all
+    // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
+    //
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        __kmp_wait_to_unref_task_teams();
+    }
+
+    #if KMP_OS_WINDOWS
+        /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
+        KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
+            (LPVOID)&(root->r.r_uber_thread->th),
+            root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
+        __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
+    #endif /* KMP_OS_WINDOWS */
+
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
+        int gtid = __kmp_get_gtid();
+        __ompt_thread_end(ompt_thread_initial, gtid);
+    }
+#endif
+
+    TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
+    __kmp_reap_thread( root->r.r_uber_thread, 1 );
+
+        // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
+    root->r.r_uber_thread = NULL;
+    /* mark root as no longer in use */
+    root->r.r_begin = FALSE;
+
+    return n;
+}
+
+void
+__kmp_unregister_root_current_thread( int gtid )
+{
+    KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
+    /* this lock should be ok, since unregister_root_current_thread is never called during
+     * and abort, only during a normal close.  furthermore, if you have the
+     * forkjoin lock, you should never try to get the initz lock */
+
+    __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+    if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
+        KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
+        __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+        return;
+    }
+    kmp_root_t *root = __kmp_root[gtid];
+
+    KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
+    KMP_ASSERT( KMP_UBER_GTID( gtid ));
+    KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
+    KMP_ASSERT( root->r.r_active == FALSE );
+
+
+    KMP_MB();
+
+#if OMP_41_ENABLED
+   kmp_info_t * thread = __kmp_threads[gtid];
+   kmp_team_t * team = thread->th.th_team;
+   kmp_task_team_t *   task_team = thread->th.th_task_team;
+
+   // we need to wait for the proxy tasks before finishing the thread
+   if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
+        __kmp_task_team_wait(thread, team, NULL );
+#endif
+
+    __kmp_reset_root(gtid, root);
+
+    /* free up this thread slot */
+    __kmp_gtid_set_specific( KMP_GTID_DNE );
+#ifdef KMP_TDATA_GTID
+    __kmp_gtid = KMP_GTID_DNE;
+#endif
+
+    KMP_MB();
+    KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
+
+    __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+}
+
+#if KMP_OS_WINDOWS
+/* __kmp_forkjoin_lock must be already held
+   Unregisters a root thread that is not the current thread.  Returns the number of
+   __kmp_threads entries freed as a result.
+ */
+static int
+__kmp_unregister_root_other_thread( int gtid )
+{
+    kmp_root_t *root = __kmp_root[gtid];
+    int r;
+
+    KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
+    KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
+    KMP_ASSERT( KMP_UBER_GTID( gtid ));
+    KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
+    KMP_ASSERT( root->r.r_active == FALSE );
+
+    r = __kmp_reset_root(gtid, root);
+    KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
+    return r;
+}
+#endif
+
+#if KMP_DEBUG
+void __kmp_task_info() {
+
+    kmp_int32 gtid       = __kmp_entry_gtid();
+    kmp_int32 tid        = __kmp_tid_from_gtid( gtid );
+    kmp_info_t *this_thr = __kmp_threads[ gtid ];
+    kmp_team_t *steam    = this_thr->th.th_serial_team;
+    kmp_team_t *team     = this_thr->th.th_team;
+
+    __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
+        gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
+}
+#endif // KMP_DEBUG
+
+/* TODO optimize with one big memclr, take out what isn't needed,
+ * split responsibility to workers as much as possible, and delay
+ * initialization of features as much as possible  */
+static void
+__kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
+{
+    /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
+     * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
+    kmp_info_t *master = team->t.t_threads[0];
+    KMP_DEBUG_ASSERT( this_thr != NULL );
+    KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
+    KMP_DEBUG_ASSERT( team );
+    KMP_DEBUG_ASSERT( team->t.t_threads  );
+    KMP_DEBUG_ASSERT( team->t.t_dispatch );
+    KMP_DEBUG_ASSERT( master );
+    KMP_DEBUG_ASSERT( master->th.th_root );
+
+    KMP_MB();
+
+    TCW_SYNC_PTR(this_thr->th.th_team, team);
+
+    this_thr->th.th_info.ds.ds_tid  = tid;
+    this_thr->th.th_set_nproc       = 0;
+#if OMP_40_ENABLED
+    this_thr->th.th_set_proc_bind   = proc_bind_default;
+# if KMP_AFFINITY_SUPPORTED
+    this_thr->th.th_new_place       = this_thr->th.th_current_place;
+# endif
+#endif
+    this_thr->th.th_root            = master->th.th_root;
+
+    /* setup the thread's cache of the team structure */
+    this_thr->th.th_team_nproc      = team->t.t_nproc;
+    this_thr->th.th_team_master     = master;
+    this_thr->th.th_team_serialized = team->t.t_serialized;
+    TCW_PTR(this_thr->th.th_sleep_loc, NULL);
+
+    KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
+    this_thr->th.th_task_state = 0;
+
+    KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
+                    tid, gtid, this_thr, this_thr->th.th_current_task ) );
+
+    __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
+
+    KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
+                    tid, gtid, this_thr, this_thr->th.th_current_task ) );
+    // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
+
+    /* TODO no worksharing in speculative threads */
+    this_thr->th.th_dispatch      = &team->t.t_dispatch[ tid ];
+
+    this_thr->th.th_local.this_construct = 0;
+
+#ifdef BUILD_TV
+    this_thr->th.th_local.tv_data = 0;
+#endif
+
+    if ( ! this_thr->th.th_pri_common ) {
+        this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
+        if ( __kmp_storage_map ) {
+            __kmp_print_storage_map_gtid(
+                gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
+                sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
+            );
+        }; // if
+        this_thr->th.th_pri_head = NULL;
+    }; // if
+
+    /* Initialize dynamic dispatch */
+    {
+        volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
+        /*
+         * Use team max_nproc since this will never change for the team.
+         */
+        size_t disp_size = sizeof( dispatch_private_info_t ) *
+            ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
+        KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
+        KMP_ASSERT( dispatch );
+        KMP_DEBUG_ASSERT( team->t.t_dispatch );
+        KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
+
+        dispatch->th_disp_index = 0;
+
+        if( ! dispatch->th_disp_buffer )  {
+            dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
+
+            if ( __kmp_storage_map ) {
+                __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
+                                         &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
+                                         disp_size, "th_%d.th_dispatch.th_disp_buffer "
+                                         "(team_%d.t_dispatch[%d].th_disp_buffer)",
+                                         gtid, team->t.t_id, gtid );
+            }
+        } else {
+            memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
+        }
+
+        dispatch->th_dispatch_pr_current = 0;
+        dispatch->th_dispatch_sh_current = 0;
+
+        dispatch->th_deo_fcn = 0;             /* ORDERED     */
+        dispatch->th_dxo_fcn = 0;             /* END ORDERED */
+    }
+
+    this_thr->th.th_next_pool = NULL;
+
+    if (!this_thr->th.th_task_state_memo_stack) {
+        this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
+        this_thr->th.th_task_state_top = 0;
+        this_thr->th.th_task_state_stack_sz = 4;
+    }
+
+    KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
+    KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
+
+    KMP_MB();
+}
+
+
+/* allocate a new thread for the requesting team.  this is only called from within a
+ * forkjoin critical section.  we will first try to get an available thread from the
+ * thread pool.  if none is available, we will fork a new one assuming we are able
+ * to create a new one.  this should be assured, as the caller should check on this
+ * first.
+ */
+kmp_info_t *
+__kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
+{
+    kmp_team_t  *serial_team;
+    kmp_info_t  *new_thr;
+    int          new_gtid;
+
+    KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
+    KMP_DEBUG_ASSERT( root && team );
+#if !KMP_NESTED_HOT_TEAMS
+    KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
+#endif
+    KMP_MB();
+
+    /* first, try to get one from the thread pool */
+    if ( __kmp_thread_pool ) {
+
+        new_thr = (kmp_info_t*)__kmp_thread_pool;
+        __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
+        if ( new_thr == __kmp_thread_pool_insert_pt ) {
+            __kmp_thread_pool_insert_pt = NULL;
+        }
+        TCW_4(new_thr->th.th_in_pool, FALSE);
+        //
+        // Don't touch th_active_in_pool or th_active.
+        // The worker thread adjusts those flags as it sleeps/awakens.
+        //
+
+        __kmp_thread_pool_nth--;
+
+        KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
+                    __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
+        KMP_ASSERT(       ! new_thr->th.th_team );
+        KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
+        KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
+
+        /* setup the thread structure */
+        __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
+        KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
+
+        TCW_4(__kmp_nth, __kmp_nth + 1);
+
+        new_thr->th.th_task_state_top = 0;
+        new_thr->th.th_task_state_stack_sz = 4;
+
+#ifdef KMP_ADJUST_BLOCKTIME
+        /* Adjust blocktime back to zero if necessar      y */
+        /* Middle initialization might not have occurred yet */
+        if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
+            if ( __kmp_nth > __kmp_avail_proc ) {
+                __kmp_zero_bt = TRUE;
+            }
+        }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+#if KMP_DEBUG
+        // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
+        int b;
+        kmp_balign_t * balign = new_thr->th.th_bar;
+        for( b = 0; b < bs_last_barrier; ++ b )
+            KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#endif
+
+        KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
+                    __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
+
+        KMP_MB();
+        return new_thr;
+    }
+
+
+    /* no, well fork a new one */
+    KMP_ASSERT( __kmp_nth    == __kmp_all_nth );
+    KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
+
+    //
+    // If this is the first worker thread the RTL is creating, then also
+    // launch the monitor thread.  We try to do this as early as possible.
+    //
+    if ( ! TCR_4( __kmp_init_monitor ) ) {
+        __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
+        if ( ! TCR_4( __kmp_init_monitor ) ) {
+            KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
+            TCW_4( __kmp_init_monitor, 1 );
+            __kmp_create_monitor( & __kmp_monitor );
+            KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
+            #if KMP_OS_WINDOWS
+                // AC: wait until monitor has started. This is a fix for CQ232808.
+                //     The reason is that if the library is loaded/unloaded in a loop with small (parallel)
+                //     work in between, then there is high probability that monitor thread started after
+                //     the library shutdown. At shutdown it is too late to cope with the problem, because
+                //     when the master is in DllMain (process detach) the monitor has no chances to start
+                //     (it is blocked), and master has no means to inform the monitor that the library has gone,
+                //     because all the memory which the monitor can access is going to be released/reset.
+                while ( TCR_4(__kmp_init_monitor) < 2 ) {
+                    KMP_YIELD( TRUE );
+                }
+                KF_TRACE( 10, ( "after monitor thread has started\n" ) );
+            #endif
+        }
+        __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
+    }
+
+    KMP_MB();
+    for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
+        KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
+    }
+
+    /* allocate space for it. */
+    new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
+
+    TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
+
+    if ( __kmp_storage_map ) {
+        __kmp_print_thread_storage_map( new_thr, new_gtid );
+    }
+
+    /* add the reserve serialized team, initialized from the team's master thread */
+    {
+    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
+    KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
+
+    new_thr->th.th_serial_team = serial_team =
+        (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
+#if OMPT_SUPPORT
+                                           0, // root parallel id
+#endif
+#if OMP_40_ENABLED
+                                           proc_bind_default,
+#endif
+                                           &r_icvs,
+                                           0 USE_NESTED_HOT_ARG(NULL) );
+    }
+    KMP_ASSERT ( serial_team );
+    serial_team->t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
+    serial_team->t.t_threads[0] = new_thr;
+    KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
+      new_thr ) );
+
+    /* setup the thread structures */
+    __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
+
+    #if USE_FAST_MEMORY
+        __kmp_initialize_fast_memory( new_thr );
+    #endif /* USE_FAST_MEMORY */
+
+    #if KMP_USE_BGET
+        KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
+        __kmp_initialize_bget( new_thr );
+    #endif
+
+    __kmp_init_random( new_thr );  // Initialize random number generator
+
+    /* Initialize these only once when thread is grabbed for a team allocation */
+    KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
+                    __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
+
+    int b;
+    kmp_balign_t * balign = new_thr->th.th_bar;
+    for(b=0; b<bs_last_barrier; ++b) {
+        balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
+        balign[b].bb.team = NULL;
+        balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
+        balign[b].bb.use_oncore_barrier = 0;
+    }
+
+    new_thr->th.th_spin_here = FALSE;
+    new_thr->th.th_next_waiting = 0;
+
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
+    new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
+    new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
+    new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
+#endif
+
+    TCW_4(new_thr->th.th_in_pool, FALSE);
+    new_thr->th.th_active_in_pool = FALSE;
+    TCW_4(new_thr->th.th_active, TRUE);
+
+    /* adjust the global counters */
+    __kmp_all_nth ++;
+    __kmp_nth ++;
+
+    //
+    // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
+    // for low numbers of procs, and method #2 (keyed API call) for higher
+    // numbers of procs.
+    //
+    if ( __kmp_adjust_gtid_mode ) {
+        if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
+            if ( TCR_4(__kmp_gtid_mode) != 2) {
+                TCW_4(__kmp_gtid_mode, 2);
+            }
+        }
+        else {
+            if (TCR_4(__kmp_gtid_mode) != 1 ) {
+                TCW_4(__kmp_gtid_mode, 1);
+            }
+        }
+    }
+
+#ifdef KMP_ADJUST_BLOCKTIME
+    /* Adjust blocktime back to zero if necessary       */
+    /* Middle initialization might not have occurred yet */
+    if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
+        if ( __kmp_nth > __kmp_avail_proc ) {
+            __kmp_zero_bt = TRUE;
+        }
+    }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+    /* actually fork it and create the new worker thread */
+    KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
+    __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
+    KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
+
+
+    KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
+    KMP_MB();
+    return new_thr;
+}
+
+/*
+ * reinitialize team for reuse.
+ *
+ * The hot team code calls this case at every fork barrier, so EPCC barrier
+ * test are extremely sensitive to changes in it, esp. writes to the team
+ * struct, which cause a cache invalidation in all threads.
+ *
+ * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
+ */
+static void
+__kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
+    KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
+                    team->t.t_threads[0], team ) );
+    KMP_DEBUG_ASSERT( team && new_icvs);
+    KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
+    team->t.t_ident = loc;
+
+    team->t.t_id = KMP_GEN_TEAM_ID();
+
+    // Copy ICVs to the master thread's implicit taskdata
+    __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
+    copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
+
+    KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
+                    team->t.t_threads[0], team ) );
+}
+
+
+/* initialize the team data structure
+ * this assumes the t_threads and t_max_nproc are already set
+ * also, we don't touch the arguments */
+static void
+__kmp_initialize_team(
+    kmp_team_t * team,
+    int          new_nproc,
+    kmp_internal_control_t * new_icvs,
+    ident_t *                loc
+) {
+    KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
+
+    /* verify */
+    KMP_DEBUG_ASSERT( team );
+    KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
+    KMP_DEBUG_ASSERT( team->t.t_threads );
+    KMP_MB();
+
+    team->t.t_master_tid  = 0;    /* not needed */
+    /* team->t.t_master_bar;        not needed */
+    team->t.t_serialized  = new_nproc > 1 ? 0 : 1;
+    team->t.t_nproc       = new_nproc;
+
+    /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
+    team->t.t_next_pool   = NULL;
+    /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
+
+    TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
+    team->t.t_invoke      = NULL; /* not needed */
+
+    // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
+    team->t.t_sched       = new_icvs->sched;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    team->t.t_fp_control_saved = FALSE; /* not needed */
+    team->t.t_x87_fpu_control_word = 0; /* not needed */
+    team->t.t_mxcsr = 0;                /* not needed */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    team->t.t_construct   = 0;
+    __kmp_init_lock( & team->t.t_single_lock );
+
+    team->t.t_ordered .dt.t_value = 0;
+    team->t.t_master_active = FALSE;
+
+    memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
+
+#ifdef KMP_DEBUG
+    team->t.t_copypriv_data = NULL;  /* not necessary, but nice for debugging */
+#endif
+    team->t.t_copyin_counter = 0;    /* for barrier-free copyin implementation */
+
+    team->t.t_control_stack_top = NULL;
+
+    __kmp_reinitialize_team( team, new_icvs, loc );
+
+    KMP_MB();
+    KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
+}
+
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+/* Sets full mask for thread and returns old mask, no changes to structures. */
+static void
+__kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
+{
+    if ( KMP_AFFINITY_CAPABLE() ) {
+        int status;
+        if ( old_mask != NULL ) {
+            status = __kmp_get_system_affinity( old_mask, TRUE );
+            int error = errno;
+            if ( status != 0 ) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( ChangeThreadAffMaskError ),
+                    KMP_ERR( error ),
+                    __kmp_msg_null
+                );
+            }
+        }
+        __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
+    }
+}
+#endif
+
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+
+//
+// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
+// It calculats the worker + master thread's partition based upon the parent
+// thread's partition, and binds each worker to a thread in their partition.
+// The master thread's partition should already include its current binding.
+//
+static void
+__kmp_partition_places( kmp_team_t *team )
+{
+    //
+    // Copy the master thread's place partion to the team struct
+    //
+    kmp_info_t *master_th = team->t.t_threads[0];
+    KMP_DEBUG_ASSERT( master_th != NULL );
+    kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
+    int first_place = master_th->th.th_first_place;
+    int last_place = master_th->th.th_last_place;
+    int masters_place = master_th->th.th_current_place;
+    team->t.t_first_place = first_place;
+    team->t.t_last_place = last_place;
+
+    KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
+       proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
+       masters_place, first_place, last_place ) );
+
+    switch ( proc_bind ) {
+
+        case proc_bind_default:
+        //
+        // serial teams might have the proc_bind policy set to
+        // proc_bind_default.  It doesn't matter, as we don't
+        // rebind the master thread for any proc_bind policy.
+        //
+        KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
+        break;
+
+        case proc_bind_master:
+        {
+            int f;
+            int n_th = team->t.t_nproc;
+            for ( f = 1; f < n_th; f++ ) {
+                kmp_info_t *th = team->t.t_threads[f];
+                KMP_DEBUG_ASSERT( th != NULL );
+                th->th.th_first_place = first_place;
+                th->th.th_last_place = last_place;
+                th->th.th_new_place = masters_place;
+
+                KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
+                  __kmp_gtid_from_thread( team->t.t_threads[f] ),
+                  team->t.t_id, f, masters_place, first_place, last_place ) );
+            }
+        }
+        break;
+
+        case proc_bind_close:
+        {
+            int f;
+            int n_th = team->t.t_nproc;
+            int n_places;
+            if ( first_place <= last_place ) {
+                n_places = last_place - first_place + 1;
+            }
+            else {
+                n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+            }
+            if ( n_th <= n_places ) {
+                int place = masters_place;
+                for ( f = 1; f < n_th; f++ ) {
+                    kmp_info_t *th = team->t.t_threads[f];
+                    KMP_DEBUG_ASSERT( th != NULL );
+
+                    if ( place == last_place ) {
+                        place = first_place;
+                    }
+                    else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                        place = 0;
+                    }
+                    else {
+                        place++;
+                    }
+                    th->th.th_first_place = first_place;
+                    th->th.th_last_place = last_place;
+                    th->th.th_new_place = place;
+
+                    KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
+                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
+                       team->t.t_id, f, place, first_place, last_place ) );
+                }
+            }
+            else {
+                int S, rem, gap, s_count;
+                S = n_th / n_places;
+                s_count = 0;
+                rem = n_th - ( S * n_places );
+                gap = rem > 0 ? n_places/rem : n_places;
+                int place = masters_place;
+                int gap_ct = gap;
+                for ( f = 0; f < n_th; f++ ) {
+                    kmp_info_t *th = team->t.t_threads[f];
+                    KMP_DEBUG_ASSERT( th != NULL );
+
+                    th->th.th_first_place = first_place;
+                    th->th.th_last_place = last_place;
+                    th->th.th_new_place = place;
+                    s_count++;
+
+                    if ( (s_count == S) && rem && (gap_ct == gap) ) {
+                        // do nothing, add an extra thread to place on next iteration
+                    }
+                    else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
+                        // we added an extra thread to this place; move to next place
+                        if ( place == last_place ) {
+                            place = first_place;
+                        }
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                            place = 0;
+                        }
+                        else {
+                            place++;
+                        }
+                        s_count = 0;
+                        gap_ct = 1;
+                        rem--;
+                    }
+                    else if (s_count == S) { // place full; don't add extra
+                        if ( place == last_place ) {
+                            place = first_place;
+                        }
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                            place = 0;
+                        }
+                        else {
+                            place++;
+                        }
+                        gap_ct++;
+                        s_count = 0;
+                    }
+
+                    KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
+                      __kmp_gtid_from_thread( team->t.t_threads[f] ),
+                      team->t.t_id, f, th->th.th_new_place, first_place,
+                      last_place ) );
+                }
+                KMP_DEBUG_ASSERT( place == masters_place );
+            }
+        }
+        break;
+
+        case proc_bind_spread:
+        {
+            int f;
+            int n_th = team->t.t_nproc;
+            int n_places;
+            if ( first_place <= last_place ) {
+                n_places = last_place - first_place + 1;
+            }
+            else {
+                n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+            }
+            if ( n_th <= n_places ) {
+                int place = masters_place;
+                int S = n_places/n_th;
+                int s_count, rem, gap, gap_ct;
+                rem = n_places - n_th*S;
+                gap = rem ? n_th/rem : 1;
+                gap_ct = gap;
+                for ( f = 0; f < n_th; f++ ) {
+                    kmp_info_t *th = team->t.t_threads[f];
+                    KMP_DEBUG_ASSERT( th != NULL );
+
+                    th->th.th_first_place = place;
+                    th->th.th_new_place = place;
+                    s_count = 1;
+                    while (s_count < S) {
+                        if ( place == last_place ) {
+                            place = first_place;
+                        }
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                            place = 0;
+                        }
+                        else {
+                            place++;
+                        }
+                        s_count++;
+                    }
+                    if (rem && (gap_ct == gap)) {
+                        if ( place == last_place ) {
+                            place = first_place;
+                        }
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                            place = 0;
+                        }
+                        else {
+                            place++;
+                        }
+                        rem--;
+                        gap_ct = 0;
+                    }
+                    th->th.th_last_place = place;
+                    gap_ct++;
+
+                    if ( place == last_place ) {
+                        place = first_place;
+                    }
+                    else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                        place = 0;
+                    }
+                    else {
+                        place++;
+                    }
+
+                    KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
+                      __kmp_gtid_from_thread( team->t.t_threads[f] ),
+                      team->t.t_id, f, th->th.th_new_place,
+                      th->th.th_first_place, th->th.th_last_place ) );
+                }
+                KMP_DEBUG_ASSERT( place == masters_place );
+            }
+            else {
+                int S, rem, gap, s_count;
+                S = n_th / n_places;
+                s_count = 0;
+                rem = n_th - ( S * n_places );
+                gap = rem > 0 ? n_places/rem : n_places;
+                int place = masters_place;
+                int gap_ct = gap;
+                for ( f = 0; f < n_th; f++ ) {
+                    kmp_info_t *th = team->t.t_threads[f];
+                    KMP_DEBUG_ASSERT( th != NULL );
+
+                    th->th.th_first_place = place;
+                    th->th.th_last_place = place;
+                    th->th.th_new_place = place;
+                    s_count++;
+
+                    if ( (s_count == S) && rem && (gap_ct == gap) ) {
+                        // do nothing, add an extra thread to place on next iteration
+                    }
+                    else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
+                        // we added an extra thread to this place; move on to next place
+                        if ( place == last_place ) {
+                            place = first_place;
+                        }
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                            place = 0;
+                        }
+                        else {
+                            place++;
+                        }
+                        s_count = 0;
+                        gap_ct = 1;
+                        rem--;
+                    }
+                    else if (s_count == S) { // place is full; don't add extra thread
+                        if ( place == last_place ) {
+                            place = first_place;
+                        }
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
+                            place = 0;
+                        }
+                        else {
+                            place++;
+                        }
+                        gap_ct++;
+                        s_count = 0;
+                    }
+
+                    KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
+                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
+                       team->t.t_id, f, th->th.th_new_place,
+                       th->th.th_first_place, th->th.th_last_place) );
+                }
+                KMP_DEBUG_ASSERT( place == masters_place );
+            }
+        }
+        break;
+
+        default:
+        break;
+    }
+
+    KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
+}
+
+#endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
+
+/* allocate a new team data structure to use.  take one off of the free pool if available */
+kmp_team_t *
+__kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+    ompt_parallel_id_t ompt_parallel_id,
+#endif
+#if OMP_40_ENABLED
+    kmp_proc_bind_t new_proc_bind,
+#endif
+    kmp_internal_control_t *new_icvs,
+    int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
+{
+    KMP_TIME_BLOCK(KMP_allocate_team);
+    int f;
+    kmp_team_t *team;
+    int use_hot_team = ! root->r.r_active;
+    int level = 0;
+
+    KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
+    KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
+    KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
+    KMP_MB();
+
+#if KMP_NESTED_HOT_TEAMS
+    kmp_hot_team_ptr_t *hot_teams;
+    if( master ) {
+        team = master->th.th_team;
+        level = team->t.t_active_level;
+        if( master->th.th_teams_microtask ) {                         // in teams construct?
+            if( master->th.th_teams_size.nteams > 1 && (             // #teams > 1
+                team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
+                master->th.th_teams_level < team->t.t_level ) ) {    // or nested parallel inside the teams
+                ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
+            }
+        }
+        hot_teams = master->th.th_hot_teams;
+        if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
+        {   // hot team has already been allocated for given level
+            use_hot_team = 1;
+        } else {
+            use_hot_team = 0;
+        }
+    }
+#endif
+    // Optimization to use a "hot" team
+    if( use_hot_team && new_nproc > 1 ) {
+        KMP_DEBUG_ASSERT( new_nproc == max_nproc );
+#if KMP_NESTED_HOT_TEAMS
+        team = hot_teams[level].hot_team;
+#else
+        team =  root->r.r_hot_team;
+#endif
+#if KMP_DEBUG
+        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+            KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
+                           team->t.t_task_team[0], team->t.t_task_team[1] ));
+        }
+#endif
+
+        // Has the number of threads changed?
+        /* Let's assume the most common case is that the number of threads is unchanged, and
+           put that case first. */
+        if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
+            KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
+            // This case can mean that omp_set_num_threads() was called and the hot team size
+            // was already reduced, so we check the special flag
+            if ( team->t.t_size_changed == -1 ) {
+                team->t.t_size_changed = 1;
+            } else {
+                team->t.t_size_changed = 0;
+            }
+
+            // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+            team->t.t_sched =  new_icvs->sched;
+
+            __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
+
+            KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
+                           0, team->t.t_threads[0], team ) );
+            __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
+
+#if OMP_40_ENABLED
+# if KMP_AFFINITY_SUPPORTED
+            if ( team->t.t_proc_bind == new_proc_bind ) {
+                KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
+                  team->t.t_id, new_proc_bind, team->t.t_first_place,
+                  team->t.t_last_place ) );
+            }
+            else {
+                team->t.t_proc_bind = new_proc_bind;
+                __kmp_partition_places( team );
+            }
+# else
+            if ( team->t.t_proc_bind != new_proc_bind ) {
+                team->t.t_proc_bind = new_proc_bind;
+            }
+# endif /* KMP_AFFINITY_SUPPORTED */
+#endif /* OMP_40_ENABLED */
+
+            if (level) {
+                for(f = 0; f < new_nproc; ++f) {
+                    team->t.t_threads[f]->th.th_task_state = 0;
+                }
+            }
+        }
+        else if( team->t.t_nproc > new_nproc ) {
+            KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
+
+            team->t.t_size_changed = 1;
+            if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+                // Signal the worker threads (esp. extra ones) to stop looking for tasks while spin waiting.
+                // The task teams are reference counted and will be deallocated by the last worker thread.
+                int tt_idx;
+                for (tt_idx=0; tt_idx<2; ++tt_idx) {
+                    // We don't know which of the two task teams workers are waiting on, so deactivate both.
+                    kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
+                    if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
+                        KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
+                        TCW_SYNC_4( task_team->tt.tt_active, FALSE );
+                        KMP_MB();
+                        KA_TRACE(20, ("__kmp_allocate_team: setting task_team %p to NULL\n",
+                                      &team->t.t_task_team[tt_idx]));
+                        team->t.t_task_team[tt_idx] = NULL;
+                    }
+                    else {
+                        KMP_DEBUG_ASSERT( task_team == NULL );
+                    }
+                }
+            }
+#if KMP_NESTED_HOT_TEAMS
+            if( __kmp_hot_teams_mode == 0 ) {
+                // AC: saved number of threads should correspond to team's value in this mode,
+                // can be bigger in mode 1, when hot team has some threads in reserve
+                KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
+                hot_teams[level].hot_team_nth = new_nproc;
+#endif // KMP_NESTED_HOT_TEAMS
+                /* release the extra threads we don't need any more */
+                for( f = new_nproc  ;  f < team->t.t_nproc  ;  f++ ) {
+                    KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
+                    __kmp_free_thread( team->t.t_threads[ f ] );
+                    team->t.t_threads[ f ] = NULL;
+                }
+#if KMP_NESTED_HOT_TEAMS
+            } // (__kmp_hot_teams_mode == 0)
+#endif // KMP_NESTED_HOT_TEAMS
+            team->t.t_nproc =  new_nproc;
+            // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+            team->t.t_sched =  new_icvs->sched;
+            __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
+
+            if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+                // Init both task teams
+                int tt_idx;
+                for (tt_idx=0; tt_idx<2; ++tt_idx) {
+                    kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
+                    if ( task_team != NULL ) {
+                        KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
+                        task_team->tt.tt_nproc = new_nproc;
+                        task_team->tt.tt_unfinished_threads = new_nproc;
+                        task_team->tt.tt_ref_ct = new_nproc - 1;
+                    }
+                }
+            }
+
+            /* update the remaining threads */
+            if (level) {
+                for(f = 0; f < new_nproc; ++f) {
+                    team->t.t_threads[f]->th.th_team_nproc = new_nproc;
+                    team->t.t_threads[f]->th.th_task_state = 0;
+                }
+            }
+            else {
+                for(f = 0; f < new_nproc; ++f) {
+                    team->t.t_threads[f]->th.th_team_nproc = new_nproc;
+                }
+            }
+            // restore the current task state of the master thread: should be the implicit task
+            KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
+                       0, team->t.t_threads[0], team ) );
+
+            __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
+
+#ifdef KMP_DEBUG
+            for ( f = 0; f < team->t.t_nproc; f++ ) {
+                KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
+                    team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
+            }
+#endif
+
+#if OMP_40_ENABLED
+            team->t.t_proc_bind = new_proc_bind;
+# if KMP_AFFINITY_SUPPORTED
+            __kmp_partition_places( team );
+# endif
+#endif
+        }
+        else { // team->t.t_nproc < new_nproc
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+            kmp_affin_mask_t *old_mask;
+            if ( KMP_AFFINITY_CAPABLE() ) {
+                KMP_CPU_ALLOC(old_mask);
+            }
+#endif
+
+            KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
+
+            team->t.t_size_changed = 1;
+
+
+#if KMP_NESTED_HOT_TEAMS
+            int avail_threads = hot_teams[level].hot_team_nth;
+            if( new_nproc < avail_threads )
+                avail_threads = new_nproc;
+            kmp_info_t **other_threads = team->t.t_threads;
+            for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
+                // Adjust barrier data of reserved threads (if any) of the team
+                // Other data will be set in __kmp_initialize_info() below.
+                int b;
+                kmp_balign_t * balign = other_threads[f]->th.th_bar;
+                for ( b = 0; b < bs_last_barrier; ++ b ) {
+                    balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+                    balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+                }
+            }
+            if( hot_teams[level].hot_team_nth >= new_nproc ) {
+                // we have all needed threads in reserve, no need to allocate any
+                // this only possible in mode 1, cannot have reserved threads in mode 0
+                KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
+                team->t.t_nproc = new_nproc;                     // just get reserved threads involved
+            } else {
+                // we may have some threads in reserve, but not enough
+                team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
+                hot_teams[level].hot_team_nth = new_nproc;       // adjust hot team max size
+#endif // KMP_NESTED_HOT_TEAMS
+            if(team->t.t_max_nproc < new_nproc) {
+                /* reallocate larger arrays */
+                __kmp_reallocate_team_arrays(team, new_nproc);
+                __kmp_reinitialize_team( team, new_icvs, NULL );
+            }
+
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+            /* Temporarily set full mask for master thread before
+               creation of workers. The reason is that workers inherit
+               the affinity from master, so if a lot of workers are
+               created on the single core quickly, they don't get
+               a chance to set their own affinity for a long time.
+            */
+            __kmp_set_thread_affinity_mask_full_tmp( old_mask );
+#endif
+
+            /* allocate new threads for the hot team */
+            for( f = team->t.t_nproc  ;  f < new_nproc  ;  f++ ) {
+                kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
+                KMP_DEBUG_ASSERT( new_worker );
+                team->t.t_threads[ f ] = new_worker;
+                new_worker->th.th_team_nproc = team->t.t_nproc;
+
+                KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%u, plain=%u\n",
+                                team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
+                                team->t.t_bar[bs_forkjoin_barrier].b_arrived,
+                                team->t.t_bar[bs_plain_barrier].b_arrived ) );
+
+                { // Initialize barrier data for new threads.
+                    int b;
+                    kmp_balign_t * balign = new_worker->th.th_bar;
+                    for( b = 0; b < bs_last_barrier; ++ b ) {
+                        balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                        KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+                        balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
+#endif
+                    }
+                }
+            }
+
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+            if ( KMP_AFFINITY_CAPABLE() ) {
+                /* Restore initial master thread's affinity mask */
+                __kmp_set_system_affinity( old_mask, TRUE );
+                KMP_CPU_FREE(old_mask);
+            }
+#endif
+#if KMP_NESTED_HOT_TEAMS
+            } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
+#endif // KMP_NESTED_HOT_TEAMS
+            /* make sure everyone is syncronized */
+            __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
+
+            if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+                int tt_idx;
+                for (tt_idx=0; tt_idx<2; ++tt_idx) {
+                    kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
+                    if ( task_team != NULL ) {
+                        KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
+                        task_team->tt.tt_nproc = new_nproc;
+                        task_team->tt.tt_unfinished_threads = new_nproc;
+                        task_team->tt.tt_ref_ct = new_nproc - 1;
+                    }
+                }
+            }
+
+            /* reinitialize the old threads */
+            if (level) {
+                for( f = 0  ;  f < team->t.t_nproc  ;  f++ ) {
+                    __kmp_initialize_info( team->t.t_threads[ f ], team, f,
+                                           __kmp_gtid_from_tid( f, team ) );
+                }
+            }
+            else {
+                int old_state = team->t.t_threads[0]->th.th_task_state;
+                for (f=0;  f < team->t.t_nproc; ++f) {
+                    __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
+                    team->t.t_threads[f]->th.th_task_state = old_state;
+                    team->t.t_threads[f]->th.th_task_team = team->t.t_task_team[old_state];
+                }
+            }
+
+#ifdef KMP_DEBUG
+            for ( f = 0; f < team->t.t_nproc; ++ f ) {
+                KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
+                    team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
+            }
+#endif
+
+#if OMP_40_ENABLED
+            team->t.t_proc_bind = new_proc_bind;
+# if KMP_AFFINITY_SUPPORTED
+            __kmp_partition_places( team );
+# endif
+#endif
+        } // Check changes in number of threads
+
+#if OMP_40_ENABLED
+        kmp_info_t *master = team->t.t_threads[0];
+        if( master->th.th_teams_microtask ) {
+            for( f = 1; f < new_nproc; ++f ) {
+                // propagate teams construct specific info to workers
+                kmp_info_t *thr = team->t.t_threads[f];
+                thr->th.th_teams_microtask = master->th.th_teams_microtask;
+                thr->th.th_teams_level     = master->th.th_teams_level;
+                thr->th.th_teams_size      = master->th.th_teams_size;
+            }
+        }
+#endif /* OMP_40_ENABLED */
+#if KMP_NESTED_HOT_TEAMS
+        if( level ) {
+            // Sync task (TODO: and barrier?) state for nested hot teams, not needed for outermost hot team.
+            for( f = 1; f < new_nproc; ++f ) {
+                kmp_info_t *thr = team->t.t_threads[f];
+                thr->th.th_task_state = 0;
+                int b;
+                kmp_balign_t * balign = thr->th.th_bar;
+                for( b = 0; b < bs_last_barrier; ++ b ) {
+                    balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+                    balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
+#endif
+                }
+            }
+        }
+#endif // KMP_NESTED_HOT_TEAMS
+
+        /* reallocate space for arguments if necessary */
+        __kmp_alloc_argv_entries( argc, team, TRUE );
+        team->t.t_argc     = argc;
+        //
+        // The hot team re-uses the previous task team,
+        // if untouched during the previous release->gather phase.
+        //
+
+        KF_TRACE( 10, ( " hot_team = %p\n", team ) );
+
+#if KMP_DEBUG
+        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+            KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
+                           team->t.t_task_team[0], team->t.t_task_team[1] ));
+        }
+#endif
+
+#if OMPT_SUPPORT
+        __ompt_team_assign_id(team, ompt_parallel_id);
+#endif
+
+        KMP_MB();
+
+        return team;
+    }
+
+    /* next, let's try to take one from the team pool */
+    KMP_MB();
+    for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
+    {
+        /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
+        if ( team->t.t_max_nproc >= max_nproc ) {
+            /* take this team from the team pool */
+            __kmp_team_pool = team->t.t_next_pool;
+
+            /* setup the team for fresh use */
+            __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
+
+            KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
+                            &team->t.t_task_team[0], &team->t.t_task_team[1]) );
+            team->t.t_task_team[0] = NULL;
+            team->t.t_task_team[1] = NULL;
+
+            /* reallocate space for arguments if necessary */
+            __kmp_alloc_argv_entries( argc, team, TRUE );
+            team->t.t_argc     = argc;
+
+            KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
+                            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
+            { // Initialize barrier data.
+                int b;
+                for ( b = 0; b < bs_last_barrier; ++ b) {
+                    team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+                    team->t.t_bar[ b ].b_master_arrived = 0;
+                    team->t.t_bar[ b ].b_team_arrived   = 0;
+#endif
+                }
+            }
+
+#if OMP_40_ENABLED
+            team->t.t_proc_bind = new_proc_bind;
+#endif
+
+            KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
+
+#if OMPT_SUPPORT
+            __ompt_team_assign_id(team, ompt_parallel_id);
+#endif
+
+            KMP_MB();
+
+            return team;
+        }
+
+        /* reap team if it is too small, then loop back and check the next one */
+        /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
+        /* TODO: Use technique to find the right size hot-team, don't reap them */
+        team =  __kmp_reap_team( team );
+        __kmp_team_pool = team;
+    }
+
+    /* nothing available in the pool, no matter, make a new team! */
+    KMP_MB();
+    team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
+
+    /* and set it up */
+    team->t.t_max_nproc   = max_nproc;
+    /* NOTE well, for some reason allocating one big buffer and dividing it
+     * up seems to really hurt performance a lot on the P4, so, let's not use
+     * this... */
+    __kmp_allocate_team_arrays( team, max_nproc );
+
+    KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
+    __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
+
+    KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
+                    &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
+    team->t.t_task_team[0] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
+    team->t.t_task_team[1] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
+
+    if ( __kmp_storage_map ) {
+        __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
+    }
+
+    /* allocate space for arguments */
+    __kmp_alloc_argv_entries( argc, team, FALSE );
+    team->t.t_argc        = argc;
+
+    KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
+                    team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
+    { // Initialize barrier data.
+        int b;
+        for ( b = 0; b < bs_last_barrier; ++ b ) {
+            team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+            team->t.t_bar[ b ].b_master_arrived = 0;
+            team->t.t_bar[ b ].b_team_arrived   = 0;
+#endif
+        }
+    }
+
+#if OMP_40_ENABLED
+    team->t.t_proc_bind = new_proc_bind;
+#endif
+
+#if OMPT_SUPPORT
+    __ompt_team_assign_id(team, ompt_parallel_id);
+    team->t.ompt_serialized_team_info = NULL;
+#endif
+
+    KMP_MB();
+
+    KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
+
+    return team;
+}
+
+/* TODO implement hot-teams at all levels */
+/* TODO implement lazy thread release on demand (disband request) */
+
+/* free the team.  return it to the team pool.  release all the threads
+ * associated with it */
+void
+__kmp_free_team( kmp_root_t *root, kmp_team_t *team  USE_NESTED_HOT_ARG(kmp_info_t *master) )
+{
+    int f;
+    KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
+
+    /* verify state */
+    KMP_DEBUG_ASSERT( root );
+    KMP_DEBUG_ASSERT( team );
+    KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
+    KMP_DEBUG_ASSERT( team->t.t_threads );
+
+    int use_hot_team = team == root->r.r_hot_team;
+#if KMP_NESTED_HOT_TEAMS
+    int level;
+    kmp_hot_team_ptr_t *hot_teams;
+    if( master ) {
+        level = team->t.t_active_level - 1;
+        if( master->th.th_teams_microtask ) {                         // in teams construct?
+            if( master->th.th_teams_size.nteams > 1 ) {
+               ++level; // level was not increased in teams construct for team_of_masters
+            }
+            if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+                master->th.th_teams_level == team->t.t_level ) {
+                ++level; // level was not increased in teams construct for team_of_workers before the parallel
+            }            // team->t.t_level will be increased inside parallel
+        }
+        hot_teams = master->th.th_hot_teams;
+        if( level < __kmp_hot_teams_max_level ) {
+            KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
+            use_hot_team = 1;
+        }
+    }
+#endif // KMP_NESTED_HOT_TEAMS
+
+    /* team is done working */
+    TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
+    team->t.t_copyin_counter = 0; // init counter for possible reuse
+    // Do not reset pointer to parent team to NULL for hot teams.
+
+    /* if we are non-hot team, release our threads */
+    if( ! use_hot_team ) {
+        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+            int tt_idx;
+            for (tt_idx=0; tt_idx<2; ++tt_idx) {
+                // We don't know which of the two task teams workers are waiting on, so deactivate both.
+                kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
+                if ( task_team != NULL ) {
+                    // Signal the worker threads to stop looking for tasks while spin waiting.  The task
+                    // teams are reference counted and will be deallocated by the last worker thread via the
+                    // thread's pointer to the task team.
+                    KA_TRACE( 20, ( "__kmp_free_team: deactivating task_team %p\n", task_team ) );
+                    KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
+                    TCW_SYNC_4( task_team->tt.tt_active, FALSE );
+                    KMP_MB();
+                    team->t.t_task_team[tt_idx] = NULL;
+                }
+            }
+        }
+
+        // Reset pointer to parent team only for non-hot teams.
+        team->t.t_parent = NULL;
+
+
+        /* free the worker threads */
+        for ( f = 1; f < team->t.t_nproc; ++ f ) {
+            KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
+            __kmp_free_thread( team->t.t_threads[ f ] );
+            team->t.t_threads[ f ] = NULL;
+        }
+
+
+        /* put the team back in the team pool */
+        /* TODO limit size of team pool, call reap_team if pool too large */
+        team->t.t_next_pool  = (kmp_team_t*) __kmp_team_pool;
+        __kmp_team_pool        = (volatile kmp_team_t*) team;
+    }
+
+    KMP_MB();
+}
+
+
+/* reap the team.  destroy it, reclaim all its resources and free its memory */
+kmp_team_t *
+__kmp_reap_team( kmp_team_t *team )
+{
+    kmp_team_t *next_pool = team->t.t_next_pool;
+
+    KMP_DEBUG_ASSERT( team );
+    KMP_DEBUG_ASSERT( team->t.t_dispatch    );
+    KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
+    KMP_DEBUG_ASSERT( team->t.t_threads     );
+    KMP_DEBUG_ASSERT( team->t.t_argv        );
+
+    /* TODO clean the threads that are a part of this? */
+
+    /* free stuff */
+
+    __kmp_free_team_arrays( team );
+    if ( team->t.t_argv != &team->t.t_inline_argv[0] )
+        __kmp_free( (void*) team->t.t_argv );
+    __kmp_free( team );
+
+    KMP_MB();
+    return next_pool;
+}
+
+//
+// Free the thread.  Don't reap it, just place it on the pool of available
+// threads.
+//
+// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
+// binding for the affinity mechanism to be useful.
+//
+// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
+// However, we want to avoid a potential performance problem by always
+// scanning through the list to find the correct point at which to insert
+// the thread (potential N**2 behavior).  To do this we keep track of the
+// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
+// With single-level parallelism, threads will always be added to the tail
+// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
+// parallelism, all bets are off and we may need to scan through the entire
+// free list.
+//
+// This change also has a potentially large performance benefit, for some
+// applications.  Previously, as threads were freed from the hot team, they
+// would be placed back on the free list in inverse order.  If the hot team
+// grew back to it's original size, then the freed thread would be placed
+// back on the hot team in reverse order.  This could cause bad cache
+// locality problems on programs where the size of the hot team regularly
+// grew and shrunk.
+//
+// Now, for single-level parallelism, the OMP tid is alway == gtid.
+//
+void
+__kmp_free_thread( kmp_info_t *this_th )
+{
+    int gtid;
+    kmp_info_t **scan;
+
+    KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
+                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
+
+    KMP_DEBUG_ASSERT( this_th );
+
+    // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
+    int b;
+    kmp_balign_t *balign = this_th->th.th_bar;
+    for (b=0; b<bs_last_barrier; ++b) {
+        if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
+            balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
+        balign[b].bb.team = NULL;
+    }
+
+
+    /* put thread back on the free pool */
+    TCW_PTR(this_th->th.th_team, NULL);
+    TCW_PTR(this_th->th.th_root, NULL);
+    TCW_PTR(this_th->th.th_dispatch, NULL);               /* NOT NEEDED */
+
+    //
+    // If the __kmp_thread_pool_insert_pt is already past the new insert
+    // point, then we need to re-scan the entire list.
+    //
+    gtid = this_th->th.th_info.ds.ds_gtid;
+    if ( __kmp_thread_pool_insert_pt != NULL ) {
+        KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
+        if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
+             __kmp_thread_pool_insert_pt = NULL;
+        }
+    }
+
+    //
+    // Scan down the list to find the place to insert the thread.
+    // scan is the address of a link in the list, possibly the address of
+    // __kmp_thread_pool itself.
+    //
+    // In the absence of nested parallism, the for loop will have 0 iterations.
+    //
+    if ( __kmp_thread_pool_insert_pt != NULL ) {
+        scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
+    }
+    else {
+        scan = (kmp_info_t **)&__kmp_thread_pool;
+    }
+    for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
+      scan = &( (*scan)->th.th_next_pool ) );
+
+    //
+    // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
+    // to its address.
+    //
+    TCW_PTR(this_th->th.th_next_pool, *scan);
+    __kmp_thread_pool_insert_pt = *scan = this_th;
+    KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
+      || ( this_th->th.th_info.ds.ds_gtid
+      < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
+    TCW_4(this_th->th.th_in_pool, TRUE);
+    __kmp_thread_pool_nth++;
+
+    TCW_4(__kmp_nth, __kmp_nth - 1);
+
+#ifdef KMP_ADJUST_BLOCKTIME
+    /* Adjust blocktime back to user setting or default if necessary */
+    /* Middle initialization might never have occurred                */
+    if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
+        KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
+        if ( __kmp_nth <= __kmp_avail_proc ) {
+            __kmp_zero_bt = FALSE;
+        }
+    }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+    KMP_MB();
+}
+
+
+/* ------------------------------------------------------------------------ */
+
+void *
+__kmp_launch_thread( kmp_info_t *this_thr )
+{
+    int                   gtid = this_thr->th.th_info.ds.ds_gtid;
+/*    void                 *stack_data;*/
+    kmp_team_t *(*volatile pteam);
+
+    KMP_MB();
+    KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
+
+    if( __kmp_env_consistency_check ) {
+        this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+        this_thr->th.ompt_thread_info.wait_id = 0;
+        this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
+            __ompt_thread_begin(ompt_thread_worker, gtid);
+        }
+    }
+#endif
+
+    /* This is the place where threads wait for work */
+    while( ! TCR_4(__kmp_global.g.g_done) ) {
+        KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
+        KMP_MB();
+
+        /* wait for work to do */
+        KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
+
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            this_thr->th.ompt_thread_info.state = ompt_state_idle;
+        }
+#endif
+
+        /* No tid yet since not part of a team */
+        __kmp_fork_barrier( gtid, KMP_GTID_DNE );
+
+#if OMPT_SUPPORT
+        if (ompt_status & ompt_status_track) {
+            this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+        }
+#endif
+
+        pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
+
+        /* have we been allocated? */
+        if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
+            /* we were just woken up, so run our new task */
+            if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
+                int rc;
+                KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
+                              gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
+
+                updateHWFPControl (*pteam);
+
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+                    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+                    // Initialize OMPT task id for implicit task.
+                    int tid = __kmp_tid_from_gtid(gtid);
+                    (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id = 
+                    __ompt_task_id_new(tid);
+                }
+#endif
+
+                KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
+                {
+                    KMP_TIME_BLOCK(USER_worker_invoke);
+                    rc = (*pteam)->t.t_invoke( gtid );
+                }
+                KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
+                KMP_ASSERT( rc );
+
+#if OMPT_SUPPORT
+                if (ompt_status & ompt_status_track) {
+                    /* no frame set while outside task */
+                    int tid = __kmp_tid_from_gtid(gtid);
+                    (*pteam)->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
+
+                    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+                }
+#endif
+                KMP_MB();
+                KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
+                              gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
+            }
+            /* join barrier after parallel region */
+            __kmp_join_barrier( gtid );
+        }
+    }
+    TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
+
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
+        __ompt_thread_end(ompt_thread_worker, gtid);
+    }
+#endif
+
+    if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
+        __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
+    }
+    /* run the destructors for the threadprivate data for this thread */
+    __kmp_common_destroy_gtid( gtid );
+
+    KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
+    KMP_MB();
+    return this_thr;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_internal_end_dest( void *specific_gtid )
+{
+    #if KMP_COMPILER_ICC
+        #pragma warning( push )
+        #pragma warning( disable:  810 ) // conversion from "void *" to "int" may lose significant bits
+    #endif
+    // Make sure no significant bits are lost
+    int gtid = (kmp_intptr_t)specific_gtid - 1;
+    #if KMP_COMPILER_ICC
+        #pragma warning( pop )
+    #endif
+
+    KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
+    /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
+     * this is because 0 is reserved for the nothing-stored case */
+
+    /* josh: One reason for setting the gtid specific data even when it is being
+       destroyed by pthread is to allow gtid lookup through thread specific data
+       (__kmp_gtid_get_specific).  Some of the code, especially stat code,
+       that gets executed in the call to __kmp_internal_end_thread, actually
+       gets the gtid through the thread specific data.  Setting it here seems
+       rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
+       to run smoothly.
+       todo: get rid of this after we remove the dependence on
+       __kmp_gtid_get_specific
+    */
+    if(gtid >= 0 && KMP_UBER_GTID(gtid))
+        __kmp_gtid_set_specific( gtid );
+    #ifdef KMP_TDATA_GTID
+        __kmp_gtid = gtid;
+    #endif
+    __kmp_internal_end_thread( gtid );
+}
+
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
+
+// 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
+// perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
+// option in makefile.mk works fine.
+
+__attribute__(( destructor ))
+void
+__kmp_internal_end_dtor( void )
+{
+    __kmp_internal_end_atexit();
+}
+
+void
+__kmp_internal_end_fini( void )
+{
+    __kmp_internal_end_atexit();
+}
+
+#endif
+
+/* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
+void
+__kmp_internal_end_atexit( void )
+{
+    KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
+    /* [Windows]
+       josh: ideally, we want to completely shutdown the library in this atexit handler, but
+       stat code that depends on thread specific data for gtid fails because that data becomes
+       unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
+       instead.  We should eventually remove the dependency on __kmp_get_specific_gtid in the
+       stat code and use __kmp_internal_end_library to cleanly shutdown the library.
+
+// TODO: Can some of this comment about GVS be removed?
+       I suspect that the offending stat code is executed when the calling thread tries to
+       clean up a dead root thread's data structures, resulting in GVS code trying to close
+       the GVS structures for that thread, but since the stat code uses
+       __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
+       cleaning up itself instead of another thread, it gets confused.  This happens because
+       allowing a thread to unregister and cleanup another thread is a recent modification for
+       addressing an issue with Maxon Cinema4D.  Based on the current design (20050722), a
+       thread may end up trying to unregister another thread only if thread death does not
+       trigger the calling of __kmp_internal_end_thread.  For Linux* OS, there is the thread
+       specific data destructor function to detect thread death.  For Windows dynamic, there
+       is DllMain(THREAD_DETACH).  For Windows static, there is nothing.  Thus, the
+       workaround is applicable only for Windows static stat library.
+    */
+    __kmp_internal_end_library( -1 );
+    #if KMP_OS_WINDOWS
+        __kmp_close_console();
+    #endif
+}
+
+static void
+__kmp_reap_thread(
+    kmp_info_t * thread,
+    int is_root
+) {
+
+    // It is assumed __kmp_forkjoin_lock is acquired.
+
+    int gtid;
+
+    KMP_DEBUG_ASSERT( thread != NULL );
+
+    gtid = thread->th.th_info.ds.ds_gtid;
+
+    if ( ! is_root ) {
+
+        if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
+            /* Assume the threads are at the fork barrier here */
+            KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
+            /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
+            kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
+            __kmp_release_64(&flag);
+        }; // if
+
+
+        // Terminate OS thread.
+        __kmp_reap_worker( thread );
+
+        //
+        // The thread was killed asynchronously.  If it was actively
+        // spinning in the in the thread pool, decrement the global count.
+        //
+        // There is a small timing hole here - if the worker thread was
+        // just waking up after sleeping in the pool, had reset it's
+        // th_active_in_pool flag but not decremented the global counter
+        // __kmp_thread_pool_active_nth yet, then the global counter
+        // might not get updated.
+        //
+        // Currently, this can only happen as the library is unloaded,
+        // so there are no harmful side effects.
+        //
+        if ( thread->th.th_active_in_pool ) {
+            thread->th.th_active_in_pool = FALSE;
+            KMP_TEST_THEN_DEC32(
+              (kmp_int32 *) &__kmp_thread_pool_active_nth );
+            KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
+        }
+
+        // Decrement # of [worker] threads in the pool.
+        KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
+        --__kmp_thread_pool_nth;
+    }; // if
+
+    // Free the fast memory for tasking
+    #if USE_FAST_MEMORY
+        __kmp_free_fast_memory( thread );
+    #endif /* USE_FAST_MEMORY */
+
+    __kmp_suspend_uninitialize_thread( thread );
+
+    KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
+    TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
+
+    -- __kmp_all_nth;
+    // __kmp_nth was decremented when thread is added to the pool.
+
+#ifdef KMP_ADJUST_BLOCKTIME
+    /* Adjust blocktime back to user setting or default if necessary */
+    /* Middle initialization might never have occurred                */
+    if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
+        KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
+        if ( __kmp_nth <= __kmp_avail_proc ) {
+            __kmp_zero_bt = FALSE;
+        }
+    }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+    /* free the memory being used */
+    if( __kmp_env_consistency_check ) {
+        if ( thread->th.th_cons ) {
+            __kmp_free_cons_stack( thread->th.th_cons );
+            thread->th.th_cons = NULL;
+        }; // if
+    }
+
+    if ( thread->th.th_pri_common != NULL ) {
+        __kmp_free( thread->th.th_pri_common );
+        thread->th.th_pri_common = NULL;
+    }; // if
+
+    if (thread->th.th_task_state_memo_stack != NULL) {
+        __kmp_free(thread->th.th_task_state_memo_stack);
+        thread->th.th_task_state_memo_stack = NULL;
+    }
+
+    #if KMP_USE_BGET
+        if ( thread->th.th_local.bget_data != NULL ) {
+            __kmp_finalize_bget( thread );
+        }; // if
+    #endif
+
+#if KMP_AFFINITY_SUPPORTED
+    if ( thread->th.th_affin_mask != NULL ) {
+        KMP_CPU_FREE( thread->th.th_affin_mask );
+        thread->th.th_affin_mask = NULL;
+    }; // if
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+    __kmp_reap_team( thread->th.th_serial_team );
+    thread->th.th_serial_team = NULL;
+    __kmp_free( thread );
+
+    KMP_MB();
+
+} // __kmp_reap_thread
+
+static void
+__kmp_internal_end(void)
+{
+    int i;
+
+    /* First, unregister the library */
+    __kmp_unregister_library();
+
+    #if KMP_OS_WINDOWS
+        /* In Win static library, we can't tell when a root actually dies, so we
+           reclaim the data structures for any root threads that have died but not
+           unregistered themselves, in order to shut down cleanly.
+           In Win dynamic library we also can't tell when a thread dies.
+        */
+        __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
+    #endif
+
+    for( i=0 ; i<__kmp_threads_capacity ; i++ )
+        if( __kmp_root[i] )
+            if( __kmp_root[i]->r.r_active )
+                break;
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+    TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+
+    if ( i < __kmp_threads_capacity ) {
+        // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        //
+        // Need to check that monitor was initialized before reaping it.
+        // If we are called form __kmp_atfork_child (which sets
+        // __kmp_init_parallel = 0), then __kmp_monitor will appear to
+        // contain valid data, but it is only valid in the parent process,
+        // not the child.
+        //
+        // One of the possible fixes for CQ138434 / CQ140126
+        // (used in 20091103_dreamworks patch)
+        //
+        // New behavior (201008): instead of keying off of the flag
+        // __kmp_init_parallel, the monitor thread creation is keyed off
+        // of the new flag __kmp_init_monitor.
+        //
+        __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
+        if ( TCR_4( __kmp_init_monitor ) ) {
+            __kmp_reap_monitor( & __kmp_monitor );
+            TCW_4( __kmp_init_monitor, 0 );
+        }
+        __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
+        KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
+    } else {
+        /* TODO move this to cleanup code */
+        #ifdef KMP_DEBUG
+            /* make sure that everything has properly ended */
+            for ( i = 0; i < __kmp_threads_capacity; i++ ) {
+                if( __kmp_root[i] ) {
+//                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC: there can be uber threads alive here
+                    KMP_ASSERT( ! __kmp_root[i]->r.r_active );  // TODO: can they be active?
+                }
+            }
+        #endif
+
+        KMP_MB();
+
+        // Reap the worker threads.
+        // This is valid for now, but be careful if threads are reaped sooner.
+        while ( __kmp_thread_pool != NULL ) {    // Loop thru all the thread in the pool.
+            // Get the next thread from the pool.
+            kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
+            __kmp_thread_pool = thread->th.th_next_pool;
+            // Reap it.
+            thread->th.th_next_pool = NULL;
+            thread->th.th_in_pool = FALSE;
+            __kmp_reap_thread( thread, 0 );
+        }; // while
+        __kmp_thread_pool_insert_pt = NULL;
+
+        // Reap teams.
+        while ( __kmp_team_pool != NULL ) {     // Loop thru all the teams in the pool.
+            // Get the next team from the pool.
+            kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
+            __kmp_team_pool = team->t.t_next_pool;
+            // Reap it.
+            team->t.t_next_pool = NULL;
+            __kmp_reap_team( team );
+        }; // while
+
+        __kmp_reap_task_teams( );
+
+        for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
+            // TBD: Add some checking...
+            // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
+        }
+
+        /* Make sure all threadprivate destructors get run by joining with all worker
+           threads before resetting this flag */
+        TCW_SYNC_4(__kmp_init_common, FALSE);
+
+        KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
+        KMP_MB();
+
+        //
+        // See note above: One of the possible fixes for CQ138434 / CQ140126
+        //
+        // FIXME: push both code fragments down and CSE them?
+        // push them into __kmp_cleanup() ?
+        //
+        __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
+        if ( TCR_4( __kmp_init_monitor ) ) {
+            __kmp_reap_monitor( & __kmp_monitor );
+            TCW_4( __kmp_init_monitor, 0 );
+        }
+        __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
+        KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
+
+    } /* else !__kmp_global.t_active */
+    TCW_4(__kmp_init_gtid, FALSE);
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+
+    __kmp_cleanup();
+#if OMPT_SUPPORT
+    ompt_fini();
+#endif
+}
+
+void
+__kmp_internal_end_library( int gtid_req )
+{
+    /* if we have already cleaned up, don't try again, it wouldn't be pretty */
+    /* this shouldn't be a race condition because __kmp_internal_end() is the
+     * only place to clear __kmp_serial_init */
+    /* we'll check this later too, after we get the lock */
+    // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
+    // because the next check will work in any case.
+    if( __kmp_global.g.g_abort ) {
+        KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
+        /* TODO abort? */
+        return;
+    }
+    if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
+        KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
+        return;
+    }
+
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    /* find out who we are and what we should do */
+    {
+        int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
+        KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req ));
+        if( gtid == KMP_GTID_SHUTDOWN ) {
+            KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
+            return;
+        } else if( gtid == KMP_GTID_MONITOR ) {
+            KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
+            return;
+        } else if( gtid == KMP_GTID_DNE ) {
+            KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
+            /* we don't know who we are, but we may still shutdown the library */
+        } else if( KMP_UBER_GTID( gtid )) {
+            /* unregister ourselves as an uber thread.  gtid is no longer valid */
+            if( __kmp_root[gtid]->r.r_active ) {
+                __kmp_global.g.g_abort = -1;
+                TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+                KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
+                return;
+            } else {
+                KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
+                __kmp_unregister_root_current_thread( gtid );
+            }
+        } else {
+            /* worker threads may call this function through the atexit handler, if they call exit() */
+            /* For now, skip the usual subsequent processing and just dump the debug buffer.
+               TODO: do a thorough shutdown instead
+            */
+            #ifdef DUMP_DEBUG_ON_EXIT
+                if ( __kmp_debug_buf )
+                    __kmp_dump_debug_buffer( );
+            #endif
+            return;
+        }
+    }
+    /* synchronize the termination process */
+    __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+
+    /* have we already finished */
+    if( __kmp_global.g.g_abort ) {
+        KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
+        /* TODO abort? */
+        __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+        return;
+    }
+    if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
+        __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+        return;
+    }
+
+    /* We need this lock to enforce mutex between this reading of
+       __kmp_threads_capacity and the writing by __kmp_register_root.
+       Alternatively, we can use a counter of roots that is
+       atomically updated by __kmp_get_global_thread_id_reg,
+       __kmp_do_serial_initialize and __kmp_internal_end_*.
+    */
+    __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+
+    /* now we can safely conduct the actual termination */
+    __kmp_internal_end();
+
+    __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+    __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+
+    KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
+
+    #ifdef DUMP_DEBUG_ON_EXIT
+        if ( __kmp_debug_buf )
+            __kmp_dump_debug_buffer();
+    #endif
+
+    #if KMP_OS_WINDOWS
+        __kmp_close_console();
+    #endif
+
+    __kmp_fini_allocator();
+
+} // __kmp_internal_end_library
+
+void
+__kmp_internal_end_thread( int gtid_req )
+{
+    int i;
+
+    /* if we have already cleaned up, don't try again, it wouldn't be pretty */
+    /* this shouldn't be a race condition because __kmp_internal_end() is the
+     * only place to clear __kmp_serial_init */
+    /* we'll check this later too, after we get the lock */
+    // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
+    // because the next check will work in any case.
+    if( __kmp_global.g.g_abort ) {
+        KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
+        /* TODO abort? */
+        return;
+    }
+    if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
+        KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
+        return;
+    }
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    /* find out who we are and what we should do */
+    {
+        int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
+        KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req ));
+        if( gtid == KMP_GTID_SHUTDOWN ) {
+            KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
+            return;
+        } else if( gtid == KMP_GTID_MONITOR ) {
+            KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
+            return;
+        } else if( gtid == KMP_GTID_DNE ) {
+            KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
+            return;
+            /* we don't know who we are */
+        } else if( KMP_UBER_GTID( gtid )) {
+        /* unregister ourselves as an uber thread.  gtid is no longer valid */
+            if( __kmp_root[gtid]->r.r_active ) {
+                __kmp_global.g.g_abort = -1;
+                TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+                KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
+                return;
+            } else {
+                KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
+                __kmp_unregister_root_current_thread( gtid );
+            }
+        } else {
+            /* just a worker thread, let's leave */
+            KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
+
+            if ( gtid >= 0 ) {
+                kmp_info_t *this_thr = __kmp_threads[ gtid ];
+                if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
+                    __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
+                }
+            }
+
+            KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
+            return;
+        }
+    }
+    #if defined KMP_DYNAMIC_LIB
+    // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
+    //     because we will better shutdown later in the library destructor.
+    //     The reason of this change is performance problem when non-openmp thread
+    //     in a loop forks and joins many openmp threads. We can save a lot of time
+    //     keeping worker threads alive until the program shutdown.
+    // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
+    //     Windows(DPD200287443) that occurs when using critical sections from foreign threads.
+        KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
+        return;
+    #endif
+    /* synchronize the termination process */
+    __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+
+    /* have we already finished */
+    if( __kmp_global.g.g_abort ) {
+        KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
+        /* TODO abort? */
+        __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+        return;
+    }
+    if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
+        __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+        return;
+    }
+
+    /* We need this lock to enforce mutex between this reading of
+       __kmp_threads_capacity and the writing by __kmp_register_root.
+       Alternatively, we can use a counter of roots that is
+       atomically updated by __kmp_get_global_thread_id_reg,
+       __kmp_do_serial_initialize and __kmp_internal_end_*.
+    */
+
+    /* should we finish the run-time?  are all siblings done? */
+    __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+
+    for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
+        if ( KMP_UBER_GTID( i ) ) {
+            KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
+            __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+            __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+            return;
+        };
+    }
+
+    /* now we can safely conduct the actual termination */
+
+    __kmp_internal_end();
+
+    __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+    __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+
+    KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
+
+    #ifdef DUMP_DEBUG_ON_EXIT
+        if ( __kmp_debug_buf )
+            __kmp_dump_debug_buffer();
+    #endif
+} // __kmp_internal_end_thread
+
+// -------------------------------------------------------------------------------------------------
+// Library registration stuff.
+
+static long   __kmp_registration_flag = 0;
+    // Random value used to indicate library initialization.
+static char * __kmp_registration_str  = NULL;
+    // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
+
+
+static inline
+char *
+__kmp_reg_status_name() {
+    /*
+        On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
+        If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
+        the name of registered_lib_env env var can not be found, because the name will contain different pid.
+    */
+    return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
+} // __kmp_reg_status_get
+
+
+void
+__kmp_register_library_startup(
+    void
+) {
+
+    char * name   = __kmp_reg_status_name();  // Name of the environment variable.
+    int    done   = 0;
+    union {
+        double dtime;
+        long   ltime;
+    } time;
+    #if KMP_OS_WINDOWS
+        __kmp_initialize_system_tick();
+    #endif
+    __kmp_read_system_time( & time.dtime );
+    __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
+    __kmp_registration_str =
+        __kmp_str_format(
+            "%p-%lx-%s",
+            & __kmp_registration_flag,
+            __kmp_registration_flag,
+            KMP_LIBRARY_FILE
+        );
+
+    KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
+
+    while ( ! done ) {
+
+        char * value  = NULL; // Actual value of the environment variable.
+
+        // Set environment variable, but do not overwrite if it is exist.
+        __kmp_env_set( name, __kmp_registration_str, 0 );
+        // Check the variable is written.
+        value = __kmp_env_get( name );
+        if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
+
+            done = 1;    // Ok, environment variable set successfully, exit the loop.
+
+        } else {
+
+            // Oops. Write failed. Another copy of OpenMP RTL is in memory.
+            // Check whether it alive or dead.
+            int    neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
+            char * tail          = value;
+            char * flag_addr_str = NULL;
+            char * flag_val_str  = NULL;
+            char const * file_name     = NULL;
+            __kmp_str_split( tail, '-', & flag_addr_str, & tail );
+            __kmp_str_split( tail, '-', & flag_val_str,  & tail );
+            file_name = tail;
+            if ( tail != NULL ) {
+                long * flag_addr = 0;
+                long   flag_val  = 0;
+                KMP_SSCANF( flag_addr_str, "%p",  & flag_addr );
+                KMP_SSCANF( flag_val_str,  "%lx", & flag_val  );
+                if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
+                    // First, check whether environment-encoded address is mapped into addr space.
+                    // If so, dereference it to see if it still has the right value.
+
+                    if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
+                        neighbor = 1;
+                    } else {
+                        // If not, then we know the other copy of the library is no longer running.
+                        neighbor = 2;
+                    }; // if
+                }; // if
+            }; // if
+            switch ( neighbor ) {
+                case 0 :      // Cannot parse environment variable -- neighbor status unknown.
+                    // Assume it is the incompatible format of future version of the library.
+                    // Assume the other library is alive.
+                    // WARN( ... ); // TODO: Issue a warning.
+                    file_name = "unknown library";
+                    // Attention! Falling to the next case. That's intentional.
+                case 1 : {    // Neighbor is alive.
+                    // Check it is allowed.
+                    char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
+                    if ( ! __kmp_str_match_true( duplicate_ok ) ) {
+                        // That's not allowed. Issue fatal error.
+                        __kmp_msg(
+                            kmp_ms_fatal,
+                            KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
+                            KMP_HNT( DuplicateLibrary ),
+                            __kmp_msg_null
+                        );
+                    }; // if
+                    KMP_INTERNAL_FREE( duplicate_ok );
+                    __kmp_duplicate_library_ok = 1;
+                    done = 1;    // Exit the loop.
+                } break;
+                case 2 : {    // Neighbor is dead.
+                    // Clear the variable and try to register library again.
+                    __kmp_env_unset( name );
+                }  break;
+                default : {
+                    KMP_DEBUG_ASSERT( 0 );
+                } break;
+            }; // switch
+
+        }; // if
+        KMP_INTERNAL_FREE( (void *) value );
+
+    }; // while
+    KMP_INTERNAL_FREE( (void *) name );
+
+} // func __kmp_register_library_startup
+
+
+void
+__kmp_unregister_library( void ) {
+
+    char * name  = __kmp_reg_status_name();
+    char * value = __kmp_env_get( name );
+
+    KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
+    KMP_DEBUG_ASSERT( __kmp_registration_str  != NULL );
+    if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
+        // Ok, this is our variable. Delete it.
+        __kmp_env_unset( name );
+    }; // if
+
+    KMP_INTERNAL_FREE( __kmp_registration_str );
+    KMP_INTERNAL_FREE( value );
+    KMP_INTERNAL_FREE( name );
+
+    __kmp_registration_flag = 0;
+    __kmp_registration_str  = NULL;
+
+} // __kmp_unregister_library
+
+
+// End of Library registration stuff.
+// -------------------------------------------------------------------------------------------------
+
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+
+static void __kmp_check_mic_type()
+{
+    kmp_cpuid_t cpuid_state = {0};
+    kmp_cpuid_t * cs_p = &cpuid_state;
+    __kmp_x86_cpuid(1, 0, cs_p);
+    // We don't support mic1 at the moment
+    if( (cs_p->eax & 0xff0) == 0xB10 ) {
+        __kmp_mic_type = mic2;
+    } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
+        __kmp_mic_type = mic3;
+    } else {
+        __kmp_mic_type = non_mic;
+    }
+}
+
+#endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
+
+static void
+__kmp_do_serial_initialize( void )
+{
+    int i, gtid;
+    int size;
+
+    KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
+
+    KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
+    KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
+    KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
+    KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
+    KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
+
+    __kmp_validate_locks();
+
+    /* Initialize internal memory allocator */
+    __kmp_init_allocator();
+
+    /* Register the library startup via an environment variable
+       and check to see whether another copy of the library is already
+       registered. */
+
+    __kmp_register_library_startup( );
+
+    /* TODO reinitialization of library */
+    if( TCR_4(__kmp_global.g.g_done) ) {
+       KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
+    }
+
+    __kmp_global.g.g_abort = 0;
+    TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
+
+    /* initialize the locks */
+#if KMP_USE_ADAPTIVE_LOCKS
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+    __kmp_init_speculative_stats();
+#endif
+#endif
+    __kmp_init_lock( & __kmp_global_lock     );
+    __kmp_init_queuing_lock( & __kmp_dispatch_lock );
+    __kmp_init_lock( & __kmp_debug_lock      );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock     );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_1i  );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_2i  );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_4i  );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_4r  );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_8i  );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_8r  );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_8c  );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
+    __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
+    __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock  );
+    __kmp_init_bootstrap_lock( & __kmp_exit_lock      );
+    __kmp_init_bootstrap_lock( & __kmp_monitor_lock   );
+    __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
+
+    /* conduct initialization and initial setup of configuration */
+
+    __kmp_runtime_initialize();
+
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_check_mic_type();
+#endif
+
+    // Some global variable initialization moved here from kmp_env_initialize()
+#ifdef KMP_DEBUG
+    kmp_diag = 0;
+#endif
+    __kmp_abort_delay = 0;
+
+    // From __kmp_init_dflt_team_nth()
+    /* assume the entire machine will be used */
+    __kmp_dflt_team_nth_ub = __kmp_xproc;
+    if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
+        __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
+    }
+    if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
+        __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
+    }
+    __kmp_max_nth = __kmp_sys_max_nth;
+
+    // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
+    __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+    __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
+    __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
+    // From "KMP_LIBRARY" part of __kmp_env_initialize()
+    __kmp_library = library_throughput;
+    // From KMP_SCHEDULE initialization
+    __kmp_static = kmp_sch_static_balanced;
+    // AC: do not use analytical here, because it is non-monotonous
+    //__kmp_guided = kmp_sch_guided_iterative_chunked;
+    //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
+    // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
+    // control parts
+    #if KMP_FAST_REDUCTION_BARRIER
+        #define kmp_reduction_barrier_gather_bb ((int)1)
+        #define kmp_reduction_barrier_release_bb ((int)1)
+        #define kmp_reduction_barrier_gather_pat bp_hyper_bar
+        #define kmp_reduction_barrier_release_pat bp_hyper_bar
+    #endif // KMP_FAST_REDUCTION_BARRIER
+    for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
+        __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
+        __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
+        __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
+        __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
+        #if KMP_FAST_REDUCTION_BARRIER
+        if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
+            __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
+            __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
+            __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
+            __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
+        }
+        #endif // KMP_FAST_REDUCTION_BARRIER
+    }
+    #if KMP_FAST_REDUCTION_BARRIER
+        #undef kmp_reduction_barrier_release_pat
+        #undef kmp_reduction_barrier_gather_pat
+        #undef kmp_reduction_barrier_release_bb
+        #undef kmp_reduction_barrier_gather_bb
+    #endif // KMP_FAST_REDUCTION_BARRIER
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    if( __kmp_mic_type != non_mic ) {
+        // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
+        __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3;  // plane gather
+        __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1;  // forkjoin release
+        __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
+        __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
+    }
+#if KMP_FAST_REDUCTION_BARRIER
+    if( __kmp_mic_type != non_mic ) {
+        __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
+        __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
+    }
+#endif
+#endif
+
+    // From KMP_CHECKS initialization
+#ifdef KMP_DEBUG
+    __kmp_env_checks = TRUE;   /* development versions have the extra checks */
+#else
+    __kmp_env_checks = FALSE;  /* port versions do not have the extra checks */
+#endif
+
+    // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
+    __kmp_foreign_tp = TRUE;
+
+    __kmp_global.g.g_dynamic = FALSE;
+    __kmp_global.g.g_dynamic_mode = dynamic_default;
+
+    __kmp_env_initialize( NULL );
+
+    // Print all messages in message catalog for testing purposes.
+    #ifdef KMP_DEBUG
+        char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
+        if ( __kmp_str_match_true( val ) ) {
+            kmp_str_buf_t buffer;
+            __kmp_str_buf_init( & buffer );
+            __kmp_i18n_dump_catalog( & buffer );
+            __kmp_printf( "%s", buffer.str );
+            __kmp_str_buf_free( & buffer );
+        }; // if
+        __kmp_env_free( & val );
+    #endif
+
+    __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
+    // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
+    __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
+
+
+    // If the library is shut down properly, both pools must be NULL. Just in case, set them
+    // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
+    KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
+    KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
+    KMP_DEBUG_ASSERT( __kmp_team_pool   == NULL );
+    __kmp_thread_pool = NULL;
+    __kmp_thread_pool_insert_pt = NULL;
+    __kmp_team_pool   = NULL;
+
+    /* Allocate all of the variable sized records */
+    /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
+    /* Since allocation is cache-aligned, just add extra padding at the end */
+    size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
+    __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
+    __kmp_root    = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
+
+    /* init thread counts */
+    KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
+    KMP_DEBUG_ASSERT( __kmp_nth == 0 );     // something was wrong in termination.
+    __kmp_all_nth = 0;
+    __kmp_nth     = 0;
+
+    /* setup the uber master thread and hierarchy */
+    gtid = __kmp_register_root( TRUE );
+    KA_TRACE( 10, ("__kmp_do_serial_initialize  T#%d\n", gtid ));
+    KMP_ASSERT( KMP_UBER_GTID( gtid ) );
+    KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    __kmp_common_initialize();
+
+    #if KMP_OS_UNIX
+        /* invoke the child fork handler */
+        __kmp_register_atfork();
+    #endif
+
+    #if ! defined KMP_DYNAMIC_LIB
+        {
+            /* Invoke the exit handler when the program finishes, only for static library.
+               For dynamic library, we already have _fini and DllMain.
+             */
+            int rc = atexit( __kmp_internal_end_atexit );
+            if ( rc != 0 ) {
+                __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
+            }; // if
+        }
+    #endif
+
+    #if KMP_HANDLE_SIGNALS
+        #if KMP_OS_UNIX
+            /* NOTE: make sure that this is called before the user installs
+             *          their own signal handlers so that the user handlers
+             *          are called first.  this way they can return false,
+             *          not call our handler, avoid terminating the library,
+             *          and continue execution where they left off. */
+            __kmp_install_signals( FALSE );
+        #endif /* KMP_OS_UNIX */
+        #if KMP_OS_WINDOWS
+            __kmp_install_signals( TRUE );
+        #endif /* KMP_OS_WINDOWS */
+    #endif
+
+    /* we have finished the serial initialization */
+    __kmp_init_counter ++;
+
+    __kmp_init_serial = TRUE;
+
+    if (__kmp_settings) {
+        __kmp_env_print();
+    }
+
+#if OMP_40_ENABLED
+    if (__kmp_display_env || __kmp_display_env_verbose) {
+        __kmp_env_print_2();
+    }
+#endif // OMP_40_ENABLED
+
+    KMP_MB();
+
+    KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
+#if OMPT_SUPPORT
+    ompt_init();
+#endif
+}
+
+void
+__kmp_serial_initialize( void )
+{
+    if ( __kmp_init_serial ) {
+        return;
+    }
+    __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+    if ( __kmp_init_serial ) {
+        __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+        return;
+    }
+    __kmp_do_serial_initialize();
+    __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+}
+
+static void
+__kmp_do_middle_initialize( void )
+{
+    int i, j;
+    int prev_dflt_team_nth;
+
+    if( !__kmp_init_serial ) {
+        __kmp_do_serial_initialize();
+    }
+
+    KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
+
+    //
+    // Save the previous value for the __kmp_dflt_team_nth so that
+    // we can avoid some reinitialization if it hasn't changed.
+    //
+    prev_dflt_team_nth = __kmp_dflt_team_nth;
+
+#if KMP_AFFINITY_SUPPORTED
+    //
+    // __kmp_affinity_initialize() will try to set __kmp_ncores to the
+    // number of cores on the machine.
+    //
+    __kmp_affinity_initialize();
+
+    //
+    // Run through the __kmp_threads array and set the affinity mask
+    // for each root thread that is currently registered with the RTL.
+    //
+    for ( i = 0; i < __kmp_threads_capacity; i++ ) {
+        if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
+            __kmp_affinity_set_init_mask( i, TRUE );
+        }
+    }
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+    KMP_ASSERT( __kmp_xproc > 0 );
+    if ( __kmp_avail_proc == 0 ) {
+        __kmp_avail_proc = __kmp_xproc;
+    }
+
+    // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
+    j = 0;
+    while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
+        __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
+        j++;
+    }
+
+    if ( __kmp_dflt_team_nth == 0 ) {
+#ifdef KMP_DFLT_NTH_CORES
+        //
+        // Default #threads = #cores
+        //
+        __kmp_dflt_team_nth = __kmp_ncores;
+        KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
+          __kmp_dflt_team_nth ) );
+#else
+        //
+        // Default #threads = #available OS procs
+        //
+        __kmp_dflt_team_nth = __kmp_avail_proc;
+        KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
+          __kmp_dflt_team_nth ) );
+#endif /* KMP_DFLT_NTH_CORES */
+    }
+
+    if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
+        __kmp_dflt_team_nth = KMP_MIN_NTH;
+    }
+    if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
+        __kmp_dflt_team_nth = __kmp_sys_max_nth;
+    }
+
+    //
+    // There's no harm in continuing if the following check fails,
+    // but it indicates an error in the previous logic.
+    //
+    KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
+
+    if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
+        //
+        // Run through the __kmp_threads array and set the num threads icv
+        // for each root thread that is currently registered with the RTL
+        // (which has not already explicitly set its nthreads-var with a
+        // call to omp_set_num_threads()).
+        //
+        for ( i = 0; i < __kmp_threads_capacity; i++ ) {
+            kmp_info_t *thread = __kmp_threads[ i ];
+            if ( thread == NULL ) continue;
+            if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
+
+            set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
+        }
+    }
+    KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
+      __kmp_dflt_team_nth) );
+
+#ifdef KMP_ADJUST_BLOCKTIME
+    /* Adjust blocktime to zero if necessary */
+    /* now that __kmp_avail_proc is set      */
+    if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
+        KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
+        if ( __kmp_nth > __kmp_avail_proc ) {
+            __kmp_zero_bt = TRUE;
+        }
+    }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+    /* we have finished middle initialization */
+    TCW_SYNC_4(__kmp_init_middle, TRUE);
+
+    KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
+}
+
+void
+__kmp_middle_initialize( void )
+{
+    if ( __kmp_init_middle ) {
+        return;
+    }
+    __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+    if ( __kmp_init_middle ) {
+        __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+        return;
+    }
+    __kmp_do_middle_initialize();
+    __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+#if OMPT_SUPPORT
+    ompt_init();
+#endif
+}
+
+void
+__kmp_parallel_initialize( void )
+{
+    int gtid = __kmp_entry_gtid();      // this might be a new root
+
+    /* syncronize parallel initialization (for sibling) */
+    if( TCR_4(__kmp_init_parallel) ) return;
+    __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+    if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
+
+    /* TODO reinitialization after we have already shut down */
+    if( TCR_4(__kmp_global.g.g_done) ) {
+        KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
+        __kmp_infinite_loop();
+    }
+
+    /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
+           would cause a deadlock.  So we call __kmp_do_serial_initialize directly.
+    */
+    if( !__kmp_init_middle ) {
+        __kmp_do_middle_initialize();
+    }
+
+    /* begin initialization */
+    KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
+    KMP_ASSERT( KMP_UBER_GTID( gtid ) );
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    //
+    // Save the FP control regs.
+    // Worker threads will set theirs to these values at thread startup.
+    //
+    __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
+    __kmp_store_mxcsr( &__kmp_init_mxcsr );
+    __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_OS_UNIX
+# if KMP_HANDLE_SIGNALS
+    /*  must be after __kmp_serial_initialize  */
+    __kmp_install_signals( TRUE );
+# endif
+#endif
+
+    __kmp_suspend_initialize();
+
+#  if defined(USE_LOAD_BALANCE)
+    if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
+        __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
+    }
+#else
+    if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
+        __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+    }
+#endif
+
+    if ( __kmp_version ) {
+        __kmp_print_version_2();
+    }
+
+    /* we have finished parallel initialization */
+    TCW_SYNC_4(__kmp_init_parallel, TRUE);
+
+    KMP_MB();
+    KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
+
+    __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+#if OMPT_SUPPORT
+    ompt_init();
+#endif
+}
+
+
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
+  kmp_team_t *team )
+{
+    kmp_disp_t *dispatch;
+
+    KMP_MB();
+
+    /* none of the threads have encountered any constructs, yet. */
+    this_thr->th.th_local.this_construct = 0;
+#if KMP_CACHE_MANAGE
+    KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
+#endif /* KMP_CACHE_MANAGE */
+    dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
+    KMP_DEBUG_ASSERT( dispatch );
+    KMP_DEBUG_ASSERT( team->t.t_dispatch );
+    //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
+
+    dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
+
+    if( __kmp_env_consistency_check )
+        __kmp_push_parallel( gtid, team->t.t_ident );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+}
+
+void
+__kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
+  kmp_team_t *team )
+{
+    if( __kmp_env_consistency_check )
+        __kmp_pop_parallel( gtid, team->t.t_ident );
+}
+
+int
+__kmp_invoke_task_func( int gtid )
+{
+    int          rc;
+    int          tid      = __kmp_tid_from_gtid( gtid );
+    kmp_info_t  *this_thr = __kmp_threads[ gtid ];
+    kmp_team_t  *team     = this_thr->th.th_team;
+
+    __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
+#if USE_ITT_BUILD
+    if ( __itt_stack_caller_create_ptr ) {
+        __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
+    }
+#endif /* USE_ITT_BUILD */
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_INVOKING();
+#endif
+
+#if OMPT_SUPPORT
+    void *dummy;
+    void **exit_runtime_p;
+    ompt_task_id_t my_task_id;
+    ompt_parallel_id_t my_parallel_id;
+
+    if (ompt_status & ompt_status_track) {
+        exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
+            ompt_task_info.frame.exit_runtime_frame);
+    } else {
+        exit_runtime_p = &dummy;
+    }
+
+#if OMPT_TRACE
+    my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
+    my_parallel_id = team->t.ompt_team_info.parallel_id;
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
+        ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
+            my_parallel_id, my_task_id);
+    }
+#endif
+#endif
+
+    rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
+      gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
+#if OMPT_SUPPORT
+      , exit_runtime_p
+#endif
+      );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if (ompt_status & ompt_status_track) {
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
+            ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
+                my_parallel_id, my_task_id);
+        }
+        // the implicit task is not dead yet, so we can't clear its task id here
+        team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_runtime_frame = 0;
+    }
+#endif
+
+#if USE_ITT_BUILD
+    if ( __itt_stack_caller_create_ptr ) {
+        __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
+    }
+#endif /* USE_ITT_BUILD */
+    __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
+
+    return rc;
+}
+
+#if OMP_40_ENABLED
+void
+__kmp_teams_master( int gtid )
+{
+    // This routine is called by all master threads in teams construct
+    kmp_info_t *thr = __kmp_threads[ gtid ];
+    kmp_team_t *team = thr->th.th_team;
+    ident_t     *loc =  team->t.t_ident;
+    thr->th.th_set_nproc = thr->th.th_teams_size.nth;
+    KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
+    KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
+    KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
+                   gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
+    // Launch league of teams now, but not let workers execute
+    // (they hang on fork barrier until next parallel)
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_FORKING();
+#endif
+    __kmp_fork_call( loc, gtid, fork_context_intel,
+            team->t.t_argc,
+#if OMPT_SUPPORT
+            (void *)thr->th.th_teams_microtask,      // "unwrapped" task
+#endif
+            (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
+            VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
+            NULL );
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_JOINING();
+#endif
+    __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
+                                     // worker threads are in a fork barrier waiting for more parallel regions
+}
+
+int
+__kmp_invoke_teams_master( int gtid )
+{
+    kmp_info_t  *this_thr = __kmp_threads[ gtid ];
+    kmp_team_t  *team     = this_thr->th.th_team;
+    #if KMP_DEBUG
+    if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
+        KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
+    #endif
+    __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
+    __kmp_teams_master( gtid );
+    __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
+    return 1;
+}
+#endif /* OMP_40_ENABLED */
+
+/* this sets the requested number of threads for the next parallel region
+ * encountered by this team */
+/* since this should be enclosed in the forkjoin critical section it
+ * should avoid race conditions with assymmetrical nested parallelism */
+
+void
+__kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
+{
+    kmp_info_t *thr = __kmp_threads[gtid];
+
+    if( num_threads > 0 )
+        thr->th.th_set_nproc = num_threads;
+}
+
+#if OMP_40_ENABLED
+
+/* this sets the requested number of teams for the teams region and/or
+ * the number of threads for the next parallel region encountered  */
+void
+__kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
+{
+    kmp_info_t *thr = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(num_teams >= 0);
+    KMP_DEBUG_ASSERT(num_threads >= 0);
+    if( num_teams == 0 ) {
+        num_teams = 1;    // default number of teams is 1.
+    }
+    // Set number of teams (number of threads in the outer "parallel" of the teams)
+    thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
+
+    // Remember the number of threads for inner parallel regions
+    if( num_threads > 0 ) {
+        thr->th.th_teams_size.nth = num_threads;
+    } else {
+        if( !TCR_4(__kmp_init_middle) )
+            __kmp_middle_initialize();  // get __kmp_avail_proc calculated
+        thr->th.th_teams_size.nth = __kmp_avail_proc / num_teams;
+    }
+}
+
+
+//
+// Set the proc_bind var to use in the following parallel region.
+//
+void
+__kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
+{
+    kmp_info_t *thr = __kmp_threads[gtid];
+    thr->th.th_set_proc_bind = proc_bind;
+}
+
+#endif /* OMP_40_ENABLED */
+
+/* Launch the worker threads into the microtask. */
+
+void
+__kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
+{
+    kmp_info_t *this_thr = __kmp_threads[gtid];
+
+#ifdef KMP_DEBUG
+    int f;
+#endif /* KMP_DEBUG */
+
+    KMP_DEBUG_ASSERT( team );
+    KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
+    KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    team->t.t_construct = 0;          /* no single directives seen yet */
+    team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
+
+    /* Reset the identifiers on the dispatch buffer */
+    KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
+    if ( team->t.t_max_nproc > 1 ) {
+        int i;
+        for (i = 0; i <  KMP_MAX_DISP_BUF; ++i)
+            team->t.t_disp_buffer[ i ].buffer_index = i;
+    } else {
+        team->t.t_disp_buffer[ 0 ].buffer_index = 0;
+    }
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+    KMP_ASSERT( this_thr->th.th_team  ==  team );
+
+#ifdef KMP_DEBUG
+    for( f=0 ; f<team->t.t_nproc ; f++ ) {
+        KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
+                          team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
+    }
+#endif /* KMP_DEBUG */
+
+    /* release the worker threads so they may begin working */
+    __kmp_fork_barrier( gtid, 0 );
+}
+
+
+void
+__kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
+{
+    kmp_info_t *this_thr = __kmp_threads[gtid];
+
+    KMP_DEBUG_ASSERT( team );
+    KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
+    KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    /* Join barrier after fork */
+
+#ifdef KMP_DEBUG
+    if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
+        __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
+        __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
+                     gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
+        __kmp_print_structure();
+    }
+    KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
+                     __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
+#endif /* KMP_DEBUG */
+
+    __kmp_join_barrier( gtid );  /* wait for everyone */
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+    KMP_ASSERT( this_thr->th.th_team  ==  team );
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#ifdef USE_LOAD_BALANCE
+
+//
+// Return the worker threads actively spinning in the hot team, if we
+// are at the outermost level of parallelism.  Otherwise, return 0.
+//
+static int
+__kmp_active_hot_team_nproc( kmp_root_t *root )
+{
+    int i;
+    int retval;
+    kmp_team_t *hot_team;
+
+    if ( root->r.r_active ) {
+        return 0;
+    }
+    hot_team = root->r.r_hot_team;
+    if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
+        return hot_team->t.t_nproc - 1;  // Don't count master thread
+    }
+
+    //
+    // Skip the master thread - it is accounted for elsewhere.
+    //
+    retval = 0;
+    for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
+        if ( hot_team->t.t_threads[i]->th.th_active ) {
+            retval++;
+        }
+    }
+    return retval;
+}
+
+//
+// Perform an automatic adjustment to the number of
+// threads used by the next parallel region.
+//
+static int
+__kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
+{
+    int retval;
+    int pool_active;
+    int hot_team_active;
+    int team_curr_active;
+    int system_active;
+
+    KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
+                root, set_nproc ) );
+    KMP_DEBUG_ASSERT( root );
+    KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
+    KMP_DEBUG_ASSERT( set_nproc > 1 );
+
+    if ( set_nproc == 1) {
+        KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
+        return 1;
+    }
+
+    //
+    // Threads that are active in the thread pool, active in the hot team
+    // for this particular root (if we are at the outer par level), and
+    // the currently executing thread (to become the master) are available
+    // to add to the new team, but are currently contributing to the system
+    // load, and must be accounted for.
+    //
+    pool_active = TCR_4(__kmp_thread_pool_active_nth);
+    hot_team_active = __kmp_active_hot_team_nproc( root );
+    team_curr_active = pool_active + hot_team_active + 1;
+
+    //
+    // Check the system load.
+    //
+    system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
+    KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
+      system_active, pool_active, hot_team_active ) );
+
+    if ( system_active < 0 ) {
+        //
+        // There was an error reading the necessary info from /proc,
+        // so use the thread limit algorithm instead.  Once we set
+        // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
+        // we shouldn't wind up getting back here.
+        //
+        __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+        KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
+
+        //
+        // Make this call behave like the thread limit algorithm.
+        //
+        retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
+          : root->r.r_hot_team->t.t_nproc);
+        if ( retval > set_nproc ) {
+            retval = set_nproc;
+        }
+        if ( retval < KMP_MIN_NTH ) {
+            retval = KMP_MIN_NTH;
+        }
+
+        KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
+        return retval;
+    }
+
+    //
+    // There is a slight delay in the load balance algorithm in detecting
+    // new running procs.  The real system load at this instant should be
+    // at least as large as the #active omp thread that are available to
+    // add to the team.
+    //
+    if ( system_active < team_curr_active ) {
+        system_active = team_curr_active;
+    }
+    retval = __kmp_avail_proc - system_active + team_curr_active;
+    if ( retval > set_nproc ) {
+        retval = set_nproc;
+    }
+    if ( retval < KMP_MIN_NTH ) {
+        retval = KMP_MIN_NTH;
+    }
+
+    KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
+    return retval;
+} // __kmp_load_balance_nproc()
+
+#endif /* USE_LOAD_BALANCE */
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* NOTE: this is called with the __kmp_init_lock held */
+void
+__kmp_cleanup( void )
+{
+    int f;
+
+    KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
+
+    if (TCR_4(__kmp_init_parallel)) {
+#if KMP_HANDLE_SIGNALS
+        __kmp_remove_signals();
+#endif
+        TCW_4(__kmp_init_parallel, FALSE);
+    }
+
+    if (TCR_4(__kmp_init_middle)) {
+#if KMP_AFFINITY_SUPPORTED
+        __kmp_affinity_uninitialize();
+#endif /* KMP_AFFINITY_SUPPORTED */
+        TCW_4(__kmp_init_middle, FALSE);
+    }
+
+    KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
+
+    if (__kmp_init_serial) {
+
+        __kmp_runtime_destroy();
+
+        __kmp_init_serial = FALSE;
+    }
+
+    for ( f = 0; f < __kmp_threads_capacity; f++ ) {
+        if ( __kmp_root[ f ] != NULL ) {
+            __kmp_free( __kmp_root[ f ] );
+            __kmp_root[ f ] = NULL;
+        }
+    }
+    __kmp_free( __kmp_threads );
+    // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
+    // freeing __kmp_root.
+    __kmp_threads = NULL;
+    __kmp_root    = NULL;
+    __kmp_threads_capacity = 0;
+
+#if KMP_USE_DYNAMIC_LOCK
+    __kmp_cleanup_indirect_user_locks();
+#else
+    __kmp_cleanup_user_locks();
+#endif
+
+    #if KMP_AFFINITY_SUPPORTED
+        KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
+        __kmp_cpuinfo_file = NULL;
+    #endif /* KMP_AFFINITY_SUPPORTED */
+
+   #if KMP_USE_ADAPTIVE_LOCKS
+   #if KMP_DEBUG_ADAPTIVE_LOCKS
+       __kmp_print_speculative_stats();
+   #endif
+   #endif
+    KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
+    __kmp_nested_nth.nth = NULL;
+    __kmp_nested_nth.size = 0;
+    __kmp_nested_nth.used = 0;
+
+    __kmp_i18n_catclose();
+
+#if KMP_STATS_ENABLED
+    __kmp_accumulate_stats_at_exit();
+    __kmp_stats_list.deallocate();
+#endif
+
+    KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+int
+__kmp_ignore_mppbeg( void )
+{
+    char *env;
+
+    if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
+        if (__kmp_str_match_false( env ))
+            return FALSE;
+    }
+    // By default __kmpc_begin() is no-op.
+    return TRUE;
+}
+
+int
+__kmp_ignore_mppend( void )
+{
+    char *env;
+
+    if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
+        if (__kmp_str_match_false( env ))
+            return FALSE;
+    }
+    // By default __kmpc_end() is no-op.
+    return TRUE;
+}
+
+void
+__kmp_internal_begin( void )
+{
+    int gtid;
+    kmp_root_t *root;
+
+    /* this is a very important step as it will register new sibling threads
+     * and assign these new uber threads a new gtid */
+    gtid = __kmp_entry_gtid();
+    root = __kmp_threads[ gtid ]->th.th_root;
+    KMP_ASSERT( KMP_UBER_GTID( gtid ));
+
+    if( root->r.r_begin ) return;
+    __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
+    if( root->r.r_begin ) {
+        __kmp_release_lock( & root->r.r_begin_lock, gtid );
+        return;
+    }
+
+    root->r.r_begin = TRUE;
+
+    __kmp_release_lock( & root->r.r_begin_lock, gtid );
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_user_set_library (enum library_type arg)
+{
+    int gtid;
+    kmp_root_t *root;
+    kmp_info_t *thread;
+
+    /* first, make sure we are initialized so we can get our gtid */
+
+    gtid = __kmp_entry_gtid();
+    thread = __kmp_threads[ gtid ];
+
+    root = thread->th.th_root;
+
+    KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
+    if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
+        KMP_WARNING( SetLibraryIncorrectCall );
+        return;
+    }
+
+    switch ( arg ) {
+    case library_serial :
+        thread->th.th_set_nproc = 0;
+        set__nproc( thread, 1 );
+        break;
+    case library_turnaround :
+        thread->th.th_set_nproc = 0;
+        set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
+        break;
+    case library_throughput :
+        thread->th.th_set_nproc = 0;
+        set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
+        break;
+    default:
+        KMP_FATAL( UnknownLibraryType, arg );
+    }
+
+    __kmp_aux_set_library ( arg );
+}
+
+void
+__kmp_aux_set_stacksize( size_t arg )
+{
+    if (! __kmp_init_serial)
+        __kmp_serial_initialize();
+
+#if KMP_OS_DARWIN
+    if (arg & (0x1000 - 1)) {
+        arg &= ~(0x1000 - 1);
+        if(arg + 0x1000) /* check for overflow if we round up */
+            arg += 0x1000;
+    }
+#endif
+    __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+
+    /* only change the default stacksize before the first parallel region */
+    if (! TCR_4(__kmp_init_parallel)) {
+        size_t value = arg;       /* argument is in bytes */
+
+        if (value < __kmp_sys_min_stksize )
+            value = __kmp_sys_min_stksize ;
+        else if (value > KMP_MAX_STKSIZE)
+            value = KMP_MAX_STKSIZE;
+
+        __kmp_stksize = value;
+
+        __kmp_env_stksize = TRUE;    /* was KMP_STACKSIZE specified? */
+    }
+
+    __kmp_release_bootstrap_lock( &__kmp_initz_lock );
+}
+
+/* set the behaviour of the runtime library */
+/* TODO this can cause some odd behaviour with sibling parallelism... */
+void
+__kmp_aux_set_library (enum library_type arg)
+{
+    __kmp_library = arg;
+
+    switch ( __kmp_library ) {
+    case library_serial :
+        {
+            KMP_INFORM( LibraryIsSerial );
+            (void) __kmp_change_library( TRUE );
+        }
+        break;
+    case library_turnaround :
+        (void) __kmp_change_library( TRUE );
+        break;
+    case library_throughput :
+        (void) __kmp_change_library( FALSE );
+        break;
+    default:
+        KMP_FATAL( UnknownLibraryType, arg );
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
+{
+    int blocktime = arg;        /* argument is in milliseconds */
+    int bt_intervals;
+    int bt_set;
+
+    __kmp_save_internal_controls( thread );
+
+    /* Normalize and set blocktime for the teams */
+    if (blocktime < KMP_MIN_BLOCKTIME)
+        blocktime = KMP_MIN_BLOCKTIME;
+    else if (blocktime > KMP_MAX_BLOCKTIME)
+        blocktime = KMP_MAX_BLOCKTIME;
+
+    set__blocktime_team( thread->th.th_team, tid, blocktime );
+    set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
+
+    /* Calculate and set blocktime intervals for the teams */
+    bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
+
+    set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
+    set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
+
+    /* Set whether blocktime has been set to "TRUE" */
+    bt_set = TRUE;
+
+    set__bt_set_team( thread->th.th_team, tid, bt_set );
+    set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
+    KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
+                  __kmp_gtid_from_tid(tid, thread->th.th_team),
+                  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
+}
+
+void
+__kmp_aux_set_defaults(
+    char const * str,
+    int          len
+) {
+    if ( ! __kmp_init_serial ) {
+        __kmp_serial_initialize();
+    };
+    __kmp_env_initialize( str );
+
+    if (__kmp_settings
+#if OMP_40_ENABLED
+        || __kmp_display_env || __kmp_display_env_verbose
+#endif // OMP_40_ENABLED
+        ) {
+        __kmp_env_print();
+    }
+} // __kmp_aux_set_defaults
+
+/* ------------------------------------------------------------------------ */
+
+/*
+ * internal fast reduction routines
+ */
+
+PACKED_REDUCTION_METHOD_T
+__kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
+        kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+        kmp_critical_name *lck )
+{
+
+    // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
+    // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
+    // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
+    // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
+
+    PACKED_REDUCTION_METHOD_T retval;
+
+    int team_size;
+
+    KMP_DEBUG_ASSERT( loc );    // it would be nice to test ( loc != 0 )
+    KMP_DEBUG_ASSERT( lck );    // it would be nice to test ( lck != 0 )
+
+    #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
+    #define FAST_REDUCTION_TREE_METHOD_GENERATED   ( ( reduce_data ) && ( reduce_func ) )
+
+    retval = critical_reduce_block;
+
+    team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
+
+    if( team_size == 1 ) {
+
+        retval = empty_reduce_block;
+
+    } else {
+
+        int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
+        int tree_available   = FAST_REDUCTION_TREE_METHOD_GENERATED;
+
+        #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
+
+            #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
+
+	    int teamsize_cutoff = 4;
+
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+                if( __kmp_mic_type != non_mic ) {
+                    teamsize_cutoff = 8;
+                }
+#endif
+                if( tree_available ) {
+                    if( team_size <= teamsize_cutoff ) {
+                        if ( atomic_available ) {
+                            retval = atomic_reduce_block;
+                        }
+                    } else {
+                        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
+                    }
+                } else if ( atomic_available ) {
+                    retval = atomic_reduce_block;
+                }
+            #else
+                #error "Unknown or unsupported OS"
+            #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
+
+        #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH
+
+            #if KMP_OS_LINUX || KMP_OS_WINDOWS
+
+                // basic tuning
+
+                if( atomic_available ) {
+                    if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
+                        retval = atomic_reduce_block;
+                    }
+                } // otherwise: use critical section
+
+            #elif KMP_OS_DARWIN
+
+                if( atomic_available && ( num_vars <= 3 ) ) {
+                        retval = atomic_reduce_block;
+                } else if( tree_available ) {
+                    if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
+                        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
+                    }
+                } // otherwise: use critical section
+
+            #else
+                #error "Unknown or unsupported OS"
+            #endif
+
+        #else
+            #error "Unknown or unsupported architecture"
+        #endif
+
+    }
+
+    // KMP_FORCE_REDUCTION
+
+    if( __kmp_force_reduction_method != reduction_method_not_defined ) {
+
+        PACKED_REDUCTION_METHOD_T forced_retval;
+
+        int atomic_available, tree_available;
+
+        switch( ( forced_retval = __kmp_force_reduction_method ) )
+        {
+            case critical_reduce_block:
+                KMP_ASSERT( lck );              // lck should be != 0
+                if( team_size <= 1 ) {
+                    forced_retval = empty_reduce_block;
+                }
+                break;
+
+            case atomic_reduce_block:
+                atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
+                KMP_ASSERT( atomic_available ); // atomic_available should be != 0
+                break;
+
+            case tree_reduce_block:
+                tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+                KMP_ASSERT( tree_available );   // tree_available should be != 0
+                #if KMP_FAST_REDUCTION_BARRIER
+                forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
+                #endif
+                break;
+
+            default:
+                KMP_ASSERT( 0 ); // "unsupported method specified"
+        }
+
+        retval = forced_retval;
+    }
+
+    KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
+
+    #undef FAST_REDUCTION_TREE_METHOD_GENERATED
+    #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
+
+    return ( retval );
+}
+
+// this function is for testing set/get/determine reduce method
+kmp_int32
+__kmp_get_reduce_method( void ) {
+    return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
+}
+
+/* ------------------------------------------------------------------------ */

diff --git a/final/runtime/src/kmp_safe_c_api.h b/final/runtime/src/kmp_safe_c_api.h
new file mode 100644
index 0000000..c1df64c
--- /dev/null
+++ b/final/runtime/src/kmp_safe_c_api.h

@@ -0,0 +1,62 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_SAFE_C_API_H
+#define KMP_SAFE_C_API_H
+
+//
+// Replacement for banned C API
+//
+
+// Not every unsafe call listed here is handled now, but keeping everything
+// in one place should be handy for future maintenance.
+#if KMP_OS_WINDOWS
+
+# define RSIZE_MAX_STR ( 4UL << 10 ) // 4KB
+
+// _malloca was suggested, but it is not a drop-in replacement for _alloca
+# define KMP_ALLOCA                  _alloca
+
+# define KMP_MEMCPY_S                memcpy_s
+# define KMP_SNPRINTF                sprintf_s
+# define KMP_SSCANF                  sscanf_s
+# define KMP_STRCPY_S                strcpy_s
+# define KMP_STRNCPY_S               strncpy_s
+
+// Use this only when buffer size is unknown
+# define KMP_MEMCPY(dst, src, cnt)   memcpy_s(dst, cnt, src, cnt)
+
+# define KMP_STRLEN(str)             strnlen_s(str, RSIZE_MAX_STR)
+
+// Use this only when buffer size is unknown
+# define KMP_STRNCPY(dst, src, cnt)  strncpy_s(dst, cnt, src, cnt)
+
+// _TRUNCATE insures buffer size > max string to print.
+# define KMP_VSNPRINTF(dst, cnt, fmt, arg)  vsnprintf_s(dst, cnt, _TRUNCATE, fmt, arg)
+
+#else // KMP_OS_WINDOWS
+
+// For now, these macros use the existing API.
+
+# define KMP_ALLOCA                         alloca
+# define KMP_MEMCPY_S(dst, bsz, src, cnt)   memcpy(dst, src, cnt)
+# define KMP_SNPRINTF                       snprintf
+# define KMP_SSCANF                         sscanf
+# define KMP_STRCPY_S(dst, bsz, src)        strcpy(dst, src) 
+# define KMP_STRNCPY_S(dst, bsz, src, cnt)  strncpy(dst, src, cnt)
+# define KMP_VSNPRINTF                      vsnprintf
+# define KMP_STRNCPY                        strncpy
+# define KMP_STRLEN                         strlen
+# define KMP_MEMCPY                         memcpy
+
+#endif // KMP_OS_WINDOWS
+
+#endif // KMP_SAFE_C_API_H

diff --git a/final/runtime/src/kmp_sched.cpp b/final/runtime/src/kmp_sched.cpp
new file mode 100644
index 0000000..0821f38
--- /dev/null
+++ b/final/runtime/src/kmp_sched.cpp

@@ -0,0 +1,929 @@
+/*
+ * kmp_sched.c -- static scheduling -- iteration initialization
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+/*
+ * Static scheduling initialization.
+ *
+ * NOTE: team->t.t_nproc is a constant inside of any dispatch loop, however
+ *       it may change values between parallel regions.  __kmp_max_nth
+ *       is the largest value __kmp_nth may take, 1 is the smallest.
+ *
+ */
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+#include "kmp_error.h"
+#include "kmp_stats.h"
+#include "kmp_itt.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// template for type limits
+template< typename T >
+struct i_maxmin {
+    static const T mx;
+    static const T mn;
+};
+template<>
+struct i_maxmin< int > {
+    static const int mx = 0x7fffffff;
+    static const int mn = 0x80000000;
+};
+template<>
+struct i_maxmin< unsigned int > {
+    static const unsigned int mx = 0xffffffff;
+    static const unsigned int mn = 0x00000000;
+};
+template<>
+struct i_maxmin< long long > {
+    static const long long mx = 0x7fffffffffffffffLL;
+    static const long long mn = 0x8000000000000000LL;
+};
+template<>
+struct i_maxmin< unsigned long long > {
+    static const unsigned long long mx = 0xffffffffffffffffLL;
+    static const unsigned long long mn = 0x0000000000000000LL;
+};
+//-------------------------------------------------------------------------
+#ifdef KMP_DEBUG
+//-------------------------------------------------------------------------
+// template for debug prints specification ( d, u, lld, llu )
+    char const * traits_t< int >::spec = "d";
+    char const * traits_t< unsigned int >::spec = "u";
+    char const * traits_t< long long >::spec = "lld";
+    char const * traits_t< unsigned long long >::spec = "llu";
+//-------------------------------------------------------------------------
+#endif
+
+template< typename T >
+static void
+__kmp_for_static_init(
+    ident_t                          *loc,
+    kmp_int32                         global_tid,
+    kmp_int32                         schedtype,
+    kmp_int32                        *plastiter,
+    T                                *plower,
+    T                                *pupper,
+    typename traits_t< T >::signed_t *pstride,
+    typename traits_t< T >::signed_t  incr,
+    typename traits_t< T >::signed_t  chunk
+) {
+    KMP_COUNT_BLOCK(OMP_FOR_static);
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    /*  this all has to be changed back to TID and such.. */
+    register kmp_int32   gtid = global_tid;
+    register kmp_uint32  tid;
+    register kmp_uint32  nth;
+    register UT          trip_count;
+    register kmp_team_t *team;
+    register kmp_info_t *th = __kmp_threads[ gtid ];
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+#endif
+
+    KMP_DEBUG_ASSERT( plastiter && plower && pupper && pstride );
+    KE_TRACE( 10, ("__kmpc_for_static_init called (%d)\n", global_tid));
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmpc_for_static_init: T#%%d sched=%%d liter=%%d iter=(%%%s," \
+            " %%%s, %%%s) incr=%%%s chunk=%%%s signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< ST >::spec, traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, global_tid, schedtype, *plastiter,
+            *plower, *pupper, *pstride, incr, chunk ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    if ( __kmp_env_consistency_check ) {
+        __kmp_push_workshare( global_tid, ct_pdo, loc );
+        if ( incr == 0 ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
+        }
+    }
+    /* special handling for zero-trip loops */
+    if ( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
+        if( plastiter != NULL )
+            *plastiter = FALSE;
+        /* leave pupper and plower set to entire iteration space */
+        *pstride = incr;   /* value should never be used */
+	//        *plower = *pupper - incr;   // let compiler bypass the illegal loop (like for(i=1;i<10;i--))  THIS LINE CAUSED shape2F/h_tests_1.f TO HAVE A FAILURE ON A ZERO-TRIP LOOP (lower=1,\
+	  upper=0,stride=1) - JPH June 23, 2009.
+        #ifdef KMP_DEBUG
+        {
+            const char * buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmpc_for_static_init:(ZERO TRIP) liter=%%d lower=%%%s upper=%%%s stride = %%%s signed?<%s>, loc = %%s\n",
+                traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, traits_t< T >::spec );
+            KD_TRACE(100, ( buff, *plastiter, *plower, *pupper, *pstride, loc->psource ) );
+            __kmp_str_free( &buff );
+        }
+        #endif
+        KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+                team_info->parallel_id, task_info->task_id,
+                team_info->microtask);
+        }
+#endif
+        return;
+    }
+
+    #if OMP_40_ENABLED
+    if ( schedtype > kmp_ord_upper ) {
+        // we are in DISTRIBUTE construct
+        schedtype += kmp_sch_static - kmp_distribute_static;      // AC: convert to usual schedule type
+        tid  = th->th.th_team->t.t_master_tid;
+        team = th->th.th_team->t.t_parent;
+    } else
+    #endif
+    {
+        tid  = __kmp_tid_from_gtid( global_tid );
+        team = th->th.th_team;
+    }
+
+    /* determine if "for" loop is an active worksharing construct */
+    if ( team -> t.t_serialized ) {
+        /* serialized parallel, each thread executes whole iteration space */
+        if( plastiter != NULL )
+            *plastiter = TRUE;
+        /* leave pupper and plower set to entire iteration space */
+        *pstride = (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
+
+        #ifdef KMP_DEBUG
+        {
+            const char * buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmpc_for_static_init: (serial) liter=%%d lower=%%%s upper=%%%s stride = %%%s\n",
+                traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
+            KD_TRACE(100, ( buff, *plastiter, *plower, *pupper, *pstride ) );
+            __kmp_str_free( &buff );
+        }
+        #endif
+        KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+                team_info->parallel_id, task_info->task_id,
+                team_info->microtask);
+        }
+#endif
+        return;
+    }
+    nth = team->t.t_nproc;
+    if ( nth == 1 ) {
+        if( plastiter != NULL )
+            *plastiter = TRUE;
+        *pstride = (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
+        #ifdef KMP_DEBUG
+        {
+            const char * buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmpc_for_static_init: (serial) liter=%%d lower=%%%s upper=%%%s stride = %%%s\n",
+                traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
+            KD_TRACE(100, ( buff, *plastiter, *plower, *pupper, *pstride ) );
+            __kmp_str_free( &buff );
+        }
+        #endif
+        KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+        if ((ompt_status == ompt_status_track_callback) &&
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+            ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+                team_info->parallel_id, task_info->task_id,
+                team_info->microtask);
+        }
+#endif
+        return;
+    }
+
+    /* compute trip count */
+    if ( incr == 1 ) {
+        trip_count = *pupper - *plower + 1;
+    } else if (incr == -1) {
+        trip_count = *plower - *pupper + 1;
+    } else {
+        if ( incr > 1 ) {  // the check is needed for unsigned division when incr < 0
+            trip_count = (*pupper - *plower) / incr + 1;
+        } else {
+            trip_count = (*plower - *pupper) / ( -incr ) + 1;
+        }
+    }
+
+    if ( __kmp_env_consistency_check ) {
+        /* tripcount overflow? */
+        if ( trip_count == 0 && *pupper != *plower ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo, loc );
+        }
+    }
+
+    /* compute remaining parameters */
+    switch ( schedtype ) {
+    case kmp_sch_static:
+        {
+            if ( trip_count < nth ) {
+                KMP_DEBUG_ASSERT(
+                    __kmp_static == kmp_sch_static_greedy || \
+                    __kmp_static == kmp_sch_static_balanced
+                ); // Unknown static scheduling type.
+                if ( tid < trip_count ) {
+                    *pupper = *plower = *plower + tid * incr;
+                } else {
+                    *plower = *pupper + incr;
+                }
+                if( plastiter != NULL )
+                    *plastiter = ( tid == trip_count - 1 );
+            } else {
+                if ( __kmp_static == kmp_sch_static_balanced ) {
+                    register UT small_chunk = trip_count / nth;
+                    register UT extras = trip_count % nth;
+                    *plower += incr * ( tid * small_chunk + ( tid < extras ? tid : extras ) );
+                    *pupper = *plower + small_chunk * incr - ( tid < extras ? 0 : incr );
+                    if( plastiter != NULL )
+                        *plastiter = ( tid == nth - 1 );
+                } else {
+                    register T big_chunk_inc_count = ( trip_count/nth +
+                                                     ( ( trip_count % nth ) ? 1 : 0) ) * incr;
+                    register T old_upper = *pupper;
+
+                    KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
+                        // Unknown static scheduling type.
+
+                    *plower += tid * big_chunk_inc_count;
+                    *pupper = *plower + big_chunk_inc_count - incr;
+                    if ( incr > 0 ) {
+                        if( *pupper < *plower )
+                            *pupper = i_maxmin< T >::mx;
+                        if( plastiter != NULL )
+                            *plastiter = *plower <= old_upper && *pupper > old_upper - incr;
+                        if ( *pupper > old_upper ) *pupper = old_upper; // tracker C73258
+                    } else {
+                        if( *pupper > *plower )
+                            *pupper = i_maxmin< T >::mn;
+                        if( plastiter != NULL )
+                            *plastiter = *plower >= old_upper && *pupper < old_upper - incr;
+                        if ( *pupper < old_upper ) *pupper = old_upper; // tracker C73258
+                    }
+                }
+            }
+            break;
+        }
+    case kmp_sch_static_chunked:
+        {
+            register ST span;
+            if ( chunk < 1 ) {
+                chunk = 1;
+            }
+            span = chunk * incr;
+            *pstride = span * nth;
+            *plower = *plower + (span * tid);
+            *pupper = *plower + span - incr;
+            if( plastiter != NULL )
+                *plastiter = (tid == ((trip_count - 1)/( UT )chunk) % nth);
+            break;
+        }
+    default:
+        KMP_ASSERT2( 0, "__kmpc_for_static_init: unknown scheduling type" );
+        break;
+    }
+
+#if USE_ITT_BUILD
+    // Report loop metadata
+    if ( KMP_MASTER_TID(tid) && __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+#if OMP_40_ENABLED
+        th->th.th_teams_microtask == NULL &&
+#endif
+        team->t.t_active_level == 1 )
+    {
+        kmp_uint64 cur_chunk = chunk;
+        // Calculate chunk in case it was not specified; it is specified for kmp_sch_static_chunked
+        if ( schedtype == kmp_sch_static ) {
+            cur_chunk = trip_count / nth + ( ( trip_count % nth ) ? 1 : 0);
+        }
+        // 0 - "static" schedule
+        __kmp_itt_metadata_loop(loc, 0, trip_count, cur_chunk);
+    }
+#endif
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmpc_for_static_init: liter=%%d lower=%%%s upper=%%%s stride = %%%s signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, *plastiter, *plower, *pupper, *pstride ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+    KE_TRACE( 10, ("__kmpc_for_static_init: T#%d return\n", global_tid ) );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
+        ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
+            team_info->parallel_id, task_info->task_id, team_info->microtask);
+    }
+#endif
+
+    return;
+}
+
+template< typename T >
+static void
+__kmp_dist_for_static_init(
+    ident_t                          *loc,
+    kmp_int32                         gtid,
+    kmp_int32                         schedule,
+    kmp_int32                        *plastiter,
+    T                                *plower,
+    T                                *pupper,
+    T                                *pupperDist,
+    typename traits_t< T >::signed_t *pstride,
+    typename traits_t< T >::signed_t  incr,
+    typename traits_t< T >::signed_t  chunk
+) {
+    KMP_COUNT_BLOCK(OMP_DISTR_FOR_static);
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    register kmp_uint32  tid;
+    register kmp_uint32  nth;
+    register kmp_uint32  team_id;
+    register kmp_uint32  nteams;
+    register UT          trip_count;
+    register kmp_team_t *team;
+    kmp_info_t * th;
+
+    KMP_DEBUG_ASSERT( plastiter && plower && pupper && pupperDist && pstride );
+    KE_TRACE( 10, ("__kmpc_dist_for_static_init called (%d)\n", gtid));
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmpc_dist_for_static_init: T#%%d schedLoop=%%d liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) chunk=%%%s signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, gtid, schedule, *plastiter,
+                       *plower, *pupper, incr, chunk ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    if( __kmp_env_consistency_check ) {
+        __kmp_push_workshare( gtid, ct_pdo, loc );
+        if( incr == 0 ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
+        }
+        if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
+            // The loop is illegal.
+            // Some zero-trip loops maintained by compiler, e.g.:
+            //   for(i=10;i<0;++i) // lower >= upper - run-time check
+            //   for(i=0;i>10;--i) // lower <= upper - run-time check
+            //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+            //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+            // Compiler does not check the following illegal loops:
+            //   for(i=0;i<10;i+=incr) // where incr<0
+            //   for(i=10;i>0;i-=incr) // where incr<0
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
+        }
+    }
+    tid = __kmp_tid_from_gtid( gtid );
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
+    nth = th->th.th_team_nproc;
+    team = th->th.th_team;
+    #if OMP_40_ENABLED
+    nteams = th->th.th_teams_size.nteams;
+    #endif
+    team_id = team->t.t_master_tid;
+    KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+
+    // compute global trip count
+    if( incr == 1 ) {
+        trip_count = *pupper - *plower + 1;
+    } else if(incr == -1) {
+        trip_count = *plower - *pupper + 1;
+    } else {
+        trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
+    }
+    *pstride = *pupper - *plower;  // just in case (can be unused)
+    if( trip_count <= nteams ) {
+        KMP_DEBUG_ASSERT(
+            __kmp_static == kmp_sch_static_greedy || \
+            __kmp_static == kmp_sch_static_balanced
+        ); // Unknown static scheduling type.
+        // only masters of some teams get single iteration, other threads get nothing
+        if( team_id < trip_count && tid == 0 ) {
+            *pupper = *pupperDist = *plower = *plower + team_id * incr;
+        } else {
+            *pupperDist = *pupper;
+            *plower = *pupper + incr; // compiler should skip loop body
+        }
+        if( plastiter != NULL )
+            *plastiter = ( tid == 0 && team_id == trip_count - 1 );
+    } else {
+        // Get the team's chunk first (each team gets at most one chunk)
+        if( __kmp_static == kmp_sch_static_balanced ) {
+            register UT chunkD = trip_count / nteams;
+            register UT extras = trip_count % nteams;
+            *plower += incr * ( team_id * chunkD + ( team_id < extras ? team_id : extras ) );
+            *pupperDist = *plower + chunkD * incr - ( team_id < extras ? 0 : incr );
+            if( plastiter != NULL )
+                *plastiter = ( team_id == nteams - 1 );
+        } else {
+            register T chunk_inc_count =
+                ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
+            register T upper = *pupper;
+            KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
+                // Unknown static scheduling type.
+            *plower += team_id * chunk_inc_count;
+            *pupperDist = *plower + chunk_inc_count - incr;
+            // Check/correct bounds if needed
+            if( incr > 0 ) {
+                if( *pupperDist < *plower )
+                    *pupperDist = i_maxmin< T >::mx;
+                if( plastiter != NULL )
+                    *plastiter = *plower <= upper && *pupperDist > upper - incr;
+                if( *pupperDist > upper )
+                    *pupperDist = upper; // tracker C73258
+                if( *plower > *pupperDist ) {
+                    *pupper = *pupperDist;  // no iterations available for the team
+                    goto end;
+                }
+            } else {
+                if( *pupperDist > *plower )
+                    *pupperDist = i_maxmin< T >::mn;
+                if( plastiter != NULL )
+                    *plastiter = *plower >= upper && *pupperDist < upper - incr;
+                if( *pupperDist < upper )
+                    *pupperDist = upper; // tracker C73258
+                if( *plower < *pupperDist ) {
+                    *pupper = *pupperDist;  // no iterations available for the team
+                    goto end;
+                }
+            }
+        }
+        // Get the parallel loop chunk now (for thread)
+        // compute trip count for team's chunk
+        if( incr == 1 ) {
+            trip_count = *pupperDist - *plower + 1;
+        } else if(incr == -1) {
+            trip_count = *plower - *pupperDist + 1;
+        } else {
+            trip_count = (ST)(*pupperDist - *plower) / incr + 1;
+        }
+        KMP_DEBUG_ASSERT( trip_count );
+        switch( schedule ) {
+        case kmp_sch_static:
+        {
+            if( trip_count <= nth ) {
+                KMP_DEBUG_ASSERT(
+                    __kmp_static == kmp_sch_static_greedy || \
+                    __kmp_static == kmp_sch_static_balanced
+                ); // Unknown static scheduling type.
+                if( tid < trip_count )
+                    *pupper = *plower = *plower + tid * incr;
+                else
+                    *plower = *pupper + incr; // no iterations available
+                if( plastiter != NULL )
+                    if( *plastiter != 0 && !( tid == trip_count - 1 ) )
+                        *plastiter = 0;
+            } else {
+                if( __kmp_static == kmp_sch_static_balanced ) {
+                    register UT chunkL = trip_count / nth;
+                    register UT extras = trip_count % nth;
+                    *plower += incr * (tid * chunkL + (tid < extras ? tid : extras));
+                    *pupper = *plower + chunkL * incr - (tid < extras ? 0 : incr);
+                    if( plastiter != NULL )
+                        if( *plastiter != 0 && !( tid == nth - 1 ) )
+                            *plastiter = 0;
+                } else {
+                    register T chunk_inc_count =
+                        ( trip_count / nth + ( ( trip_count % nth ) ? 1 : 0) ) * incr;
+                    register T upper = *pupperDist;
+                    KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
+                        // Unknown static scheduling type.
+                    *plower += tid * chunk_inc_count;
+                    *pupper = *plower + chunk_inc_count - incr;
+                    if( incr > 0 ) {
+                        if( *pupper < *plower )
+                            *pupper = i_maxmin< T >::mx;
+                        if( plastiter != NULL )
+                            if( *plastiter != 0 && !(*plower <= upper && *pupper > upper - incr) )
+                                *plastiter = 0;
+                        if( *pupper > upper )
+                            *pupper = upper;//tracker C73258
+                    } else {
+                        if( *pupper > *plower )
+                            *pupper = i_maxmin< T >::mn;
+                        if( plastiter != NULL )
+                            if( *plastiter != 0 && !(*plower >= upper && *pupper < upper - incr) )
+                                *plastiter = 0;
+                        if( *pupper < upper )
+                            *pupper = upper;//tracker C73258
+                    }
+                }
+            }
+            break;
+        }
+        case kmp_sch_static_chunked:
+        {
+            register ST span;
+            if( chunk < 1 )
+                chunk = 1;
+            span = chunk * incr;
+            *pstride = span * nth;
+            *plower = *plower + (span * tid);
+            *pupper = *plower + span - incr;
+            if( plastiter != NULL )
+                if( *plastiter != 0 && !(tid == ((trip_count - 1) / ( UT )chunk) % nth) )
+                    *plastiter = 0;
+            break;
+        }
+        default:
+            KMP_ASSERT2( 0, "__kmpc_dist_for_static_init: unknown loop scheduling type" );
+            break;
+        }
+    }
+    end:;
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmpc_dist_for_static_init: last=%%d lo=%%%s up=%%%s upDist=%%%s "\
+            "stride=%%%s signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec,
+            traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, *plastiter, *plower, *pupper, *pupperDist, *pstride ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+    KE_TRACE( 10, ("__kmpc_dist_for_static_init: T#%d return\n", gtid ) );
+    return;
+}
+
+template< typename T >
+static void
+__kmp_team_static_init(
+    ident_t                          *loc,
+    kmp_int32                         gtid,
+    kmp_int32                        *p_last,
+    T                                *p_lb,
+    T                                *p_ub,
+    typename traits_t< T >::signed_t *p_st,
+    typename traits_t< T >::signed_t  incr,
+    typename traits_t< T >::signed_t  chunk
+) {
+    // The routine returns the first chunk distributed to the team and
+    // stride for next chunks calculation.
+    // Last iteration flag set for the team that will execute
+    // the last iteration of the loop.
+    // The routine is called for dist_schedue(static,chunk) only.
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    kmp_uint32  team_id;
+    kmp_uint32  nteams;
+    UT          trip_count;
+    T           lower;
+    T           upper;
+    ST          span;
+    kmp_team_t *team;
+    kmp_info_t *th;
+
+    KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st );
+    KE_TRACE( 10, ("__kmp_team_static_init called (%d)\n", gtid));
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format( "__kmp_team_static_init enter: T#%%d liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) chunk %%%s; signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, gtid, *p_last, *p_lb, *p_ub, *p_st, chunk ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    lower = *p_lb;
+    upper = *p_ub;
+    if( __kmp_env_consistency_check ) {
+        if( incr == 0 ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
+        }
+        if( incr > 0 ? (upper < lower) : (lower < upper) ) {
+            // The loop is illegal.
+            // Some zero-trip loops maintained by compiler, e.g.:
+            //   for(i=10;i<0;++i) // lower >= upper - run-time check
+            //   for(i=0;i>10;--i) // lower <= upper - run-time check
+            //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+            //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+            // Compiler does not check the following illegal loops:
+            //   for(i=0;i<10;i+=incr) // where incr<0
+            //   for(i=10;i>0;i-=incr) // where incr<0
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
+        }
+    }
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
+    team = th->th.th_team;
+    #if OMP_40_ENABLED
+    nteams = th->th.th_teams_size.nteams;
+    #endif
+    team_id = team->t.t_master_tid;
+    KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+
+    // compute trip count
+    if( incr == 1 ) {
+        trip_count = upper - lower + 1;
+    } else if(incr == -1) {
+        trip_count = lower - upper + 1;
+    } else {
+        trip_count = (ST)(upper - lower) / incr + 1; // cast to signed to cover incr<0 case
+    }
+    if( chunk < 1 )
+        chunk = 1;
+    span = chunk * incr;
+    *p_st = span * nteams;
+    *p_lb = lower + (span * team_id);
+    *p_ub = *p_lb + span - incr;
+    if ( p_last != NULL )
+        *p_last = (team_id == ((trip_count - 1)/(UT)chunk) % nteams);
+    // Correct upper bound if needed
+    if( incr > 0 ) {
+        if( *p_ub < *p_lb ) // overflow?
+            *p_ub = i_maxmin< T >::mx;
+        if( *p_ub > upper )
+            *p_ub = upper; // tracker C73258
+    } else {   // incr < 0
+        if( *p_ub > *p_lb )
+            *p_ub = i_maxmin< T >::mn;
+        if( *p_ub < upper )
+            *p_ub = upper; // tracker C73258
+    }
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format( "__kmp_team_static_init exit: T#%%d team%%u liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) chunk %%%s\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< ST >::spec );
+        KD_TRACE(100, ( buff, gtid, team_id, *p_last, *p_lb, *p_ub, *p_st, chunk ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+}
+
+//--------------------------------------------------------------------------------------
+extern "C" {
+
+/*!
+@ingroup WORK_SHARING
+@param    loc       Source code location
+@param    gtid      Global thread id of this thread
+@param    schedtype  Scheduling type
+@param    plastiter Pointer to the "last iteration" flag
+@param    plower    Pointer to the lower bound
+@param    pupper    Pointer to the upper bound
+@param    pstride   Pointer to the stride
+@param    incr      Loop increment
+@param    chunk     The chunk size
+
+Each of the four functions here are identical apart from the argument types.
+
+The functions compute the upper and lower bounds and stride to be used for the set of iterations
+to be executed by the current thread from the statically scheduled loop that is described by the
+initial values of the bounds, stride, increment and chunk size.
+
+@{
+*/
+void
+__kmpc_for_static_init_4( ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, kmp_int32 *plastiter,
+                      kmp_int32 *plower, kmp_int32 *pupper,
+                      kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk )
+{
+    __kmp_for_static_init< kmp_int32 >(
+                      loc, gtid, schedtype, plastiter, plower, pupper, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void
+__kmpc_for_static_init_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, kmp_int32 *plastiter,
+                      kmp_uint32 *plower, kmp_uint32 *pupper,
+                      kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk )
+{
+    __kmp_for_static_init< kmp_uint32 >(
+                      loc, gtid, schedtype, plastiter, plower, pupper, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void
+__kmpc_for_static_init_8( ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, kmp_int32 *plastiter,
+                      kmp_int64 *plower, kmp_int64 *pupper,
+                      kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk )
+{
+    __kmp_for_static_init< kmp_int64 >(
+                      loc, gtid, schedtype, plastiter, plower, pupper, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void
+__kmpc_for_static_init_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, kmp_int32 *plastiter,
+                      kmp_uint64 *plower, kmp_uint64 *pupper,
+                      kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk )
+{
+    __kmp_for_static_init< kmp_uint64 >(
+                      loc, gtid, schedtype, plastiter, plower, pupper, pstride, incr, chunk );
+}
+/*!
+@}
+*/
+
+/*!
+@ingroup WORK_SHARING
+@param    loc       Source code location
+@param    gtid      Global thread id of this thread
+@param    schedule  Scheduling type for the parallel loop
+@param    plastiter Pointer to the "last iteration" flag
+@param    plower    Pointer to the lower bound
+@param    pupper    Pointer to the upper bound of loop chunk
+@param    pupperD   Pointer to the upper bound of dist_chunk
+@param    pstride   Pointer to the stride for parallel loop
+@param    incr      Loop increment
+@param    chunk     The chunk size for the parallel loop
+
+Each of the four functions here are identical apart from the argument types.
+
+The functions compute the upper and lower bounds and strides to be used for the set of iterations
+to be executed by the current thread from the statically scheduled loop that is described by the
+initial values of the bounds, strides, increment and chunks for parallel loop and distribute
+constructs.
+
+@{
+*/
+void
+__kmpc_dist_for_static_init_4(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_int32 *plower, kmp_int32 *pupper, kmp_int32 *pupperD,
+    kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk )
+{
+    __kmp_dist_for_static_init< kmp_int32 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void
+__kmpc_dist_for_static_init_4u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_uint32 *plower, kmp_uint32 *pupper, kmp_uint32 *pupperD,
+    kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk )
+{
+    __kmp_dist_for_static_init< kmp_uint32 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void
+__kmpc_dist_for_static_init_8(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_int64 *plower, kmp_int64 *pupper, kmp_int64 *pupperD,
+    kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk )
+{
+    __kmp_dist_for_static_init< kmp_int64 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void
+__kmpc_dist_for_static_init_8u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_uint64 *plower, kmp_uint64 *pupper, kmp_uint64 *pupperD,
+    kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk )
+{
+    __kmp_dist_for_static_init< kmp_uint64 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+/*!
+@}
+*/
+
+//-----------------------------------------------------------------------------------------
+// Auxiliary routines for Distribute Parallel Loop construct implementation
+//    Transfer call to template< type T >
+//    __kmp_team_static_init( ident_t *loc, int gtid,
+//        int *p_last, T *lb, T *ub, ST *st, ST incr, ST chunk )
+
+/*!
+@ingroup WORK_SHARING
+@{
+@param loc Source location
+@param gtid Global thread id
+@param p_last pointer to last iteration flag
+@param p_lb  pointer to Lower bound
+@param p_ub  pointer to Upper bound
+@param p_st  Step (or increment if you prefer)
+@param incr  Loop increment
+@param chunk The chunk size to block with
+
+The functions compute the upper and lower bounds and stride to be used for the set of iterations
+to be executed by the current team from the statically scheduled loop that is described by the
+initial values of the bounds, stride, increment and chunk for the distribute construct as part of
+composite distribute parallel loop construct.
+These functions are all identical apart from the types of the arguments.
+*/
+
+void
+__kmpc_team_static_init_4(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st, kmp_int32 incr, kmp_int32 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void
+__kmpc_team_static_init_4u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st, kmp_int32 incr, kmp_int32 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void
+__kmpc_team_static_init_8(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st, kmp_int64 incr, kmp_int64 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void
+__kmpc_team_static_init_8u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st, kmp_int64 incr, kmp_int64 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+/*!
+@}
+*/
+
+} // extern "C"
+

diff --git a/final/runtime/src/kmp_settings.c b/final/runtime/src/kmp_settings.c
new file mode 100644
index 0000000..7356bf8
--- /dev/null
+++ b/final/runtime/src/kmp_settings.c

@@ -0,0 +1,5269 @@
+/*
+ * kmp_settings.c -- Initialize environment variables
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_wrapper_getpid.h"
+#include "kmp_environment.h"
+#include "kmp_atomic.h"
+#include "kmp_itt.h"
+#include "kmp_str.h"
+#include "kmp_settings.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+
+static int __kmp_env_toPrint( char const * name, int flag );
+
+bool __kmp_env_format = 0; // 0 - old format; 1 - new format
+// -------------------------------------------------------------------------------------------------
+// Helper string functions. Subject to move to kmp_str.
+// -------------------------------------------------------------------------------------------------
+
+static double
+__kmp_convert_to_double( char const * s )
+{
+    double result;
+
+    if ( KMP_SSCANF( s, "%lf", &result ) < 1 ) {
+        result = 0.0;
+    }
+
+    return result;
+}
+
+#ifdef KMP_DEBUG
+static unsigned int
+__kmp_readstr_with_sentinel(char *dest, char const * src, size_t len, char sentinel) {
+    unsigned int i;
+    for (i = 0; i < len; i++) {
+        if ((*src == '\0') || (*src == sentinel)) {
+            break;
+        }
+        *(dest++) = *(src++);
+    }
+    *dest = '\0';
+    return i;
+}
+#endif
+
+static int
+__kmp_match_with_sentinel( char const * a, char const * b, size_t len, char sentinel ) {
+    size_t l = 0;
+
+    if(a == NULL)
+        a = "";
+    if(b == NULL)
+        b = "";
+    while(*a && *b && *b != sentinel) {
+        char ca = *a, cb = *b;
+
+        if(ca >= 'a' && ca <= 'z')
+            ca -= 'a' - 'A';
+        if(cb >= 'a' && cb <= 'z')
+            cb -= 'a' - 'A';
+        if(ca != cb)
+            return FALSE;
+        ++l;
+        ++a;
+        ++b;
+    }
+    return l >= len;
+}
+
+//
+// Expected usage:
+//     token is the token to check for.
+//     buf is the string being parsed.
+//     *end returns the char after the end of the token.
+//        it is not modified unless a match occurs.
+//
+//
+// Example 1:
+//
+//     if (__kmp_match_str("token", buf, *end) {
+//         <do something>
+//         buf = end;
+//     }
+//
+//  Example 2:
+//
+//     if (__kmp_match_str("token", buf, *end) {
+//         char *save = **end;
+//         **end = sentinel;
+//         <use any of the __kmp*_with_sentinel() functions>
+//         **end = save;
+//         buf = end;
+//     }
+//
+
+static int
+__kmp_match_str( char const *token, char const *buf, const char **end) {
+
+    KMP_ASSERT(token != NULL);
+    KMP_ASSERT(buf != NULL);
+    KMP_ASSERT(end != NULL);
+
+    while (*token && *buf) {
+        char ct = *token, cb = *buf;
+
+        if(ct >= 'a' && ct <= 'z')
+            ct -= 'a' - 'A';
+        if(cb >= 'a' && cb <= 'z')
+            cb -= 'a' - 'A';
+        if (ct != cb)
+            return FALSE;
+        ++token;
+        ++buf;
+    }
+    if (*token) {
+        return FALSE;
+    }
+    *end = buf;
+    return TRUE;
+}
+
+
+static size_t
+__kmp_round4k( size_t size ) {
+    size_t _4k = 4 * 1024;
+    if ( size & ( _4k - 1 ) ) {
+        size &= ~ ( _4k - 1 );
+        if ( size <= KMP_SIZE_T_MAX - _4k ) {
+            size += _4k;    // Round up if there is no overflow.
+        }; // if
+    }; // if
+    return size;
+} // __kmp_round4k
+
+
+/*
+    Here, multipliers are like __kmp_convert_to_seconds, but floating-point
+    values are allowed, and the return value is in milliseconds.  The default
+    multiplier is milliseconds.  Returns INT_MAX only if the value specified
+    matches "infinit*".  Returns -1 if specified string is invalid.
+*/
+int
+__kmp_convert_to_milliseconds( char const * data )
+{
+    int ret, nvalues, factor;
+    char mult, extra;
+    double value;
+
+    if (data == NULL) return (-1);
+    if ( __kmp_str_match( "infinit", -1, data)) return (INT_MAX);
+    value = (double) 0.0;
+    mult = '\0';
+    nvalues = KMP_SSCANF (data, "%lf%c%c", &value, &mult, &extra);
+    if (nvalues < 1) return (-1);
+    if (nvalues == 1) mult = '\0';
+    if (nvalues == 3) return (-1);
+
+    if (value < 0)    return (-1);
+
+    switch (mult) {
+    case '\0':
+        /*  default is milliseconds  */
+        factor = 1;
+        break;
+    case 's': case 'S':
+        factor = 1000;
+        break;
+    case 'm': case 'M':
+        factor = 1000 * 60;
+        break;
+    case 'h': case 'H':
+        factor = 1000 * 60 * 60;
+        break;
+    case 'd': case 'D':
+        factor = 1000 * 24 * 60 * 60;
+        break;
+    default:
+        return (-1);
+    }
+
+    if ( value >= ( (INT_MAX-1) / factor) )
+        ret = INT_MAX-1;        /* Don't allow infinite value here */
+    else
+        ret = (int) (value * (double) factor);  /* truncate to int  */
+
+    return ret;
+}
+
+
+static int
+__kmp_strcasecmp_with_sentinel( char const * a, char const * b, char sentinel ) {
+    if(a == NULL)
+        a = "";
+    if(b == NULL)
+        b = "";
+    while(*a && *b && *b != sentinel) {
+        char ca = *a, cb = *b;
+
+        if(ca >= 'a' && ca <= 'z')
+            ca -= 'a' - 'A';
+        if(cb >= 'a' && cb <= 'z')
+            cb -= 'a' - 'A';
+        if(ca != cb)
+            return (int)(unsigned char)*a - (int)(unsigned char)*b;
+        ++a;
+        ++b;
+    }
+    return *a ?
+        (*b && *b != sentinel) ? (int)(unsigned char)*a - (int)(unsigned char)*b : 1 :
+        (*b && *b != sentinel) ? -1 : 0;
+}
+
+
+// =================================================================================================
+// Table structures and helper functions.
+// =================================================================================================
+
+typedef struct __kmp_setting        kmp_setting_t;
+typedef struct __kmp_stg_ss_data    kmp_stg_ss_data_t;
+typedef struct __kmp_stg_wp_data    kmp_stg_wp_data_t;
+typedef struct __kmp_stg_fr_data    kmp_stg_fr_data_t;
+
+typedef void ( * kmp_stg_parse_func_t )( char const * name, char const * value, void * data );
+typedef void ( * kmp_stg_print_func_t )( kmp_str_buf_t * buffer, char const * name, void * data );
+
+struct __kmp_setting {
+    char const *         name;        // Name of setting (environment variable).
+    kmp_stg_parse_func_t parse;       // Parser function.
+    kmp_stg_print_func_t print;       // Print function.
+    void *               data;        // Data passed to parser and printer.
+    int                  set;         // Variable set during this "session"
+                                      //     (__kmp_env_initialize() or kmp_set_defaults() call).
+    int                  defined;     // Variable set in any "session".
+}; // struct __kmp_setting
+
+struct __kmp_stg_ss_data {
+    size_t             factor;  // Default factor: 1 for KMP_STACKSIZE, 1024 for others.
+    kmp_setting_t * *  rivals;  // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_ss_data
+
+struct __kmp_stg_wp_data {
+    int                omp;     // 0 -- KMP_LIBRARY, 1 -- OMP_WAIT_POLICY.
+    kmp_setting_t * *  rivals;  // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_wp_data
+
+struct __kmp_stg_fr_data {
+    int                force;  // 0 -- KMP_DETERMINISTIC_REDUCTION, 1 -- KMP_FORCE_REDUCTION.
+    kmp_setting_t * *  rivals;  // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_fr_data
+
+static int
+__kmp_stg_check_rivals(          // 0 -- Ok, 1 -- errors found.
+    char const *       name,     // Name of variable.
+    char const *       value,    // Value of the variable.
+    kmp_setting_t * *  rivals    // List of rival settings (the list must include current one).
+);
+
+
+// -------------------------------------------------------------------------------------------------
+// Helper parse functions.
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_bool(
+    char const * name,
+    char const * value,
+    int *        out
+) {
+    if ( __kmp_str_match_true( value ) ) {
+        * out = TRUE;
+    } else if (__kmp_str_match_false( value ) ) {
+        * out = FALSE;
+    } else {
+        __kmp_msg(
+            kmp_ms_warning,
+            KMP_MSG( BadBoolValue, name, value ),
+            KMP_HNT( ValidBoolValues ),
+            __kmp_msg_null
+        );
+    }; // if
+} // __kmp_stg_parse_bool
+
+static void
+__kmp_stg_parse_size(
+    char const * name,
+    char const * value,
+    size_t       size_min,
+    size_t       size_max,
+    int *        is_specified,
+    size_t *     out,
+    size_t       factor
+) {
+    char const * msg = NULL;
+    #if KMP_OS_DARWIN
+        size_min = __kmp_round4k( size_min );
+        size_max = __kmp_round4k( size_max );
+    #endif // KMP_OS_DARWIN
+    if ( value ) {
+        if ( is_specified != NULL ) {
+            * is_specified = 1;
+        }; // if
+        __kmp_str_to_size( value, out, factor, & msg );
+        if ( msg == NULL ) {
+            if ( * out > size_max ) {
+                * out = size_max;
+                msg = KMP_I18N_STR( ValueTooLarge );
+            } else if ( * out < size_min ) {
+                * out = size_min;
+                msg = KMP_I18N_STR( ValueTooSmall );
+            } else {
+                #if KMP_OS_DARWIN
+                    size_t round4k = __kmp_round4k( * out );
+                    if ( * out != round4k ) {
+                        * out = round4k;
+                        msg = KMP_I18N_STR( NotMultiple4K );
+                    }; // if
+                #endif
+            }; // if
+        } else {
+            // If integer overflow occurred, * out == KMP_SIZE_T_MAX. Cut it to size_max silently.
+            if ( * out < size_min ) {
+                * out = size_max;
+            }
+            else if ( * out >  size_max ) {
+                * out = size_max;
+            }; // if
+        }; // if
+        if ( msg != NULL ) {
+            // Message is not empty. Print warning.
+            kmp_str_buf_t buf;
+            __kmp_str_buf_init( & buf );
+            __kmp_str_buf_print_size( & buf, * out );
+            KMP_WARNING( ParseSizeIntWarn, name, value, msg );
+            KMP_INFORM( Using_str_Value, name, buf.str );
+            __kmp_str_buf_free( & buf );
+        }; // if
+    }; // if
+} // __kmp_stg_parse_size
+
+#if KMP_AFFINITY_SUPPORTED
+static void
+__kmp_stg_parse_str(
+    char const *      name,
+    char const *      value,
+    char const * *    out
+) {
+    KMP_INTERNAL_FREE( (void *) * out );
+    * out = __kmp_str_format( "%s", value );
+} // __kmp_stg_parse_str
+#endif
+
+static void
+__kmp_stg_parse_int(
+    char const * name,   // I: Name of environment variable (used in warning messages).
+    char const * value,  // I: Value of environment variable to parse.
+    int          min,    // I: Miminal allowed value.
+    int          max,    // I: Maximum allowed value.
+    int *        out     // O: Output (parsed) value.
+) {
+    char const * msg  = NULL;
+    kmp_uint64   uint = * out;
+    __kmp_str_to_uint( value, & uint, & msg );
+    if ( msg == NULL ) {
+        if ( uint < (unsigned int)min ) {
+            msg = KMP_I18N_STR( ValueTooSmall );
+            uint = min;
+        } else if ( uint > (unsigned int)max ) {
+            msg = KMP_I18N_STR( ValueTooLarge );
+            uint = max;
+        }; // if
+    } else {
+        // If overflow occurred msg contains error message and uint is very big. Cut tmp it
+        // to INT_MAX.
+        if ( uint < (unsigned int)min ) {
+            uint = min;
+        }
+        else if ( uint > (unsigned int)max ) {
+            uint = max;
+        }; // if
+    }; // if
+    if ( msg != NULL ) {
+        // Message is not empty. Print warning.
+        kmp_str_buf_t buf;
+        KMP_WARNING( ParseSizeIntWarn, name, value, msg );
+        __kmp_str_buf_init( & buf );
+        __kmp_str_buf_print( &buf, "%" KMP_UINT64_SPEC "", uint );
+        KMP_INFORM( Using_uint64_Value, name, buf.str );
+        __kmp_str_buf_free( &buf );
+    }; // if
+    * out = uint;
+} // __kmp_stg_parse_int
+
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+static void
+__kmp_stg_parse_file(
+    char const * name,
+    char const * value,
+    char *       suffix,
+    char * *     out
+) {
+    char buffer[256];
+    char *t;
+    int hasSuffix;
+    KMP_INTERNAL_FREE( (void *) * out );
+    t = (char *) strrchr(value, '.');
+    hasSuffix = t && __kmp_str_eqf( t, suffix );
+    t = __kmp_str_format( "%s%s", value, hasSuffix ? "" : suffix );
+    __kmp_expand_file_name( buffer, sizeof(buffer), t);
+    KMP_INTERNAL_FREE(t);
+    * out = __kmp_str_format( "%s", buffer );
+} // __kmp_stg_parse_file
+#endif
+
+#ifdef KMP_DEBUG
+static char * par_range_to_print = NULL;
+
+static void
+__kmp_stg_parse_par_range(
+    char const * name,
+    char const * value,
+    int *        out_range,
+    char *       out_routine,
+    char *       out_file,
+    int *        out_lb,
+    int *        out_ub
+) {
+    size_t len = KMP_STRLEN( value + 1 );
+    par_range_to_print = (char *) KMP_INTERNAL_MALLOC( len +1 );
+    KMP_STRNCPY_S( par_range_to_print, len + 1, value, len + 1);
+    __kmp_par_range = +1;
+    __kmp_par_range_lb = 0;
+    __kmp_par_range_ub = INT_MAX;
+    for (;;) {
+        unsigned int len;
+        if (( value == NULL ) || ( *value == '\0' )) {
+            break;
+        }
+        if ( ! __kmp_strcasecmp_with_sentinel( "routine", value, '=' )) {
+            value = strchr( value, '=' ) + 1;
+            len = __kmp_readstr_with_sentinel( out_routine,
+              value, KMP_PAR_RANGE_ROUTINE_LEN - 1, ',' );
+            if ( len == 0 ) {
+                goto par_range_error;
+            }
+            value = strchr( value, ',' );
+            if ( value != NULL ) {
+                value++;
+            }
+            continue;
+        }
+        if ( ! __kmp_strcasecmp_with_sentinel( "filename", value, '=' )) {
+            value = strchr( value, '=' ) + 1;
+            len = __kmp_readstr_with_sentinel( out_file,
+              value, KMP_PAR_RANGE_FILENAME_LEN - 1, ',' );
+            if ( len == 0) {
+                goto par_range_error;
+            }
+            value = strchr( value, ',' );
+            if ( value != NULL ) {
+                value++;
+            }
+            continue;
+        }
+        if (( ! __kmp_strcasecmp_with_sentinel( "range", value, '=' ))
+          || ( ! __kmp_strcasecmp_with_sentinel( "incl_range", value, '=' ))) {
+            value = strchr( value, '=' ) + 1;
+            if ( KMP_SSCANF( value, "%d:%d", out_lb, out_ub ) != 2 ) {
+                goto par_range_error;
+            }
+            *out_range = +1;
+            value = strchr( value, ',' );
+            if ( value != NULL ) {
+                value++;
+            }
+            continue;
+        }
+        if ( ! __kmp_strcasecmp_with_sentinel( "excl_range", value, '=' )) {
+            value = strchr( value, '=' ) + 1;
+            if ( KMP_SSCANF( value, "%d:%d", out_lb, out_ub) != 2 ) {
+                goto par_range_error;
+            }
+            *out_range = -1;
+            value = strchr( value, ',' );
+            if ( value != NULL ) {
+                value++;
+            }
+            continue;
+        }
+        par_range_error:
+        KMP_WARNING( ParRangeSyntax, name );
+        __kmp_par_range = 0;
+        break;
+    }
+} // __kmp_stg_parse_par_range
+#endif
+
+int
+__kmp_initial_threads_capacity( int req_nproc )
+{
+    int nth = 32;
+
+    /* MIN( MAX( 32, 4 * $OMP_NUM_THREADS, 4 * omp_get_num_procs() ), __kmp_max_nth) */
+    if (nth < (4 * req_nproc))
+        nth = (4 * req_nproc);
+    if (nth < (4 * __kmp_xproc))
+        nth = (4 * __kmp_xproc);
+
+    if (nth > __kmp_max_nth)
+        nth = __kmp_max_nth;
+
+    return nth;
+}
+
+
+int
+__kmp_default_tp_capacity( int req_nproc, int max_nth, int all_threads_specified) {
+    int nth = 128;
+
+    if(all_threads_specified)
+        return max_nth;
+    /* MIN( MAX (128, 4 * $OMP_NUM_THREADS, 4 * omp_get_num_procs() ), __kmp_max_nth ) */
+    if (nth < (4 * req_nproc))
+        nth = (4 * req_nproc);
+    if (nth < (4 * __kmp_xproc))
+        nth = (4 * __kmp_xproc);
+
+    if (nth > __kmp_max_nth)
+        nth = __kmp_max_nth;
+
+    return nth;
+}
+
+
+// -------------------------------------------------------------------------------------------------
+// Helper print functions.
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_print_bool( kmp_str_buf_t * buffer, char const * name, int value ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_BOOL;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s=%s\n", name, value ? "true" : "false" );
+    }
+} // __kmp_stg_print_bool
+
+static void
+__kmp_stg_print_int( kmp_str_buf_t * buffer, char const * name, int value ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_INT;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s=%d\n", name, value );
+    }
+} // __kmp_stg_print_int
+
+static void
+__kmp_stg_print_uint64( kmp_str_buf_t * buffer, char const * name, kmp_uint64 value ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_UINT64;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s=%" KMP_UINT64_SPEC "\n", name, value );
+    }
+} // __kmp_stg_print_uint64
+
+static void
+__kmp_stg_print_str( kmp_str_buf_t * buffer, char const * name, char const * value ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_STR;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s=%s\n", name, value );
+    }
+} // __kmp_stg_print_str
+
+static void
+__kmp_stg_print_size( kmp_str_buf_t * buffer, char const * name, size_t value ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME_EX(name);
+        __kmp_str_buf_print_size( buffer, value );
+        __kmp_str_buf_print( buffer, "'\n" );
+    } else {
+        __kmp_str_buf_print( buffer, "   %s=", name );
+        __kmp_str_buf_print_size( buffer, value );
+        __kmp_str_buf_print( buffer, "\n" );
+        return;
+    }
+} // __kmp_stg_print_size
+
+
+// =================================================================================================
+// Parse and print functions.
+// =================================================================================================
+
+// -------------------------------------------------------------------------------------------------
+// KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_all_threads( char const * name, char const * value, void * data ) {
+
+    kmp_setting_t * * rivals = (kmp_setting_t * *) data;
+    int               rc;
+    rc = __kmp_stg_check_rivals( name, value, rivals );
+    if ( rc ) {
+        return;
+    }; // if
+    if ( ! __kmp_strcasecmp_with_sentinel( "all", value, 0 ) ) {
+        __kmp_max_nth = __kmp_xproc;
+        __kmp_allThreadsSpecified = 1;
+    } else {
+        __kmp_stg_parse_int( name, value, 1, __kmp_sys_max_nth, & __kmp_max_nth );
+        __kmp_allThreadsSpecified = 0;
+    }
+    K_DIAG( 1, ( "__kmp_max_nth == %d\n", __kmp_max_nth ) );
+
+} // __kmp_stg_parse_all_threads
+
+static void
+__kmp_stg_print_all_threads( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_max_nth );
+} // __kmp_stg_print_all_threads
+
+// -------------------------------------------------------------------------------------------------
+// KMP_BLOCKTIME
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_blocktime( char const * name, char const * value, void * data ) {
+    __kmp_dflt_blocktime = __kmp_convert_to_milliseconds( value );
+    if ( __kmp_dflt_blocktime < 0 ) {
+        __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+        __kmp_msg( kmp_ms_warning, KMP_MSG( InvalidValue, name, value ), __kmp_msg_null );
+        KMP_INFORM( Using_int_Value, name, __kmp_dflt_blocktime );
+        __kmp_env_blocktime = FALSE;  // Revert to default as if var not set.
+    } else {
+        if ( __kmp_dflt_blocktime < KMP_MIN_BLOCKTIME ) {
+            __kmp_dflt_blocktime = KMP_MIN_BLOCKTIME;
+            __kmp_msg( kmp_ms_warning, KMP_MSG( SmallValue, name, value ), __kmp_msg_null );
+            KMP_INFORM( MinValueUsing, name, __kmp_dflt_blocktime );
+        } else if ( __kmp_dflt_blocktime > KMP_MAX_BLOCKTIME ) {
+            __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+            __kmp_msg( kmp_ms_warning, KMP_MSG( LargeValue, name, value ), __kmp_msg_null );
+            KMP_INFORM( MaxValueUsing, name, __kmp_dflt_blocktime );
+        }; // if
+        __kmp_env_blocktime = TRUE;    // KMP_BLOCKTIME was specified.
+    }; // if
+    // calculate number of monitor thread wakeup intervals corresonding to blocktime.
+    __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
+    __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
+    K_DIAG( 1, ( "__kmp_env_blocktime == %d\n", __kmp_env_blocktime ) );
+    if ( __kmp_env_blocktime ) {
+        K_DIAG( 1, ( "__kmp_dflt_blocktime == %d\n", __kmp_dflt_blocktime ) );
+    }
+} // __kmp_stg_parse_blocktime
+
+static void
+__kmp_stg_print_blocktime( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_dflt_blocktime );
+} // __kmp_stg_print_blocktime
+
+// -------------------------------------------------------------------------------------------------
+// KMP_DUPLICATE_LIB_OK
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_duplicate_lib_ok( char const * name, char const * value, void * data ) {
+    /* actually this variable is not supported,
+       put here for compatibility with earlier builds and for static/dynamic combination */
+    __kmp_stg_parse_bool( name, value, & __kmp_duplicate_library_ok );
+} // __kmp_stg_parse_duplicate_lib_ok
+
+static void
+__kmp_stg_print_duplicate_lib_ok( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_duplicate_library_ok );
+} // __kmp_stg_print_duplicate_lib_ok
+
+// -------------------------------------------------------------------------------------------------
+// KMP_INHERIT_FP_CONTROL
+// -------------------------------------------------------------------------------------------------
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+static void
+__kmp_stg_parse_inherit_fp_control( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_inherit_fp_control );
+} // __kmp_stg_parse_inherit_fp_control
+
+static void
+__kmp_stg_print_inherit_fp_control( kmp_str_buf_t * buffer, char const * name, void * data ) {
+#if KMP_DEBUG
+    __kmp_stg_print_bool( buffer, name, __kmp_inherit_fp_control );
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_inherit_fp_control
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// -------------------------------------------------------------------------------------------------
+// KMP_LIBRARY, OMP_WAIT_POLICY
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_wait_policy( char const * name, char const * value, void * data ) {
+
+    kmp_stg_wp_data_t * wait = (kmp_stg_wp_data_t *) data;
+    int                 rc;
+
+    rc = __kmp_stg_check_rivals( name, value, wait->rivals );
+    if ( rc ) {
+        return;
+    }; // if
+
+    if ( wait->omp ) {
+        if ( __kmp_str_match( "ACTIVE", 1, value ) ) {
+           __kmp_library = library_turnaround;
+        } else if ( __kmp_str_match( "PASSIVE", 1, value ) ) {
+           __kmp_library = library_throughput;
+        } else {
+            KMP_WARNING( StgInvalidValue, name, value );
+        }; // if
+    } else {
+        if ( __kmp_str_match( "serial", 1, value ) ) {             /* S */
+           __kmp_library = library_serial;
+        } else if ( __kmp_str_match( "throughput", 2, value ) ) {  /* TH */
+           __kmp_library = library_throughput;
+        } else if ( __kmp_str_match( "turnaround", 2, value ) ) {  /* TU */
+           __kmp_library = library_turnaround;
+        } else if ( __kmp_str_match( "dedicated", 1, value ) ) {   /* D */
+           __kmp_library = library_turnaround;
+        } else if ( __kmp_str_match( "multiuser", 1, value ) ) {   /* M */
+           __kmp_library = library_throughput;
+        } else {
+            KMP_WARNING( StgInvalidValue, name, value );
+        }; // if
+    }; // if
+    __kmp_aux_set_library( __kmp_library );
+
+} // __kmp_stg_parse_wait_policy
+
+static void
+__kmp_stg_print_wait_policy( kmp_str_buf_t * buffer, char const * name, void * data ) {
+
+    kmp_stg_wp_data_t * wait = (kmp_stg_wp_data_t *) data;
+    char const *        value = NULL;
+
+    if ( wait->omp ) {
+        switch ( __kmp_library ) {
+            case library_turnaround : {
+                value = "ACTIVE";
+            } break;
+            case library_throughput : {
+                value = "PASSIVE";
+            } break;
+        }; // switch
+    } else {
+        switch ( __kmp_library ) {
+            case library_serial : {
+                value = "serial";
+            } break;
+            case library_turnaround : {
+                value = "turnaround";
+            } break;
+            case library_throughput : {
+                value = "throughput";
+            } break;
+        }; // switch
+    }; // if
+    if ( value != NULL ) {
+        __kmp_stg_print_str( buffer, name, value );
+    }; // if
+
+} // __kmp_stg_print_wait_policy
+
+// -------------------------------------------------------------------------------------------------
+// KMP_MONITOR_STACKSIZE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_monitor_stacksize( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_size(
+        name,
+        value,
+        __kmp_sys_min_stksize,
+        KMP_MAX_STKSIZE,
+        NULL,
+        & __kmp_monitor_stksize,
+        1
+    );
+} // __kmp_stg_parse_monitor_stacksize
+
+static void
+__kmp_stg_print_monitor_stacksize( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if( __kmp_env_format ) {
+        if (  __kmp_monitor_stksize > 0 )
+            KMP_STR_BUF_PRINT_NAME_EX(name);
+        else
+            KMP_STR_BUF_PRINT_NAME;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s", name );
+    }
+    if (  __kmp_monitor_stksize > 0 ) {
+        __kmp_str_buf_print_size( buffer, __kmp_monitor_stksize );
+    } else {
+        __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+    }
+    if( __kmp_env_format && __kmp_monitor_stksize ) {
+            __kmp_str_buf_print( buffer, "'\n");
+    }
+
+} // __kmp_stg_print_monitor_stacksize
+
+// -------------------------------------------------------------------------------------------------
+// KMP_SETTINGS
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_settings( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_settings );
+} // __kmp_stg_parse_settings
+
+static void
+__kmp_stg_print_settings( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_settings );
+} // __kmp_stg_print_settings
+
+// -------------------------------------------------------------------------------------------------
+// KMP_STACKPAD
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_stackpad( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int(
+        name,                             // Env var name
+        value,                            // Env var value
+        KMP_MIN_STKPADDING,               // Min value
+        KMP_MAX_STKPADDING,               // Max value
+        & __kmp_stkpadding                // Var to initialize
+    );
+} // __kmp_stg_parse_stackpad
+
+static void
+__kmp_stg_print_stackpad( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_stkpadding );
+} // __kmp_stg_print_stackpad
+
+// -------------------------------------------------------------------------------------------------
+// KMP_STACKOFFSET
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_stackoffset( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_size(
+        name,                             // Env var name
+        value,                            // Env var value
+        KMP_MIN_STKOFFSET,                // Min value
+        KMP_MAX_STKOFFSET,                // Max value
+        NULL,                             //
+        & __kmp_stkoffset,                // Var to initialize
+        1
+    );
+} // __kmp_stg_parse_stackoffset
+
+static void
+__kmp_stg_print_stackoffset( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_size( buffer, name, __kmp_stkoffset );
+} // __kmp_stg_print_stackoffset
+
+// -------------------------------------------------------------------------------------------------
+// KMP_STACKSIZE, OMP_STACKSIZE, GOMP_STACKSIZE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_stacksize( char const * name, char const * value, void * data ) {
+
+    kmp_stg_ss_data_t *  stacksize = (kmp_stg_ss_data_t *) data;
+    int                  rc;
+
+    rc = __kmp_stg_check_rivals( name, value, stacksize->rivals );
+    if ( rc ) {
+        return;
+    }; // if
+    __kmp_stg_parse_size(
+        name,                     // Env var name
+        value,                    // Env var value
+        __kmp_sys_min_stksize,    // Min value
+        KMP_MAX_STKSIZE,          // Max value
+        & __kmp_env_stksize,      //
+        & __kmp_stksize,          // Var to initialize
+        stacksize->factor
+    );
+
+} // __kmp_stg_parse_stacksize
+
+// This function is called for printing both KMP_STACKSIZE (factor is 1) and OMP_STACKSIZE (factor is 1024).
+// Currently it is not possible to print OMP_STACKSIZE value in bytes. We can consider adding this
+// possibility by a customer request in future.
+static void
+__kmp_stg_print_stacksize( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    kmp_stg_ss_data_t *  stacksize = (kmp_stg_ss_data_t *) data;
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME_EX(name);
+        __kmp_str_buf_print_size( buffer, (__kmp_stksize % 1024) ? __kmp_stksize / stacksize->factor : __kmp_stksize );
+        __kmp_str_buf_print( buffer, "'\n" );
+    } else {
+        __kmp_str_buf_print( buffer, "   %s=", name );
+        __kmp_str_buf_print_size( buffer, (__kmp_stksize % 1024) ? __kmp_stksize / stacksize->factor : __kmp_stksize );
+        __kmp_str_buf_print( buffer, "\n" );
+    }
+} // __kmp_stg_print_stacksize
+
+// -------------------------------------------------------------------------------------------------
+// KMP_VERSION
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_version( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_version );
+} // __kmp_stg_parse_version
+
+static void
+__kmp_stg_print_version( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_version );
+} // __kmp_stg_print_version
+
+// -------------------------------------------------------------------------------------------------
+// KMP_WARNINGS
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_warnings( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_generate_warnings );
+    if (__kmp_generate_warnings != kmp_warnings_off) {   // AC: we have only 0/1 values documented,
+        __kmp_generate_warnings = kmp_warnings_explicit; //     so reset it to explicit in order to
+    }                                                    //     distinguish from default setting
+} // __kmp_env_parse_warnings
+
+static void
+__kmp_stg_print_warnings( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_generate_warnings ); // AC: TODO: change to print_int?
+} // __kmp_env_print_warnings                                      //     (needs documentation change)...
+
+// -------------------------------------------------------------------------------------------------
+// OMP_NESTED, OMP_NUM_THREADS
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_nested( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_dflt_nested );
+} // __kmp_stg_parse_nested
+
+static void
+__kmp_stg_print_nested( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_dflt_nested );
+} // __kmp_stg_print_nested
+
+static void
+__kmp_parse_nested_num_threads( const char *var, const char *env, kmp_nested_nthreads_t *nth_array )
+{
+    const char *next = env;
+    const char *scan = next;
+
+    int total = 0;          // Count elements that were set. It'll be used as an array size
+    int prev_comma = FALSE; // For correct processing sequential commas
+
+    // Count the number of values in the env. var string
+    for ( ; ; ) {
+        SKIP_WS( next );
+
+        if ( *next == '\0' ) {
+            break;
+        }
+        // Next character is not an integer or not a comma => end of list
+        if ( ( ( *next < '0' ) || ( *next > '9' ) ) && ( *next !=',') ) {
+            KMP_WARNING( NthSyntaxError, var, env );
+            return;
+        }
+        // The next character is ','
+        if ( *next == ',' ) {
+            // ',' is the fisrt character
+            if ( total == 0 || prev_comma ) {
+                total++;
+            }
+            prev_comma = TRUE;
+            next++; //skip ','
+            SKIP_WS( next );
+        }
+        // Next character is a digit
+        if ( *next >= '0' && *next <= '9' ) {
+            prev_comma = FALSE;
+            SKIP_DIGITS( next );
+            total++;
+            const char *tmp = next;
+            SKIP_WS( tmp );
+            if ( ( *next == ' ' || *next == '\t' ) && ( *tmp >= '0' && *tmp <= '9' ) ) {
+                KMP_WARNING( NthSpacesNotAllowed, var, env );
+                return;
+            }
+        }
+    }
+    KMP_DEBUG_ASSERT( total > 0 );
+    if( total <= 0 ) {
+        KMP_WARNING( NthSyntaxError, var, env );
+        return;
+    }
+
+    // Check if the nested nthreads array exists
+    if ( ! nth_array->nth ) {
+        // Allocate an array of double size
+        nth_array->nth = ( int * )KMP_INTERNAL_MALLOC( sizeof( int ) * total * 2 );
+        if ( nth_array->nth == NULL ) {
+            KMP_FATAL( MemoryAllocFailed );
+        }
+        nth_array->size = total * 2;
+    } else {
+        if ( nth_array->size < total ) {
+            // Increase the array size
+            do {
+                nth_array->size *= 2;
+            } while ( nth_array->size < total );
+
+            nth_array->nth = (int *) KMP_INTERNAL_REALLOC(
+                nth_array->nth, sizeof( int ) * nth_array->size );
+            if ( nth_array->nth == NULL ) {
+		KMP_FATAL( MemoryAllocFailed );
+            }
+        }
+    }
+    nth_array->used = total;
+    int i = 0;
+
+    prev_comma = FALSE;
+    total = 0;
+    // Save values in the array
+    for ( ; ; ) {
+        SKIP_WS( scan );
+        if ( *scan == '\0' ) {
+            break;
+        }
+        // The next character is ','
+        if ( *scan == ',' ) {
+            // ',' in the beginning of the list
+            if ( total == 0 ) {
+                // The value is supposed to be equal to __kmp_avail_proc but it is unknown at the moment.
+                // So let's put a placeholder (#threads = 0) to correct it later.
+                nth_array->nth[i++] = 0;
+                total++;
+            }else if ( prev_comma ) {
+                // Num threads is inherited from the previous level
+                nth_array->nth[i] = nth_array->nth[i - 1];
+                i++;
+                total++;
+            }
+            prev_comma = TRUE;
+            scan++; //skip ','
+            SKIP_WS( scan );
+        }
+        // Next character is a digit
+        if ( *scan >= '0' && *scan <= '9' ) {
+            int num;
+            const char *buf = scan;
+            char const * msg  = NULL;
+            prev_comma = FALSE;
+            SKIP_DIGITS( scan );
+            total++;
+
+            num = __kmp_str_to_int( buf, *scan );
+            if ( num < KMP_MIN_NTH ) {
+                msg = KMP_I18N_STR( ValueTooSmall );
+                num = KMP_MIN_NTH;
+            } else if ( num > __kmp_sys_max_nth ) {
+                msg = KMP_I18N_STR( ValueTooLarge );
+                num = __kmp_sys_max_nth;
+            }
+            if ( msg != NULL ) {
+                // Message is not empty. Print warning.
+                KMP_WARNING( ParseSizeIntWarn, var, env, msg );
+                KMP_INFORM( Using_int_Value, var, num );
+            }
+            nth_array->nth[i++] = num;
+        }
+    }
+}
+
+static void
+__kmp_stg_parse_num_threads( char const * name, char const * value, void * data ) {
+    // TODO: Remove this option. OMP_NUM_THREADS is a list of positive integers!
+    if ( ! __kmp_strcasecmp_with_sentinel( "all", value, 0 ) ) {
+        // The array of 1 element
+        __kmp_nested_nth.nth = ( int* )KMP_INTERNAL_MALLOC( sizeof( int ) );
+        __kmp_nested_nth.size = __kmp_nested_nth.used = 1;
+        __kmp_nested_nth.nth[0] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_xproc;
+    } else {
+        __kmp_parse_nested_num_threads( name, value, & __kmp_nested_nth );
+        if ( __kmp_nested_nth.nth ) {
+            __kmp_dflt_team_nth = __kmp_nested_nth.nth[0];
+            if ( __kmp_dflt_team_nth_ub < __kmp_dflt_team_nth ) {
+                __kmp_dflt_team_nth_ub = __kmp_dflt_team_nth;
+            }
+        }
+    }; // if
+    K_DIAG( 1, ( "__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth ) );
+} // __kmp_stg_parse_num_threads
+
+static void
+__kmp_stg_print_num_threads( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s", name );
+    }
+    if ( __kmp_nested_nth.used ) {
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init( &buf );
+        for ( int i = 0; i < __kmp_nested_nth.used; i++) {
+            __kmp_str_buf_print( &buf, "%d", __kmp_nested_nth.nth[i] );
+            if ( i < __kmp_nested_nth.used - 1 ) {
+                __kmp_str_buf_print( &buf, "," );
+            }
+        }
+        __kmp_str_buf_print( buffer, "='%s'\n", buf.str );
+        __kmp_str_buf_free(&buf);
+    } else {
+        __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+    }
+} // __kmp_stg_print_num_threads
+
+// -------------------------------------------------------------------------------------------------
+// OpenMP 3.0: KMP_TASKING, OMP_MAX_ACTIVE_LEVELS,
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_tasking( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int( name, value, 0, (int)tskm_max, (int *)&__kmp_tasking_mode );
+} // __kmp_stg_parse_tasking
+
+static void
+__kmp_stg_print_tasking( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_tasking_mode );
+} // __kmp_stg_print_tasking
+
+static void
+__kmp_stg_parse_task_stealing( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int( name, value, 0, 1, (int *)&__kmp_task_stealing_constraint );
+} // __kmp_stg_parse_task_stealing
+
+static void
+__kmp_stg_print_task_stealing( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_task_stealing_constraint );
+} // __kmp_stg_print_task_stealing
+
+static void
+__kmp_stg_parse_max_active_levels( char const * name, char const * value, void * data ) {
+	 __kmp_stg_parse_int( name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT, & __kmp_dflt_max_active_levels );
+} // __kmp_stg_parse_max_active_levels
+
+static void
+__kmp_stg_print_max_active_levels( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_dflt_max_active_levels );
+} // __kmp_stg_print_max_active_levels
+
+#if KMP_NESTED_HOT_TEAMS
+// -------------------------------------------------------------------------------------------------
+// KMP_HOT_TEAMS_MAX_LEVEL, KMP_HOT_TEAMS_MODE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_hot_teams_level( char const * name, char const * value, void * data ) {
+    if ( TCR_4(__kmp_init_parallel) ) {
+        KMP_WARNING( EnvParallelWarn, name );
+        return;
+    }   // read value before first parallel only
+    __kmp_stg_parse_int( name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT, & __kmp_hot_teams_max_level );
+} // __kmp_stg_parse_hot_teams_level
+
+static void
+__kmp_stg_print_hot_teams_level( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_hot_teams_max_level );
+} // __kmp_stg_print_hot_teams_level
+
+static void
+__kmp_stg_parse_hot_teams_mode( char const * name, char const * value, void * data ) {
+    if ( TCR_4(__kmp_init_parallel) ) {
+        KMP_WARNING( EnvParallelWarn, name );
+        return;
+    }   // read value before first parallel only
+    __kmp_stg_parse_int( name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT, & __kmp_hot_teams_mode );
+} // __kmp_stg_parse_hot_teams_mode
+
+static void
+__kmp_stg_print_hot_teams_mode( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_hot_teams_mode );
+} // __kmp_stg_print_hot_teams_mode
+
+#endif // KMP_NESTED_HOT_TEAMS
+
+// -------------------------------------------------------------------------------------------------
+// KMP_HANDLE_SIGNALS
+// -------------------------------------------------------------------------------------------------
+
+#if KMP_HANDLE_SIGNALS
+
+static void
+__kmp_stg_parse_handle_signals( char const * name, char const * value, void * data ) {
+	__kmp_stg_parse_bool( name, value, & __kmp_handle_signals );
+} // __kmp_stg_parse_handle_signals
+
+static void
+__kmp_stg_print_handle_signals( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_handle_signals );
+} // __kmp_stg_print_handle_signals
+
+#endif // KMP_HANDLE_SIGNALS
+
+// -------------------------------------------------------------------------------------------------
+// KMP_X_DEBUG, KMP_DEBUG, KMP_DEBUG_BUF_*, KMP_DIAG
+// -------------------------------------------------------------------------------------------------
+
+#ifdef KMP_DEBUG
+
+#define KMP_STG_X_DEBUG( x )                                                                            \
+    static void __kmp_stg_parse_##x##_debug( char const * name, char const * value, void * data ) {     \
+	__kmp_stg_parse_int( name, value, 0, INT_MAX, & kmp_##x##_debug );                              \
+    } /* __kmp_stg_parse_x_debug */                                                                     \
+    static void __kmp_stg_print_##x##_debug( kmp_str_buf_t * buffer, char const * name, void * data ) { \
+	__kmp_stg_print_int( buffer, name, kmp_##x##_debug );                                           \
+    } /* __kmp_stg_print_x_debug */
+
+KMP_STG_X_DEBUG( a )
+KMP_STG_X_DEBUG( b )
+KMP_STG_X_DEBUG( c )
+KMP_STG_X_DEBUG( d )
+KMP_STG_X_DEBUG( e )
+KMP_STG_X_DEBUG( f )
+
+#undef KMP_STG_X_DEBUG
+
+static void
+__kmp_stg_parse_debug( char const * name, char const * value, void * data ) {
+    int debug = 0;
+    __kmp_stg_parse_int( name, value, 0, INT_MAX, & debug );
+    if ( kmp_a_debug < debug ) {
+	kmp_a_debug = debug;
+    }; // if
+    if ( kmp_b_debug < debug ) {
+	kmp_b_debug = debug;
+    }; // if
+    if ( kmp_c_debug < debug ) {
+	kmp_c_debug = debug;
+    }; // if
+    if ( kmp_d_debug < debug ) {
+	kmp_d_debug = debug;
+    }; // if
+    if ( kmp_e_debug < debug ) {
+	kmp_e_debug = debug;
+    }; // if
+    if ( kmp_f_debug < debug ) {
+	kmp_f_debug = debug;
+    }; // if
+} // __kmp_stg_parse_debug
+
+static void
+__kmp_stg_parse_debug_buf( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_debug_buf );
+    // !!! TODO: Move buffer initialization of of this file! It may works incorrectly if
+    // KMP_DEBUG_BUF is parsed before KMP_DEBUG_BUF_LINES or KMP_DEBUG_BUF_CHARS.
+    if ( __kmp_debug_buf ) {
+	int i;
+	int elements = __kmp_debug_buf_lines * __kmp_debug_buf_chars;
+
+	/* allocate and initialize all entries in debug buffer to empty */
+	__kmp_debug_buffer = (char *) __kmp_page_allocate( elements * sizeof( char ) );
+	for ( i = 0; i < elements; i += __kmp_debug_buf_chars )
+	   __kmp_debug_buffer[i] = '\0';
+
+	__kmp_debug_count = 0;
+    }
+    K_DIAG( 1, ( "__kmp_debug_buf = %d\n", __kmp_debug_buf ) );
+} // __kmp_stg_parse_debug_buf
+
+static void
+__kmp_stg_print_debug_buf( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_debug_buf );
+} // __kmp_stg_print_debug_buf
+
+static void
+__kmp_stg_parse_debug_buf_atomic( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_debug_buf_atomic );
+} // __kmp_stg_parse_debug_buf_atomic
+
+static void
+__kmp_stg_print_debug_buf_atomic( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_debug_buf_atomic );
+} // __kmp_stg_print_debug_buf_atomic
+
+static void
+__kmp_stg_parse_debug_buf_chars( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int(
+	name,
+	value,
+	KMP_DEBUG_BUF_CHARS_MIN,
+	INT_MAX,
+	& __kmp_debug_buf_chars
+    );
+} // __kmp_stg_debug_parse_buf_chars
+
+static void
+__kmp_stg_print_debug_buf_chars( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_debug_buf_chars );
+} // __kmp_stg_print_debug_buf_chars
+
+static void
+__kmp_stg_parse_debug_buf_lines( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int(
+	name,
+	value,
+	KMP_DEBUG_BUF_LINES_MIN,
+	INT_MAX,
+	& __kmp_debug_buf_lines
+    );
+} // __kmp_stg_parse_debug_buf_lines
+
+static void
+__kmp_stg_print_debug_buf_lines( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_debug_buf_lines );
+} // __kmp_stg_print_debug_buf_lines
+
+static void
+__kmp_stg_parse_diag( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int( name, value, 0, INT_MAX, & kmp_diag );
+} // __kmp_stg_parse_diag
+
+static void
+__kmp_stg_print_diag( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, kmp_diag );
+} // __kmp_stg_print_diag
+
+#endif // KMP_DEBUG
+
+// -------------------------------------------------------------------------------------------------
+// KMP_ALIGN_ALLOC
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_align_alloc( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_size(
+        name,
+        value,
+        CACHE_LINE,
+        INT_MAX,
+        NULL,
+        & __kmp_align_alloc,
+        1
+    );
+} // __kmp_stg_parse_align_alloc
+
+static void
+__kmp_stg_print_align_alloc( kmp_str_buf_t * buffer, char const * name, void * data ) {
+        __kmp_stg_print_size( buffer, name, __kmp_align_alloc );
+} // __kmp_stg_print_align_alloc
+
+// -------------------------------------------------------------------------------------------------
+// KMP_PLAIN_BARRIER, KMP_FORKJOIN_BARRIER, KMP_REDUCTION_BARRIER
+// -------------------------------------------------------------------------------------------------
+
+// TODO: Remove __kmp_barrier_branch_bit_env_name varibale, remove loops from parse and print
+//       functions, pass required info through data argument.
+
+static void
+__kmp_stg_parse_barrier_branch_bit( char const * name, char const * value, void * data ) {
+    const char *var;
+
+    /* ---------- Barrier branch bit control ------------ */
+    for ( int i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
+        var = __kmp_barrier_branch_bit_env_name[ i ];
+        if ( ( strcmp( var, name) == 0 ) && ( value != 0 ) ) {
+            char *comma;
+
+            comma = (char *) strchr( value, ',' );
+            __kmp_barrier_gather_branch_bits[ i ] = ( kmp_uint32 ) __kmp_str_to_int( value, ',' );
+            /* is there a specified release parameter? */
+            if ( comma == NULL ) {
+                __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
+            } else {
+                __kmp_barrier_release_branch_bits[ i ] = (kmp_uint32) __kmp_str_to_int( comma + 1, 0 );
+
+                if ( __kmp_barrier_release_branch_bits[ i ] > KMP_MAX_BRANCH_BITS ) {
+                    __kmp_msg( kmp_ms_warning, KMP_MSG( BarrReleaseValueInvalid, name, comma + 1 ), __kmp_msg_null );
+                    __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
+                }
+            }
+            if ( __kmp_barrier_gather_branch_bits[ i ] > KMP_MAX_BRANCH_BITS ) {
+                    KMP_WARNING( BarrGatherValueInvalid, name, value );
+                    KMP_INFORM( Using_uint_Value, name, __kmp_barrier_gather_bb_dflt );
+                __kmp_barrier_gather_branch_bits[ i ] =  __kmp_barrier_gather_bb_dflt;
+            }
+        }
+        K_DIAG(1, ("%s == %d,%d\n", __kmp_barrier_branch_bit_env_name[ i ], \
+                   __kmp_barrier_gather_branch_bits [ i ], \
+                   __kmp_barrier_release_branch_bits [ i ]))
+    }
+} // __kmp_stg_parse_barrier_branch_bit
+
+static void
+__kmp_stg_print_barrier_branch_bit( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    const char *var;
+    for ( int i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
+        var = __kmp_barrier_branch_bit_env_name[ i ];
+        if ( strcmp( var, name) == 0  ) {
+            if( __kmp_env_format ) {
+                KMP_STR_BUF_PRINT_NAME_EX(__kmp_barrier_branch_bit_env_name[ i ]);
+            } else {
+                __kmp_str_buf_print( buffer, "   %s='", __kmp_barrier_branch_bit_env_name[ i ] );
+            }
+            __kmp_str_buf_print( buffer, "%d,%d'\n", __kmp_barrier_gather_branch_bits [ i ], __kmp_barrier_release_branch_bits [ i ]);
+        }
+    }
+} // __kmp_stg_print_barrier_branch_bit
+
+
+// -------------------------------------------------------------------------------------------------
+// KMP_PLAIN_BARRIER_PATTERN, KMP_FORKJOIN_BARRIER_PATTERN, KMP_REDUCTION_BARRIER_PATTERN
+// -------------------------------------------------------------------------------------------------
+
+// TODO: Remove __kmp_barrier_pattern_name variable, remove loops from parse and print functions,
+//       pass required data to functions through data argument.
+
+static void
+__kmp_stg_parse_barrier_pattern( char const * name, char const * value, void * data ) {
+    const char *var;
+    /* ---------- Barrier method control ------------ */
+
+    for ( int i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
+        var = __kmp_barrier_pattern_env_name[ i ];
+
+        if ( ( strcmp ( var, name ) == 0 ) && ( value != 0 ) ) {
+            int j;
+            char *comma = (char *) strchr( value, ',' );
+
+            /* handle first parameter: gather pattern */
+            for ( j = bp_linear_bar; j<bp_last_bar; j++ ) {
+                if (__kmp_match_with_sentinel( __kmp_barrier_pattern_name[j], value, 1, ',' )) {
+                   __kmp_barrier_gather_pattern[ i ] = (kmp_bar_pat_e) j;
+                   break;
+                }
+            }
+            if ( j == bp_last_bar ) {
+                KMP_WARNING( BarrGatherValueInvalid, name, value );
+                KMP_INFORM( Using_str_Value, name, __kmp_barrier_pattern_name[ bp_linear_bar ] );
+            }
+
+            /* handle second parameter: release pattern */
+            if ( comma != NULL ) {
+                for ( j = bp_linear_bar; j < bp_last_bar; j++ ) {
+                    if ( __kmp_str_match( __kmp_barrier_pattern_name[j], 1, comma + 1 ) ) {
+                       __kmp_barrier_release_pattern[ i ] = (kmp_bar_pat_e) j;
+                       break;
+                    }
+                }
+                if (j == bp_last_bar) {
+                    __kmp_msg( kmp_ms_warning, KMP_MSG( BarrReleaseValueInvalid, name, comma + 1 ), __kmp_msg_null );
+                    KMP_INFORM( Using_str_Value, name, __kmp_barrier_pattern_name[ bp_linear_bar ] );
+                }
+            }
+        }
+    }
+} // __kmp_stg_parse_barrier_pattern
+
+static void
+__kmp_stg_print_barrier_pattern( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    const char *var;
+    for ( int i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
+        var = __kmp_barrier_pattern_env_name[ i ];
+        if ( strcmp ( var, name ) == 0 ) {
+            int j = __kmp_barrier_gather_pattern [ i ];
+            int k = __kmp_barrier_release_pattern [ i ];
+            if( __kmp_env_format ) {
+                KMP_STR_BUF_PRINT_NAME_EX(__kmp_barrier_pattern_env_name[ i ]);
+            } else {
+                __kmp_str_buf_print( buffer, "   %s='", __kmp_barrier_pattern_env_name[ i ] );
+            }
+            __kmp_str_buf_print( buffer, "%s,%s'\n", __kmp_barrier_pattern_name [ j ], __kmp_barrier_pattern_name [ k ]);
+        }
+    }
+} // __kmp_stg_print_barrier_pattern
+
+// -------------------------------------------------------------------------------------------------
+// KMP_ABORT_DELAY
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_abort_delay( char const * name, char const * value, void * data ) {
+    // Units of KMP_DELAY_ABORT are seconds, units of __kmp_abort_delay is milliseconds.
+    int delay = __kmp_abort_delay / 1000;
+    __kmp_stg_parse_int( name, value, 0, INT_MAX / 1000, & delay );
+    __kmp_abort_delay = delay * 1000;
+} // __kmp_stg_parse_abort_delay
+
+static void
+__kmp_stg_print_abort_delay( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_abort_delay );
+} // __kmp_stg_print_abort_delay
+
+// -------------------------------------------------------------------------------------------------
+// KMP_CPUINFO_FILE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_cpuinfo_file( char const * name, char const * value, void * data ) {
+    #if KMP_AFFINITY_SUPPORTED
+        __kmp_stg_parse_str( name, value, & __kmp_cpuinfo_file );
+        K_DIAG( 1, ( "__kmp_cpuinfo_file == %s\n", __kmp_cpuinfo_file ) );
+    #endif
+} //__kmp_stg_parse_cpuinfo_file
+
+static void
+__kmp_stg_print_cpuinfo_file( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    #if KMP_AFFINITY_SUPPORTED
+        if( __kmp_env_format ) {
+            KMP_STR_BUF_PRINT_NAME;
+        } else {
+            __kmp_str_buf_print( buffer, "   %s", name );
+        }
+        if ( __kmp_cpuinfo_file ) {
+            __kmp_str_buf_print( buffer, "='%s'\n", __kmp_cpuinfo_file );
+        } else {
+            __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+        }
+    #endif
+} //__kmp_stg_print_cpuinfo_file
+
+// -------------------------------------------------------------------------------------------------
+// KMP_FORCE_REDUCTION, KMP_DETERMINISTIC_REDUCTION
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_force_reduction( char const * name, char const * value, void * data )
+{
+    kmp_stg_fr_data_t * reduction = (kmp_stg_fr_data_t *) data;
+    int                 rc;
+
+    rc = __kmp_stg_check_rivals( name, value, reduction->rivals );
+    if ( rc ) {
+        return;
+    }; // if
+    if ( reduction->force ) {
+        if( value != 0 ) {
+            if( __kmp_str_match( "critical", 0, value ) )
+               __kmp_force_reduction_method = critical_reduce_block;
+            else if( __kmp_str_match( "atomic", 0, value ) )
+               __kmp_force_reduction_method = atomic_reduce_block;
+            else if( __kmp_str_match( "tree", 0, value ) )
+               __kmp_force_reduction_method = tree_reduce_block;
+            else {
+                KMP_FATAL( UnknownForceReduction, name, value );
+            }
+        }
+    } else {
+        __kmp_stg_parse_bool( name, value, & __kmp_determ_red );
+        if( __kmp_determ_red ) {
+            __kmp_force_reduction_method = tree_reduce_block;
+        } else {
+            __kmp_force_reduction_method = reduction_method_not_defined;
+        }
+    }
+    K_DIAG( 1, ( "__kmp_force_reduction_method == %d\n", __kmp_force_reduction_method ) );
+} // __kmp_stg_parse_force_reduction
+
+static void
+__kmp_stg_print_force_reduction( kmp_str_buf_t * buffer, char const * name, void * data ) {
+
+    kmp_stg_fr_data_t * reduction = (kmp_stg_fr_data_t *) data;
+    if ( reduction->force ) {
+        if( __kmp_force_reduction_method == critical_reduce_block) {
+            __kmp_stg_print_str( buffer, name, "critical");
+        } else if ( __kmp_force_reduction_method == atomic_reduce_block ) {
+            __kmp_stg_print_str( buffer, name, "atomic");
+        } else if ( __kmp_force_reduction_method == tree_reduce_block ) {
+            __kmp_stg_print_str( buffer, name, "tree");
+        } else {
+            if( __kmp_env_format ) {
+                KMP_STR_BUF_PRINT_NAME;
+            } else {
+                __kmp_str_buf_print( buffer, "   %s", name );
+            }
+            __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+        }
+    } else {
+        __kmp_stg_print_bool( buffer, name, __kmp_determ_red );
+    }
+
+
+} // __kmp_stg_print_force_reduction
+
+// -------------------------------------------------------------------------------------------------
+// KMP_STORAGE_MAP
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_storage_map( char const * name, char const * value, void * data ) {
+    if ( __kmp_str_match(  "verbose", 1, value ) ) {
+        __kmp_storage_map         = TRUE;
+        __kmp_storage_map_verbose = TRUE;
+        __kmp_storage_map_verbose_specified = TRUE;
+
+    } else {
+        __kmp_storage_map_verbose = FALSE;
+        __kmp_stg_parse_bool( name, value, & __kmp_storage_map ); // !!!
+    }; // if
+} // __kmp_stg_parse_storage_map
+
+static void
+__kmp_stg_print_storage_map( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if ( __kmp_storage_map_verbose || __kmp_storage_map_verbose_specified ) {
+        __kmp_stg_print_str( buffer, name, "verbose" );
+    } else {
+        __kmp_stg_print_bool( buffer, name, __kmp_storage_map );
+    }
+} // __kmp_stg_print_storage_map
+
+// -------------------------------------------------------------------------------------------------
+// KMP_ALL_THREADPRIVATE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_all_threadprivate( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int( name, value, __kmp_allThreadsSpecified ? __kmp_max_nth : 1, __kmp_max_nth,
+        & __kmp_tp_capacity );
+} // __kmp_stg_parse_all_threadprivate
+
+static void
+__kmp_stg_print_all_threadprivate( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_tp_capacity );
+
+}
+
+// -------------------------------------------------------------------------------------------------
+// KMP_FOREIGN_THREADS_THREADPRIVATE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_foreign_threads_threadprivate( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_foreign_tp );
+} // __kmp_stg_parse_foreign_threads_threadprivate
+
+static void
+__kmp_stg_print_foreign_threads_threadprivate( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_foreign_tp );
+} // __kmp_stg_print_foreign_threads_threadprivate
+
+
+// -------------------------------------------------------------------------------------------------
+// KMP_AFFINITY, GOMP_CPU_AFFINITY, KMP_TOPOLOGY_METHOD
+// -------------------------------------------------------------------------------------------------
+
+#if KMP_AFFINITY_SUPPORTED
+//
+// Parse the proc id list.  Return TRUE if successful, FALSE otherwise.
+//
+static int
+__kmp_parse_affinity_proc_id_list( const char *var, const char *env,
+    const char **nextEnv, char **proclist )
+{
+    const char *scan = env;
+    const char *next = scan;
+    int empty = TRUE;
+
+    *proclist = NULL;
+
+    for (;;) {
+        int start, end, stride;
+
+        SKIP_WS(scan);
+        next = scan;
+        if (*next == '\0') {
+            break;
+        }
+
+        if (*next == '{') {
+            int num;
+            next++;     // skip '{'
+            SKIP_WS(next);
+            scan = next;
+
+            //
+            // Read the first integer in the set.
+            //
+            if ((*next < '0') || (*next > '9')) {
+                KMP_WARNING( AffSyntaxError, var );
+                return FALSE;
+            }
+            SKIP_DIGITS(next);
+            num = __kmp_str_to_int(scan, *next);
+            KMP_ASSERT(num >= 0);
+
+            for (;;) {
+                //
+                // Check for end of set.
+                //
+                SKIP_WS(next);
+                if (*next == '}') {
+                    next++;     // skip '}'
+                    break;
+                }
+
+                //
+                // Skip optional comma.
+                //
+                if (*next == ',') {
+                    next++;
+                }
+                SKIP_WS(next);
+
+                //
+                // Read the next integer in the set.
+                //
+                scan = next;
+                if ((*next < '0') || (*next > '9')) {
+                    KMP_WARNING( AffSyntaxError, var );
+                    return FALSE;
+                }
+
+                SKIP_DIGITS(next);
+                num = __kmp_str_to_int(scan, *next);
+                KMP_ASSERT(num >= 0);
+            }
+            empty = FALSE;
+
+            SKIP_WS(next);
+            if (*next == ',') {
+                next++;
+            }
+            scan = next;
+            continue;
+        }
+
+        //
+        // Next character is not an integer => end of list
+        //
+        if ((*next < '0') || (*next > '9')) {
+            if (empty) {
+                KMP_WARNING( AffSyntaxError, var );
+                return FALSE;
+            }
+            break;
+        }
+
+        //
+        // Read the first integer.
+        //
+        SKIP_DIGITS(next);
+        start = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT(start >= 0);
+        SKIP_WS(next);
+
+        //
+        // If this isn't a range, then go on.
+        //
+        if (*next != '-') {
+            empty = FALSE;
+
+            //
+            // Skip optional comma.
+            //
+            if (*next == ',') {
+                next++;
+            }
+            scan = next;
+            continue;
+        }
+
+        //
+        // This is a range.  Skip over the '-' and read in the 2nd int.
+        //
+        next++;         // skip '-'
+        SKIP_WS(next);
+        scan = next;
+        if ((*next < '0') || (*next > '9')) {
+            KMP_WARNING( AffSyntaxError, var );
+            return FALSE;
+        }
+        SKIP_DIGITS(next);
+        end = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT(end >= 0);
+
+        //
+        // Check for a stride parameter
+        //
+        stride = 1;
+        SKIP_WS(next);
+        if (*next == ':') {
+            //
+            // A stride is specified.  Skip over the ':" and read the 3rd int.
+            //
+            int sign = +1;
+            next++;         // skip ':'
+            SKIP_WS(next);
+            scan = next;
+            if (*next == '-') {
+                sign = -1;
+                next++;
+                SKIP_WS(next);
+                scan = next;
+            }
+            if ((*next < '0') || (*next > '9')) {
+                KMP_WARNING( AffSyntaxError, var );
+                return FALSE;
+            }
+            SKIP_DIGITS(next);
+            stride = __kmp_str_to_int(scan, *next);
+            KMP_ASSERT(stride >= 0);
+            stride *= sign;
+        }
+
+        //
+        // Do some range checks.
+        //
+        if (stride == 0) {
+            KMP_WARNING( AffZeroStride, var );
+            return FALSE;
+        }
+        if (stride > 0) {
+            if (start > end) {
+                KMP_WARNING( AffStartGreaterEnd, var, start, end );
+                return FALSE;
+            }
+        }
+        else {
+            if (start < end) {
+                KMP_WARNING( AffStrideLessZero, var, start, end );
+                return FALSE;
+            }
+        }
+        if ((end - start) / stride > 65536 ) {
+            KMP_WARNING( AffRangeTooBig, var, end, start, stride );
+            return FALSE;
+        }
+
+        empty = FALSE;
+
+        //
+        // Skip optional comma.
+        //
+        SKIP_WS(next);
+        if (*next == ',') {
+            next++;
+        }
+        scan = next;
+    }
+
+    *nextEnv = next;
+
+    {
+        int len = next - env;
+        char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+        KMP_MEMCPY_S(retlist, (len+1)*sizeof(char), env, len * sizeof(char));
+        retlist[len] = '\0';
+        *proclist = retlist;
+    }
+    return TRUE;
+}
+
+
+//
+// If KMP_AFFINITY is specified without a type, then
+// __kmp_affinity_notype should point to its setting.
+//
+static kmp_setting_t *__kmp_affinity_notype = NULL;
+
+static void
+__kmp_parse_affinity_env( char const * name, char const * value,
+    enum affinity_type  * out_type,
+    char                ** out_proclist,
+    int                 * out_verbose,
+    int                 * out_warn,
+    int                 * out_respect,
+    enum affinity_gran  * out_gran,
+    int                 * out_gran_levels,
+    int                 * out_dups,
+    int                 * out_compact,
+    int                 * out_offset
+)
+{
+    char * buffer = NULL;    // Copy of env var value.
+    char * buf    = NULL;    // Buffer for strtok_r() function.
+    char * next = NULL;      // end of token / start of next.
+    const char * start;      // start of current token (for err msgs)
+    int    count  = 0;       // Counter of parsed integer numbers.
+    int    number[ 2 ];      // Parsed numbers.
+
+    // Guards.
+    int type         = 0;
+    int proclist     = 0;
+    int max_proclist = 0;
+    int verbose      = 0;
+    int warnings     = 0;
+    int respect      = 0;
+    int gran         = 0;
+    int dups         = 0;
+
+    KMP_ASSERT( value != NULL );
+
+    if ( TCR_4(__kmp_init_middle) ) {
+        KMP_WARNING( EnvMiddleWarn, name );
+        __kmp_env_toPrint( name, 0 );
+        return;
+    }
+    __kmp_env_toPrint( name, 1 );
+
+    buffer = __kmp_str_format( "%s", value );         // Copy env var to keep original intact.
+    buf = buffer;
+    SKIP_WS(buf);
+
+    // Helper macros.
+
+    //
+    // If we see a parse error, emit a warning and scan to the next ",".
+    //
+    // FIXME - there's got to be a better way to print an error
+    // message, hopefully without overwritting peices of buf.
+    //
+    #define EMIT_WARN(skip,errlist) \
+        {                                                                     \
+            char ch;                                                          \
+            if (skip) {                                                       \
+                SKIP_TO(next, ',');                                           \
+            }                                                                 \
+            ch = *next;                                                       \
+            *next = '\0';                                                     \
+            KMP_WARNING errlist;                                              \
+            *next = ch;                                                       \
+            if (skip) {                                                       \
+                if (ch == ',') next++;                                        \
+            }                                                                 \
+            buf = next;                                                       \
+        }
+
+    #define _set_param(_guard,_var,_val)                                      \
+        {                                                                     \
+            if ( _guard == 0 ) {                                              \
+                _var = _val;                                                  \
+            } else {                                                          \
+                EMIT_WARN( FALSE, ( AffParamDefined, name, start ) );         \
+            };                                                                \
+            ++ _guard;                                                        \
+        }
+
+    #define set_type(val)          _set_param( type,     *out_type,        val )
+    #define set_verbose(val)       _set_param( verbose,  *out_verbose,     val )
+    #define set_warnings(val)      _set_param( warnings, *out_warn,        val )
+    #define set_respect(val)       _set_param( respect,  *out_respect,     val )
+    #define set_dups(val)          _set_param( dups,     *out_dups,        val )
+    #define set_proclist(val)      _set_param( proclist, *out_proclist,    val )
+
+    #define set_gran(val,levels)                                              \
+        {                                                                     \
+            if ( gran == 0 ) {                                                \
+                *out_gran = val;                                              \
+                *out_gran_levels = levels;                                    \
+            } else {                                                          \
+                EMIT_WARN( FALSE, ( AffParamDefined, name, start ) );         \
+            };                                                                \
+            ++ gran;                                                          \
+        }
+
+# if OMP_40_ENABLED
+    KMP_DEBUG_ASSERT( ( __kmp_nested_proc_bind.bind_types != NULL )
+      && ( __kmp_nested_proc_bind.used > 0 ) );
+# endif
+
+    while ( *buf != '\0' ) {
+        start = next = buf;
+
+        if (__kmp_match_str("none", buf, (const char **)&next)) {
+            set_type( affinity_none );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+# endif
+            buf = next;
+        } else if (__kmp_match_str("scatter", buf, (const char **)&next)) {
+            set_type( affinity_scatter );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
+            buf = next;
+        } else if (__kmp_match_str("compact", buf, (const char **)&next)) {
+            set_type( affinity_compact );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
+            buf = next;
+        } else if (__kmp_match_str("logical", buf, (const char **)&next)) {
+            set_type( affinity_logical );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
+            buf = next;
+        } else if (__kmp_match_str("physical", buf, (const char **)&next)) {
+            set_type( affinity_physical );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
+            buf = next;
+        } else if (__kmp_match_str("explicit", buf, (const char **)&next)) {
+            set_type( affinity_explicit );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
+            buf = next;
+        } else if (__kmp_match_str("balanced", buf, (const char **)&next)) {
+            set_type( affinity_balanced );
+#  if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+#  endif
+            buf = next;
+        } else if (__kmp_match_str("disabled", buf, (const char **)&next)) {
+            set_type( affinity_disabled );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+# endif
+            buf = next;
+        } else if (__kmp_match_str("verbose", buf, (const char **)&next)) {
+            set_verbose( TRUE );
+            buf = next;
+        } else if (__kmp_match_str("noverbose", buf, (const char **)&next)) {
+            set_verbose( FALSE );
+            buf = next;
+        } else if (__kmp_match_str("warnings", buf, (const char **)&next)) {
+            set_warnings( TRUE );
+            buf = next;
+        } else if (__kmp_match_str("nowarnings", buf, (const char **)&next)) {
+            set_warnings( FALSE );
+            buf = next;
+        } else if (__kmp_match_str("respect", buf, (const char **)&next)) {
+            set_respect( TRUE );
+            buf = next;
+        } else if (__kmp_match_str("norespect", buf, (const char **)&next)) {
+            set_respect( FALSE );
+            buf = next;
+        } else if (__kmp_match_str("duplicates", buf, (const char **)&next)
+          || __kmp_match_str("dups", buf, (const char **)&next)) {
+            set_dups( TRUE );
+            buf = next;
+        } else if (__kmp_match_str("noduplicates", buf, (const char **)&next)
+          || __kmp_match_str("nodups", buf, (const char **)&next)) {
+            set_dups( FALSE );
+            buf = next;
+        } else if (__kmp_match_str("granularity", buf, (const char **)&next)
+          || __kmp_match_str("gran", buf, (const char **)&next)) {
+            SKIP_WS(next);
+            if (*next != '=') {
+                EMIT_WARN( TRUE, ( AffInvalidParam, name, start ) );
+                continue;
+            }
+            next++;      // skip '='
+            SKIP_WS(next);
+
+            buf = next;
+            if (__kmp_match_str("fine", buf, (const char **)&next)) {
+                set_gran( affinity_gran_fine, -1 );
+                buf = next;
+            } else if (__kmp_match_str("thread", buf, (const char **)&next)) {
+                set_gran( affinity_gran_thread, -1 );
+                buf = next;
+            } else if (__kmp_match_str("core", buf, (const char **)&next)) {
+                set_gran( affinity_gran_core, -1 );
+                buf = next;
+            } else if (__kmp_match_str("package", buf, (const char **)&next)) {
+                set_gran( affinity_gran_package, -1 );
+                buf = next;
+            } else if (__kmp_match_str("node", buf, (const char **)&next)) {
+                set_gran( affinity_gran_node, -1 );
+                buf = next;
+# if KMP_GROUP_AFFINITY
+            } else if (__kmp_match_str("group", buf, (const char **)&next)) {
+                set_gran( affinity_gran_group, -1 );
+                buf = next;
+# endif /* KMP_GROUP AFFINITY */
+            } else if ((*buf >= '0') && (*buf <= '9')) {
+                int n;
+                next = buf;
+                SKIP_DIGITS(next);
+                n = __kmp_str_to_int( buf, *next );
+                KMP_ASSERT(n >= 0);
+                buf = next;
+                set_gran( affinity_gran_default, n );
+            } else {
+                EMIT_WARN( TRUE, ( AffInvalidParam, name, start ) );
+                continue;
+            }
+        } else if (__kmp_match_str("proclist", buf, (const char **)&next)) {
+            char *temp_proclist;
+
+            SKIP_WS(next);
+            if (*next != '=') {
+                EMIT_WARN( TRUE, ( AffInvalidParam, name, start ) );
+                continue;
+            }
+            next++;      // skip '='
+            SKIP_WS(next);
+            if (*next != '[') {
+                EMIT_WARN( TRUE, ( AffInvalidParam, name, start ) );
+                continue;
+            }
+            next++;      // skip '['
+            buf = next;
+            if (! __kmp_parse_affinity_proc_id_list(name, buf,
+              (const char **)&next, &temp_proclist)) {
+                //
+                // warning already emitted.
+                //
+                SKIP_TO(next, ']');
+                if (*next == ']') next++;
+                SKIP_TO(next, ',');
+                if (*next == ',') next++;
+                buf = next;
+                continue;
+            }
+            if (*next != ']') {
+                EMIT_WARN( TRUE, ( AffInvalidParam, name, start ) );
+                continue;
+            }
+            next++;      // skip ']'
+            set_proclist( temp_proclist );
+        } else if ((*buf >= '0') && (*buf <= '9')) {
+            // Parse integer numbers -- permute and offset.
+            int n;
+            next = buf;
+            SKIP_DIGITS(next);
+            n = __kmp_str_to_int( buf, *next );
+            KMP_ASSERT(n >= 0);
+            buf = next;
+            if ( count < 2 ) {
+                number[ count ] = n;
+            } else {
+                KMP_WARNING( AffManyParams, name, start );
+            }; // if
+            ++ count;
+        } else {
+            EMIT_WARN( TRUE, ( AffInvalidParam, name, start ) );
+            continue;
+        }
+
+        SKIP_WS(next);
+        if (*next == ',') {
+            next++;
+            SKIP_WS(next);
+        }
+        else if (*next != '\0') {
+            const char *temp = next;
+            EMIT_WARN( TRUE, ( ParseExtraCharsWarn, name, temp ) );
+            continue;
+        }
+        buf = next;
+    } // while
+
+    #undef EMIT_WARN
+    #undef _set_param
+    #undef set_type
+    #undef set_verbose
+    #undef set_warnings
+    #undef set_respect
+    #undef set_granularity
+
+    KMP_INTERNAL_FREE( buffer );
+
+    if ( proclist ) {
+        if ( ! type ) {
+            KMP_WARNING( AffProcListNoType, name );
+            __kmp_affinity_type = affinity_explicit;
+        }
+        else if ( __kmp_affinity_type != affinity_explicit ) {
+            KMP_WARNING( AffProcListNotExplicit, name );
+            KMP_ASSERT( *out_proclist != NULL );
+            KMP_INTERNAL_FREE( *out_proclist );
+            *out_proclist = NULL;
+        }
+    }
+    switch ( *out_type ) {
+        case affinity_logical:
+        case affinity_physical: {
+            if ( count > 0 ) {
+                *out_offset = number[ 0 ];
+            }; // if
+            if ( count > 1 ) {
+                KMP_WARNING( AffManyParamsForLogic, name, number[ 1 ] );
+            }; // if
+        } break;
+        case affinity_balanced: {
+            if ( count > 0 ) {
+                *out_compact = number[ 0 ];
+            }; // if
+            if ( count > 1 ) {
+                *out_offset = number[ 1 ];
+            }; // if
+
+            if ( __kmp_affinity_gran == affinity_gran_default ) {
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+                if( __kmp_mic_type != non_mic ) {
+                    if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
+                        KMP_WARNING( AffGranUsing, "KMP_AFFINITY", "fine" );
+                    }
+                    __kmp_affinity_gran = affinity_gran_fine;
+                } else
+#endif
+                {
+                    if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
+                        KMP_WARNING( AffGranUsing, "KMP_AFFINITY", "core" );
+                    }
+                    __kmp_affinity_gran = affinity_gran_core;
+                }
+            }
+        } break;
+        case affinity_scatter:
+        case affinity_compact: {
+            if ( count > 0 ) {
+                *out_compact = number[ 0 ];
+            }; // if
+            if ( count > 1 ) {
+                *out_offset = number[ 1 ];
+            }; // if
+        } break;
+        case affinity_explicit: {
+            if ( *out_proclist == NULL ) {
+                KMP_WARNING( AffNoProcList, name );
+                __kmp_affinity_type = affinity_none;
+            }
+            if ( count > 0 ) {
+                KMP_WARNING( AffNoParam, name, "explicit" );
+            }
+        } break;
+        case affinity_none: {
+            if ( count > 0 ) {
+                KMP_WARNING( AffNoParam, name, "none" );
+            }; // if
+        } break;
+        case affinity_disabled: {
+            if ( count > 0 ) {
+                KMP_WARNING( AffNoParam, name, "disabled" );
+            }; // if
+        } break;
+        case affinity_default: {
+            if ( count > 0 ) {
+                KMP_WARNING( AffNoParam, name, "default" );
+            }; // if
+        } break;
+        default: {
+            KMP_ASSERT( 0 );
+        };
+    }; // switch
+} // __kmp_parse_affinity_env
+
+static void
+__kmp_stg_parse_affinity( char const * name, char const * value, void * data )
+{
+    kmp_setting_t **rivals = (kmp_setting_t **) data;
+    int rc;
+
+    rc = __kmp_stg_check_rivals( name, value, rivals );
+    if ( rc ) {
+        return;
+    }
+
+    __kmp_parse_affinity_env( name, value, & __kmp_affinity_type,
+      & __kmp_affinity_proclist, & __kmp_affinity_verbose,
+      & __kmp_affinity_warnings, & __kmp_affinity_respect_mask,
+      & __kmp_affinity_gran, & __kmp_affinity_gran_levels,
+      & __kmp_affinity_dups, & __kmp_affinity_compact,
+      & __kmp_affinity_offset );
+
+} // __kmp_stg_parse_affinity
+
+static void
+__kmp_stg_print_affinity( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME_EX(name);
+    } else {
+        __kmp_str_buf_print( buffer, "   %s='", name );
+    }
+    if ( __kmp_affinity_verbose ) {
+        __kmp_str_buf_print( buffer, "%s,", "verbose");
+    } else {
+        __kmp_str_buf_print( buffer, "%s,", "noverbose");
+    }
+    if ( __kmp_affinity_warnings ) {
+        __kmp_str_buf_print( buffer, "%s,", "warnings");
+    } else {
+        __kmp_str_buf_print( buffer, "%s,", "nowarnings");
+    }
+    if ( KMP_AFFINITY_CAPABLE() ) {
+        if ( __kmp_affinity_respect_mask ) {
+            __kmp_str_buf_print( buffer, "%s,", "respect");
+        } else {
+            __kmp_str_buf_print( buffer, "%s,", "norespect");
+        }
+        switch ( __kmp_affinity_gran ) {
+            case affinity_gran_default:
+                __kmp_str_buf_print( buffer, "%s", "granularity=default,");
+                break;
+            case affinity_gran_fine:
+                __kmp_str_buf_print( buffer, "%s", "granularity=fine,");
+                break;
+            case affinity_gran_thread:
+                __kmp_str_buf_print( buffer, "%s", "granularity=thread,");
+                break;
+            case affinity_gran_core:
+                __kmp_str_buf_print( buffer, "%s", "granularity=core,");
+                break;
+            case affinity_gran_package:
+                __kmp_str_buf_print( buffer, "%s", "granularity=package,");
+                break;
+            case affinity_gran_node:
+                __kmp_str_buf_print( buffer, "%s", "granularity=node,");
+                break;
+# if KMP_GROUP_AFFINITY
+            case affinity_gran_group:
+                __kmp_str_buf_print( buffer, "%s", "granularity=group,");
+                break;
+# endif /* KMP_GROUP_AFFINITY */
+        }
+        if ( __kmp_affinity_dups ) {
+            __kmp_str_buf_print( buffer, "%s,", "duplicates");
+        } else {
+            __kmp_str_buf_print( buffer, "%s,", "noduplicates");
+        }
+    }
+    if ( ! KMP_AFFINITY_CAPABLE() ) {
+        __kmp_str_buf_print( buffer, "%s", "disabled" );
+    }
+    else switch ( __kmp_affinity_type ){
+        case affinity_none:
+            __kmp_str_buf_print( buffer, "%s", "none");
+            break;
+        case affinity_physical:
+            __kmp_str_buf_print( buffer, "%s,%d", "physical",
+              __kmp_affinity_offset );
+            break;
+        case affinity_logical:
+            __kmp_str_buf_print( buffer, "%s,%d", "logical",
+              __kmp_affinity_offset );
+            break;
+        case affinity_compact:
+            __kmp_str_buf_print( buffer, "%s,%d,%d", "compact",
+              __kmp_affinity_compact, __kmp_affinity_offset );
+            break;
+        case affinity_scatter:
+            __kmp_str_buf_print( buffer, "%s,%d,%d", "scatter",
+              __kmp_affinity_compact, __kmp_affinity_offset );
+            break;
+        case affinity_explicit:
+            __kmp_str_buf_print( buffer, "%s=[%s],%s", "proclist",
+              __kmp_affinity_proclist, "explicit" );
+            break;
+        case affinity_balanced:
+            __kmp_str_buf_print( buffer, "%s,%d,%d", "balanced",
+              __kmp_affinity_compact, __kmp_affinity_offset );
+            break;
+        case affinity_disabled:
+            __kmp_str_buf_print( buffer, "%s", "disabled");
+            break;
+        case affinity_default:
+            __kmp_str_buf_print( buffer, "%s", "default");
+            break;
+        default:
+            __kmp_str_buf_print( buffer, "%s", "<unknown>");
+            break;
+    }
+        __kmp_str_buf_print( buffer, "'\n" );
+} //__kmp_stg_print_affinity
+
+# ifdef KMP_GOMP_COMPAT
+
+static void
+__kmp_stg_parse_gomp_cpu_affinity( char const * name, char const * value, void * data )
+{
+    const char * next = NULL;
+    char * temp_proclist;
+    kmp_setting_t **rivals = (kmp_setting_t **) data;
+    int rc;
+
+    rc = __kmp_stg_check_rivals( name, value, rivals );
+    if ( rc ) {
+        return;
+    }
+
+    if ( TCR_4(__kmp_init_middle) ) {
+        KMP_WARNING( EnvMiddleWarn, name );
+        __kmp_env_toPrint( name, 0 );
+        return;
+    }
+
+    __kmp_env_toPrint( name, 1 );
+
+    if ( __kmp_parse_affinity_proc_id_list( name, value, &next,
+      &temp_proclist )) {
+        SKIP_WS(next);
+        if (*next == '\0') {
+            //
+            // GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=...
+            //
+            __kmp_affinity_proclist = temp_proclist;
+            __kmp_affinity_type = affinity_explicit;
+            __kmp_affinity_gran = affinity_gran_fine;
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
+        }
+        else {
+            KMP_WARNING( AffSyntaxError, name );
+            if (temp_proclist != NULL) {
+                KMP_INTERNAL_FREE((void *)temp_proclist);
+            }
+        }
+    }
+    else {
+        //
+        // Warning already emitted
+        //
+        __kmp_affinity_type = affinity_none;
+# if OMP_40_ENABLED
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+# endif
+    }
+} // __kmp_stg_parse_gomp_cpu_affinity
+
+# endif /* KMP_GOMP_COMPAT */
+
+
+# if OMP_40_ENABLED
+
+/*-----------------------------------------------------------------------------
+
+The OMP_PLACES proc id list parser. Here is the grammar:
+
+place_list := place
+place_list := place , place_list
+place := num
+place := place : num
+place := place : num : signed
+place := { subplacelist }
+place := ! place                  // (lowest priority)
+subplace_list := subplace
+subplace_list := subplace , subplace_list
+subplace := num
+subplace := num : num
+subplace := num : num : signed
+signed := num
+signed := + signed
+signed := - signed
+
+-----------------------------------------------------------------------------*/
+
+static int
+__kmp_parse_subplace_list( const char *var, const char **scan )
+{
+    const char *next;
+
+    for (;;) {
+        int start, count, stride;
+
+        //
+        // Read in the starting proc id
+        //
+        SKIP_WS(*scan);
+        if ((**scan < '0') || (**scan > '9')) {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        next = *scan;
+        SKIP_DIGITS(next);
+        start = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(start >= 0);
+        *scan = next;
+
+        //
+        // valid follow sets are ',' ':' and '}'
+        //
+        SKIP_WS(*scan);
+        if (**scan == '}') {
+            break;
+        }
+        if (**scan == ',') {
+            (*scan)++;  // skip ','
+            continue;
+        }
+        if (**scan != ':') {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        (*scan)++;      // skip ':'
+
+        //
+        // Read count parameter
+        //
+        SKIP_WS(*scan);
+        if ((**scan < '0') || (**scan > '9')) {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        next = *scan;
+        SKIP_DIGITS(next);
+        count = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(count >= 0);
+        *scan = next;
+
+        //
+        // valid follow sets are ',' ':' and '}'
+        //
+        SKIP_WS(*scan);
+        if (**scan == '}') {
+            break;
+        }
+        if (**scan == ',') {
+            (*scan)++;  // skip ','
+            continue;
+        }
+        if (**scan != ':') {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        (*scan)++;      // skip ':'
+
+        //
+        // Read stride parameter
+        //
+        int sign = +1;
+        for (;;) {
+            SKIP_WS(*scan);
+            if (**scan == '+') {
+                (*scan)++; // skip '+'
+                continue;
+            }
+            if (**scan == '-') {
+                sign *= -1;
+                (*scan)++; // skip '-'
+                continue;
+            }
+            break;
+        }
+        SKIP_WS(*scan);
+        if ((**scan < '0') || (**scan > '9')) {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        next = *scan;
+        SKIP_DIGITS(next);
+        stride = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(stride >= 0);
+        *scan = next;
+        stride *= sign;
+
+        //
+        // valid follow sets are ',' and '}'
+        //
+        SKIP_WS(*scan);
+        if (**scan == '}') {
+            break;
+        }
+        if (**scan == ',') {
+            (*scan)++;  // skip ','
+            continue;
+        }
+
+        KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+        return FALSE;
+    }
+    return TRUE;
+}
+
+static int
+__kmp_parse_place( const char *var, const char ** scan )
+{
+    const char *next;
+
+    //
+    // valid follow sets are '{' '!' and num
+    //
+    SKIP_WS(*scan);
+    if (**scan == '{') {
+        (*scan)++;      // skip '{'
+        if (! __kmp_parse_subplace_list(var, scan)) {
+            return FALSE;
+        }
+        if (**scan != '}') {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        (*scan)++;      // skip '}'
+    }
+    else if (**scan == '!') {
+        (*scan)++;      // skip '!'
+        return __kmp_parse_place(var, scan); //'!' has lower precedence than ':'
+    }
+    else if ((**scan >= '0') && (**scan <= '9')) {
+        next = *scan;
+        SKIP_DIGITS(next);
+        int proc = __kmp_str_to_int(*scan, *next);
+        KMP_ASSERT(proc >= 0);
+        *scan = next;
+    }
+    else {
+        KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+        return FALSE;
+    }
+    return TRUE;
+}
+
+static int
+__kmp_parse_place_list( const char *var, const char *env, char **place_list )
+{
+    const char *scan = env;
+    const char *next = scan;
+
+    for (;;) {
+        int start, count, stride;
+
+        if (! __kmp_parse_place(var, &scan)) {
+            return FALSE;
+        }
+
+        //
+        // valid follow sets are ',' ':' and EOL
+        //
+        SKIP_WS(scan);
+        if (*scan == '\0') {
+            break;
+        }
+        if (*scan == ',') {
+            scan++;     // skip ','
+            continue;
+        }
+        if (*scan != ':') {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        scan++;         // skip ':'
+
+        //
+        // Read count parameter
+        //
+        SKIP_WS(scan);
+        if ((*scan < '0') || (*scan > '9')) {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        next = scan;
+        SKIP_DIGITS(next);
+        count = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT(count >= 0);
+        scan = next;
+
+        //
+        // valid follow sets are ',' ':' and EOL
+        //
+        SKIP_WS(scan);
+        if (*scan == '\0') {
+            break;
+        }
+        if (*scan == ',') {
+            scan++;     // skip ','
+            continue;
+        }
+        if (*scan != ':') {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        scan++;         // skip ':'
+
+        //
+        // Read stride parameter
+        //
+        int sign = +1;
+        for (;;) {
+            SKIP_WS(scan);
+            if (*scan == '+') {
+                scan++; // skip '+'
+                continue;
+            }
+            if (*scan == '-') {
+                sign *= -1;
+                scan++; // skip '-'
+                continue;
+            }
+            break;
+        }
+        SKIP_WS(scan);
+        if ((*scan < '0') || (*scan > '9')) {
+            KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+            return FALSE;
+        }
+        next = scan;
+        SKIP_DIGITS(next);
+        stride = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT(stride >= 0);
+        scan = next;
+        stride *= sign;
+
+        //
+        // valid follow sets are ',' and EOL
+        //
+        SKIP_WS(scan);
+        if (*scan == '\0') {
+            break;
+        }
+        if (*scan == ',') {
+            scan++;     // skip ','
+            continue;
+        }
+
+        KMP_WARNING( SyntaxErrorUsing, var, "\"threads\"" );
+        return FALSE;
+    }
+
+    {
+        int len = scan - env;
+        char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+        KMP_MEMCPY_S(retlist, (len+1)*sizeof(char), env, len * sizeof(char));
+        retlist[len] = '\0';
+        *place_list = retlist;
+    }
+    return TRUE;
+}
+
+static void
+__kmp_stg_parse_places( char const * name, char const * value, void * data )
+{
+    int count;
+    const char *scan = value;
+    const char *next = scan;
+    const char *kind = "\"threads\"";
+    kmp_setting_t **rivals = (kmp_setting_t **) data;
+    int rc;
+
+    rc = __kmp_stg_check_rivals( name, value, rivals );
+    if ( rc ) {
+        return;
+    }
+
+    //
+    // If OMP_PROC_BIND is not specified but OMP_PLACES is,
+    // then let OMP_PROC_BIND default to true.
+    //
+    if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) {
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+    }
+
+    //__kmp_affinity_num_places = 0;
+
+    if ( __kmp_match_str( "threads", scan, &next ) ) {
+        scan = next;
+        __kmp_affinity_type = affinity_compact;
+        __kmp_affinity_gran = affinity_gran_thread;
+        __kmp_affinity_dups = FALSE;
+        kind = "\"threads\"";
+    }
+    else if ( __kmp_match_str( "cores", scan, &next ) ) {
+        scan = next;
+        __kmp_affinity_type = affinity_compact;
+        __kmp_affinity_gran = affinity_gran_core;
+        __kmp_affinity_dups = FALSE;
+        kind = "\"cores\"";
+    }
+    else if ( __kmp_match_str( "sockets", scan, &next ) ) {
+        scan = next;
+        __kmp_affinity_type = affinity_compact;
+        __kmp_affinity_gran = affinity_gran_package;
+        __kmp_affinity_dups = FALSE;
+        kind = "\"sockets\"";
+    }
+    else {
+        if ( __kmp_affinity_proclist != NULL ) {
+            KMP_INTERNAL_FREE( (void *)__kmp_affinity_proclist );
+            __kmp_affinity_proclist = NULL;
+        }
+        if ( __kmp_parse_place_list( name, value, &__kmp_affinity_proclist ) ) {
+            __kmp_affinity_type = affinity_explicit;
+            __kmp_affinity_gran = affinity_gran_fine;
+            __kmp_affinity_dups = FALSE;
+            if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) {
+                 __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+            }
+        }
+        return;
+    }
+
+    if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) {
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+    }
+
+    SKIP_WS(scan);
+    if ( *scan == '\0' ) {
+        return;
+    }
+
+    //
+    // Parse option count parameter in parentheses
+    //
+    if ( *scan != '(' ) {
+        KMP_WARNING( SyntaxErrorUsing, name, kind );
+        return;
+    }
+    scan++;     // skip '('
+
+    SKIP_WS(scan);
+    next = scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(count >= 0);
+    scan = next;
+
+    SKIP_WS(scan);
+    if ( *scan != ')' ) {
+        KMP_WARNING( SyntaxErrorUsing, name, kind );
+        return;
+    }
+    scan++;     // skip ')'
+
+    SKIP_WS(scan);
+    if ( *scan != '\0' ) {
+        KMP_WARNING( ParseExtraCharsWarn, name, scan );
+    }
+    __kmp_affinity_num_places = count;
+}
+
+static void
+__kmp_stg_print_places( kmp_str_buf_t * buffer, char const * name,
+  void * data )
+{
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s", name );
+    }
+    if ( ( __kmp_nested_proc_bind.used == 0 )
+      || ( __kmp_nested_proc_bind.bind_types == NULL )
+      || ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_false ) ) {
+        __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+    }
+    else if ( __kmp_affinity_type == affinity_explicit ) {
+        if ( __kmp_affinity_proclist != NULL ) {
+            __kmp_str_buf_print( buffer, "='%s'\n", __kmp_affinity_proclist );
+        }
+        else {
+            __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+        }
+    }
+    else if ( __kmp_affinity_type == affinity_compact ) {
+        int num;
+        if ( __kmp_affinity_num_masks > 0 ) {
+            num = __kmp_affinity_num_masks;
+        }
+        else if ( __kmp_affinity_num_places > 0 ) {
+            num = __kmp_affinity_num_places;
+        }
+        else {
+            num = 0;
+        }
+        if ( __kmp_affinity_gran == affinity_gran_thread ) {
+            if ( num > 0 ) {
+                __kmp_str_buf_print( buffer, "='threads(%d)'\n", num );
+            }
+            else {
+                __kmp_str_buf_print( buffer, "='threads'\n" );
+            }
+        }
+        else if ( __kmp_affinity_gran == affinity_gran_core ) {
+            if ( num > 0 ) {
+                __kmp_str_buf_print( buffer, "='cores(%d)' \n", num );
+            }
+            else {
+                __kmp_str_buf_print( buffer, "='cores'\n" );
+            }
+        }
+        else if ( __kmp_affinity_gran == affinity_gran_package ) {
+            if ( num > 0 ) {
+                __kmp_str_buf_print( buffer, "='sockets(%d)'\n", num );
+            }
+            else {
+                __kmp_str_buf_print( buffer, "='sockets'\n" );
+            }
+        }
+        else {
+            __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+        }
+    }
+    else {
+        __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+    }
+}
+
+# endif /* OMP_40_ENABLED */
+
+# if (! OMP_40_ENABLED)
+
+static void
+__kmp_stg_parse_proc_bind( char const * name, char const * value, void * data )
+{
+    int enabled;
+    kmp_setting_t **rivals = (kmp_setting_t **) data;
+    int rc;
+
+    rc = __kmp_stg_check_rivals( name, value, rivals );
+    if ( rc ) {
+        return;
+    }
+
+    //
+    // in OMP 3.1, OMP_PROC_BIND is strictly a boolean
+    //
+    __kmp_stg_parse_bool( name, value, & enabled );
+    if ( enabled ) {
+            //
+            // OMP_PROC_BIND => granularity=fine,scatter on MIC
+            // OMP_PROC_BIND => granularity=core,scatter elsewhere
+            //
+            __kmp_affinity_type = affinity_scatter;
+            if( __kmp_mic_type != non_mic ) {
+                __kmp_affinity_gran = affinity_gran_fine;
+            } else {
+                __kmp_affinity_gran = affinity_gran_core;
+            }
+    }
+    else {
+        __kmp_affinity_type = affinity_none;
+    }
+} // __kmp_parse_proc_bind
+
+# endif /* if (! OMP_40_ENABLED) */
+
+
+static void
+__kmp_stg_parse_topology_method( char const * name, char const * value,
+  void * data ) {
+    if ( __kmp_str_match( "all", 1, value ) ) {
+       __kmp_affinity_top_method = affinity_top_method_all;
+    }
+# if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    else if ( __kmp_str_match( "x2apic id", 9, value )
+      || __kmp_str_match( "x2apic_id", 9, value )
+      || __kmp_str_match( "x2apic-id", 9, value )
+      || __kmp_str_match( "x2apicid", 8, value )
+      || __kmp_str_match( "cpuid leaf 11", 13, value )
+      || __kmp_str_match( "cpuid_leaf_11", 13, value )
+      || __kmp_str_match( "cpuid-leaf-11", 13, value )
+      || __kmp_str_match( "cpuid leaf11", 12, value )
+      || __kmp_str_match( "cpuid_leaf11", 12, value )
+      || __kmp_str_match( "cpuid-leaf11", 12, value )
+      || __kmp_str_match( "cpuidleaf 11", 12, value )
+      || __kmp_str_match( "cpuidleaf_11", 12, value )
+      || __kmp_str_match( "cpuidleaf-11", 12, value )
+      || __kmp_str_match( "cpuidleaf11", 11, value )
+      || __kmp_str_match( "cpuid 11", 8, value )
+      || __kmp_str_match( "cpuid_11", 8, value )
+      || __kmp_str_match( "cpuid-11", 8, value )
+      || __kmp_str_match( "cpuid11", 7, value )
+      || __kmp_str_match( "leaf 11", 7, value )
+      || __kmp_str_match( "leaf_11", 7, value )
+      || __kmp_str_match( "leaf-11", 7, value )
+      || __kmp_str_match( "leaf11", 6, value ) ) {
+        __kmp_affinity_top_method = affinity_top_method_x2apicid;
+    }
+    else if ( __kmp_str_match( "apic id", 7, value )
+      || __kmp_str_match( "apic_id", 7, value )
+      || __kmp_str_match( "apic-id", 7, value )
+      || __kmp_str_match( "apicid", 6, value )
+      || __kmp_str_match( "cpuid leaf 4", 12, value )
+      || __kmp_str_match( "cpuid_leaf_4", 12, value )
+      || __kmp_str_match( "cpuid-leaf-4", 12, value )
+      || __kmp_str_match( "cpuid leaf4", 11, value )
+      || __kmp_str_match( "cpuid_leaf4", 11, value )
+      || __kmp_str_match( "cpuid-leaf4", 11, value )
+      || __kmp_str_match( "cpuidleaf 4", 11, value )
+      || __kmp_str_match( "cpuidleaf_4", 11, value )
+      || __kmp_str_match( "cpuidleaf-4", 11, value )
+      || __kmp_str_match( "cpuidleaf4", 10, value )
+      || __kmp_str_match( "cpuid 4", 7, value )
+      || __kmp_str_match( "cpuid_4", 7, value )
+      || __kmp_str_match( "cpuid-4", 7, value )
+      || __kmp_str_match( "cpuid4", 6, value )
+      || __kmp_str_match( "leaf 4", 6, value )
+      || __kmp_str_match( "leaf_4", 6, value )
+      || __kmp_str_match( "leaf-4", 6, value )
+      || __kmp_str_match( "leaf4", 5, value ) ) {
+        __kmp_affinity_top_method = affinity_top_method_apicid;
+    }
+# endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+    else if ( __kmp_str_match( "/proc/cpuinfo", 2, value )
+      || __kmp_str_match( "cpuinfo", 5, value )) {
+        __kmp_affinity_top_method = affinity_top_method_cpuinfo;
+    }
+# if KMP_GROUP_AFFINITY
+    else if ( __kmp_str_match( "group", 1, value ) ) {
+        __kmp_affinity_top_method = affinity_top_method_group;
+    }
+# endif /* KMP_GROUP_AFFINITY */
+    else if ( __kmp_str_match( "flat", 1, value ) ) {
+        __kmp_affinity_top_method = affinity_top_method_flat;
+    }
+    else {
+        KMP_WARNING( StgInvalidValue, name, value );
+    }
+} // __kmp_stg_parse_topology_method
+
+static void
+__kmp_stg_print_topology_method( kmp_str_buf_t * buffer, char const * name,
+  void * data ) {
+# if KMP_DEBUG
+    char const * value = NULL;
+
+    switch ( __kmp_affinity_top_method ) {
+        case affinity_top_method_default:
+        value = "default";
+        break;
+
+        case affinity_top_method_all:
+        value = "all";
+        break;
+
+#  if KMP_ARCH_X86 || KMP_ARCH_X86_64
+        case affinity_top_method_x2apicid:
+        value = "x2APIC id";
+        break;
+
+        case affinity_top_method_apicid:
+        value = "APIC id";
+        break;
+#  endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+        case affinity_top_method_cpuinfo:
+        value = "cpuinfo";
+        break;
+
+#  if KMP_GROUP_AFFINITY
+        case affinity_top_method_group:
+        value = "group";
+        break;
+#  endif /* KMP_GROUP_AFFINITY */
+
+        case affinity_top_method_flat:
+        value = "flat";
+        break;
+    }
+
+    if ( value != NULL ) {
+        __kmp_stg_print_str( buffer, name, value );
+    }
+# endif /* KMP_DEBUG */
+} // __kmp_stg_print_topology_method
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+
+#if OMP_40_ENABLED
+
+//
+// OMP_PROC_BIND / bind-var is functional on all 4.0 builds, including OS X*
+// OMP_PLACES / place-partition-var is not.
+//
+static void
+__kmp_stg_parse_proc_bind( char const * name, char const * value, void * data )
+{
+    kmp_setting_t **rivals = (kmp_setting_t **) data;
+    int rc;
+
+    rc = __kmp_stg_check_rivals( name, value, rivals );
+    if ( rc ) {
+        return;
+    }
+
+    //
+    // in OMP 4.0 OMP_PROC_BIND is a vector of proc_bind types.
+    //
+    KMP_DEBUG_ASSERT( (__kmp_nested_proc_bind.bind_types != NULL)
+      && ( __kmp_nested_proc_bind.used > 0 ) );
+
+    const char *buf = value;
+    const char *next;
+    int num;
+    SKIP_WS( buf );
+    if ( (*buf >= '0') && (*buf <= '9') ) {
+        next = buf;
+        SKIP_DIGITS( next );
+        num = __kmp_str_to_int( buf, *next );
+        KMP_ASSERT( num >= 0 );
+        buf = next;
+        SKIP_WS( buf );
+    }
+    else {
+        num = -1;
+    }
+
+    next = buf;
+    if ( __kmp_match_str( "disabled", buf, &next ) ) {
+        buf = next;
+        SKIP_WS( buf );
+# if KMP_AFFINITY_SUPPORTED
+        __kmp_affinity_type = affinity_disabled;
+# endif /* KMP_AFFINITY_SUPPORTED */
+        __kmp_nested_proc_bind.used = 1;
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+    }
+    else if ( ( num == (int)proc_bind_false )
+      || __kmp_match_str( "false", buf, &next ) ) {
+        buf = next;
+        SKIP_WS( buf );
+# if KMP_AFFINITY_SUPPORTED
+        __kmp_affinity_type = affinity_none;
+# endif /* KMP_AFFINITY_SUPPORTED */
+        __kmp_nested_proc_bind.used = 1;
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+    }
+    else if ( ( num == (int)proc_bind_true )
+      || __kmp_match_str( "true", buf, &next ) ) {
+        buf = next;
+        SKIP_WS( buf );
+        __kmp_nested_proc_bind.used = 1;
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+    }
+    else {
+        //
+        // Count the number of values in the env var string
+        //
+        const char *scan;
+        int nelem = 1;
+        for ( scan = buf; *scan != '\0'; scan++ ) {
+            if ( *scan == ',' ) {
+                nelem++;
+            }
+        }
+
+        //
+        // Create / expand the nested proc_bind array as needed
+        //
+        if ( __kmp_nested_proc_bind.size < nelem ) {
+            __kmp_nested_proc_bind.bind_types = (kmp_proc_bind_t *)
+                KMP_INTERNAL_REALLOC( __kmp_nested_proc_bind.bind_types,
+                sizeof(kmp_proc_bind_t) * nelem );
+            if ( __kmp_nested_proc_bind.bind_types == NULL ) {
+                KMP_FATAL( MemoryAllocFailed );
+            }
+            __kmp_nested_proc_bind.size = nelem;
+        }
+        __kmp_nested_proc_bind.used = nelem;
+
+        //
+        // Save values in the nested proc_bind array
+        //
+        int i = 0;
+        for (;;) {
+            enum kmp_proc_bind_t bind;
+
+            if ( ( num == (int)proc_bind_master )
+              || __kmp_match_str( "master", buf, &next ) ) {
+                buf = next;
+                SKIP_WS( buf );
+                bind = proc_bind_master;
+            }
+            else if ( ( num == (int)proc_bind_close )
+              || __kmp_match_str( "close", buf, &next ) ) {
+                buf = next;
+                SKIP_WS( buf );
+                bind = proc_bind_close;
+            }
+            else if ( ( num == (int)proc_bind_spread )
+              || __kmp_match_str( "spread", buf, &next ) ) {
+                buf = next;
+                SKIP_WS( buf );
+                bind = proc_bind_spread;
+            }
+            else {
+                KMP_WARNING( StgInvalidValue, name, value );
+                __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+                __kmp_nested_proc_bind.used = 1;
+                return;
+            }
+
+            __kmp_nested_proc_bind.bind_types[i++] = bind;
+            if ( i >= nelem ) {
+                break;
+            }
+            KMP_DEBUG_ASSERT( *buf == ',' );
+            buf++;
+            SKIP_WS( buf );
+
+            //
+            // Read next value if it was specified as an integer
+            //
+            if ( (*buf >= '0') && (*buf <= '9') ) {
+                next = buf;
+                SKIP_DIGITS( next );
+                num = __kmp_str_to_int( buf, *next );
+                KMP_ASSERT( num >= 0 );
+                buf = next;
+                SKIP_WS( buf );
+            }
+            else {
+                num = -1;
+            }
+        }
+        SKIP_WS( buf );
+    }
+    if ( *buf != '\0' ) {
+        KMP_WARNING( ParseExtraCharsWarn, name, buf );
+    }
+}
+
+
+static void
+__kmp_stg_print_proc_bind( kmp_str_buf_t * buffer, char const * name,
+  void * data )
+{
+    int nelem = __kmp_nested_proc_bind.used;
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME;
+    } else {
+        __kmp_str_buf_print( buffer, "   %s", name );
+    }
+    if ( nelem == 0 ) {
+        __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
+    }
+    else {
+        int i;
+        __kmp_str_buf_print( buffer, "='", name );
+        for ( i = 0; i < nelem; i++ ) {
+            switch ( __kmp_nested_proc_bind.bind_types[i] ) {
+                case proc_bind_false:
+                __kmp_str_buf_print( buffer, "false" );
+                break;
+
+                case proc_bind_true:
+                __kmp_str_buf_print( buffer, "true" );
+                break;
+
+                case proc_bind_master:
+                __kmp_str_buf_print( buffer, "master" );
+                break;
+
+                case proc_bind_close:
+                __kmp_str_buf_print( buffer, "close" );
+                break;
+
+                case proc_bind_spread:
+                __kmp_str_buf_print( buffer, "spread" );
+                break;
+
+                case proc_bind_intel:
+                __kmp_str_buf_print( buffer, "intel" );
+                break;
+
+                case proc_bind_default:
+                __kmp_str_buf_print( buffer, "default" );
+                break;
+            }
+            if ( i < nelem - 1 ) {
+                __kmp_str_buf_print( buffer, "," );
+            }
+        }
+        __kmp_str_buf_print( buffer, "'\n" );
+    }
+}
+
+#endif /* OMP_40_ENABLED */
+
+
+// -------------------------------------------------------------------------------------------------
+// OMP_DYNAMIC
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_omp_dynamic( char const * name, char const * value, void * data )
+{
+    __kmp_stg_parse_bool( name, value, & (__kmp_global.g.g_dynamic) );
+} // __kmp_stg_parse_omp_dynamic
+
+static void
+__kmp_stg_print_omp_dynamic( kmp_str_buf_t * buffer, char const * name, void * data )
+{
+    __kmp_stg_print_bool( buffer, name, __kmp_global.g.g_dynamic );
+} // __kmp_stg_print_omp_dynamic
+
+static void
+__kmp_stg_parse_kmp_dynamic_mode( char const * name, char const * value, void * data )
+{
+    if ( TCR_4(__kmp_init_parallel) ) {
+        KMP_WARNING( EnvParallelWarn, name );
+        __kmp_env_toPrint( name, 0 );
+        return;
+    }
+#ifdef USE_LOAD_BALANCE
+    else if ( __kmp_str_match( "load balance", 2, value )
+      || __kmp_str_match( "load_balance", 2, value )
+      || __kmp_str_match( "load-balance", 2, value )
+      || __kmp_str_match( "loadbalance", 2, value )
+      || __kmp_str_match( "balance", 1, value ) ) {
+        __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
+    }
+#endif /* USE_LOAD_BALANCE */
+    else if ( __kmp_str_match( "thread limit", 1, value )
+      || __kmp_str_match( "thread_limit", 1, value )
+      || __kmp_str_match( "thread-limit", 1, value )
+      || __kmp_str_match( "threadlimit", 1, value )
+      || __kmp_str_match( "limit", 2, value ) ) {
+        __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+    }
+    else if ( __kmp_str_match( "random", 1, value ) ) {
+        __kmp_global.g.g_dynamic_mode = dynamic_random;
+    }
+    else {
+        KMP_WARNING( StgInvalidValue, name, value );
+    }
+} //__kmp_stg_parse_kmp_dynamic_mode
+
+static void
+__kmp_stg_print_kmp_dynamic_mode( kmp_str_buf_t * buffer, char const * name, void * data )
+{
+#if KMP_DEBUG
+    if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
+        __kmp_str_buf_print( buffer, "   %s: %s \n", name, KMP_I18N_STR( NotDefined ) );
+    }
+# ifdef USE_LOAD_BALANCE
+    else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
+        __kmp_stg_print_str( buffer, name, "load balance" );
+    }
+# endif /* USE_LOAD_BALANCE */
+    else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
+        __kmp_stg_print_str( buffer, name, "thread limit" );
+    }
+    else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
+        __kmp_stg_print_str( buffer, name, "random" );
+    }
+    else {
+        KMP_ASSERT(0);
+    }
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_kmp_dynamic_mode
+
+
+#ifdef USE_LOAD_BALANCE
+
+// -------------------------------------------------------------------------------------------------
+// KMP_LOAD_BALANCE_INTERVAL
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_ld_balance_interval( char const * name, char const * value, void * data )
+{
+    double interval = __kmp_convert_to_double( value );
+    if ( interval >= 0 ) {
+        __kmp_load_balance_interval = interval;
+    } else {
+        KMP_WARNING( StgInvalidValue, name, value );
+    }; // if
+} // __kmp_stg_parse_load_balance_interval
+
+static void
+__kmp_stg_print_ld_balance_interval( kmp_str_buf_t * buffer, char const * name, void * data ) {
+#if KMP_DEBUG
+    __kmp_str_buf_print( buffer, "   %s=%8.6f\n", name, __kmp_load_balance_interval );
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_load_balance_interval
+
+#endif /* USE_LOAD_BALANCE */
+
+
+
+// -------------------------------------------------------------------------------------------------
+// KMP_INIT_AT_FORK
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_init_at_fork( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_need_register_atfork );
+    if ( __kmp_need_register_atfork ) {
+        __kmp_need_register_atfork_specified = TRUE;
+    };
+} // __kmp_stg_parse_init_at_fork
+
+static void
+__kmp_stg_print_init_at_fork( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_need_register_atfork_specified );
+} // __kmp_stg_print_init_at_fork
+
+// -------------------------------------------------------------------------------------------------
+// KMP_SCHEDULE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_schedule( char const * name, char const * value, void * data ) {
+
+    if ( value != NULL ) {
+        size_t length = KMP_STRLEN( value );
+        if ( length > INT_MAX ) {
+            KMP_WARNING( LongValue, name );
+        } else {
+            char *semicolon;
+            if( value[ length - 1 ] == '"' || value[ length -1 ] == '\'' )
+                KMP_WARNING( UnbalancedQuotes, name );
+            do {
+                char sentinel;
+
+                semicolon = (char *) strchr( value, ';' );
+                if( *value && semicolon != value ) {
+                    char *comma = (char *) strchr( value, ',' );
+
+                    if ( comma ) {
+                        ++comma;
+                        sentinel = ',';
+                    } else
+                        sentinel = ';';
+                    if ( !__kmp_strcasecmp_with_sentinel( "static", value, sentinel ) ) {
+                        if( !__kmp_strcasecmp_with_sentinel( "greedy", comma, ';' ) ) {
+                            __kmp_static = kmp_sch_static_greedy;
+                            continue;
+                        } else if( !__kmp_strcasecmp_with_sentinel( "balanced", comma, ';' ) ) {
+                            __kmp_static = kmp_sch_static_balanced;
+                            continue;
+                        }
+                    } else if ( !__kmp_strcasecmp_with_sentinel( "guided", value, sentinel ) ) {
+                        if ( !__kmp_strcasecmp_with_sentinel( "iterative", comma, ';' ) ) {
+                            __kmp_guided = kmp_sch_guided_iterative_chunked;
+                            continue;
+                        } else if ( !__kmp_strcasecmp_with_sentinel( "analytical", comma, ';' ) ) {
+                            /* analytical not allowed for too many threads */
+                            __kmp_guided = kmp_sch_guided_analytical_chunked;
+                            continue;
+                        }
+                    }
+                    KMP_WARNING( InvalidClause, name, value );
+                } else
+                    KMP_WARNING( EmptyClause, name );
+            } while ( (value = semicolon ? semicolon + 1 : NULL) );
+        }
+    }; // if
+
+} // __kmp_stg_parse__schedule
+
+static void
+__kmp_stg_print_schedule( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME_EX(name);
+    } else {
+        __kmp_str_buf_print( buffer, "   %s='", name );
+    }
+    if ( __kmp_static == kmp_sch_static_greedy ) {
+        __kmp_str_buf_print( buffer, "%s", "static,greedy");
+    } else if ( __kmp_static == kmp_sch_static_balanced ) {
+        __kmp_str_buf_print ( buffer, "%s", "static,balanced");
+    }
+    if ( __kmp_guided == kmp_sch_guided_iterative_chunked ) {
+        __kmp_str_buf_print( buffer, ";%s'\n", "guided,iterative");
+    } else if ( __kmp_guided == kmp_sch_guided_analytical_chunked ) {
+        __kmp_str_buf_print( buffer, ";%s'\n", "guided,analytical");
+    }
+} // __kmp_stg_print_schedule
+
+// -------------------------------------------------------------------------------------------------
+// OMP_SCHEDULE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_omp_schedule( char const * name, char const * value, void * data )
+{
+    size_t      length;
+    if( value ) {
+        length = KMP_STRLEN( value );
+        if( length ) {
+            char *comma = (char *) strchr( value, ',' );
+            if( value[ length - 1 ] == '"' || value[ length -1 ] == '\'')
+                KMP_WARNING( UnbalancedQuotes, name );
+            /* get the specified scheduling style */
+            if (!__kmp_strcasecmp_with_sentinel("dynamic", value, ','))          /* DYNAMIC */
+                __kmp_sched = kmp_sch_dynamic_chunked;
+            else if (!__kmp_strcasecmp_with_sentinel("guided", value, ','))      /* GUIDED */
+                __kmp_sched = kmp_sch_guided_chunked;
+// AC: TODO: add AUTO schedule, and pprobably remove TRAPEZOIDAL (OMP 3.0 does not allow it)
+            else if (!__kmp_strcasecmp_with_sentinel("auto", value, ',')) {       /* AUTO */
+                __kmp_sched = kmp_sch_auto;
+                if( comma ) {
+                    __kmp_msg( kmp_ms_warning, KMP_MSG( IgnoreChunk, name, comma ), __kmp_msg_null );
+                    comma = NULL;
+                }
+            }
+            else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", value, ',')) /* TRAPEZOIDAL */
+                __kmp_sched = kmp_sch_trapezoidal;
+            else if (!__kmp_strcasecmp_with_sentinel("static", value, ','))      /* STATIC */
+                __kmp_sched = kmp_sch_static;
+#ifdef KMP_STATIC_STEAL_ENABLED
+            else if (KMP_ARCH_X86_64 &&
+                     !__kmp_strcasecmp_with_sentinel("static_steal", value, ','))
+                __kmp_sched = kmp_sch_static_steal;
+#endif
+            else {
+                KMP_WARNING( StgInvalidValue, name, value );
+                value = NULL; /* skip processing of comma */
+            }
+            if( value && comma ) {
+                __kmp_env_chunk = TRUE;
+
+                if(__kmp_sched == kmp_sch_static)
+                    __kmp_sched = kmp_sch_static_chunked;
+                ++comma;
+                __kmp_chunk = __kmp_str_to_int( comma, 0 );
+                if ( __kmp_chunk < 1 ) {
+                    __kmp_chunk = KMP_DEFAULT_CHUNK;
+                    __kmp_msg( kmp_ms_warning, KMP_MSG( InvalidChunk, name, comma ), __kmp_msg_null );
+                    KMP_INFORM( Using_int_Value, name, __kmp_chunk );
+// AC: next block commented out until KMP_DEFAULT_CHUNK != KMP_MIN_CHUNK (to improve code coverage :)
+//     The default chunk size is 1 according to standard, thus making KMP_MIN_CHUNK not 1 we would introduce mess:
+//     wrong chunk becomes 1, but it will be impossible to explicitely set 1, because it becomes KMP_MIN_CHUNK...
+//                } else if ( __kmp_chunk < KMP_MIN_CHUNK ) {
+//                    __kmp_chunk = KMP_MIN_CHUNK;
+                } else if ( __kmp_chunk > KMP_MAX_CHUNK ) {
+                    __kmp_chunk = KMP_MAX_CHUNK;
+                    __kmp_msg( kmp_ms_warning, KMP_MSG( LargeChunk, name, comma ), __kmp_msg_null );
+                    KMP_INFORM( Using_int_Value, name, __kmp_chunk );
+                }
+            } else
+                __kmp_env_chunk = FALSE;
+        } else
+            KMP_WARNING( EmptyString, name );
+    }
+    K_DIAG(1, ("__kmp_static == %d\n", __kmp_static))
+    K_DIAG(1, ("__kmp_guided == %d\n", __kmp_guided))
+    K_DIAG(1, ("__kmp_sched == %d\n", __kmp_sched))
+    K_DIAG(1, ("__kmp_chunk == %d\n", __kmp_chunk))
+} // __kmp_stg_parse_omp_schedule
+
+static void
+__kmp_stg_print_omp_schedule( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME_EX(name);
+    } else {
+        __kmp_str_buf_print( buffer, "   %s='", name );
+    }
+    if ( __kmp_chunk ) {
+        switch ( __kmp_sched ) {
+            case kmp_sch_dynamic_chunked:
+                __kmp_str_buf_print( buffer, "%s,%d'\n", "dynamic", __kmp_chunk);
+                break;
+            case kmp_sch_guided_iterative_chunked:
+            case kmp_sch_guided_analytical_chunked:
+                __kmp_str_buf_print( buffer, "%s,%d'\n", "guided", __kmp_chunk);
+                break;
+            case kmp_sch_trapezoidal:
+                __kmp_str_buf_print( buffer, "%s,%d'\n", "trapezoidal", __kmp_chunk);
+                break;
+            case kmp_sch_static:
+            case kmp_sch_static_chunked:
+            case kmp_sch_static_balanced:
+            case kmp_sch_static_greedy:
+                __kmp_str_buf_print( buffer, "%s,%d'\n", "static", __kmp_chunk);
+                break;
+            case kmp_sch_static_steal:
+                __kmp_str_buf_print( buffer, "%s,%d'\n", "static_steal", __kmp_chunk);
+                break;
+            case kmp_sch_auto:
+                __kmp_str_buf_print( buffer, "%s,%d'\n", "auto", __kmp_chunk);
+                break;
+        }
+    } else {
+        switch ( __kmp_sched ) {
+            case kmp_sch_dynamic_chunked:
+                __kmp_str_buf_print( buffer, "%s'\n", "dynamic");
+                break;
+            case kmp_sch_guided_iterative_chunked:
+            case kmp_sch_guided_analytical_chunked:
+                __kmp_str_buf_print( buffer, "%s'\n", "guided");
+                break;
+            case kmp_sch_trapezoidal:
+                __kmp_str_buf_print( buffer, "%s'\n", "trapezoidal");
+                break;
+            case kmp_sch_static:
+            case kmp_sch_static_chunked:
+            case kmp_sch_static_balanced:
+            case kmp_sch_static_greedy:
+                __kmp_str_buf_print( buffer, "%s'\n", "static");
+                break;
+            case kmp_sch_static_steal:
+                __kmp_str_buf_print( buffer, "%s'\n", "static_steal");
+                break;
+            case kmp_sch_auto:
+                __kmp_str_buf_print( buffer, "%s'\n", "auto");
+                break;
+        }
+    }
+} // __kmp_stg_print_omp_schedule
+
+// -------------------------------------------------------------------------------------------------
+// KMP_ATOMIC_MODE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_atomic_mode( char const * name, char const * value, void * data ) {
+    // Modes: 0 -- do not change default; 1 -- Intel perf mode, 2 -- GOMP compatibility mode.
+    int mode = 0;
+    int max  = 1;
+    #ifdef KMP_GOMP_COMPAT
+        max = 2;
+    #endif /* KMP_GOMP_COMPAT */
+    __kmp_stg_parse_int( name, value, 0, max, & mode );
+    // TODO; parse_int is not very suitable for this case. In case of overflow it is better to use
+    // 0 rather that max value.
+    if ( mode > 0 ) {
+        __kmp_atomic_mode = mode;
+    }; // if
+} // __kmp_stg_parse_atomic_mode
+
+static void
+__kmp_stg_print_atomic_mode( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_atomic_mode );
+} // __kmp_stg_print_atomic_mode
+
+
+// -------------------------------------------------------------------------------------------------
+// KMP_CONSISTENCY_CHECK
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_consistency_check( char const * name, char const * value, void * data ) {
+    if ( ! __kmp_strcasecmp_with_sentinel( "all", value, 0 ) ) {
+        // Note, this will not work from kmp_set_defaults because th_cons stack was not allocated
+        // for existed thread(s) thus the first __kmp_push_<construct> will break with assertion.
+        // TODO: allocate th_cons if called from kmp_set_defaults.
+        __kmp_env_consistency_check = TRUE;
+    } else if ( ! __kmp_strcasecmp_with_sentinel( "none", value, 0 ) ) {
+        __kmp_env_consistency_check = FALSE;
+    } else {
+        KMP_WARNING( StgInvalidValue, name, value );
+    }; // if
+} // __kmp_stg_parse_consistency_check
+
+static void
+__kmp_stg_print_consistency_check( kmp_str_buf_t * buffer, char const * name, void * data ) {
+#if KMP_DEBUG
+    const char *value = NULL;
+
+    if ( __kmp_env_consistency_check ) {
+        value = "all";
+    } else {
+        value = "none";
+    }
+
+    if ( value != NULL ) {
+        __kmp_stg_print_str( buffer, name, value );
+    }
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_consistency_check
+
+
+#if USE_ITT_BUILD
+// -------------------------------------------------------------------------------------------------
+// KMP_ITT_PREPARE_DELAY
+// -------------------------------------------------------------------------------------------------
+
+#if USE_ITT_NOTIFY
+
+static void
+__kmp_stg_parse_itt_prepare_delay( char const * name, char const * value, void * data )
+{
+    // Experimental code: KMP_ITT_PREPARE_DELAY specifies numbert of loop iterations.
+    int delay = 0;
+    __kmp_stg_parse_int( name, value, 0, INT_MAX, & delay );
+    __kmp_itt_prepare_delay = delay;
+} // __kmp_str_parse_itt_prepare_delay
+
+static void
+__kmp_stg_print_itt_prepare_delay( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_uint64( buffer, name, __kmp_itt_prepare_delay );
+
+} // __kmp_str_print_itt_prepare_delay
+
+#endif // USE_ITT_NOTIFY
+#endif /* USE_ITT_BUILD */
+
+// -------------------------------------------------------------------------------------------------
+// KMP_MALLOC_POOL_INCR
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_malloc_pool_incr( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_size(
+            name,
+            value,
+            KMP_MIN_MALLOC_POOL_INCR,
+            KMP_MAX_MALLOC_POOL_INCR,
+            NULL,
+            & __kmp_malloc_pool_incr,
+            1
+        );
+} // __kmp_stg_parse_malloc_pool_incr
+
+static void
+__kmp_stg_print_malloc_pool_incr( kmp_str_buf_t * buffer, char const * name, void * data ) {
+       __kmp_stg_print_size( buffer, name, __kmp_malloc_pool_incr );
+
+} // _kmp_stg_print_malloc_pool_incr
+
+
+#ifdef KMP_DEBUG
+
+// -------------------------------------------------------------------------------------------------
+// KMP_PAR_RANGE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_par_range_env( char const * name, char const * value, void * data ) {
+        __kmp_stg_parse_par_range(
+            name,
+            value,
+            & __kmp_par_range,
+            __kmp_par_range_routine,
+            __kmp_par_range_filename,
+            & __kmp_par_range_lb,
+            & __kmp_par_range_ub
+        );
+} // __kmp_stg_parse_par_range_env
+
+static void
+__kmp_stg_print_par_range_env( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if (__kmp_par_range != 0) {
+        __kmp_stg_print_str( buffer, name, par_range_to_print );
+    }
+} // __kmp_stg_print_par_range_env
+
+// -------------------------------------------------------------------------------------------------
+// KMP_YIELD_CYCLE, KMP_YIELD_ON, KMP_YIELD_OFF
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_yield_cycle( char const * name, char const * value, void * data ) {
+    int flag = __kmp_yield_cycle;
+    __kmp_stg_parse_bool( name, value, & flag );
+    __kmp_yield_cycle = flag;
+} // __kmp_stg_parse_yield_cycle
+
+static void
+__kmp_stg_print_yield_cycle( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_yield_cycle );
+} // __kmp_stg_print_yield_cycle
+
+static void
+__kmp_stg_parse_yield_on( char const * name, char const * value, void * data ) {
+        __kmp_stg_parse_int( name, value, 2, INT_MAX, & __kmp_yield_on_count );
+} // __kmp_stg_parse_yield_on
+
+static void
+__kmp_stg_print_yield_on( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_yield_on_count );
+} // __kmp_stg_print_yield_on
+
+static void
+__kmp_stg_parse_yield_off( char const * name, char const * value, void * data ) {
+        __kmp_stg_parse_int( name, value, 2, INT_MAX, & __kmp_yield_off_count );
+} // __kmp_stg_parse_yield_off
+
+static void
+__kmp_stg_print_yield_off( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_yield_off_count );
+} // __kmp_stg_print_yield_off
+
+#endif
+
+// -------------------------------------------------------------------------------------------------
+// KMP_INIT_WAIT, KMP_NEXT_WAIT
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_init_wait( char const * name, char const * value, void * data ) {
+   int wait;
+   KMP_ASSERT( ( __kmp_init_wait & 1 ) == 0 );
+   wait = __kmp_init_wait / 2;
+    __kmp_stg_parse_int( name, value, KMP_MIN_INIT_WAIT, KMP_MAX_INIT_WAIT, & wait );
+    __kmp_init_wait = wait * 2;
+    KMP_ASSERT( ( __kmp_init_wait & 1 ) == 0 );
+    __kmp_yield_init = __kmp_init_wait;
+} // __kmp_stg_parse_init_wait
+
+static void
+__kmp_stg_print_init_wait( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_init_wait );
+} // __kmp_stg_print_init_wait
+
+static void
+__kmp_stg_parse_next_wait( char const * name, char const * value, void * data ) {
+    int wait;
+    KMP_ASSERT( ( __kmp_next_wait & 1 ) == 0 );
+    wait = __kmp_next_wait / 2;
+    __kmp_stg_parse_int( name, value, KMP_MIN_NEXT_WAIT, KMP_MAX_NEXT_WAIT, & wait );
+    __kmp_next_wait = wait * 2;
+    KMP_ASSERT( ( __kmp_next_wait & 1 ) == 0 );
+    __kmp_yield_next = __kmp_next_wait;
+} // __kmp_stg_parse_next_wait
+
+static void
+__kmp_stg_print_next_wait( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_next_wait );
+} //__kmp_stg_print_next_wait
+
+
+// -------------------------------------------------------------------------------------------------
+// KMP_GTID_MODE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_gtid_mode( char const * name, char const * value, void * data ) {
+    //
+    // Modes:
+    //   0 -- do not change default
+    //   1 -- sp search
+    //   2 -- use "keyed" TLS var, i.e.
+    //        pthread_getspecific(Linux* OS/OS X*) or TlsGetValue(Windows* OS)
+    //   3 -- __declspec(thread) TLS var in tdata section
+    //
+    int mode = 0;
+    int max  = 2;
+    #ifdef KMP_TDATA_GTID
+        max = 3;
+    #endif /* KMP_TDATA_GTID */
+    __kmp_stg_parse_int( name, value, 0, max, & mode );
+    // TODO; parse_int is not very suitable for this case. In case of overflow it is better to use
+    // 0 rather that max value.
+    if ( mode == 0 ) {
+        __kmp_adjust_gtid_mode = TRUE;
+    }
+    else {
+        __kmp_gtid_mode = mode;
+        __kmp_adjust_gtid_mode = FALSE;
+    }; // if
+} // __kmp_str_parse_gtid_mode
+
+static void
+__kmp_stg_print_gtid_mode( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if ( __kmp_adjust_gtid_mode ) {
+        __kmp_stg_print_int( buffer, name, 0 );
+    }
+    else {
+        __kmp_stg_print_int( buffer, name, __kmp_gtid_mode );
+    }
+} // __kmp_stg_print_gtid_mode
+
+
+// -------------------------------------------------------------------------------------------------
+// KMP_NUM_LOCKS_IN_BLOCK
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_lock_block( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int( name, value, 0, KMP_INT_MAX, & __kmp_num_locks_in_block );
+} // __kmp_str_parse_lock_block
+
+static void
+__kmp_stg_print_lock_block( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_num_locks_in_block );
+} // __kmp_stg_print_lock_block
+
+// -------------------------------------------------------------------------------------------------
+// KMP_LOCK_KIND
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_lock_kind( char const * name, char const * value, void * data ) {
+    if ( __kmp_init_user_locks ) {
+        KMP_WARNING( EnvLockWarn, name );
+        return;
+    }
+
+    if ( __kmp_str_match( "tas", 2, value )
+      || __kmp_str_match( "test and set", 2, value )
+      || __kmp_str_match( "test_and_set", 2, value )
+      || __kmp_str_match( "test-and-set", 2, value )
+      || __kmp_str_match( "test andset", 2, value )
+      || __kmp_str_match( "test_andset", 2, value )
+      || __kmp_str_match( "test-andset", 2, value )
+      || __kmp_str_match( "testand set", 2, value )
+      || __kmp_str_match( "testand_set", 2, value )
+      || __kmp_str_match( "testand-set", 2, value )
+      || __kmp_str_match( "testandset", 2, value ) ) {
+        __kmp_user_lock_kind = lk_tas;
+        DYNA_STORE_LOCK_SEQ(tas);
+    }
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
+    else if ( __kmp_str_match( "futex", 1, value ) ) {
+        if ( __kmp_futex_determine_capable() ) {
+            __kmp_user_lock_kind = lk_futex;
+            DYNA_STORE_LOCK_SEQ(futex);
+        }
+        else {
+            KMP_WARNING( FutexNotSupported, name, value );
+        }
+    }
+#endif
+    else if ( __kmp_str_match( "ticket", 2, value ) ) {
+        __kmp_user_lock_kind = lk_ticket;
+        DYNA_STORE_LOCK_SEQ(ticket);
+    }
+    else if ( __kmp_str_match( "queuing", 1, value )
+      || __kmp_str_match( "queue", 1, value ) ) {
+        __kmp_user_lock_kind = lk_queuing;
+        DYNA_STORE_LOCK_SEQ(queuing);
+    }
+    else if ( __kmp_str_match( "drdpa ticket", 1, value )
+      || __kmp_str_match( "drdpa_ticket", 1, value )
+      || __kmp_str_match( "drdpa-ticket", 1, value )
+      || __kmp_str_match( "drdpaticket", 1, value )
+      || __kmp_str_match( "drdpa", 1, value ) ) {
+        __kmp_user_lock_kind = lk_drdpa;
+        DYNA_STORE_LOCK_SEQ(drdpa);
+    }
+#if KMP_USE_ADAPTIVE_LOCKS
+    else if ( __kmp_str_match( "adaptive", 1, value )  ) {
+        if( __kmp_cpuinfo.rtm ) { // ??? Is cpuinfo available here?
+            __kmp_user_lock_kind = lk_adaptive;
+            DYNA_STORE_LOCK_SEQ(adaptive);
+        } else {
+            KMP_WARNING( AdaptiveNotSupported, name, value );
+            __kmp_user_lock_kind = lk_queuing;
+            DYNA_STORE_LOCK_SEQ(queuing);
+        }
+    }
+#endif // KMP_USE_ADAPTIVE_LOCKS
+#if KMP_USE_DYNAMIC_LOCK
+    else if ( __kmp_str_match("hle", 1, value) ) {
+        DYNA_STORE_LOCK_SEQ(hle);
+    }
+#endif
+    else {
+        KMP_WARNING( StgInvalidValue, name, value );
+    }
+}
+
+static void
+__kmp_stg_print_lock_kind( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    const char *value = NULL;
+
+    switch ( __kmp_user_lock_kind ) {
+        case lk_default:
+        value = "default";
+        break;
+
+        case lk_tas:
+        value = "tas";
+        break;
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+        case lk_futex:
+        value = "futex";
+        break;
+#endif
+
+        case lk_ticket:
+        value = "ticket";
+        break;
+
+        case lk_queuing:
+        value = "queuing";
+        break;
+
+        case lk_drdpa:
+        value = "drdpa";
+        break;
+#if KMP_USE_ADAPTIVE_LOCKS
+        case lk_adaptive:
+        value = "adaptive";
+        break;
+#endif
+    }
+
+    if ( value != NULL ) {
+        __kmp_stg_print_str( buffer, name, value );
+    }
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// -------------------------------------------------------------------------------------------------
+// KMP_ADAPTIVE_LOCK_PROPS, KMP_SPECULATIVE_STATSFILE
+// -------------------------------------------------------------------------------------------------
+
+// Parse out values for the tunable parameters from a string of the form
+// KMP_ADAPTIVE_LOCK_PROPS=max_soft_retries[,max_badness]
+static void
+__kmp_stg_parse_adaptive_lock_props( const char *name, const char *value, void *data )
+{
+    int max_retries = 0;
+    int max_badness = 0;
+
+    const char *next = value;
+
+    int total = 0;          // Count elements that were set. It'll be used as an array size
+    int prev_comma = FALSE; // For correct processing sequential commas
+    int i;
+
+    // Save values in the structure __kmp_speculative_backoff_params
+    // Run only 3 iterations because it is enough to read two values or find a syntax error
+    for ( i = 0; i < 3 ; i++) {
+        SKIP_WS( next );
+
+        if ( *next == '\0' ) {
+            break;
+        }
+        // Next character is not an integer or not a comma OR number of values > 2 => end of list
+        if ( ( ( *next < '0' || *next > '9' ) && *next !=',' ) || total > 2 ) {
+            KMP_WARNING( EnvSyntaxError, name, value );
+            return;
+        }
+        // The next character is ','
+        if ( *next == ',' ) {
+            // ',' is the fisrt character
+            if ( total == 0 || prev_comma ) {
+                total++;
+            }
+            prev_comma = TRUE;
+            next++; //skip ','
+            SKIP_WS( next );
+        }
+        // Next character is a digit
+        if ( *next >= '0' && *next <= '9' ) {
+            int num;
+            const char *buf = next;
+            char const * msg  = NULL;
+            prev_comma = FALSE;
+            SKIP_DIGITS( next );
+            total++;
+
+            const char *tmp = next;
+            SKIP_WS( tmp );
+            if ( ( *next == ' ' || *next == '\t' ) && ( *tmp >= '0' && *tmp <= '9' ) ) {
+                KMP_WARNING( EnvSpacesNotAllowed, name, value );
+                return;
+            }
+
+            num = __kmp_str_to_int( buf, *next );
+            if ( num < 1 ) { // The number of retries should be > 0
+                msg = KMP_I18N_STR( ValueTooSmall );
+                num = 1;
+            } else if ( num > KMP_INT_MAX ) {
+                msg = KMP_I18N_STR( ValueTooLarge );
+                num = KMP_INT_MAX;
+            }
+            if ( msg != NULL ) {
+                // Message is not empty. Print warning.
+                KMP_WARNING( ParseSizeIntWarn, name, value, msg );
+                KMP_INFORM( Using_int_Value, name, num );
+            }
+            if( total == 1 ) {
+                max_retries = num;
+            } else if( total == 2 ) {
+                max_badness = num;
+            }
+        }
+    }
+    KMP_DEBUG_ASSERT( total > 0 );
+    if( total <= 0 ) {
+        KMP_WARNING( EnvSyntaxError, name, value );
+        return;
+    }
+    if( max_retries != 0 ) {
+        __kmp_adaptive_backoff_params.max_soft_retries = max_retries;
+    }
+    if( max_badness != 0 ) {
+        __kmp_adaptive_backoff_params.max_badness = max_badness;
+    }
+}
+
+
+static void
+__kmp_stg_print_adaptive_lock_props(kmp_str_buf_t * buffer, char const * name, void * data )
+{
+    if( __kmp_env_format ) {
+        KMP_STR_BUF_PRINT_NAME_EX(name);
+    } else {
+        __kmp_str_buf_print( buffer, "   %s='", name );
+    }
+    __kmp_str_buf_print( buffer, "%d,%d'\n", __kmp_adaptive_backoff_params.max_soft_retries,
+                         __kmp_adaptive_backoff_params.max_badness );
+} // __kmp_stg_print_adaptive_lock_props
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+static void
+__kmp_stg_parse_speculative_statsfile( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_file( name, value, "", & __kmp_speculative_statsfile );
+} // __kmp_stg_parse_speculative_statsfile
+
+static void
+__kmp_stg_print_speculative_statsfile( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if ( __kmp_str_match( "-", 0, __kmp_speculative_statsfile )  ) {
+        __kmp_stg_print_str( buffer, name, "stdout" );
+    } else {
+        __kmp_stg_print_str( buffer, name, __kmp_speculative_statsfile );
+    }
+
+} // __kmp_stg_print_speculative_statsfile
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+// -------------------------------------------------------------------------------------------------
+// KMP_PLACE_THREADS
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_place_threads( char const * name, char const * value, void * data ) {
+    // Value example: 5Cx2Tx15O
+    // Which means "use 5 cores with offset 15, 2 threads per core"
+
+    int         num;
+    int         prev_delim = 0;
+    const char *next = value;
+    const char *prev;
+
+    SKIP_WS( next );
+    if ( *next == '\0' ) {
+        return;   // leave default values
+    }
+
+    // Get num_cores first
+    if ( *next >= '0' && *next <= '9' ) {
+        prev = next;
+        SKIP_DIGITS( next );
+        num = __kmp_str_to_int( prev, *next );
+        SKIP_WS( next );
+        if ( *next == 'C' || *next == 'c' ) {
+            __kmp_place_num_cores = num;
+            next++;
+        } else if ( *next == ',' || *next == 'x' ) {
+            __kmp_place_num_cores = num;
+            prev_delim = 1;
+            next++;
+        } else if ( *next == 'T' || *next == 't' ) {
+            __kmp_place_num_threads_per_core = num;
+            return;   // we ignore offset value in case all cores are used
+        } else if ( *next == '\0' ) {
+            __kmp_place_num_cores = num;
+            return;   // the only value provided
+        } else {
+            KMP_WARNING( AffThrPlaceInvalid, name, value );
+            return;
+        }
+    } else if ( *next == ',' || *next == 'x' ) {
+        // First character is delimiter, skip it, leave num_cores default value
+        prev_delim = 2;
+        next++;
+    } else {
+        KMP_WARNING( AffThrPlaceInvalid, name, value );
+        return;
+    }
+    SKIP_WS( next );
+    if ( *next == '\0' ) {
+        return;   // " n  " - something like this
+    }
+    if ( ( *next == ',' || *next == 'x' ) && !prev_delim ) {
+        prev_delim = 1;
+        next++;   // skip delimiter after num_core value
+        SKIP_WS( next );
+    }
+
+    // Get threads_per_core next
+    if ( *next >= '0' && *next <= '9' ) {
+        prev_delim = 0;
+        prev = next;
+        SKIP_DIGITS( next );
+        num = __kmp_str_to_int( prev, *next );
+        SKIP_WS( next );
+        if ( *next == 'T' || *next == 't' ) {
+            __kmp_place_num_threads_per_core = num;
+            next++;
+        } else if ( *next == ',' || *next == 'x' ) {
+            __kmp_place_num_threads_per_core = num;
+            prev_delim = 1;
+            next++;
+        } else if ( *next == 'O' || *next == 'o' ) {
+            __kmp_place_core_offset = num;
+            return;   // threads_per_core remains default
+        } else if ( *next == '\0' ) {
+            __kmp_place_num_threads_per_core = num;
+            return;
+        } else {
+            KMP_WARNING( AffThrPlaceInvalid, name, value );
+            return;
+        }
+    } else if ( *next == ',' || *next == 'x' ) {
+        if ( prev_delim == 2 ) {
+            return; // no sense in the only offset value, thus skip the rest
+        }
+        KMP_DEBUG_ASSERT( prev_delim == 1 );
+        next++;     // no value for threads_per_core provided
+    } else {
+        KMP_WARNING( AffThrPlaceInvalid, name, value );
+        return;
+    }
+    SKIP_WS( next );
+    if ( *next == '\0' ) {
+        return;   // " nC,mT  " - something like this
+    }
+    if ( ( *next == ',' || *next == 'x' ) && !prev_delim ) {
+        prev_delim = 1;
+        next++;   // skip delimiter after threads_per_core value
+        SKIP_WS( next );
+    }
+
+    // Get core offset last if any,
+    // don't bother checking syntax after all data obtained
+    if ( *next >= '0' && *next <= '9' ) {
+        prev = next;
+        SKIP_DIGITS( next );
+        num = __kmp_str_to_int( prev, *next );
+        __kmp_place_core_offset = num;
+    }
+}
+
+static void
+__kmp_stg_print_place_threads( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    if ( __kmp_place_num_cores + __kmp_place_num_threads_per_core ) {
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init( &buf );
+        if( __kmp_env_format ) {
+            KMP_STR_BUF_PRINT_NAME_EX(name);
+        } else {
+            __kmp_str_buf_print( buffer, "   %s='", name );
+        }
+        __kmp_str_buf_print( &buf, "%dC", __kmp_place_num_cores );
+        __kmp_str_buf_print( &buf, "x%dT", __kmp_place_num_threads_per_core );
+        if ( __kmp_place_core_offset ) {
+            __kmp_str_buf_print( &buf, ",%dO", __kmp_place_core_offset );
+        }
+        __kmp_str_buf_print(buffer, "%s'\n", buf.str );
+        __kmp_str_buf_free(&buf);
+/*
+    } else {
+        __kmp_str_buf_print( buffer, "   %s: %s \n", name, KMP_I18N_STR( NotDefined ) );
+*/
+    }
+}
+
+#if USE_ITT_BUILD
+// -------------------------------------------------------------------------------------------------
+// KMP_FORKJOIN_FRAMES
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_forkjoin_frames( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_bool( name, value, & __kmp_forkjoin_frames );
+} // __kmp_stg_parse_forkjoin_frames
+
+static void
+__kmp_stg_print_forkjoin_frames( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_forkjoin_frames );
+} // __kmp_stg_print_forkjoin_frames
+
+// -------------------------------------------------------------------------------------------------
+// KMP_FORKJOIN_FRAMES_MODE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_forkjoin_frames_mode( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int( name, value, 0, 3, & __kmp_forkjoin_frames_mode );
+} // __kmp_stg_parse_forkjoin_frames
+
+static void
+__kmp_stg_print_forkjoin_frames_mode( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_forkjoin_frames_mode );
+} // __kmp_stg_print_forkjoin_frames
+#endif /* USE_ITT_BUILD */
+
+// -------------------------------------------------------------------------------------------------
+// OMP_DISPLAY_ENV
+// -------------------------------------------------------------------------------------------------
+
+#if OMP_40_ENABLED
+
+static void
+__kmp_stg_parse_omp_display_env( char const * name, char const * value, void * data )
+{
+    if ( __kmp_str_match( "VERBOSE", 1, value ) )
+    {
+        __kmp_display_env_verbose = TRUE;
+    } else {
+        __kmp_stg_parse_bool( name, value, & __kmp_display_env );
+    }
+
+} // __kmp_stg_parse_omp_display_env
+
+static void
+__kmp_stg_print_omp_display_env( kmp_str_buf_t * buffer, char const * name, void * data )
+{
+    if ( __kmp_display_env_verbose )
+    {
+        __kmp_stg_print_str( buffer, name, "VERBOSE" );
+    } else {
+        __kmp_stg_print_bool( buffer, name, __kmp_display_env );
+    }
+} // __kmp_stg_print_omp_display_env
+
+static void
+__kmp_stg_parse_omp_cancellation( char const * name, char const * value, void * data ) {
+    if ( TCR_4(__kmp_init_parallel) ) {
+        KMP_WARNING( EnvParallelWarn, name );
+        return;
+    }   // read value before first parallel only
+    __kmp_stg_parse_bool( name, value, & __kmp_omp_cancellation );
+} // __kmp_stg_parse_omp_cancellation
+
+static void
+__kmp_stg_print_omp_cancellation( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_bool( buffer, name, __kmp_omp_cancellation );
+} // __kmp_stg_print_omp_cancellation
+
+#endif
+
+// -------------------------------------------------------------------------------------------------
+// Table.
+// -------------------------------------------------------------------------------------------------
+
+
+static kmp_setting_t __kmp_stg_table[] = {
+
+    { "KMP_ALL_THREADS",                   __kmp_stg_parse_all_threads,        __kmp_stg_print_all_threads,        NULL, 0, 0 },
+    { "KMP_BLOCKTIME",                     __kmp_stg_parse_blocktime,          __kmp_stg_print_blocktime,          NULL, 0, 0 },
+    { "KMP_DUPLICATE_LIB_OK",              __kmp_stg_parse_duplicate_lib_ok,   __kmp_stg_print_duplicate_lib_ok,   NULL, 0, 0 },
+    { "KMP_LIBRARY",                       __kmp_stg_parse_wait_policy,        __kmp_stg_print_wait_policy,        NULL, 0, 0 },
+    { "KMP_MAX_THREADS",                   __kmp_stg_parse_all_threads,        NULL,                               NULL, 0, 0 }, // For backward compatibility
+    { "KMP_MONITOR_STACKSIZE",             __kmp_stg_parse_monitor_stacksize,  __kmp_stg_print_monitor_stacksize,  NULL, 0, 0 },
+    { "KMP_SETTINGS",                      __kmp_stg_parse_settings,           __kmp_stg_print_settings,           NULL, 0, 0 },
+    { "KMP_STACKOFFSET",                   __kmp_stg_parse_stackoffset,        __kmp_stg_print_stackoffset,        NULL, 0, 0 },
+    { "KMP_STACKSIZE",                     __kmp_stg_parse_stacksize,          __kmp_stg_print_stacksize,          NULL, 0, 0 },
+    { "KMP_STACKPAD",                      __kmp_stg_parse_stackpad,           __kmp_stg_print_stackpad,           NULL, 0, 0 },
+    { "KMP_VERSION",                       __kmp_stg_parse_version,            __kmp_stg_print_version,            NULL, 0, 0 },
+    { "KMP_WARNINGS",                      __kmp_stg_parse_warnings,           __kmp_stg_print_warnings,           NULL, 0, 0 },
+
+    { "OMP_NESTED",                        __kmp_stg_parse_nested,             __kmp_stg_print_nested,             NULL, 0, 0 },
+    { "OMP_NUM_THREADS",                   __kmp_stg_parse_num_threads,        __kmp_stg_print_num_threads,        NULL, 0, 0 },
+    { "OMP_STACKSIZE",                     __kmp_stg_parse_stacksize,          __kmp_stg_print_stacksize,          NULL, 0, 0 },
+
+    { "KMP_TASKING",                       __kmp_stg_parse_tasking,            __kmp_stg_print_tasking,            NULL, 0, 0 },
+    { "KMP_TASK_STEALING_CONSTRAINT",      __kmp_stg_parse_task_stealing,      __kmp_stg_print_task_stealing,      NULL, 0, 0 },
+    { "OMP_MAX_ACTIVE_LEVELS",             __kmp_stg_parse_max_active_levels,  __kmp_stg_print_max_active_levels,  NULL, 0, 0 },
+    { "OMP_THREAD_LIMIT",                  __kmp_stg_parse_all_threads,        __kmp_stg_print_all_threads,        NULL, 0, 0 },
+    { "OMP_WAIT_POLICY",                   __kmp_stg_parse_wait_policy,        __kmp_stg_print_wait_policy,        NULL, 0, 0 },
+#if KMP_NESTED_HOT_TEAMS
+    { "KMP_HOT_TEAMS_MAX_LEVEL",           __kmp_stg_parse_hot_teams_level,    __kmp_stg_print_hot_teams_level,    NULL, 0, 0 },
+    { "KMP_HOT_TEAMS_MODE",                __kmp_stg_parse_hot_teams_mode,     __kmp_stg_print_hot_teams_mode,     NULL, 0, 0 },
+#endif // KMP_NESTED_HOT_TEAMS
+
+#if KMP_HANDLE_SIGNALS
+    { "KMP_HANDLE_SIGNALS",                __kmp_stg_parse_handle_signals,     __kmp_stg_print_handle_signals,     NULL, 0, 0 },
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    { "KMP_INHERIT_FP_CONTROL",            __kmp_stg_parse_inherit_fp_control, __kmp_stg_print_inherit_fp_control, NULL, 0, 0 },
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef KMP_GOMP_COMPAT
+    { "GOMP_STACKSIZE",                    __kmp_stg_parse_stacksize,          NULL,                               NULL, 0, 0 },
+#endif
+
+#ifdef KMP_DEBUG
+    { "KMP_A_DEBUG",                       __kmp_stg_parse_a_debug,            __kmp_stg_print_a_debug,            NULL, 0, 0 },
+    { "KMP_B_DEBUG",                       __kmp_stg_parse_b_debug,            __kmp_stg_print_b_debug,            NULL, 0, 0 },
+    { "KMP_C_DEBUG",                       __kmp_stg_parse_c_debug,            __kmp_stg_print_c_debug,            NULL, 0, 0 },
+    { "KMP_D_DEBUG",                       __kmp_stg_parse_d_debug,            __kmp_stg_print_d_debug,            NULL, 0, 0 },
+    { "KMP_E_DEBUG",                       __kmp_stg_parse_e_debug,            __kmp_stg_print_e_debug,            NULL, 0, 0 },
+    { "KMP_F_DEBUG",                       __kmp_stg_parse_f_debug,            __kmp_stg_print_f_debug,            NULL, 0, 0 },
+    { "KMP_DEBUG",                         __kmp_stg_parse_debug,              NULL, /* no print */                NULL, 0, 0 },
+    { "KMP_DEBUG_BUF",                     __kmp_stg_parse_debug_buf,          __kmp_stg_print_debug_buf,          NULL, 0, 0 },
+    { "KMP_DEBUG_BUF_ATOMIC",              __kmp_stg_parse_debug_buf_atomic,   __kmp_stg_print_debug_buf_atomic,   NULL, 0, 0 },
+    { "KMP_DEBUG_BUF_CHARS",               __kmp_stg_parse_debug_buf_chars,    __kmp_stg_print_debug_buf_chars,    NULL, 0, 0 },
+    { "KMP_DEBUG_BUF_LINES",               __kmp_stg_parse_debug_buf_lines,    __kmp_stg_print_debug_buf_lines,    NULL, 0, 0 },
+    { "KMP_DIAG",                          __kmp_stg_parse_diag,               __kmp_stg_print_diag,               NULL, 0, 0 },
+
+    { "KMP_PAR_RANGE",                     __kmp_stg_parse_par_range_env,      __kmp_stg_print_par_range_env,      NULL, 0, 0 },
+    { "KMP_YIELD_CYCLE",                   __kmp_stg_parse_yield_cycle,        __kmp_stg_print_yield_cycle,        NULL, 0, 0 },
+    { "KMP_YIELD_ON",                      __kmp_stg_parse_yield_on,           __kmp_stg_print_yield_on,           NULL, 0, 0 },
+    { "KMP_YIELD_OFF",                     __kmp_stg_parse_yield_off,          __kmp_stg_print_yield_off,          NULL, 0, 0 },
+#endif // KMP_DEBUG
+
+    { "KMP_ALIGN_ALLOC",                   __kmp_stg_parse_align_alloc,        __kmp_stg_print_align_alloc,        NULL, 0, 0 },
+
+    { "KMP_PLAIN_BARRIER",                 __kmp_stg_parse_barrier_branch_bit, __kmp_stg_print_barrier_branch_bit, NULL, 0, 0 },
+    { "KMP_PLAIN_BARRIER_PATTERN",         __kmp_stg_parse_barrier_pattern,    __kmp_stg_print_barrier_pattern,    NULL, 0, 0 },
+    { "KMP_FORKJOIN_BARRIER",              __kmp_stg_parse_barrier_branch_bit, __kmp_stg_print_barrier_branch_bit, NULL, 0, 0 },
+    { "KMP_FORKJOIN_BARRIER_PATTERN",      __kmp_stg_parse_barrier_pattern,    __kmp_stg_print_barrier_pattern,    NULL, 0, 0 },
+#if KMP_FAST_REDUCTION_BARRIER
+    { "KMP_REDUCTION_BARRIER",             __kmp_stg_parse_barrier_branch_bit, __kmp_stg_print_barrier_branch_bit, NULL, 0, 0 },
+    { "KMP_REDUCTION_BARRIER_PATTERN",     __kmp_stg_parse_barrier_pattern,    __kmp_stg_print_barrier_pattern,    NULL, 0, 0 },
+#endif
+
+    { "KMP_ABORT_DELAY",                   __kmp_stg_parse_abort_delay,        __kmp_stg_print_abort_delay,        NULL, 0, 0 },
+    { "KMP_CPUINFO_FILE",                  __kmp_stg_parse_cpuinfo_file,       __kmp_stg_print_cpuinfo_file,       NULL, 0, 0 },
+    { "KMP_FORCE_REDUCTION",               __kmp_stg_parse_force_reduction,    __kmp_stg_print_force_reduction,    NULL, 0, 0 },
+    { "KMP_DETERMINISTIC_REDUCTION",       __kmp_stg_parse_force_reduction,    __kmp_stg_print_force_reduction,    NULL, 0, 0 },
+    { "KMP_STORAGE_MAP",                   __kmp_stg_parse_storage_map,        __kmp_stg_print_storage_map,        NULL, 0, 0 },
+    { "KMP_ALL_THREADPRIVATE",             __kmp_stg_parse_all_threadprivate,  __kmp_stg_print_all_threadprivate,  NULL, 0, 0 },
+    { "KMP_FOREIGN_THREADS_THREADPRIVATE", __kmp_stg_parse_foreign_threads_threadprivate, __kmp_stg_print_foreign_threads_threadprivate,     NULL, 0, 0 },
+
+#if KMP_AFFINITY_SUPPORTED
+    { "KMP_AFFINITY",                      __kmp_stg_parse_affinity,           __kmp_stg_print_affinity,           NULL, 0, 0 },
+# ifdef KMP_GOMP_COMPAT
+    { "GOMP_CPU_AFFINITY",                 __kmp_stg_parse_gomp_cpu_affinity,  NULL, /* no print */                NULL, 0, 0 },
+# endif /* KMP_GOMP_COMPAT */
+# if OMP_40_ENABLED
+    { "OMP_PROC_BIND",                     __kmp_stg_parse_proc_bind,          __kmp_stg_print_proc_bind,          NULL, 0, 0 },
+    { "OMP_PLACES",                        __kmp_stg_parse_places,             __kmp_stg_print_places,             NULL, 0, 0 },
+# else
+    { "OMP_PROC_BIND",                     __kmp_stg_parse_proc_bind,          NULL, /* no print */                NULL, 0, 0 },
+# endif /* OMP_40_ENABLED */
+
+    { "KMP_TOPOLOGY_METHOD",               __kmp_stg_parse_topology_method,    __kmp_stg_print_topology_method,    NULL, 0, 0 },
+
+#else
+
+    //
+    // KMP_AFFINITY is not supported on OS X*, nor is OMP_PLACES.
+    // OMP_PROC_BIND and proc-bind-var are supported, however.
+    //
+# if OMP_40_ENABLED
+    { "OMP_PROC_BIND",                     __kmp_stg_parse_proc_bind,          __kmp_stg_print_proc_bind,          NULL, 0, 0 },
+# endif
+
+#endif // KMP_AFFINITY_SUPPORTED
+
+    { "KMP_INIT_AT_FORK",                  __kmp_stg_parse_init_at_fork,       __kmp_stg_print_init_at_fork,       NULL, 0, 0 },
+    { "KMP_SCHEDULE",                      __kmp_stg_parse_schedule,           __kmp_stg_print_schedule,           NULL, 0, 0 },
+    { "OMP_SCHEDULE",                      __kmp_stg_parse_omp_schedule,       __kmp_stg_print_omp_schedule,       NULL, 0, 0 },
+    { "KMP_ATOMIC_MODE",                   __kmp_stg_parse_atomic_mode,        __kmp_stg_print_atomic_mode,        NULL, 0, 0 },
+    { "KMP_CONSISTENCY_CHECK",             __kmp_stg_parse_consistency_check,  __kmp_stg_print_consistency_check,  NULL, 0, 0 },
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    { "KMP_ITT_PREPARE_DELAY",             __kmp_stg_parse_itt_prepare_delay,  __kmp_stg_print_itt_prepare_delay,  NULL, 0, 0 },
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+    { "KMP_MALLOC_POOL_INCR",              __kmp_stg_parse_malloc_pool_incr,   __kmp_stg_print_malloc_pool_incr,   NULL, 0, 0 },
+    { "KMP_INIT_WAIT",                     __kmp_stg_parse_init_wait,          __kmp_stg_print_init_wait,          NULL, 0, 0 },
+    { "KMP_NEXT_WAIT",                     __kmp_stg_parse_next_wait,          __kmp_stg_print_next_wait,          NULL, 0, 0 },
+    { "KMP_GTID_MODE",                     __kmp_stg_parse_gtid_mode,          __kmp_stg_print_gtid_mode,          NULL, 0, 0 },
+    { "OMP_DYNAMIC",                       __kmp_stg_parse_omp_dynamic,        __kmp_stg_print_omp_dynamic,        NULL, 0, 0 },
+    { "KMP_DYNAMIC_MODE",                  __kmp_stg_parse_kmp_dynamic_mode,   __kmp_stg_print_kmp_dynamic_mode,   NULL, 0, 0 },
+
+#ifdef USE_LOAD_BALANCE
+    { "KMP_LOAD_BALANCE_INTERVAL",         __kmp_stg_parse_ld_balance_interval,__kmp_stg_print_ld_balance_interval,NULL, 0, 0 },
+#endif
+
+
+
+    { "KMP_NUM_LOCKS_IN_BLOCK",            __kmp_stg_parse_lock_block,         __kmp_stg_print_lock_block,         NULL, 0, 0 },
+    { "KMP_LOCK_KIND",                     __kmp_stg_parse_lock_kind,          __kmp_stg_print_lock_kind,          NULL, 0, 0 },
+#if KMP_USE_ADAPTIVE_LOCKS
+    { "KMP_ADAPTIVE_LOCK_PROPS",           __kmp_stg_parse_adaptive_lock_props,__kmp_stg_print_adaptive_lock_props,  NULL, 0, 0 },
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+    { "KMP_SPECULATIVE_STATSFILE",         __kmp_stg_parse_speculative_statsfile,__kmp_stg_print_speculative_statsfile,  NULL, 0, 0 },
+#endif
+#endif // KMP_USE_ADAPTIVE_LOCKS
+    { "KMP_PLACE_THREADS",                 __kmp_stg_parse_place_threads,      __kmp_stg_print_place_threads,      NULL, 0, 0 },
+#if USE_ITT_BUILD
+    { "KMP_FORKJOIN_FRAMES",               __kmp_stg_parse_forkjoin_frames,    __kmp_stg_print_forkjoin_frames,    NULL, 0, 0 },
+    { "KMP_FORKJOIN_FRAMES_MODE",          __kmp_stg_parse_forkjoin_frames_mode,__kmp_stg_print_forkjoin_frames_mode,  NULL, 0, 0 },
+#endif
+
+# if OMP_40_ENABLED
+    { "OMP_DISPLAY_ENV",                   __kmp_stg_parse_omp_display_env,    __kmp_stg_print_omp_display_env,    NULL, 0, 0 },
+    { "OMP_CANCELLATION",                  __kmp_stg_parse_omp_cancellation,   __kmp_stg_print_omp_cancellation,   NULL, 0, 0 },
+#endif
+    { "",                                  NULL,                               NULL,                               NULL, 0, 0 }
+}; // settings
+
+static int const __kmp_stg_count = sizeof( __kmp_stg_table ) / sizeof( kmp_setting_t );
+
+static inline
+kmp_setting_t *
+__kmp_stg_find( char const * name ) {
+
+    int i;
+    if ( name != NULL ) {
+        for ( i = 0; i < __kmp_stg_count; ++ i ) {
+            if ( strcmp( __kmp_stg_table[ i ].name, name ) == 0 ) {
+                return & __kmp_stg_table[ i ];
+            }; // if
+        }; // for
+    }; // if
+    return NULL;
+
+} // __kmp_stg_find
+
+
+static int
+__kmp_stg_cmp( void const * _a, void const * _b ) {
+    kmp_setting_t * a = (kmp_setting_t *) _a;
+    kmp_setting_t * b = (kmp_setting_t *) _b;
+
+    //
+    // Process KMP_AFFINITY last.
+    // It needs to come after OMP_PLACES and GOMP_CPU_AFFINITY.
+    //
+    if ( strcmp( a->name, "KMP_AFFINITY" ) == 0 ) {
+        if ( strcmp( b->name, "KMP_AFFINITY" ) == 0 ) {
+            return 0;
+        }
+        return 1;
+    }
+    else if ( strcmp( b->name, "KMP_AFFINITY" ) == 0 ) {
+        return -1;
+    }
+    return strcmp( a->name, b->name );
+} // __kmp_stg_cmp
+
+
+static void
+__kmp_stg_init( void
+) {
+
+    static int initialized = 0;
+
+    if ( ! initialized ) {
+
+        // Sort table.
+        qsort( __kmp_stg_table, __kmp_stg_count - 1, sizeof( kmp_setting_t ), __kmp_stg_cmp );
+
+        { // Initialize *_STACKSIZE data.
+
+            kmp_setting_t * kmp_stacksize  = __kmp_stg_find( "KMP_STACKSIZE"  );      // 1st priority.
+#ifdef KMP_GOMP_COMPAT
+            kmp_setting_t * gomp_stacksize = __kmp_stg_find( "GOMP_STACKSIZE" );      // 2nd priority.
+#endif
+            kmp_setting_t * omp_stacksize  = __kmp_stg_find( "OMP_STACKSIZE"  );      // 3rd priority.
+
+            // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
+            // !!! Compiler does not understand rivals is used and optimizes out assignments
+            // !!!     rivals[ i ++ ] = ...;
+            static kmp_setting_t * volatile rivals[ 4 ];
+            static kmp_stg_ss_data_t kmp_data  = {    1, (kmp_setting_t **)rivals };
+#ifdef KMP_GOMP_COMPAT
+            static kmp_stg_ss_data_t gomp_data = { 1024, (kmp_setting_t **)rivals };
+#endif
+            static kmp_stg_ss_data_t omp_data  = { 1024, (kmp_setting_t **)rivals };
+            int i = 0;
+
+            rivals[ i ++ ] = kmp_stacksize;
+#ifdef KMP_GOMP_COMPAT
+            if ( gomp_stacksize != NULL ) {
+                rivals[ i ++ ] = gomp_stacksize;
+            }; // if
+#endif
+            rivals[ i ++ ] = omp_stacksize;
+            rivals[ i ++ ] = NULL;
+
+            kmp_stacksize->data = & kmp_data;
+#ifdef KMP_GOMP_COMPAT
+            if ( gomp_stacksize != NULL ) {
+                gomp_stacksize->data = & gomp_data;
+            }; // if
+#endif
+            omp_stacksize->data = & omp_data;
+
+        }
+
+        { // Initialize KMP_LIBRARY and OMP_WAIT_POLICY data.
+
+            kmp_setting_t * kmp_library     = __kmp_stg_find( "KMP_LIBRARY" );        // 1st priority.
+            kmp_setting_t * omp_wait_policy = __kmp_stg_find( "OMP_WAIT_POLICY" );    // 2nd priority.
+
+            // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
+            static kmp_setting_t * volatile rivals[ 3 ];
+            static kmp_stg_wp_data_t kmp_data  = { 0, (kmp_setting_t **)rivals };
+            static kmp_stg_wp_data_t omp_data  = { 1, (kmp_setting_t **)rivals };
+            int i = 0;
+
+            rivals[ i ++ ] = kmp_library;
+            if ( omp_wait_policy != NULL ) {
+                rivals[ i ++ ] = omp_wait_policy;
+            }; // if
+            rivals[ i ++ ] = NULL;
+
+            kmp_library->data  = & kmp_data;
+            if ( omp_wait_policy != NULL ) {
+                omp_wait_policy->data = & omp_data;
+            }; // if
+
+        }
+
+        { // Initialize KMP_ALL_THREADS, KMP_MAX_THREADS, and OMP_THREAD_LIMIT data.
+
+            kmp_setting_t * kmp_all_threads  = __kmp_stg_find( "KMP_ALL_THREADS"  );  // 1st priority.
+            kmp_setting_t * kmp_max_threads  = __kmp_stg_find( "KMP_MAX_THREADS"  );  // 2nd priority.
+            kmp_setting_t * omp_thread_limit = __kmp_stg_find( "OMP_THREAD_LIMIT" );  // 3rd priority.
+
+            // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
+            static kmp_setting_t * volatile rivals[ 4 ];
+            int i = 0;
+
+            rivals[ i ++ ] = kmp_all_threads;
+            rivals[ i ++ ] = kmp_max_threads;
+            if ( omp_thread_limit != NULL ) {
+                rivals[ i ++ ] = omp_thread_limit;
+            }; // if
+            rivals[ i ++ ] = NULL;
+
+            kmp_all_threads->data = (void*)& rivals;
+            kmp_max_threads->data = (void*)& rivals;
+            if ( omp_thread_limit != NULL ) {
+                omp_thread_limit->data = (void*)& rivals;
+            }; // if
+
+        }
+
+#if KMP_AFFINITY_SUPPORTED
+        { // Initialize KMP_AFFINITY, GOMP_CPU_AFFINITY, and OMP_PROC_BIND data.
+
+            kmp_setting_t * kmp_affinity = __kmp_stg_find( "KMP_AFFINITY"  );  // 1st priority.
+            KMP_DEBUG_ASSERT( kmp_affinity != NULL );
+
+# ifdef KMP_GOMP_COMPAT
+            kmp_setting_t * gomp_cpu_affinity = __kmp_stg_find( "GOMP_CPU_AFFINITY"  );  // 2nd priority.
+            KMP_DEBUG_ASSERT( gomp_cpu_affinity != NULL );
+# endif
+
+            kmp_setting_t * omp_proc_bind = __kmp_stg_find( "OMP_PROC_BIND" );  // 3rd priority.
+            KMP_DEBUG_ASSERT( omp_proc_bind != NULL );
+
+            // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
+            static kmp_setting_t * volatile rivals[ 4 ];
+            int i = 0;
+
+            rivals[ i ++ ] = kmp_affinity;
+
+# ifdef KMP_GOMP_COMPAT
+            rivals[ i ++ ] = gomp_cpu_affinity;
+            gomp_cpu_affinity->data = (void*)& rivals;
+# endif
+
+            rivals[ i ++ ] = omp_proc_bind;
+            omp_proc_bind->data = (void*)& rivals;
+            rivals[ i ++ ] = NULL;
+
+# if OMP_40_ENABLED
+            static kmp_setting_t * volatile places_rivals[ 4 ];
+            i = 0;
+
+            kmp_setting_t * omp_places = __kmp_stg_find( "OMP_PLACES" );  // 3rd priority.
+            KMP_DEBUG_ASSERT( omp_places != NULL );
+
+            places_rivals[ i ++ ] = kmp_affinity;
+#  ifdef KMP_GOMP_COMPAT
+            places_rivals[ i ++ ] = gomp_cpu_affinity;
+#  endif
+            places_rivals[ i ++ ] = omp_places;
+            omp_places->data = (void*)& places_rivals;
+            places_rivals[ i ++ ] = NULL;
+# endif
+        }
+#else
+    // KMP_AFFINITY not supported, so OMP_PROC_BIND has no rivals.
+    // OMP_PLACES not supported yet.
+#endif // KMP_AFFINITY_SUPPORTED
+
+        { // Initialize KMP_DETERMINISTIC_REDUCTION and KMP_FORCE_REDUCTION data.
+
+            kmp_setting_t * kmp_force_red  = __kmp_stg_find( "KMP_FORCE_REDUCTION" );         // 1st priority.
+            kmp_setting_t * kmp_determ_red = __kmp_stg_find( "KMP_DETERMINISTIC_REDUCTION" ); // 2nd priority.
+
+            // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
+            static kmp_setting_t * volatile rivals[ 3 ];
+            static kmp_stg_fr_data_t force_data   = { 1, (kmp_setting_t **)rivals };
+            static kmp_stg_fr_data_t determ_data  = { 0, (kmp_setting_t **)rivals };
+            int i = 0;
+
+            rivals[ i ++ ] = kmp_force_red;
+            if ( kmp_determ_red != NULL ) {
+                rivals[ i ++ ] = kmp_determ_red;
+            }; // if
+            rivals[ i ++ ] = NULL;
+
+            kmp_force_red->data = & force_data;
+            if ( kmp_determ_red != NULL ) {
+                kmp_determ_red->data  = & determ_data;
+            }; // if
+        }
+
+        initialized = 1;
+
+    }; // if
+
+    // Reset flags.
+    int i;
+    for ( i = 0; i < __kmp_stg_count; ++ i ) {
+        __kmp_stg_table[ i ].set = 0;
+    }; // for
+
+} // __kmp_stg_init
+
+
+static void
+__kmp_stg_parse(
+    char const * name,
+    char const * value
+) {
+
+    // On Windows* OS there are some nameless variables like "C:=C:\" (yeah, really nameless, they are
+    // presented in environment block as "=C:=C\\\x00=D:=D:\\\x00...", so let us skip them.
+    if ( name[ 0 ] == 0 ) {
+        return;
+    }; // if
+
+    if ( value != NULL ) {
+        kmp_setting_t * setting = __kmp_stg_find( name );
+        if ( setting != NULL ) {
+            setting->parse( name, value, setting->data );
+            setting->defined = 1;
+        }; // if
+    }; // if
+
+} // __kmp_stg_parse
+
+
+static int
+__kmp_stg_check_rivals(          // 0 -- Ok, 1 -- errors found.
+    char const *       name,     // Name of variable.
+    char const *       value,    // Value of the variable.
+    kmp_setting_t * *  rivals    // List of rival settings (the list must include current one).
+) {
+
+    if ( rivals == NULL ) {
+        return 0;
+    }
+
+    // Loop thru higher priority settings (listed before current).
+    int i = 0;
+    for ( ; strcmp( rivals[ i ]->name, name ) != 0; i++ ) {
+        KMP_DEBUG_ASSERT( rivals[ i ] != NULL );
+
+#if KMP_AFFINITY_SUPPORTED
+        if ( rivals[ i ] == __kmp_affinity_notype ) {
+            //
+            // If KMP_AFFINITY is specified without a type name,
+            // it does not rival OMP_PROC_BIND or GOMP_CPU_AFFINITY.
+            //
+            continue;
+        }
+#endif
+
+        if ( rivals[ i ]->set ) {
+            KMP_WARNING( StgIgnored, name, rivals[ i ]->name );
+            return 1;
+        }; // if
+    }; // while
+
+    ++ i; // Skip current setting.
+    return 0;
+
+}; // __kmp_stg_check_rivals
+
+
+static int
+__kmp_env_toPrint( char const * name, int flag ) {
+    int rc = 0;
+    kmp_setting_t * setting = __kmp_stg_find( name );
+    if ( setting != NULL ) {
+        rc = setting->defined;
+        if ( flag >= 0 ) {
+            setting->defined = flag;
+        }; // if
+    }; // if
+    return rc;
+}
+
+
+static void
+__kmp_aux_env_initialize( kmp_env_blk_t* block ) {
+
+    char const * value;
+
+    /* OMP_NUM_THREADS */
+    value = __kmp_env_blk_var( block, "OMP_NUM_THREADS" );
+    if ( value ) {
+        ompc_set_num_threads( __kmp_dflt_team_nth );
+    }
+
+    /* KMP_BLOCKTIME */
+    value = __kmp_env_blk_var( block, "KMP_BLOCKTIME" );
+    if ( value ) {
+        kmpc_set_blocktime( __kmp_dflt_blocktime );
+    }
+
+    /* OMP_NESTED */
+    value = __kmp_env_blk_var( block, "OMP_NESTED" );
+    if ( value ) {
+        ompc_set_nested( __kmp_dflt_nested );
+    }
+
+    /* OMP_DYNAMIC */
+    value = __kmp_env_blk_var( block, "OMP_DYNAMIC" );
+    if ( value ) {
+        ompc_set_dynamic( __kmp_global.g.g_dynamic );
+    }
+
+}
+
+void
+__kmp_env_initialize( char const * string ) {
+
+    kmp_env_blk_t block;
+    int           i;
+
+    __kmp_stg_init();
+
+    // Hack!!!
+    if ( string == NULL ) {
+        // __kmp_max_nth = __kmp_sys_max_nth;
+        __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
+    }; // if
+    __kmp_env_blk_init( & block, string );
+
+    //
+    // update the set flag on all entries that have an env var
+    //
+    for ( i = 0; i < block.count; ++ i ) {
+        if (( block.vars[ i ].name == NULL )
+          || ( *block.vars[ i ].name == '\0')) {
+            continue;
+        }
+        if ( block.vars[ i ].value == NULL ) {
+            continue;
+        }
+        kmp_setting_t * setting = __kmp_stg_find( block.vars[ i ].name );
+        if ( setting != NULL ) {
+            setting->set = 1;
+        }
+    }; // for i
+
+    // Special case. If we parse environment, not a string, process KMP_WARNINGS first.
+    if ( string == NULL ) {
+        char const * name  = "KMP_WARNINGS";
+        char const * value = __kmp_env_blk_var( & block, name );
+        __kmp_stg_parse( name, value );
+    }; // if
+
+#if KMP_AFFINITY_SUPPORTED
+    //
+    // Special case. KMP_AFFINITY is not a rival to other affinity env vars
+    // if no affinity type is specified.  We want to allow
+    // KMP_AFFINITY=[no],verbose/[no]warnings/etc.  to be enabled when
+    // specifying the affinity type via GOMP_CPU_AFFINITY or the OMP 4.0
+    // affinity mechanism.
+    //
+    __kmp_affinity_notype = NULL;
+    char const *aff_str = __kmp_env_blk_var( & block, "KMP_AFFINITY" );
+    if ( aff_str != NULL ) {
+        //
+        // Check if the KMP_AFFINITY type is specified in the string.
+        // We just search the string for "compact", "scatter", etc.
+        // without really parsing the string.  The syntax of the
+        // KMP_AFFINITY env var is such that none of the affinity
+        // type names can appear anywhere other that the type
+        // specifier, even as substrings.
+        //
+        // I can't find a case-insensitive version of strstr on Windows* OS.
+        // Use the case-sensitive version for now.
+        //
+
+# if KMP_OS_WINDOWS
+#  define FIND strstr
+# else
+#  define FIND strcasestr
+# endif
+
+        if ( ( FIND( aff_str, "none" ) == NULL )
+          && ( FIND( aff_str, "physical" ) == NULL )
+          && ( FIND( aff_str, "logical" ) == NULL )
+          && ( FIND( aff_str, "compact" ) == NULL )
+          && ( FIND( aff_str, "scatter" ) == NULL )
+          && ( FIND( aff_str, "explicit" ) == NULL )
+          && ( FIND( aff_str, "balanced" ) == NULL )
+          && ( FIND( aff_str, "disabled" ) == NULL ) ) {
+            __kmp_affinity_notype = __kmp_stg_find( "KMP_AFFINITY"  );
+        }
+        else {
+            //
+            // A new affinity type is specified.
+            // Reset the affinity flags to their default values,
+            // in case this is called from kmp_set_defaults().
+            //
+            __kmp_affinity_type = affinity_default;
+            __kmp_affinity_gran = affinity_gran_default;
+            __kmp_affinity_top_method = affinity_top_method_default;
+            __kmp_affinity_respect_mask = affinity_respect_mask_default;
+        }
+# undef FIND
+
+#if OMP_40_ENABLED
+        //
+        // Also reset the affinity flags if OMP_PROC_BIND is specified.
+        //
+        aff_str = __kmp_env_blk_var( & block, "OMP_PROC_BIND" );
+        if ( aff_str != NULL ) {
+            __kmp_affinity_type = affinity_default;
+            __kmp_affinity_gran = affinity_gran_default;
+            __kmp_affinity_top_method = affinity_top_method_default;
+            __kmp_affinity_respect_mask = affinity_respect_mask_default;
+        }
+#endif /* OMP_40_ENABLED */
+    }
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+#if OMP_40_ENABLED
+    //
+    // Set up the nested proc bind type vector.
+    //
+    if ( __kmp_nested_proc_bind.bind_types == NULL ) {
+        __kmp_nested_proc_bind.bind_types = (kmp_proc_bind_t *)
+          KMP_INTERNAL_MALLOC( sizeof(kmp_proc_bind_t) );
+        if ( __kmp_nested_proc_bind.bind_types == NULL ) {
+            KMP_FATAL( MemoryAllocFailed );
+        }
+        __kmp_nested_proc_bind.size = 1;
+        __kmp_nested_proc_bind.used = 1;
+# if KMP_AFFINITY_SUPPORTED
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_default;
+# else
+        // default proc bind is false if affinity not supported
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+# endif
+
+    }
+#endif /* OMP_40_ENABLED */
+
+    //
+    // Now process all of the settings.
+    //
+    for ( i = 0; i < block.count; ++ i ) {
+        __kmp_stg_parse( block.vars[ i ].name, block.vars[ i ].value );
+    }; // for i
+
+    //
+    // If user locks have been allocated yet, don't reset the lock vptr table.
+    //
+    if ( ! __kmp_init_user_locks ) {
+        if ( __kmp_user_lock_kind == lk_default ) {
+            __kmp_user_lock_kind = lk_queuing;
+        }
+#if KMP_USE_DYNAMIC_LOCK
+        __kmp_init_dynamic_user_locks();
+#else
+        __kmp_set_user_lock_vptrs( __kmp_user_lock_kind );
+#endif
+    }
+    else {
+        KMP_DEBUG_ASSERT( string != NULL); // kmp_set_defaults() was called
+        KMP_DEBUG_ASSERT( __kmp_user_lock_kind != lk_default );
+        // Binds lock functions again to follow the transition between different
+        // KMP_CONSISTENCY_CHECK values. Calling this again is harmless as long
+        // as we do not allow lock kind changes after making a call to any
+        // user lock functions (true).
+#if KMP_USE_DYNAMIC_LOCK
+        __kmp_init_dynamic_user_locks();
+#else
+        __kmp_set_user_lock_vptrs( __kmp_user_lock_kind );
+#endif
+    }
+
+#if KMP_AFFINITY_SUPPORTED
+
+    if ( ! TCR_4(__kmp_init_middle) ) {
+        //
+        // Determine if the machine/OS is actually capable of supporting
+        // affinity.
+        //
+        const char *var = "KMP_AFFINITY";
+        if ( __kmp_affinity_type == affinity_disabled ) {
+            KMP_AFFINITY_DISABLE();
+        }
+        else if ( ! KMP_AFFINITY_CAPABLE() ) {
+            __kmp_affinity_determine_capable( var );
+            if ( ! KMP_AFFINITY_CAPABLE() ) {
+                if ( __kmp_affinity_verbose || ( __kmp_affinity_warnings
+                  && ( __kmp_affinity_type != affinity_default )
+                  && ( __kmp_affinity_type != affinity_none )
+                  && ( __kmp_affinity_type != affinity_disabled ) ) ) {
+                    KMP_WARNING( AffNotSupported, var );
+                }
+                __kmp_affinity_type = affinity_disabled;
+                __kmp_affinity_respect_mask = 0;
+                __kmp_affinity_gran = affinity_gran_fine;
+            }
+        }
+
+# if OMP_40_ENABLED
+        if ( __kmp_affinity_type == affinity_disabled )  {
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+        }
+        else if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_true ) {
+            //
+            // OMP_PROC_BIND=true maps to OMP_PROC_BIND=spread.
+            //
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_spread;
+        }
+# endif /* OMP_40_ENABLED */
+
+        if ( KMP_AFFINITY_CAPABLE() ) {
+
+# if KMP_GROUP_AFFINITY
+
+            //
+            // Handle the Win 64 group affinity stuff if there are multiple
+            // processor groups, or if the user requested it, and OMP 4.0
+            // affinity is not in effect.
+            //
+            if ( ( ( __kmp_num_proc_groups > 1 )
+              && ( __kmp_affinity_type == affinity_default )
+#  if OMP_40_ENABLED
+              && ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) )
+#  endif
+              || ( __kmp_affinity_top_method == affinity_top_method_group ) ) {
+                if ( __kmp_affinity_respect_mask == affinity_respect_mask_default ) {
+                    __kmp_affinity_respect_mask = FALSE;
+                }
+                if ( __kmp_affinity_type == affinity_default ) {
+                    __kmp_affinity_type = affinity_compact;
+#  if OMP_40_ENABLED
+                    __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+#  endif
+                }
+                if ( __kmp_affinity_top_method == affinity_top_method_default ) {
+                    if ( __kmp_affinity_gran == affinity_gran_default ) {
+                        __kmp_affinity_top_method = affinity_top_method_group;
+                        __kmp_affinity_gran = affinity_gran_group;
+                    }
+                    else if ( __kmp_affinity_gran == affinity_gran_group ) {
+                        __kmp_affinity_top_method = affinity_top_method_group;
+                    }
+                    else {
+                        __kmp_affinity_top_method = affinity_top_method_all;
+                    }
+                }
+                else if ( __kmp_affinity_top_method == affinity_top_method_group ) {
+                    if ( __kmp_affinity_gran == affinity_gran_default ) {
+                        __kmp_affinity_gran = affinity_gran_group;
+                    }
+                    else if ( ( __kmp_affinity_gran != affinity_gran_group )
+                      && ( __kmp_affinity_gran != affinity_gran_fine )
+                      && ( __kmp_affinity_gran != affinity_gran_thread ) ) {
+                        char *str = NULL;
+                        switch ( __kmp_affinity_gran ) {
+                            case affinity_gran_core: str = "core"; break;
+                            case affinity_gran_package: str = "package"; break;
+                            case affinity_gran_node: str = "node"; break;
+                            default: KMP_DEBUG_ASSERT( 0 );
+                        }
+                        KMP_WARNING( AffGranTopGroup, var, str );
+                        __kmp_affinity_gran = affinity_gran_fine;
+                    }
+                }
+                else {
+                    if ( __kmp_affinity_gran == affinity_gran_default ) {
+                        __kmp_affinity_gran = affinity_gran_core;
+                    }
+                    else if ( __kmp_affinity_gran == affinity_gran_group ) {
+                        char *str = NULL;
+                        switch ( __kmp_affinity_type ) {
+                            case affinity_physical: str = "physical"; break;
+                            case affinity_logical: str = "logical"; break;
+                            case affinity_compact: str = "compact"; break;
+                            case affinity_scatter: str = "scatter"; break;
+                            case affinity_explicit: str = "explicit"; break;
+                            // No MIC on windows, so no affinity_balanced case
+                            default: KMP_DEBUG_ASSERT( 0 );
+                        }
+                        KMP_WARNING( AffGranGroupType, var, str );
+                        __kmp_affinity_gran = affinity_gran_core;
+                    }
+                }
+            }
+            else
+
+# endif /* KMP_GROUP_AFFINITY */
+
+            {
+                if ( __kmp_affinity_respect_mask == affinity_respect_mask_default ) {
+# if KMP_GROUP_AFFINITY
+                    if ( __kmp_num_proc_groups > 1 ) {
+                        __kmp_affinity_respect_mask = FALSE;
+                    }
+                    else
+# endif /* KMP_GROUP_AFFINITY */
+                    {
+                        __kmp_affinity_respect_mask = TRUE;
+                    }
+                }
+# if OMP_40_ENABLED
+                if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
+                  && ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_default ) ) {
+                    if ( __kmp_affinity_type == affinity_default ) {
+                        __kmp_affinity_type = affinity_compact;
+                        __kmp_affinity_dups = FALSE;
+                    }
+                }
+                else
+# endif /* OMP_40_ENABLED */
+                if ( __kmp_affinity_type == affinity_default ) {
+#if OMP_40_ENABLED
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+                    if( __kmp_mic_type != non_mic ) {
+                        __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+                    } else
+#endif
+                    {
+                        __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+                    }
+#endif /* OMP_40_ENABLED */
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+                    if( __kmp_mic_type != non_mic ) {
+                        __kmp_affinity_type = affinity_scatter;
+                    } else
+#endif
+                    {
+                        __kmp_affinity_type = affinity_none;
+                    }
+
+                }
+                if ( ( __kmp_affinity_gran == affinity_gran_default )
+                  &&  ( __kmp_affinity_gran_levels < 0 ) ) {
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+                    if( __kmp_mic_type != non_mic ) {
+                        __kmp_affinity_gran = affinity_gran_fine;
+                    } else
+#endif
+                    {
+                        __kmp_affinity_gran = affinity_gran_core;
+                    }
+                }
+                if ( __kmp_affinity_top_method == affinity_top_method_default ) {
+                    __kmp_affinity_top_method = affinity_top_method_all;
+                }
+            }
+        }
+
+        K_DIAG( 1, ( "__kmp_affinity_type         == %d\n", __kmp_affinity_type         ) );
+        K_DIAG( 1, ( "__kmp_affinity_compact      == %d\n", __kmp_affinity_compact      ) );
+        K_DIAG( 1, ( "__kmp_affinity_offset       == %d\n", __kmp_affinity_offset       ) );
+        K_DIAG( 1, ( "__kmp_affinity_verbose      == %d\n", __kmp_affinity_verbose      ) );
+        K_DIAG( 1, ( "__kmp_affinity_warnings     == %d\n", __kmp_affinity_warnings     ) );
+        K_DIAG( 1, ( "__kmp_affinity_respect_mask == %d\n", __kmp_affinity_respect_mask ) );
+        K_DIAG( 1, ( "__kmp_affinity_gran         == %d\n", __kmp_affinity_gran         ) );
+
+        KMP_DEBUG_ASSERT( __kmp_affinity_type != affinity_default);
+# if OMP_40_ENABLED
+        KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.bind_types[0] != proc_bind_default );
+# endif
+    }
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+    if ( __kmp_version ) {
+        __kmp_print_version_1();
+    }; // if
+
+    // Post-initialization step: some env. vars need their value's further processing
+    if ( string != NULL) { // kmp_set_defaults() was called
+        __kmp_aux_env_initialize( &block );
+    }
+
+    __kmp_env_blk_free( & block );
+
+    KMP_MB();
+
+} // __kmp_env_initialize
+
+
+void
+__kmp_env_print() {
+
+    kmp_env_blk_t block;
+    int           i;
+    kmp_str_buf_t buffer;
+
+    __kmp_stg_init();
+    __kmp_str_buf_init( & buffer );
+
+    __kmp_env_blk_init( & block, NULL );
+    __kmp_env_blk_sort( & block );
+
+    // Print real environment values.
+    __kmp_str_buf_print( & buffer, "\n%s\n\n", KMP_I18N_STR( UserSettings )  );
+    for ( i = 0; i < block.count; ++ i ) {
+        char const * name  = block.vars[ i ].name;
+        char const * value = block.vars[ i ].value;
+        if (
+            ( KMP_STRLEN( name ) > 4 && strncmp( name, "KMP_", 4 ) == 0 )
+            || strncmp( name, "OMP_", 4 ) == 0
+            #ifdef KMP_GOMP_COMPAT
+                || strncmp( name, "GOMP_", 5 ) == 0
+            #endif // KMP_GOMP_COMPAT
+        ) {
+            __kmp_str_buf_print( & buffer, "   %s=%s\n", name, value );
+        }; // if
+    }; // for
+    __kmp_str_buf_print( & buffer, "\n" );
+
+    // Print internal (effective) settings.
+    __kmp_str_buf_print( & buffer, "%s\n\n", KMP_I18N_STR( EffectiveSettings ) );
+    for ( int i = 0; i < __kmp_stg_count; ++ i ) {
+        if (  __kmp_stg_table[ i ].print != NULL ) {
+            __kmp_stg_table[ i ].print( & buffer, __kmp_stg_table[ i ].name, __kmp_stg_table[ i ].data );
+        }; // if
+    }; // for
+
+    __kmp_printf( "%s", buffer.str );
+
+    __kmp_env_blk_free( & block );
+    __kmp_str_buf_free( & buffer );
+
+    __kmp_printf("\n");
+
+} // __kmp_env_print
+
+
+#if OMP_40_ENABLED
+void
+__kmp_env_print_2() {
+
+    kmp_env_blk_t block;
+    kmp_str_buf_t buffer;
+
+    __kmp_env_format = 1;
+
+    __kmp_stg_init();
+    __kmp_str_buf_init( & buffer );
+
+    __kmp_env_blk_init( & block, NULL );
+    __kmp_env_blk_sort( & block );
+
+    __kmp_str_buf_print( & buffer, "\n%s\n", KMP_I18N_STR( DisplayEnvBegin )  );
+    __kmp_str_buf_print( & buffer, "   _OPENMP='%d'\n", __kmp_openmp_version );
+
+    for ( int i = 0; i < __kmp_stg_count; ++ i ) {
+        if (  __kmp_stg_table[ i ].print != NULL &&
+              ( ( __kmp_display_env && strncmp( __kmp_stg_table[ i ].name, "OMP_", 4 ) == 0 ) || __kmp_display_env_verbose ) ) {
+            __kmp_stg_table[ i ].print( & buffer, __kmp_stg_table[ i ].name, __kmp_stg_table[ i ].data );
+        }; // if
+    }; // for
+
+    __kmp_str_buf_print( & buffer, "%s\n", KMP_I18N_STR( DisplayEnvEnd )  );
+    __kmp_str_buf_print( & buffer, "\n" );
+
+    __kmp_printf( "%s", buffer.str );
+
+    __kmp_env_blk_free( & block );
+    __kmp_str_buf_free( & buffer );
+
+    __kmp_printf("\n");
+
+} // __kmp_env_print_2
+#endif // OMP_40_ENABLED
+
+
+
+// end of file
+

diff --git a/final/runtime/src/kmp_settings.h b/final/runtime/src/kmp_settings.h
new file mode 100644
index 0000000..7232e61
--- /dev/null
+++ b/final/runtime/src/kmp_settings.h

@@ -0,0 +1,50 @@
+/*
+ * kmp_settings.h -- Initialize environment variables
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_SETTINGS_H
+#define KMP_SETTINGS_H
+
+void __kmp_reset_global_vars( void );
+void __kmp_env_initialize( char const * );
+void __kmp_env_print();
+#if OMP_40_ENABLED
+void __kmp_env_print_2();
+#endif // OMP_40_ENABLED
+
+int __kmp_initial_threads_capacity( int req_nproc );
+void __kmp_init_dflt_team_nth();
+int __kmp_convert_to_milliseconds( char const * );
+int __kmp_default_tp_capacity( int, int, int);
+
+#if KMP_MIC
+#define KMP_STR_BUF_PRINT_NAME          __kmp_str_buf_print( buffer, "  %s %s", KMP_I18N_STR(Device), name )
+#define KMP_STR_BUF_PRINT_NAME_EX(x)    __kmp_str_buf_print( buffer, "  %s %s='", KMP_I18N_STR(Device), x )
+#define KMP_STR_BUF_PRINT_BOOL          __kmp_str_buf_print( buffer, "  %s %s='%s'\n", KMP_I18N_STR(Device), name, value ? "TRUE" : "FALSE" );
+#define KMP_STR_BUF_PRINT_INT           __kmp_str_buf_print( buffer, "  %s %s='%d'\n", KMP_I18N_STR(Device), name, value )
+#define KMP_STR_BUF_PRINT_UINT64        __kmp_str_buf_print( buffer, "  %s %s='%" KMP_UINT64_SPEC "'\n", KMP_I18N_STR(Device), name, value );
+#define KMP_STR_BUF_PRINT_STR           __kmp_str_buf_print( buffer, "  %s %s='%s'\n", KMP_I18N_STR(Device), name, value )
+#else
+#define KMP_STR_BUF_PRINT_NAME          __kmp_str_buf_print( buffer, "  %s %s", KMP_I18N_STR(Host), name )
+#define KMP_STR_BUF_PRINT_NAME_EX(x)    __kmp_str_buf_print( buffer, "  %s %s='", KMP_I18N_STR(Host), x )
+#define KMP_STR_BUF_PRINT_BOOL          __kmp_str_buf_print( buffer, "  %s %s='%s'\n", KMP_I18N_STR(Host), name, value ? "TRUE" : "FALSE" );
+#define KMP_STR_BUF_PRINT_INT           __kmp_str_buf_print( buffer, "  %s %s='%d'\n", KMP_I18N_STR(Host), name, value )
+#define KMP_STR_BUF_PRINT_UINT64        __kmp_str_buf_print( buffer, "  %s %s='%" KMP_UINT64_SPEC "'\n", KMP_I18N_STR(Host), name, value );
+#define KMP_STR_BUF_PRINT_STR           __kmp_str_buf_print( buffer, "  %s %s='%s'\n", KMP_I18N_STR(Host), name, value )
+#endif
+
+#endif // KMP_SETTINGS_H
+
+// end of file //
+

diff --git a/final/runtime/src/kmp_stats.cpp b/final/runtime/src/kmp_stats.cpp
new file mode 100644
index 0000000..9750f7b
--- /dev/null
+++ b/final/runtime/src/kmp_stats.cpp

@@ -0,0 +1,615 @@
+/** @file kmp_stats.cpp
+ * Statistics gathering and processing.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if KMP_STATS_ENABLED
+
+#include "kmp.h"
+#include "kmp_str.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+
+#include <algorithm>
+#include <sstream>
+#include <iomanip>
+#include <stdlib.h>                             // for atexit
+
+#define STRINGIZE2(x) #x
+#define STRINGIZE(x) STRINGIZE2(x)
+
+#define expandName(name,flags,ignore)  {STRINGIZE(name),flags},
+statInfo timeStat::timerInfo[] = {
+    KMP_FOREACH_TIMER(expandName,0)
+    {0,0}
+};
+const statInfo counter::counterInfo[] = {
+    KMP_FOREACH_COUNTER(expandName,0)
+    {0,0}
+};
+#undef expandName
+
+#define expandName(ignore1,ignore2,ignore3)  {0.0,0.0,0.0},
+kmp_stats_output_module::rgb_color kmp_stats_output_module::timerColorInfo[] = {
+    KMP_FOREACH_TIMER(expandName,0)
+    {0.0,0.0,0.0}
+};
+#undef expandName
+
+const kmp_stats_output_module::rgb_color kmp_stats_output_module::globalColorArray[] = {
+    {1.0, 0.0, 0.0}, // red
+    {1.0, 0.6, 0.0}, // orange
+    {1.0, 1.0, 0.0}, // yellow
+    {0.0, 1.0, 0.0}, // green 
+    {0.0, 0.0, 1.0}, // blue
+    {0.6, 0.2, 0.8}, // purple
+    {1.0, 0.0, 1.0}, // magenta
+    {0.0, 0.4, 0.2}, // dark green
+    {1.0, 1.0, 0.6}, // light yellow
+    {0.6, 0.4, 0.6}, // dirty purple
+    {0.0, 1.0, 1.0}, // cyan
+    {1.0, 0.4, 0.8}, // pink
+    {0.5, 0.5, 0.5}, // grey
+    {0.8, 0.7, 0.5}, // brown
+    {0.6, 0.6, 1.0}, // light blue
+    {1.0, 0.7, 0.5}, // peach
+    {0.8, 0.5, 1.0}, // lavender
+    {0.6, 0.0, 0.0}, // dark red
+    {0.7, 0.6, 0.0}, // gold
+    {0.0, 0.0, 0.0}  // black
+};
+
+// Ensure that the atexit handler only runs once.
+static uint32_t statsPrinted = 0;
+
+// output interface
+static kmp_stats_output_module __kmp_stats_global_output;
+
+/* ****************************************************** */
+/* ************* statistic member functions ************* */
+
+void statistic::addSample(double sample)
+{
+    double delta = sample - meanVal;
+
+    sampleCount = sampleCount + 1;
+    meanVal     = meanVal + delta/sampleCount;
+    m2          = m2 + delta*(sample - meanVal);
+
+    minVal = std::min(minVal, sample);
+    maxVal = std::max(maxVal, sample);
+}
+
+statistic & statistic::operator+= (const statistic & other)
+{
+    if (sampleCount == 0)
+    {
+        *this = other;
+        return *this;
+    }
+
+    uint64_t newSampleCount = sampleCount + other.sampleCount;
+    double dnsc  = double(newSampleCount);
+    double dsc   = double(sampleCount);
+    double dscBydnsc = dsc/dnsc;
+    double dosc  = double(other.sampleCount);
+    double delta = other.meanVal - meanVal;
+
+    // Try to order these calculations to avoid overflows.
+    // If this were Fortran, then the compiler would not be able to re-order over brackets.
+    // In C++ it may be legal to do that (we certainly hope it doesn't, and CC+ Programming Language 2nd edition
+    // suggests it shouldn't, since it says that exploitation of associativity can only be made if the operation
+    // really is associative (which floating addition isn't...)).
+    meanVal     = meanVal*dscBydnsc + other.meanVal*(1-dscBydnsc);
+    m2          = m2 + other.m2 + dscBydnsc*dosc*delta*delta;
+    minVal      = std::min (minVal, other.minVal);
+    maxVal      = std::max (maxVal, other.maxVal);
+    sampleCount = newSampleCount;
+
+
+    return *this;
+}
+
+void statistic::scale(double factor)
+{
+    minVal = minVal*factor;
+    maxVal = maxVal*factor;
+    meanVal= meanVal*factor;
+    m2     = m2*factor*factor;
+    return;
+}
+
+std::string statistic::format(char unit, bool total) const
+{
+    std::string result = formatSI(sampleCount,9,' ');
+
+    result = result + std::string(", ") + formatSI(minVal,  9, unit);
+    result = result + std::string(", ") + formatSI(meanVal, 9, unit);
+    result = result + std::string(", ") + formatSI(maxVal,  9, unit);
+    if (total)
+        result = result + std::string(", ") + formatSI(meanVal*sampleCount, 9, unit);
+    result = result + std::string(", ") + formatSI(getSD(), 9, unit);
+
+    return result;
+}
+
+/* ********************************************************** */
+/* ************* explicitTimer member functions ************* */
+
+void explicitTimer::start(timer_e timerEnumValue) { 
+    startTime = tsc_tick_count::now(); 
+    if(timeStat::logEvent(timerEnumValue)) {
+        __kmp_stats_thread_ptr->incrementNestValue();
+    }
+    return;
+}
+
+void explicitTimer::stop(timer_e timerEnumValue) {
+    if (startTime.getValue() == 0)
+        return;
+
+    tsc_tick_count finishTime = tsc_tick_count::now();
+
+    //stat->addSample ((tsc_tick_count::now() - startTime).ticks());
+    stat->addSample ((finishTime - startTime).ticks());
+
+    if(timeStat::logEvent(timerEnumValue)) {
+        __kmp_stats_thread_ptr->push_event(startTime.getValue() - __kmp_stats_start_time.getValue(), finishTime.getValue() - __kmp_stats_start_time.getValue(), __kmp_stats_thread_ptr->getNestValue(), timerEnumValue); 
+        __kmp_stats_thread_ptr->decrementNestValue();
+    }
+
+    /* We accept the risk that we drop a sample because it really did start at t==0. */
+    startTime = 0; 
+    return;
+}
+
+/* ******************************************************************* */
+/* ************* kmp_stats_event_vector member functions ************* */
+
+void kmp_stats_event_vector::deallocate() {
+    __kmp_free(events);
+    internal_size = 0;
+    allocated_size = 0;
+    events = NULL;
+}
+
+// This function is for qsort() which requires the compare function to return
+// either a negative number if event1 < event2, a positive number if event1 > event2
+// or zero if event1 == event2.  
+// This sorts by start time (lowest to highest).
+int compare_two_events(const void* event1, const void* event2) {
+    kmp_stats_event* ev1 = (kmp_stats_event*)event1;
+    kmp_stats_event* ev2 = (kmp_stats_event*)event2;
+
+    if(ev1->getStart() < ev2->getStart()) return -1;
+    else if(ev1->getStart() > ev2->getStart()) return 1;
+    else return 0;
+}
+
+void kmp_stats_event_vector::sort() {
+    qsort(events, internal_size, sizeof(kmp_stats_event), compare_two_events);
+}
+
+/* *********************************************************** */
+/* ************* kmp_stats_list member functions ************* */
+
+// returns a pointer to newly created stats node
+kmp_stats_list* kmp_stats_list::push_back(int gtid) { 
+    kmp_stats_list* newnode = (kmp_stats_list*)__kmp_allocate(sizeof(kmp_stats_list));
+    // placement new, only requires space and pointer and initializes (so __kmp_allocate instead of C++ new[] is used)
+    new (newnode) kmp_stats_list();
+    newnode->setGtid(gtid);
+    newnode->prev = this->prev;
+    newnode->next = this;
+    newnode->prev->next = newnode;
+    newnode->next->prev = newnode;
+    return newnode;
+}
+void kmp_stats_list::deallocate() {
+    kmp_stats_list* ptr = this->next;
+    kmp_stats_list* delptr = this->next;
+    while(ptr != this) {
+        delptr = ptr;
+        ptr=ptr->next;
+        // placement new means we have to explicitly call destructor.
+        delptr->_event_vector.deallocate();
+        delptr->~kmp_stats_list();
+        __kmp_free(delptr);
+    }
+}
+kmp_stats_list::iterator kmp_stats_list::begin() {
+    kmp_stats_list::iterator it;
+    it.ptr = this->next;
+    return it;
+}
+kmp_stats_list::iterator kmp_stats_list::end() {
+    kmp_stats_list::iterator it;
+    it.ptr = this;
+    return it;
+}
+int kmp_stats_list::size() {
+    int retval;
+    kmp_stats_list::iterator it;
+    for(retval=0, it=begin(); it!=end(); it++, retval++) {}
+    return retval;
+}
+
+/* ********************************************************************* */
+/* ************* kmp_stats_list::iterator member functions ************* */
+
+kmp_stats_list::iterator::iterator() : ptr(NULL) {} 
+kmp_stats_list::iterator::~iterator() {}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++() {
+    this->ptr = this->ptr->next;
+    return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++(int dummy) {
+    this->ptr = this->ptr->next;
+    return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--() {
+    this->ptr = this->ptr->prev;
+    return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--(int dummy) {
+    this->ptr = this->ptr->prev;
+    return *this;
+}
+bool kmp_stats_list::iterator::operator!=(const kmp_stats_list::iterator & rhs) {
+   return this->ptr!=rhs.ptr; 
+}
+bool kmp_stats_list::iterator::operator==(const kmp_stats_list::iterator & rhs) {
+   return this->ptr==rhs.ptr; 
+}
+kmp_stats_list* kmp_stats_list::iterator::operator*() const {
+    return this->ptr;
+}
+
+/* *************************************************************** */
+/* *************  kmp_stats_output_module functions ************** */
+
+const char* kmp_stats_output_module::outputFileName = NULL;
+const char* kmp_stats_output_module::eventsFileName = NULL;
+const char* kmp_stats_output_module::plotFileName   = NULL;
+int kmp_stats_output_module::printPerThreadFlag       = 0;
+int kmp_stats_output_module::printPerThreadEventsFlag = 0;
+
+// init() is called very near the beginning of execution time in the constructor of __kmp_stats_global_output
+void kmp_stats_output_module::init() 
+{
+    char * statsFileName  = getenv("KMP_STATS_FILE");
+    eventsFileName        = getenv("KMP_STATS_EVENTS_FILE");
+    plotFileName          = getenv("KMP_STATS_PLOT_FILE");
+    char * threadStats    = getenv("KMP_STATS_THREADS");
+    char * threadEvents   = getenv("KMP_STATS_EVENTS");
+
+    // set the stats output filenames based on environment variables and defaults
+    outputFileName = statsFileName;
+    eventsFileName = eventsFileName ? eventsFileName : "events.dat";
+    plotFileName   = plotFileName   ? plotFileName   : "events.plt";
+
+    // set the flags based on environment variables matching: true, on, 1, .true. , .t. , yes
+    printPerThreadFlag        = __kmp_str_match_true(threadStats);
+    printPerThreadEventsFlag  = __kmp_str_match_true(threadEvents);
+
+    if(printPerThreadEventsFlag) {
+        // assigns a color to each timer for printing
+        setupEventColors();
+    } else {
+        // will clear flag so that no event will be logged
+        timeStat::clearEventFlags();
+    }
+
+    return;
+}
+
+void kmp_stats_output_module::setupEventColors() {
+    int i;
+    int globalColorIndex = 0;
+    int numGlobalColors = sizeof(globalColorArray) / sizeof(rgb_color);
+    for(i=0;i<TIMER_LAST;i++) {
+        if(timeStat::logEvent((timer_e)i)) {
+            timerColorInfo[i] = globalColorArray[globalColorIndex];
+            globalColorIndex = (globalColorIndex+1)%numGlobalColors;
+        }
+    }
+    return;
+}
+
+void kmp_stats_output_module::printStats(FILE *statsOut, statistic const * theStats, bool areTimers)
+{
+    if (areTimers)
+    {
+        // Check if we have useful timers, since we don't print zero value timers we need to avoid
+        // printing a header and then no data.
+        bool haveTimers = false;
+        for (int s = 0; s<TIMER_LAST; s++)
+        {
+            if (theStats[s].getCount() != 0)
+            {
+                haveTimers = true;
+                break;
+            }
+        }
+        if (!haveTimers)
+            return;
+    }
+
+    // Print
+    const char * title = areTimers ? "Timer,                   SampleCount," : "Counter,                 ThreadCount,";
+    fprintf (statsOut, "%s    Min,      Mean,       Max,     Total,        SD\n", title);    
+    if (areTimers) {
+        for (int s = 0; s<TIMER_LAST; s++) {
+            statistic const * stat = &theStats[s];
+            if (stat->getCount() != 0) {
+                char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T';
+                fprintf (statsOut, "%-25s, %s\n", timeStat::name(timer_e(s)), stat->format(tag, true).c_str());
+            }
+        }
+    } else {   // Counters
+        for (int s = 0; s<COUNTER_LAST; s++) {
+            statistic const * stat = &theStats[s];
+            fprintf (statsOut, "%-25s, %s\n", counter::name(counter_e(s)), stat->format(' ', true).c_str());
+        }
+    }
+} 
+
+void kmp_stats_output_module::printCounters(FILE * statsOut, counter const * theCounters)
+{
+    // We print all the counters even if they are zero.
+    // That makes it easier to slice them into a spreadsheet if you need to.
+    fprintf (statsOut, "\nCounter,                    Count\n");
+    for (int c = 0; c<COUNTER_LAST; c++) {
+        counter const * stat = &theCounters[c];
+        fprintf (statsOut, "%-25s, %s\n", counter::name(counter_e(c)), formatSI(stat->getValue(), 9, ' ').c_str());
+    }
+}
+
+void kmp_stats_output_module::printEvents(FILE* eventsOut, kmp_stats_event_vector* theEvents, int gtid) {
+    // sort by start time before printing
+    theEvents->sort();
+    for (int i = 0; i < theEvents->size(); i++) {
+        kmp_stats_event ev = theEvents->at(i);
+        rgb_color color = getEventColor(ev.getTimerName());
+        fprintf(eventsOut, "%d %lu %lu %1.1f rgb(%1.1f,%1.1f,%1.1f) %s\n", 
+                gtid, 
+                ev.getStart(), 
+                ev.getStop(), 
+                1.2 - (ev.getNestLevel() * 0.2),
+                color.r, color.g, color.b,
+                timeStat::name(ev.getTimerName())
+               );
+    }
+    return;
+}
+
+void kmp_stats_output_module::windupExplicitTimers()
+{
+    // Wind up any explicit timers. We assume that it's fair at this point to just walk all the explcit timers in all threads 
+    // and say "it's over".
+    // If the timer wasn't running, this won't record anything anyway.
+    kmp_stats_list::iterator it;
+    for(it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
+        for (int timer=0; timer<EXPLICIT_TIMER_LAST; timer++) {
+            (*it)->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer);
+        }
+    }
+}
+
+void kmp_stats_output_module::printPloticusFile() {
+    int i;
+    int size = __kmp_stats_list.size();
+    FILE* plotOut = fopen(plotFileName, "w+");
+
+    fprintf(plotOut, "#proc page\n"
+                     "   pagesize: 15 10\n"
+                     "   scale: 1.0\n\n");
+
+    fprintf(plotOut, "#proc getdata\n"
+                     "   file: %s\n\n", 
+                     eventsFileName);
+
+    fprintf(plotOut, "#proc areadef\n"
+                     "   title: OpenMP Sampling Timeline\n"
+                     "   titledetails: align=center size=16\n"
+                     "   rectangle: 1 1 13 9\n"
+                     "   xautorange: datafield=2,3\n"
+                     "   yautorange: -1 %d\n\n", 
+                     size);
+
+    fprintf(plotOut, "#proc xaxis\n"
+                     "   stubs: inc\n"
+                     "   stubdetails: size=12\n"
+                     "   label: Time (ticks)\n"
+                     "   labeldetails: size=14\n\n");
+
+    fprintf(plotOut, "#proc yaxis\n"
+                     "   stubs: inc 1\n"
+                     "   stubrange: 0 %d\n"
+                     "   stubdetails: size=12\n"
+                     "   label: Thread #\n"
+                     "   labeldetails: size=14\n\n", 
+                     size-1);
+
+    fprintf(plotOut, "#proc bars\n"
+                     "   exactcolorfield: 5\n"
+                     "   axis: x\n"
+                     "   locfield: 1\n"
+                     "   segmentfields: 2 3\n"
+                     "   barwidthfield: 4\n\n");
+
+    // create legend entries corresponding to the timer color
+    for(i=0;i<TIMER_LAST;i++) {
+        if(timeStat::logEvent((timer_e)i)) {
+            rgb_color c = getEventColor((timer_e)i);
+            fprintf(plotOut, "#proc legendentry\n"
+                             "   sampletype: color\n"
+                             "   label: %s\n"
+                             "   details: rgb(%1.1f,%1.1f,%1.1f)\n\n",
+                             timeStat::name((timer_e)i),
+                             c.r, c.g, c.b);
+
+        }
+    }
+
+    fprintf(plotOut, "#proc legend\n"
+                     "   format: down\n"
+                     "   location: max max\n\n");
+    fclose(plotOut);
+    return;
+}
+
+void kmp_stats_output_module::outputStats(const char* heading) 
+{
+    statistic allStats[TIMER_LAST];
+    statistic allCounters[COUNTER_LAST];
+
+    // stop all the explicit timers for all threads
+    windupExplicitTimers();
+
+    FILE * eventsOut;
+    FILE * statsOut = outputFileName ? fopen (outputFileName, "a+") : stderr;
+
+    if (eventPrintingEnabled()) {
+        eventsOut = fopen(eventsFileName, "w+");
+    }
+
+    if (!statsOut)
+        statsOut = stderr;
+
+    fprintf(statsOut, "%s\n",heading);
+    // Accumulate across threads.
+    kmp_stats_list::iterator it;
+    for (it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
+        int t = (*it)->getGtid();
+        // Output per thread stats if requested.
+        if (perThreadPrintingEnabled()) {
+            fprintf (statsOut, "Thread %d\n", t);
+            printStats(statsOut, (*it)->getTimers(), true);
+            printCounters(statsOut, (*it)->getCounters());
+            fprintf(statsOut,"\n");
+        }
+        // Output per thread events if requested.
+        if (eventPrintingEnabled()) {
+            kmp_stats_event_vector events = (*it)->getEventVector();
+            printEvents(eventsOut, &events, t);
+        }
+
+        for (int s = 0; s<TIMER_LAST; s++) {
+            // See if we should ignore this timer when aggregating
+            if ((timeStat::masterOnly(timer_e(s)) && (t != 0)) || // Timer is only valid on the master and this thread is a worker
+                (timeStat::workerOnly(timer_e(s)) && (t == 0)) || // Timer is only valid on a worker and this thread is the master
+                timeStat::synthesized(timer_e(s))                 // It's a synthesized stat, so there's no raw data for it.
+               )            
+            {
+                continue;
+            }
+
+            statistic * threadStat = (*it)->getTimer(timer_e(s));
+            allStats[s] += *threadStat;
+        }
+
+        // Special handling for synthesized statistics.
+        // These just have to be coded specially here for now. 
+        // At present we only have one: the total parallel work done in each thread.
+        // The variance here makes it easy to see load imbalance over the whole program (though, of course,
+        // it's possible to have a code with awful load balance in every parallel region but perfect load
+        // balance oever the whole program.)
+        allStats[TIMER_Total_work].addSample ((*it)->getTimer(TIMER_OMP_work)->getTotal());
+
+        // Time waiting for work (synthesized)
+        if ((t != 0) || !timeStat::workerOnly(timer_e(TIMER_OMP_await_work)))
+            allStats[TIMER_Total_await_work].addSample ((*it)->getTimer(TIMER_OMP_await_work)->getTotal());
+
+        // Time in explicit barriers.
+        allStats[TIMER_Total_barrier].addSample ((*it)->getTimer(TIMER_OMP_barrier)->getTotal());
+
+        for (int c = 0; c<COUNTER_LAST; c++) {
+            if (counter::masterOnly(counter_e(c)) && t != 0)
+                continue;
+            allCounters[c].addSample ((*it)->getCounter(counter_e(c))->getValue());
+        }
+    }
+
+    if (eventPrintingEnabled()) {
+        printPloticusFile();
+        fclose(eventsOut);
+    }
+
+    fprintf (statsOut, "Aggregate for all threads\n");
+    printStats (statsOut, &allStats[0], true);
+    fprintf (statsOut, "\n");
+    printStats (statsOut, &allCounters[0], false);
+
+    if (statsOut != stderr)
+        fclose(statsOut);
+
+}
+
+/* ************************************************** */
+/* *************  exported C functions ************** */
+
+// no name mangling for these functions, we want the c files to be able to get at these functions
+extern "C" {
+
+void __kmp_reset_stats()
+{
+    kmp_stats_list::iterator it;
+    for(it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
+        timeStat * timers     = (*it)->getTimers();
+        counter * counters    = (*it)->getCounters();
+        explicitTimer * eTimers = (*it)->getExplicitTimers();
+
+        for (int t = 0; t<TIMER_LAST; t++)
+            timers[t].reset();
+
+        for (int c = 0; c<COUNTER_LAST; c++)
+            counters[c].reset();
+
+        for (int t=0; t<EXPLICIT_TIMER_LAST; t++)
+            eTimers[t].reset();
+
+        // reset the event vector so all previous events are "erased"
+        (*it)->resetEventVector();
+
+        // May need to restart the explicit timers in thread zero?
+    }
+    KMP_START_EXPLICIT_TIMER(OMP_serial);
+    KMP_START_EXPLICIT_TIMER(OMP_start_end);
+}
+
+// This function will reset all stats and stop all threads' explicit timers if they haven't been stopped already.
+void __kmp_output_stats(const char * heading)
+{
+    __kmp_stats_global_output.outputStats(heading);
+    __kmp_reset_stats();
+}
+
+void __kmp_accumulate_stats_at_exit(void)
+{
+    // Only do this once.
+    if (KMP_XCHG_FIXED32(&statsPrinted, 1) != 0)
+        return;
+
+    __kmp_output_stats("Statistics on exit");
+    return;
+}
+
+void __kmp_stats_init(void) 
+{
+    return;
+}
+
+} // extern "C" 
+
+#endif // KMP_STATS_ENABLED

diff --git a/final/runtime/src/kmp_stats.h b/final/runtime/src/kmp_stats.h
new file mode 100644
index 0000000..9189b80
--- /dev/null
+++ b/final/runtime/src/kmp_stats.h

@@ -0,0 +1,706 @@
+#ifndef KMP_STATS_H
+#define KMP_STATS_H
+
+/** @file kmp_stats.h
+ * Functions for collecting statistics.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if KMP_STATS_ENABLED
+/*
+ * Statistics accumulator.
+ * Accumulates number of samples and computes min, max, mean, standard deviation on the fly.
+ *
+ * Online variance calculation algorithm from http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
+ */
+
+#include <limits>
+#include <math.h>
+#include <string>
+#include <stdint.h>
+#include <new> // placement new
+#include "kmp_stats_timing.h"
+
+
+/*!
+ * @ingroup STATS_GATHERING
+ * \brief flags to describe the statistic ( timers or counter )
+ *
+*/
+class stats_flags_e {
+    public:
+        const static int onlyInMaster = 1<<0; //!< statistic is valid only for master
+        const static int noUnits      = 1<<1; //!< statistic doesn't need units printed next to it in output
+        const static int synthesized  = 1<<2; //!< statistic's value is created atexit time in the __kmp_output_stats function
+        const static int notInMaster  = 1<<3; //!< statistic is valid for non-master threads
+        const static int logEvent     = 1<<4; //!< statistic can be logged when KMP_STATS_EVENTS is on (valid only for timers)
+};
+
+/*!
+ * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments - macro(COUNTER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A counter counts the occurrence of some event.
+ * Each thread accumulates its own count, at the end of execution the counts are aggregated treating each thread
+ * as a separate measurement. (Unless onlyInMaster is set, in which case there's only a single measurement).
+ * The min,mean,max are therefore the values for the threads.
+ * Adding the counter here and then putting in a KMP_BLOCK_COUNTER(name) is all you need to do.
+ * All of the tables and printing is generated from this macro.
+ * Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_FOREACH_COUNTER(macro, arg)                         \
+    macro (OMP_PARALLEL, stats_flags_e::onlyInMaster, arg)      \
+    macro (OMP_FOR_static, 0, arg)                              \
+    macro (OMP_FOR_dynamic, 0, arg)                             \
+    macro (OMP_DISTR_FOR_static, 0, arg)                        \
+    macro (OMP_DISTR_FOR_dynamic, 0, arg)                       \
+    macro (OMP_BARRIER, 0, arg)                                 \
+    macro (OMP_CRITICAL,0, arg)                                 \
+    macro (OMP_SINGLE, 0, arg)                                  \
+    macro (OMP_MASTER, 0, arg)                                  \
+    macro (OMP_set_lock, 0, arg)                                \
+    macro (OMP_test_lock, 0, arg)                               \
+    macro (OMP_test_lock_failure, 0, arg)                       \
+    macro (REDUCE_wait, 0, arg)                                 \
+    macro (REDUCE_nowait, 0, arg)                               \
+    macro (LAST,0,arg)
+
+/*!
+ * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments - macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A timer collects multiple samples of some count in each thread and then finally aggregates over all the threads.
+ * The count is normally a time (in ticks), hence the name "timer". (But can be any value, so we use this for "number of arguments passed to fork"
+ * as well, or we could collect "loop iteration count" if we wanted to).
+ * For timers the threads are not significant, it's the individual observations that count, so the statistics are at that level.
+ * Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_FOREACH_TIMER(macro, arg)                                       \
+    macro (OMP_PARALLEL_args, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
+    macro (FOR_static_iterations, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
+    macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg)         \
+    macro (OMP_start_end, stats_flags_e::onlyInMaster, arg)             \
+    macro (OMP_serial, stats_flags_e::onlyInMaster, arg)                \
+    macro (OMP_work, 0, arg)                                            \
+    macro (Total_work, stats_flags_e::synthesized, arg)                 \
+    macro (OMP_await_work, stats_flags_e::notInMaster, arg)             \
+    macro (Total_await_work, stats_flags_e::synthesized, arg)           \
+    macro (OMP_barrier, 0, arg)                                         \
+    macro (Total_barrier, stats_flags_e::synthesized, arg)              \
+    macro (OMP_test_lock, 0, arg)                                       \
+    macro (FOR_static_scheduling, 0, arg)                               \
+    macro (FOR_dynamic_scheduling, 0, arg)                              \
+    macro (KMP_fork_call, 0, arg) \
+    macro (KMP_join_call, 0, arg) \
+    macro (KMP_fork_barrier, stats_flags_e::logEvent, arg)              \
+    macro (KMP_join_barrier, stats_flags_e::logEvent, arg)              \
+    macro (KMP_barrier, 0, arg)                   \
+    macro (KMP_end_split_barrier, 0, arg) \
+    macro (KMP_wait_sleep, 0, arg) \
+    macro (KMP_release, 0, arg)                   \
+    macro (KMP_hier_gather, 0, arg) \
+    macro (KMP_hier_release, 0, arg) \
+    macro (KMP_hyper_gather,  stats_flags_e::logEvent, arg) \
+    macro (KMP_hyper_release,  stats_flags_e::logEvent, arg) \
+    macro (KMP_linear_gather, 0, arg)                                   \
+    macro (KMP_linear_release, 0, arg)                                  \
+    macro (KMP_tree_gather, 0, arg)                                     \
+    macro (KMP_tree_release, 0, arg)                                    \
+    macro (USER_master_invoke, stats_flags_e::logEvent, arg) \
+    macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \
+    macro (USER_resume, stats_flags_e::logEvent, arg) \
+    macro (USER_suspend, stats_flags_e::logEvent, arg) \
+    macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
+    macro (KMP_allocate_team, 0, arg) \
+    macro (KMP_setup_icv_copy, 0, arg) \
+    macro (USER_icv_copy, 0, arg) \
+    macro (LAST,0, arg)
+
+
+
+// OMP_PARALLEL_args      -- the number of arguments passed to a fork
+// FOR_static_iterations  -- Number of available parallel chunks of work in a static for
+// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for
+//                           Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2.
+// OMP_serial             -- thread zero time executing serial code
+// OMP_start_end          -- time from when OpenMP is initialized until the stats are printed at exit
+// OMP_work               -- elapsed time in code dispatched by a fork (measured in the thread)
+// Total_work             -- a synthesized statistic summarizing how much parallel work each thread executed.
+// OMP_barrier            -- time at "real" barriers
+// Total_barrier          -- a synthesized statistic summarizing how much time at real barriers in each thread
+// OMP_set_lock           -- time in lock setting
+// OMP_test_lock          -- time in testing a lock
+// LOCK_WAIT              -- time waiting for a lock
+// FOR_static_scheduling  -- time spent doing scheduling for a static "for"
+// FOR_dynamic_scheduling -- time spent doing scheduling for a dynamic "for"
+// KMP_wait_sleep         -- time in __kmp_wait_sleep
+// KMP_release            -- time in __kmp_release
+// KMP_fork_barrier       -- time in __kmp_fork_barrier
+// KMP_join_barrier       -- time in __kmp_join_barrier
+// KMP_barrier            -- time in __kmp_barrier
+// KMP_end_split_barrier  -- time in __kmp_end_split_barrier
+// KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
+// KMP_icv_copy           -- start/stop timer for any ICV copying
+// KMP_linear_gather      -- time in __kmp_linear_barrier_gather
+// KMP_linear_release     -- time in __kmp_linear_barrier_release
+// KMP_tree_gather        -- time in __kmp_tree_barrier_gather
+// KMP_tree_release       -- time in __kmp_tree_barrier_release
+// KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
+// KMP_hyper_release      -- time in __kmp_hyper_barrier_release
+
+/*!
+ * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
+ *
+ * @param macro a user defined macro that takes three arguments - macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE BAD THINGS WILL HAPPEN!
+ *
+ * \details Explicit timers are ones where we need to allocate a timer itself (as well as the accumulated timing statistics).
+ * We allocate these on a per-thread basis, and explicitly start and stop them.
+ * Block timers just allocate the timer itself on the stack, and use the destructor to notice block exit; they don't
+ * need to be defined here.
+ * The name here should be the same as that of a timer above.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)  \
+    macro(OMP_serial, 0, arg)                   \
+    macro(OMP_start_end, 0, arg)                \
+    macro(USER_icv_copy, 0, arg) \
+    macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
+    macro(LAST, 0, arg)
+
+#define ENUMERATE(name,ignore,prefix) prefix##name,
+enum timer_e {
+    KMP_FOREACH_TIMER(ENUMERATE, TIMER_)
+};
+
+enum explicit_timer_e {
+    KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_)
+};
+
+enum counter_e {
+    KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_)
+};
+#undef ENUMERATE
+
+class statistic
+{
+    double   minVal;
+    double   maxVal;
+    double   meanVal;
+    double   m2;
+    uint64_t sampleCount;
+
+ public:
+    statistic() { reset(); }
+    statistic (statistic const &o): minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), sampleCount(o.sampleCount) {}
+
+    double   getMin()   const { return minVal; }
+    double   getMean()  const { return meanVal; }
+    double   getMax()   const { return maxVal; }
+    uint64_t getCount() const { return sampleCount; }
+    double   getSD()    const { return sqrt(m2/sampleCount); }
+    double   getTotal() const { return sampleCount*meanVal; }
+
+    void reset()
+    {
+        minVal =  std::numeric_limits<double>::max();
+        maxVal = -std::numeric_limits<double>::max();
+        meanVal= 0.0;
+        m2     = 0.0;
+        sampleCount = 0;
+    }
+    void addSample(double sample);
+    void scale    (double factor);
+    void scaleDown(double f)  { scale (1./f); }
+    statistic & operator+= (statistic const & other);
+
+    std::string format(char unit, bool total=false) const;
+};
+
+struct statInfo
+{
+    const char * name;
+    uint32_t     flags;
+};
+
+class timeStat : public statistic
+{
+    static statInfo timerInfo[];
+
+ public:
+    timeStat() : statistic() {}
+    static const char * name(timer_e e) { return timerInfo[e].name; }
+    static bool  masterOnly (timer_e e) { return timerInfo[e].flags & stats_flags_e::onlyInMaster; }
+    static bool  workerOnly (timer_e e) { return timerInfo[e].flags & stats_flags_e::notInMaster;  }
+    static bool  noUnits    (timer_e e) { return timerInfo[e].flags & stats_flags_e::noUnits;      }
+    static bool  synthesized(timer_e e) { return timerInfo[e].flags & stats_flags_e::synthesized;  }
+    static bool  logEvent   (timer_e e) { return timerInfo[e].flags & stats_flags_e::logEvent;     }
+    static void  clearEventFlags()      {
+        int i;
+        for(i=0;i<TIMER_LAST;i++) {
+            timerInfo[i].flags &= (~(stats_flags_e::logEvent));
+        }
+    }
+};
+
+// Where we need explicitly to start and end the timer, this version can be used
+// Since these timers normally aren't nicely scoped, so don't have a good place to live
+// on the stack of the thread, they're more work to use.
+class explicitTimer
+{
+    timeStat * stat;
+    tsc_tick_count startTime;
+
+ public:
+    explicitTimer () : stat(0), startTime(0) { }
+    explicitTimer (timeStat * s) : stat(s), startTime() { }
+
+    void setStat (timeStat *s) { stat = s; }
+    void start(timer_e timerEnumValue);
+    void stop(timer_e timerEnumValue);
+    void reset() { startTime = 0; }
+};
+
+// Where all you need is to time a block, this is enough.
+// (It avoids the need to have an explicit end, leaving the scope suffices.)
+class blockTimer : public explicitTimer
+{
+    timer_e timerEnumValue;
+ public:
+    blockTimer (timeStat * s, timer_e newTimerEnumValue) : timerEnumValue(newTimerEnumValue), explicitTimer(s) { start(timerEnumValue); }
+    ~blockTimer() { stop(timerEnumValue); }
+};
+
+// If all you want is a count, then you can use this...
+// The individual per-thread counts will be aggregated into a statistic at program exit.
+class counter
+{
+    uint64_t value;
+    static const statInfo counterInfo[];
+
+ public:
+    counter() : value(0) {}
+    void increment() { value++; }
+    uint64_t getValue() const { return value; }
+    void reset() { value = 0; }
+    static const char * name(counter_e e) { return counterInfo[e].name; }
+    static bool  masterOnly (counter_e e) { return counterInfo[e].flags & stats_flags_e::onlyInMaster; }
+};
+
+/* ****************************************************************
+    Class to implement an event
+
+    There are four components to an event: start time, stop time
+    nest_level, and timer_name.
+    The start and stop time should be obvious (recorded in clock ticks).
+    The nest_level relates to the bar width in the timeline graph.
+    The timer_name is used to determine which timer event triggered this event.
+
+    the interface to this class is through four read-only operations:
+    1) getStart()     -- returns the start time as 64 bit integer
+    2) getStop()      -- returns the stop time as 64 bit integer
+    3) getNestLevel() -- returns the nest level of the event
+    4) getTimerName() -- returns the timer name that triggered event
+
+    *MORE ON NEST_LEVEL*
+    The nest level is used in the bar graph that represents the timeline.
+    Its main purpose is for showing how events are nested inside eachother.
+    For example, say events, A, B, and C are recorded.  If the timeline
+    looks like this:
+
+Begin -------------------------------------------------------------> Time
+         |    |          |        |          |              |
+         A    B          C        C          B              A
+       start start     start     end        end            end
+
+       Then A, B, C will have a nest level of 1, 2, 3 respectively.
+       These values are then used to calculate the barwidth so you can
+       see that inside A, B has occurred, and inside B, C has occurred.
+       Currently, this is shown with A's bar width being larger than B's
+       bar width, and B's bar width being larger than C's bar width.
+
+**************************************************************** */
+class kmp_stats_event {
+    uint64_t start;
+    uint64_t stop;
+    int nest_level;
+    timer_e timer_name;
+ public:
+    kmp_stats_event() : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
+    kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
+    inline uint64_t  getStart() const { return start; }
+    inline uint64_t  getStop() const  { return stop;  }
+    inline int       getNestLevel() const { return nest_level; }
+    inline timer_e   getTimerName() const { return timer_name; }
+};
+
+/* ****************************************************************
+    Class to implement a dynamically expandable array of events
+
+    ---------------------------------------------------------
+    | event 1 | event 2 | event 3 | event 4 | ... | event N |
+    ---------------------------------------------------------
+
+    An event is pushed onto the back of this array at every
+    explicitTimer->stop() call.  The event records the thread #,
+    start time, stop time, and nest level related to the bar width.
+
+    The event vector starts at size INIT_SIZE and grows (doubles in size)
+    if needed.  An implication of this behavior is that log(N)
+    reallocations are needed (where N is number of events).  If you want
+    to avoid reallocations, then set INIT_SIZE to a large value.
+
+    the interface to this class is through six operations:
+    1) reset() -- sets the internal_size back to 0 but does not deallocate any memory
+    2) size()  -- returns the number of valid elements in the vector
+    3) push_back(start, stop, nest, timer_name) -- pushes an event onto
+                                                   the back of the array
+    4) deallocate() -- frees all memory associated with the vector
+    5) sort() -- sorts the vector by start time
+    6) operator[index] or at(index) -- returns event reference at that index
+
+**************************************************************** */
+class kmp_stats_event_vector {
+    kmp_stats_event* events;
+    int internal_size;
+    int allocated_size;
+    static const int INIT_SIZE = 1024;
+ public:
+    kmp_stats_event_vector() {
+        events = (kmp_stats_event*)__kmp_allocate(sizeof(kmp_stats_event)*INIT_SIZE);
+        internal_size = 0;
+        allocated_size = INIT_SIZE;
+    }
+   ~kmp_stats_event_vector() {}
+    inline void reset() { internal_size = 0; }
+    inline int  size() const { return internal_size; }
+    void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, timer_e name) {
+        int i;
+        if(internal_size == allocated_size) {
+            kmp_stats_event* tmp = (kmp_stats_event*)__kmp_allocate(sizeof(kmp_stats_event)*allocated_size*2);
+            for(i=0;i<internal_size;i++) tmp[i] = events[i];
+            __kmp_free(events);
+            events = tmp;
+            allocated_size*=2;
+        }
+        events[internal_size] = kmp_stats_event(start_time, stop_time, nest_level, name);
+        internal_size++;
+        return;
+    }
+    void deallocate();
+    void sort();
+    const kmp_stats_event & operator[](int index) const { return events[index]; }
+          kmp_stats_event & operator[](int index) { return events[index]; }
+    const kmp_stats_event & at(int index) const { return events[index]; }
+          kmp_stats_event & at(int index) { return events[index]; }
+};
+
+/* ****************************************************************
+    Class to implement a doubly-linked, circular, statistics list
+
+    |---| ---> |---| ---> |---| ---> |---| ---> ... next
+    |   |      |   |      |   |      |   |
+    |---| <--- |---| <--- |---| <--- |---| <--- ... prev
+    Sentinel   first      second     third
+    Node       node       node       node
+
+    The Sentinel Node is the user handle on the list.
+    The first node corresponds to thread 0's statistics.
+    The second node corresponds to thread 1's statistics and so on...
+
+    Each node has a _timers, _counters, and _explicitTimers array to
+    hold that thread's statistics.  The _explicitTimers
+    point to the correct _timer and update its statistics at every stop() call.
+    The explicitTimers' pointers are set up in the constructor.
+    Each node also has an event vector to hold that thread's timing events.
+    The event vector expands as necessary and records the start-stop times
+    for each timer.
+
+    The nestLevel variable is for plotting events and is related
+    to the bar width in the timeline graph.
+
+    Every thread will have a __thread local pointer to its node in
+    the list.  The sentinel node is used by the master thread to
+    store "dummy" statistics before __kmp_create_worker() is called.
+
+**************************************************************** */
+class kmp_stats_list {
+    int gtid;
+    timeStat      _timers[TIMER_LAST+1];
+    counter       _counters[COUNTER_LAST+1];
+    explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST+1];
+    int           _nestLevel; // one per thread
+    kmp_stats_event_vector _event_vector;
+    kmp_stats_list* next;
+    kmp_stats_list* prev;
+ public:
+    kmp_stats_list() : next(this) , prev(this) , _event_vector(), _nestLevel(0) {
+#define doInit(name,ignore1,ignore2) \
+        getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name));
+        KMP_FOREACH_EXPLICIT_TIMER(doInit,0);
+#undef doInit
+    }
+   ~kmp_stats_list() { }
+    inline timeStat *      getTimer(timer_e idx)                  { return &_timers[idx]; }
+    inline counter  *      getCounter(counter_e idx)              { return &_counters[idx]; }
+    inline explicitTimer * getExplicitTimer(explicit_timer_e idx) { return &_explicitTimers[idx]; }
+    inline timeStat *      getTimers()                            { return _timers; }
+    inline counter  *      getCounters()                          { return _counters; }
+    inline explicitTimer * getExplicitTimers()                    { return _explicitTimers; }
+    inline kmp_stats_event_vector & getEventVector()              { return _event_vector; }
+    inline void resetEventVector()                                { _event_vector.reset(); }
+    inline void incrementNestValue()                              { _nestLevel++; }
+    inline int  getNestValue()                                    { return _nestLevel; }
+    inline void decrementNestValue()                              { _nestLevel--; }
+    inline int  getGtid() const                                   { return gtid; }
+    inline void setGtid(int newgtid)                              { gtid = newgtid; }
+    kmp_stats_list* push_back(int gtid); // returns newly created list node
+    inline void     push_event(uint64_t start_time, uint64_t stop_time, int nest_level, timer_e name) {
+        _event_vector.push_back(start_time, stop_time, nest_level, name);
+    }
+    void deallocate();
+    class iterator;
+    kmp_stats_list::iterator begin();
+    kmp_stats_list::iterator end();
+    int size();
+    class iterator {
+        kmp_stats_list* ptr;
+        friend kmp_stats_list::iterator kmp_stats_list::begin();
+        friend kmp_stats_list::iterator kmp_stats_list::end();
+      public:
+        iterator();
+       ~iterator();
+        iterator operator++();
+        iterator operator++(int dummy);
+        iterator operator--();
+        iterator operator--(int dummy);
+        bool operator!=(const iterator & rhs);
+        bool operator==(const iterator & rhs);
+        kmp_stats_list* operator*() const; // dereference operator
+    };
+};
+
+/* ****************************************************************
+   Class to encapsulate all output functions and the environment variables
+
+   This module holds filenames for various outputs (normal stats, events, plot file),
+   as well as coloring information for the plot file.
+
+   The filenames and flags variables are read from environment variables.
+   These are read once by the constructor of the global variable __kmp_stats_output
+   which calls init().
+
+   During this init() call, event flags for the timeStat::timerInfo[] global array
+   are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
+
+   The only interface function that is public is outputStats(heading).  This function
+   should print out everything it needs to, either to files or stderr,
+   depending on the environment variables described below
+
+   ENVIRONMENT VARIABLES:
+   KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this file,
+                     otherwise, print to stderr
+   KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to either
+                        KMP_STATS_FILE or stderr
+   KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
+                          otherwise, the plot file is sent to "events.plt"
+   KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log events
+   KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
+                            otherwise, output is sent to "events.dat"
+
+**************************************************************** */
+class kmp_stats_output_module {
+
+ public:
+    struct rgb_color {
+        float r;
+        float g;
+        float b;
+    };
+
+ private:
+    static const char* outputFileName;
+    static const char* eventsFileName;
+    static const char* plotFileName;
+    static int printPerThreadFlag;
+    static int printPerThreadEventsFlag;
+    static const rgb_color globalColorArray[];
+    static       rgb_color timerColorInfo[];
+
+    void init();
+    static void setupEventColors();
+    static void printPloticusFile();
+    static void printStats(FILE *statsOut, statistic const * theStats, bool areTimers);
+    static void printCounters(FILE * statsOut, counter const * theCounters);
+    static void printEvents(FILE * eventsOut, kmp_stats_event_vector* theEvents, int gtid);
+    static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
+    static void windupExplicitTimers();
+    bool eventPrintingEnabled() {
+        if(printPerThreadEventsFlag) return true;
+        else return false;
+    }
+    bool perThreadPrintingEnabled() {
+        if(printPerThreadFlag) return true;
+        else return false;
+    }
+
+ public:
+    kmp_stats_output_module() { init(); }
+    void outputStats(const char* heading);
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __kmp_stats_init();
+void __kmp_reset_stats();
+void __kmp_output_stats(const char *);
+void __kmp_accumulate_stats_at_exit(void);
+// thread local pointer to stats node within list
+extern __thread kmp_stats_list* __kmp_stats_thread_ptr;
+// head to stats list.
+extern kmp_stats_list __kmp_stats_list;
+// lock for __kmp_stats_list
+extern kmp_tas_lock_t  __kmp_stats_lock;
+// reference start time
+extern tsc_tick_count __kmp_stats_start_time;
+// interface to output
+extern kmp_stats_output_module __kmp_stats_output;
+
+#ifdef __cplusplus
+}
+#endif
+
+// Simple, standard interfaces that drop out completely if stats aren't enabled
+
+
+/*!
+ * \brief Uses specified timer (name) to time code block.
+ *
+ * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
+ *
+ * \details Use KMP_TIME_BLOCK(name) macro to time a code block.  This will record the time taken in the block
+ * and use the destructor to stop the timer.  Convenient!
+ * With this definition you can't have more than one KMP_TIME_BLOCK in the same code block.
+ * I don't think that's a problem.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_TIME_BLOCK(name) \
+    blockTimer __BLOCKTIME__(__kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)
+
+/*!
+ * \brief Adds value to specified timer (name).
+ *
+ * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
+ * @param value double precision sample value to add to statistics for the timer
+ *
+ * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to a timer statistics.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_COUNT_VALUE(name, value) \
+    __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
+
+/*!
+ * \brief Increments specified counter (name).
+ *
+ * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
+ *
+ * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics counter for the executing thread.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_COUNT_BLOCK(name) \
+   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
+
+/*!
+ * \brief "Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro.
+ *
+ * @param name explicit timer name as specified under the KMP_FOREACH_EXPLICIT_TIMER() macro
+ *
+ * \details Use to start a timer.  This will need a corresponding KMP_STOP_EXPLICIT_TIMER()
+ * macro to stop the timer unlike the KMP_TIME_BLOCK(name) macro which has an implicit stopping macro at the end
+ * of the code block.  All explicit timers are stopped at library exit time before the final statistics are outputted.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_START_EXPLICIT_TIMER(name) \
+    __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)->start(TIMER_##name)
+
+/*!
+ * \brief "Stops" an explicit timer.
+ *
+ * @param name explicit timer name as specified under the KMP_FOREACH_EXPLICIT_TIMER() macro
+ *
+ * \details Use KMP_STOP_EXPLICIT_TIMER(name) to stop a timer.  When this is done, the time between the last KMP_START_EXPLICIT_TIMER(name)
+ * and this KMP_STOP_EXPLICIT_TIMER(name) will be added to the timer's stat value.  The timer will then be reset.
+ * After the KMP_STOP_EXPLICIT_TIMER(name) macro is called, another call to KMP_START_EXPLICIT_TIMER(name) will start the timer once again.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_STOP_EXPLICIT_TIMER(name) \
+    __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)->stop(TIMER_##name)
+
+/*!
+ * \brief Outputs the current thread statistics and reset them.
+ *
+ * @param heading_string heading put above the final stats output
+ *
+ * \details Explicitly stops all timers and outputs all stats.
+ * Environment variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a filename instead of stderr
+ * Environment variable, `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific stats
+ * For now the `OMPTB_STATSTHREADS` environment variable can either be defined with any value, which will print out thread
+ * specific stats, or it can be undefined (not specified in the environment) and thread specific stats won't be printed
+ * It should be noted that all statistics are reset when this macro is called.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_OUTPUT_STATS(heading_string) \
+    __kmp_output_stats(heading_string)
+
+/*!
+ * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
+ *
+ * \details Reset all stats for all threads.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_RESET_STATS()  __kmp_reset_stats()
+
+#else // KMP_STATS_ENABLED
+
+// Null definitions
+#define KMP_TIME_BLOCK(n)             ((void)0)
+#define KMP_COUNT_VALUE(n,v)          ((void)0)
+#define KMP_COUNT_BLOCK(n)            ((void)0)
+#define KMP_START_EXPLICIT_TIMER(n)   ((void)0)
+#define KMP_STOP_EXPLICIT_TIMER(n)    ((void)0)
+
+#define KMP_OUTPUT_STATS(heading_string) ((void)0)
+#define KMP_RESET_STATS()  ((void)0)
+
+#endif  // KMP_STATS_ENABLED
+
+#endif // KMP_STATS_H

diff --git a/final/runtime/src/kmp_stats_timing.cpp b/final/runtime/src/kmp_stats_timing.cpp
new file mode 100644
index 0000000..0826367
--- /dev/null
+++ b/final/runtime/src/kmp_stats_timing.cpp

@@ -0,0 +1,167 @@
+/** @file kmp_stats_timing.cpp
+ * Timing functions
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+
+#include "kmp_stats_timing.h"
+
+using namespace std;
+
+#if KMP_OS_LINUX
+# if KMP_MIC
+double tsc_tick_count::tick_time()
+{
+    // pretty bad assumption of 1GHz clock for MIC
+    return 1/((double)1000*1.e6);
+}
+# else
+#  include <string.h>
+// Extract the value from the CPUID information
+double tsc_tick_count::tick_time()
+{
+    static double result = 0.0;
+
+    if (result == 0.0)
+    {
+        int cpuinfo[4];
+        char brand[256];
+
+        __cpuid(cpuinfo, 0x80000000);
+        memset(brand, 0, sizeof(brand));
+        int ids = cpuinfo[0];
+
+        for (unsigned int i=2; i<(ids^0x80000000)+2; i++)
+            __cpuid(brand+(i-2)*sizeof(cpuinfo), i | 0x80000000);
+
+        char * start = &brand[0];
+        for (;*start == ' '; start++)
+            ;
+    
+        char * end = brand + KMP_STRLEN(brand) - 3;
+        uint64_t multiplier;
+
+        if (*end == 'M') multiplier = 1000LL*1000LL;
+        else if (*end == 'G') multiplier = 1000LL*1000LL*1000LL;
+        else if (*end == 'T') multiplier = 1000LL*1000LL*1000LL*1000LL;
+        else 
+        {
+            cout << "Error determining multiplier '" << *end << "'\n";
+            exit (-1);
+        }
+        *end = 0;
+        while (*end != ' ') end--;
+        end++;
+    
+        double freq = strtod(end, &start);
+        if (freq == 0.0) 
+        {
+            cout << "Error calculating frequency " <<  end << "\n";
+            exit (-1);
+        }
+
+        result = ((double)1.0)/(freq * multiplier);
+    }
+    return result;
+}
+# endif
+#endif
+
+static bool useSI = true;
+
+// Return a formatted string after normalising the value into
+// engineering style and using a suitable unit prefix (e.g. ms, us, ns).
+std::string formatSI(double interval, int width, char unit)
+{
+    std::stringstream os;
+
+    if (useSI)
+    {
+        // Preserve accuracy for small numbers, since we only multiply and the positive powers
+        // of ten are precisely representable. 
+        static struct { double scale; char prefix; } ranges[] = {
+            {1.e12,'f'},
+            {1.e9, 'p'},
+            {1.e6, 'n'},
+            {1.e3, 'u'},
+            {1.0,  'm'},
+            {1.e-3,' '},
+            {1.e-6,'k'},
+            {1.e-9,'M'},
+            {1.e-12,'G'},
+            {1.e-15,'T'},
+            {1.e-18,'P'},
+            {1.e-21,'E'},
+            {1.e-24,'Z'},
+            {1.e-27,'Y'}
+        };
+        
+        if (interval == 0.0)
+        {
+            os << std::setw(width-3) << std::right << "0.00" << std::setw(3) << unit;
+            return os.str();
+        }
+
+        bool negative = false;
+        if (interval < 0.0)
+        {
+            negative = true;
+            interval = -interval;
+        }
+        
+        for (int i=0; i<(int)(sizeof(ranges)/sizeof(ranges[0])); i++)
+        {
+            if (interval*ranges[i].scale < 1.e0)
+            {
+                interval = interval * 1000.e0 * ranges[i].scale;
+                os << std::fixed << std::setprecision(2) << std::setw(width-3) << std::right << 
+                    (negative ? -interval : interval) << std::setw(2) << ranges[i].prefix << std::setw(1) << unit;
+
+                return os.str();
+            }
+        }
+    }
+    os << std::setprecision(2) << std::fixed << std::right << std::setw(width-3) << interval << std::setw(3) << unit;
+
+    return os.str();
+}
+
+tsc_tick_count::tsc_interval_t computeLastInLastOutInterval(timePair * times, int nTimes)
+{
+    timePair lastTimes = times[0];
+    tsc_tick_count * startp = lastTimes.get_startp();
+    tsc_tick_count * endp   = lastTimes.get_endp();
+
+    for (int i=1; i<nTimes; i++)
+    {
+       (*startp) = startp->later(times[i].get_start());
+       (*endp)   = endp->later  (times[i].get_end());
+    }
+
+    return lastTimes.duration();
+}
+
+std::string timePair::format() const
+{
+    std::ostringstream oss;
+
+    oss << start.getValue() << ":" << end.getValue() << " = " << (end-start).getValue();
+
+    return oss.str();
+}

diff --git a/final/runtime/src/kmp_stats_timing.h b/final/runtime/src/kmp_stats_timing.h
new file mode 100644
index 0000000..2bdfdea
--- /dev/null
+++ b/final/runtime/src/kmp_stats_timing.h

@@ -0,0 +1,104 @@
+#ifndef KMP_STATS_TIMING_H
+#define KMP_STATS_TIMING_H
+
+/** @file kmp_stats_timing.h
+ * Access to real time clock and timers.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include <stdint.h>
+#include <string>
+#include <limits>
+#include "kmp_os.h"
+
+class tsc_tick_count {
+  private:
+    int64_t my_count;
+
+  public:
+    class tsc_interval_t {
+        int64_t value;
+        explicit tsc_interval_t(int64_t _value) : value(_value) {}
+     public:
+        tsc_interval_t() : value(0) {}; // Construct 0 time duration
+        double seconds() const; // Return the length of a time interval in seconds
+        double ticks() const { return double(value); }
+        int64_t getValue() const { return value; }
+
+        friend class tsc_tick_count;
+
+        friend tsc_interval_t operator-(
+        const tsc_tick_count t1, const tsc_tick_count t0);
+    };
+
+    tsc_tick_count() : my_count(static_cast<int64_t>(__rdtsc())) {};
+    tsc_tick_count(int64_t value) : my_count(value) {};
+    int64_t getValue() const { return my_count; }
+    tsc_tick_count later (tsc_tick_count const other) const { 
+        return my_count > other.my_count ? (*this) : other; 
+    }
+    tsc_tick_count earlier(tsc_tick_count const other) const { 
+        return my_count < other.my_count ? (*this) : other; 
+    }
+    static double tick_time(); // returns seconds per cycle (period) of clock
+    static tsc_tick_count now() { return tsc_tick_count(); } // returns the rdtsc register value
+    friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0);
+};
+
+inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0) 
+{
+    return tsc_tick_count::tsc_interval_t( t1.my_count-t0.my_count );
+}
+
+inline double tsc_tick_count::tsc_interval_t::seconds() const 
+{
+    return value*tick_time();
+}
+
+extern std::string formatSI(double interval, int width, char unit);
+
+inline std::string formatSeconds(double interval, int width)
+{
+    return formatSI(interval, width, 'S');
+}
+
+inline std::string formatTicks(double interval, int width)
+{
+    return formatSI(interval, width, 'T');
+}
+
+class timePair
+{
+    tsc_tick_count KMP_ALIGN_CACHE start;
+    tsc_tick_count end;
+
+public:
+    timePair() : start(-std::numeric_limits<int64_t>::max()), end(-std::numeric_limits<int64_t>::max()) {}
+    tsc_tick_count get_start() const { return start; }
+    tsc_tick_count get_end()   const { return end; }
+    tsc_tick_count * get_startp()    { return &start; }
+    tsc_tick_count * get_endp()      { return &end; }
+
+    void markStart() { start = tsc_tick_count::now(); }
+    void markEnd()   { end   = tsc_tick_count::now(); }
+    void set_start(tsc_tick_count s) { start = s; }
+    void set_end  (tsc_tick_count e) { end = e; }
+
+    tsc_tick_count::tsc_interval_t duration() const { return end-start; }
+    std::string format() const;
+
+};
+
+extern tsc_tick_count::tsc_interval_t computeLastInLastOutInterval(timePair * times, int nTimes);
+#endif // KMP_STATS_TIMING_H

diff --git a/final/runtime/src/kmp_str.c b/final/runtime/src/kmp_str.c
new file mode 100644
index 0000000..b5f7005
--- /dev/null
+++ b/final/runtime/src/kmp_str.c

@@ -0,0 +1,883 @@
+/*
+ * kmp_str.c -- String manipulation routines.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp_str.h"
+
+#include <stdarg.h>    // va_*
+#include <stdio.h>     // vsnprintf()
+#include <stdlib.h>    // malloc(), realloc()
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+
+/*
+    ------------------------------------------------------------------------------------------------
+    String buffer.
+    ------------------------------------------------------------------------------------------------
+
+    Usage:
+
+        // Declare buffer and initialize it.
+        kmp_str_buf_t  buffer;
+	__kmp_str_buf_init( & buffer );
+
+        // Print to buffer.
+        __kmp_str_buf_print( & buffer, "Error in file \"%s\" line %d\n", "foo.c", 12 );
+        __kmp_str_buf_print( & buffer, "    <%s>\n", line );
+
+        // Use buffer contents. buffer.str is a pointer to data, buffer.used is a number of printed
+        // characters (not including terminating zero).
+        write( fd, buffer.str, buffer.used );
+
+        // Free buffer.
+        __kmp_str_buf_free( & buffer );
+
+        // Alternatively, you can detach allocated memory from buffer:
+        __kmp_str_buf_detach( & buffer );
+        return buffer.str;    // That memory should be freed eventually.
+
+
+    Notes:
+
+        * Buffer users may use buffer.str and buffer.used. Users should not change any fields of
+          buffer directly.
+
+        * buffer.str is never NULL. If buffer is empty, buffer.str points to empty string ("").
+
+        * For performance reasons, buffer uses stack memory (buffer.bulk) first. If stack memory is
+          exhausted, buffer allocates memory on heap by malloc(), and reallocates it by realloc()
+          as amount of used memory grows.
+
+        * Buffer doubles amount of allocated memory each time it is exhausted.
+
+    ------------------------------------------------------------------------------------------------
+*/
+
+// TODO: __kmp_str_buf_print() can use thread local memory allocator.
+
+#define KMP_STR_BUF_INVARIANT( b )                                                                \
+    {                                                                                             \
+        KMP_DEBUG_ASSERT( (b)->str != NULL );                                                     \
+        KMP_DEBUG_ASSERT( (b)->size >= sizeof( (b)->bulk ) );                                     \
+        KMP_DEBUG_ASSERT( (b)->size % sizeof( (b)->bulk ) == 0 );                                 \
+        KMP_DEBUG_ASSERT( (unsigned)(b)->used < (b)->size );                                      \
+        KMP_DEBUG_ASSERT( (b)->size == sizeof( (b)->bulk ) ? (b)->str == & (b)->bulk[ 0 ] : 1 );  \
+        KMP_DEBUG_ASSERT( (b)->size > sizeof( (b)->bulk ) ? (b)->str != & (b)->bulk[ 0 ] : 1 );   \
+    }
+
+void
+ __kmp_str_buf_clear(
+     kmp_str_buf_t * buffer
+) {
+    KMP_STR_BUF_INVARIANT( buffer );
+    if ( buffer->used > 0 ) {
+        buffer->used = 0;
+        buffer->str[ 0 ] = 0;
+    }; // if
+    KMP_STR_BUF_INVARIANT( buffer );
+} // __kmp_str_buf_clear
+
+
+void
+__kmp_str_buf_reserve(
+    kmp_str_buf_t * buffer,
+    int             size
+) {
+
+    KMP_STR_BUF_INVARIANT( buffer );
+    KMP_DEBUG_ASSERT( size >= 0 );
+
+    if ( buffer->size < (unsigned int)size ) {
+
+        // Calculate buffer size.
+        do {
+            buffer->size *= 2;
+        } while ( buffer->size < (unsigned int)size );
+
+        // Enlarge buffer.
+        if ( buffer->str == & buffer->bulk[ 0 ] ) {
+            buffer->str = (char *) KMP_INTERNAL_MALLOC( buffer->size );
+            if ( buffer->str == NULL ) {
+		KMP_FATAL( MemoryAllocFailed );
+            }; // if
+            KMP_MEMCPY_S( buffer->str, buffer->size, buffer->bulk, buffer->used + 1 );
+        } else {
+            buffer->str = (char *) KMP_INTERNAL_REALLOC( buffer->str, buffer->size );
+            if ( buffer->str == NULL ) {
+		KMP_FATAL( MemoryAllocFailed );
+            }; // if
+        }; // if
+
+    }; // if
+
+    KMP_DEBUG_ASSERT( buffer->size > 0 );
+    KMP_DEBUG_ASSERT( buffer->size >= (unsigned)size );
+    KMP_STR_BUF_INVARIANT( buffer );
+
+} // __kmp_str_buf_reserve
+
+
+void
+__kmp_str_buf_detach(
+    kmp_str_buf_t *  buffer
+) {
+
+    KMP_STR_BUF_INVARIANT( buffer );
+
+    // If internal bulk is used, allocate memory and copy it.
+    if ( buffer->size <= sizeof( buffer->bulk ) ) {
+        buffer->str = (char *) KMP_INTERNAL_MALLOC( buffer->size );
+        if ( buffer->str == NULL ) {
+		KMP_FATAL( MemoryAllocFailed );
+        }; // if
+        KMP_MEMCPY_S( buffer->str, buffer->size, buffer->bulk, buffer->used + 1 );
+    }; // if
+
+} // __kmp_str_buf_detach
+
+
+void
+__kmp_str_buf_free(
+    kmp_str_buf_t * buffer
+) {
+    KMP_STR_BUF_INVARIANT( buffer );
+    if ( buffer->size > sizeof( buffer->bulk ) ) {
+        KMP_INTERNAL_FREE( buffer->str );
+    }; // if
+    buffer->str  = buffer->bulk;
+    buffer->size = sizeof( buffer->bulk );
+    buffer->used = 0;
+    KMP_STR_BUF_INVARIANT( buffer );
+} // __kmp_str_buf_free
+
+
+void
+__kmp_str_buf_cat(
+    kmp_str_buf_t * buffer,
+    char const *    str,
+    int             len
+) {
+    KMP_STR_BUF_INVARIANT( buffer );
+    KMP_DEBUG_ASSERT( str != NULL );
+    KMP_DEBUG_ASSERT( len >= 0 );
+    __kmp_str_buf_reserve( buffer, buffer->used + len + 1 );
+    KMP_MEMCPY( buffer->str + buffer->used, str, len );
+    buffer->str[ buffer->used + len ] = 0;
+    buffer->used += len;
+    KMP_STR_BUF_INVARIANT( buffer );
+} // __kmp_str_buf_cat
+
+
+void
+__kmp_str_buf_vprint(
+    kmp_str_buf_t *  buffer,
+    char const *     format,
+    va_list          args
+) {
+
+    KMP_STR_BUF_INVARIANT( buffer );
+
+    for ( ; ; ) {
+
+        int const free = buffer->size - buffer->used;
+        int       rc;
+        int       size;
+
+        // Try to format string.
+        {
+            /*
+                On Linux* OS Intel(R) 64, vsnprintf() modifies args argument, so vsnprintf() crashes if it
+                is called for the second time with the same args. To prevent the crash, we have to
+                pass a fresh intact copy of args to vsnprintf() on each iteration.
+
+                Unfortunately, standard va_copy() macro is not available on Windows* OS. However, it
+                seems vsnprintf() does not modify args argument on Windows* OS.
+            */
+
+            #if ! KMP_OS_WINDOWS
+                va_list _args;
+                __va_copy( _args, args );  // Make copy of args.
+                #define args _args         // Substitute args with its copy, _args.
+            #endif // KMP_OS_WINDOWS
+            rc = KMP_VSNPRINTF( buffer->str + buffer->used, free, format, args );
+            #if ! KMP_OS_WINDOWS
+                #undef args                // Remove substitution.
+                va_end( _args );
+            #endif // KMP_OS_WINDOWS
+        }
+
+        // No errors, string has been formatted.
+        if ( rc >= 0 && rc < free ) {
+            buffer->used += rc;
+            break;
+        }; // if
+
+        // Error occurred, buffer is too small.
+        if ( rc >= 0 ) {
+            // C99-conforming implementation of vsnprintf returns required buffer size.
+            size = buffer->used + rc + 1;
+        } else {
+            // Older implementations just return -1. Double buffer size.
+            size = buffer->size * 2;
+        }; // if
+
+        // Enlarge buffer.
+        __kmp_str_buf_reserve( buffer, size );
+
+        // And try again.
+
+    }; // forever
+
+    KMP_DEBUG_ASSERT( buffer->size > 0 );
+    KMP_STR_BUF_INVARIANT( buffer );
+
+} // __kmp_str_buf_vprint
+
+
+void
+__kmp_str_buf_print(
+    kmp_str_buf_t *  buffer,
+    char const *     format,
+    ...
+) {
+
+    va_list args;
+    va_start( args, format );
+    __kmp_str_buf_vprint( buffer, format, args );
+    va_end( args );
+
+} // __kmp_str_buf_print
+
+
+/*
+    The function prints specified size to buffer. Size is expressed using biggest possible unit, for
+    example 1024 is printed as "1k".
+*/
+
+void
+__kmp_str_buf_print_size(
+    kmp_str_buf_t * buf,
+    size_t          size
+) {
+
+    char const * names[] = { "", "k", "M", "G", "T", "P", "E", "Z", "Y" };
+    int const    units   = sizeof( names ) / sizeof( char const * );
+    int          u       = 0;
+    if ( size > 0 ) {
+        while ( ( size % 1024 == 0 ) && ( u + 1 < units ) ) {
+            size = size / 1024;
+            ++ u;
+        }; // while
+    }; // if
+
+    __kmp_str_buf_print( buf, "%" KMP_SIZE_T_SPEC "%s", size, names[ u ] );
+
+} // __kmp_str_buf_print_size
+
+
+void
+__kmp_str_fname_init(
+    kmp_str_fname_t * fname,
+    char const *      path
+) {
+
+    fname->path = NULL;
+    fname->dir  = NULL;
+    fname->base = NULL;
+
+    if ( path != NULL ) {
+        char * slash = NULL;    // Pointer to the last character of dir.
+        char * base  = NULL;    // Pointer to the beginning of basename.
+        fname->path = __kmp_str_format( "%s", path );
+            // Original code used strdup() function to copy a string, but on Windows* OS Intel(R) 64 it
+            // causes assertioon id debug heap, so I had to replace strdup with __kmp_str_format().
+        if ( KMP_OS_WINDOWS ) {
+            __kmp_str_replace( fname->path, '\\', '/' );
+        }; // if
+        fname->dir = __kmp_str_format( "%s", fname->path );
+        slash = strrchr( fname->dir, '/' );
+        if ( KMP_OS_WINDOWS && slash == NULL ) {           // On Windows* OS, if slash not found,
+            char first = TOLOWER( fname->dir[ 0 ] );     // look for drive.
+            if ( 'a' <= first && first <= 'z' && fname->dir[ 1 ] == ':' ) {
+                slash = & fname->dir[ 1 ];
+            }; // if
+        }; // if
+        base = ( slash == NULL ? fname->dir : slash + 1 );
+        fname->base = __kmp_str_format( "%s", base );    // Copy basename
+        * base = 0;                    // and truncate dir.
+    }; // if
+
+} // kmp_str_fname_init
+
+
+void
+__kmp_str_fname_free(
+    kmp_str_fname_t * fname
+) {
+    __kmp_str_free( (char const **)( & fname->path ) );
+    __kmp_str_free( (char const **)( & fname->dir  ) );
+    __kmp_str_free( (char const **)( & fname->base ) );
+} // kmp_str_fname_free
+
+
+int
+__kmp_str_fname_match(
+    kmp_str_fname_t const * fname,
+    char const *            pattern
+) {
+
+    int dir_match  = 1;
+    int base_match = 1;
+
+    if ( pattern != NULL ) {
+        kmp_str_fname_t ptrn;
+        __kmp_str_fname_init( & ptrn, pattern );
+        dir_match =
+            strcmp( ptrn.dir, "*/" ) == 0
+            ||
+            ( fname->dir != NULL && __kmp_str_eqf( fname->dir, ptrn.dir ) );
+        base_match =
+            strcmp( ptrn.base, "*" ) == 0
+            ||
+            ( fname->base != NULL && __kmp_str_eqf( fname->base, ptrn.base ) );
+        __kmp_str_fname_free( & ptrn );
+    }; // if
+
+    return dir_match && base_match;
+
+} // __kmp_str_fname_match
+
+
+kmp_str_loc_t
+__kmp_str_loc_init(
+    char const * psource,
+    int          init_fname
+) {
+
+    kmp_str_loc_t loc;
+
+    loc._bulk = NULL;
+    loc.file  = NULL;
+    loc.func  = NULL;
+    loc.line  = 0;
+    loc.col   = 0;
+
+    if ( psource != NULL ) {
+
+        char * str   = NULL;
+        char * dummy = NULL;
+        char * line  = NULL;
+        char * col   = NULL;
+
+        // Copy psource to keep it intact.
+        loc._bulk = __kmp_str_format( "%s", psource );
+
+        // Parse psource string: ";file;func;line;col;;"
+        str = loc._bulk;
+        __kmp_str_split( str, ';', & dummy,    & str );
+        __kmp_str_split( str, ';', & loc.file, & str );
+        __kmp_str_split( str, ';', & loc.func, & str );
+        __kmp_str_split( str, ';', & line,     & str );
+        __kmp_str_split( str, ';', & col,      & str );
+
+        // Convert line and col into numberic values.
+        if ( line != NULL ) {
+            loc.line = atoi( line );
+            if ( loc.line < 0 ) {
+                loc.line = 0;
+            }; // if
+        }; // if
+        if ( col != NULL ) {
+            loc.col = atoi( col );
+            if ( loc.col < 0 ) {
+                loc.col = 0;
+            }; // if
+        }; // if
+
+    }; // if
+
+    __kmp_str_fname_init( & loc.fname, init_fname ? loc.file : NULL );
+
+    return loc;
+
+} // kmp_str_loc_init
+
+
+void
+__kmp_str_loc_free(
+    kmp_str_loc_t * loc
+) {
+    __kmp_str_fname_free( & loc->fname );
+    KMP_INTERNAL_FREE( loc->_bulk );
+    loc->_bulk = NULL;
+    loc->file  = NULL;
+    loc->func  = NULL;
+} // kmp_str_loc_free
+
+
+
+/*
+    This function is intended to compare file names. On Windows* OS file names are case-insensitive,
+    so functions performs case-insensitive comparison. On Linux* OS it performs case-sensitive
+    comparison.
+    Note: The function returns *true* if strings are *equal*.
+*/
+
+int
+__kmp_str_eqf(         // True, if strings are equal, false otherwise.
+    char const * lhs,  // First string.
+    char const * rhs   // Second string.
+) {
+    int result;
+    #if KMP_OS_WINDOWS
+        result = ( _stricmp( lhs, rhs ) == 0 );
+    #else
+        result = ( strcmp( lhs, rhs ) == 0 );
+    #endif
+    return result;
+} // __kmp_str_eqf
+
+
+/*
+    This function is like sprintf, but it *allocates* new buffer, which must be freed eventually by
+    __kmp_str_free(). The function is very convenient for constructing strings, it successfully
+    replaces strdup(), strcat(), it frees programmer from buffer allocations and helps to avoid
+    buffer overflows. Examples:
+
+        str = __kmp_str_format( "%s", orig );              // strdup(), do not care about buffer size.
+        __kmp_str_free( & str );
+        str = __kmp_str_format( "%s%s", orig1, orig2 );    // strcat(), do not care about buffer size.
+        __kmp_str_free( & str );
+        str = __kmp_str_format( "%s/%s.txt", path, file ); // constructing string.
+        __kmp_str_free( & str );
+
+    Performance note:
+        This function allocates memory with malloc() calls, so do not call it from
+        performance-critical code. In performance-critical code consider using kmp_str_buf_t
+        instead, since it uses stack-allocated buffer for short strings.
+
+    Why does this function use malloc()?
+        1. __kmp_allocate() returns cache-aligned memory allocated with malloc(). There are no
+           reasons in using __kmp_allocate() for strings due to extra overhead while cache-aligned
+           memory is not necessary.
+        2. __kmp_thread_malloc() cannot be used because it requires pointer to thread structure.
+           We need to perform string operations during library startup (for example, in
+           __kmp_register_library_startup()) when no thread structures are allocated yet.
+    So standard malloc() is the only available option.
+*/
+
+// TODO: Find and replace all regular free() with __kmp_str_free().
+
+char *
+__kmp_str_format(           // Allocated string.
+    char const * format,    // Format string.
+    ...                     // Other parameters.
+) {
+
+    va_list args;
+    int     size   = 512;
+    char *  buffer = NULL;
+    int     rc;
+
+    // Allocate buffer.
+    buffer = (char *) KMP_INTERNAL_MALLOC( size );
+    if ( buffer == NULL ) {
+	KMP_FATAL( MemoryAllocFailed );
+    }; // if
+
+    for ( ; ; ) {
+
+        // Try to format string.
+        va_start( args, format );
+        rc = KMP_VSNPRINTF( buffer, size, format, args );
+        va_end( args );
+
+        // No errors, string has been formatted.
+        if ( rc >= 0 && rc < size ) {
+            break;
+        }; // if
+
+        // Error occurred, buffer is too small.
+        if ( rc >= 0 ) {
+            // C99-conforming implementation of vsnprintf returns required buffer size.
+            size = rc + 1;
+        } else {
+            // Older implementations just return -1.
+            size = size * 2;
+        }; // if
+
+        // Enlarge buffer and try again.
+        buffer = (char *) KMP_INTERNAL_REALLOC( buffer, size );
+        if ( buffer == NULL ) {
+    	    KMP_FATAL( MemoryAllocFailed );
+        }; // if
+
+    }; // forever
+
+    return buffer;
+
+} // func __kmp_str_format
+
+
+void
+__kmp_str_free(
+    char const * * str
+) {
+    KMP_DEBUG_ASSERT( str != NULL );
+    KMP_INTERNAL_FREE( (void *) * str );
+    * str = NULL;
+} // func __kmp_str_free
+
+
+/* If len is zero, returns true iff target and data have exact case-insensitive match.
+   If len is negative, returns true iff target is a case-insensitive substring of data.
+   If len is positive, returns true iff target is a case-insensitive substring of data or
+     vice versa, and neither is shorter than len.
+*/
+int
+__kmp_str_match(
+    char const * target,
+    int          len,
+    char const * data
+) {
+    int i;
+    if ( target == NULL || data == NULL ) {
+        return FALSE;
+    }; // if
+    for ( i = 0; target[i] && data[i]; ++ i ) {
+        if ( TOLOWER( target[i] ) != TOLOWER( data[i] ) ) {
+            return FALSE;
+        }; // if
+    }; // for i
+    return ( ( len > 0 ) ? i >= len : ( ! target[i] && ( len || ! data[i] ) ) );
+} // __kmp_str_match
+
+
+int
+__kmp_str_match_false( char const * data ) {
+    int result =
+        __kmp_str_match( "false",   1, data ) ||
+        __kmp_str_match( "off",     2, data ) ||
+        __kmp_str_match( "0",       1, data ) ||
+        __kmp_str_match( ".false.", 2, data ) ||
+        __kmp_str_match( ".f.",     2, data ) ||
+        __kmp_str_match( "no",      1, data );
+    return result;
+} // __kmp_str_match_false
+
+
+int
+__kmp_str_match_true( char const * data ) {
+    int result =
+        __kmp_str_match( "true",   1, data ) ||
+        __kmp_str_match( "on",     2, data ) ||
+        __kmp_str_match( "1",      1, data ) ||
+        __kmp_str_match( ".true.", 2, data ) ||
+        __kmp_str_match( ".t.",    2, data ) ||
+        __kmp_str_match( "yes",    1, data );
+    return result;
+} // __kmp_str_match_true
+
+void
+__kmp_str_replace(
+    char * str,
+    char   search_for,
+    char   replace_with
+) {
+
+    char * found = NULL;
+
+    found = strchr( str, search_for );
+    while ( found ) {
+        * found = replace_with;
+        found = strchr( found + 1, search_for );
+    }; // while
+
+} // __kmp_str_replace
+
+
+void
+__kmp_str_split(
+    char *  str,    // I: String to split.
+    char    delim,  // I: Character to split on.
+    char ** head,   // O: Pointer to head (may be NULL).
+    char ** tail    // O: Pointer to tail (may be NULL).
+) {
+    char * h = str;
+    char * t = NULL;
+    if ( str != NULL ) {
+        char * ptr = strchr( str, delim );
+        if ( ptr != NULL ) {
+            * ptr  = 0;
+            t = ptr + 1;
+        }; // if
+    }; // if
+    if ( head != NULL ) {
+        * head = h;
+    }; // if
+    if ( tail != NULL ) {
+        * tail = t;
+    }; // if
+} // __kmp_str_split
+
+/*
+    strtok_r() is not available on Windows* OS. This function reimplements strtok_r().
+*/
+char *
+__kmp_str_token(
+    char *       str,   // String to split into tokens. Note: String *is* modified!
+    char const * delim, // Delimiters.
+    char **      buf    // Internal buffer.
+) {
+    char * token = NULL;
+    #if KMP_OS_WINDOWS
+        // On Windows* OS there is no strtok_r() function. Let us implement it.
+        if ( str != NULL ) {
+            * buf = str;                       // First call, initialize buf.
+        }; // if
+        * buf += strspn( * buf, delim );       // Skip leading delimiters.
+        if ( ** buf != 0 ) {                   // Rest of the string is not yet empty.
+            token = * buf;                     // Use it as result.
+            * buf += strcspn( * buf, delim );  // Skip non-delimiters.
+            if ( ** buf != 0 ) {               // Rest of the string is not yet empty.
+                ** buf = 0;                    // Terminate token here.
+                * buf += 1;                    // Advance buf to start with the next token next time.
+            }; // if
+        }; // if
+    #else
+        // On Linux* OS and OS X*, strtok_r() is available. Let us use it.
+        token = strtok_r( str, delim, buf );
+    #endif
+    return token;
+}; // __kmp_str_token
+
+
+int
+__kmp_str_to_int(
+    char const * str,
+    char         sentinel
+) {
+    int result, factor;
+    char const * t;
+
+    result = 0;
+
+    for (t = str; *t != '\0'; ++t) {
+        if (*t < '0' || *t > '9')
+            break;
+        result = (result * 10) + (*t - '0');
+    }
+
+    switch (*t) {
+    case '\0':          /* the current default for no suffix is bytes */
+	factor = 1;
+        break;
+    case 'b': case 'B': /* bytes */
+	++t;
+	factor = 1;
+        break;
+    case 'k': case 'K': /* kilo-bytes */
+	++t;
+	factor = 1024;
+        break;
+    case 'm': case 'M': /* mega-bytes */
+	++t;
+	factor = (1024 * 1024);
+        break;
+    default:
+	if(*t != sentinel)
+	    return (-1);
+	t = "";
+	factor = 1;
+    }
+
+    if (result > (INT_MAX / factor))
+	result = INT_MAX;
+    else
+	result *= factor;
+
+    return (*t != 0 ? 0 : result);
+
+} // __kmp_str_to_int
+
+
+/*
+    The routine parses input string. It is expected it is a unsigned integer with optional unit.
+    Units are: "b" for bytes, "kb" or just "k" for kilobytes, "mb" or "m" for megabytes, ..., "yb"
+    or "y" for yottabytes. :-) Unit name is case-insensitive. The routine returns 0 if everything is
+    ok, or error code: -1 in case of overflow, -2 in case of unknown unit. *size is set to parsed
+    value. In case of overflow *size is set to KMP_SIZE_T_MAX, in case of unknown unit *size is set
+    to zero.
+*/
+void
+__kmp_str_to_size(         // R: Error code.
+    char const *   str,    // I: String of characters, unsigned number and unit ("b", "kb", etc).
+    size_t *       out,    // O: Parsed number.
+    size_t         dfactor, // I: The factor if none of the letters specified.
+    char const * * error   // O: Null if everything is ok, error message otherwise.
+) {
+
+    size_t value    = 0;
+    size_t factor   = 0;
+    int    overflow = 0;
+    int    i        = 0;
+    int    digit;
+
+
+    KMP_DEBUG_ASSERT( str != NULL );
+
+    // Skip spaces.
+    while ( str[ i ] == ' ' || str[ i ] == '\t') {
+        ++ i;
+    }; // while
+
+    // Parse number.
+    if ( str[ i ] < '0' || str[ i ] > '9' ) {
+        * error = KMP_I18N_STR( NotANumber );
+        return;
+    }; // if
+    do {
+        digit = str[ i ] - '0';
+        overflow = overflow || ( value > ( KMP_SIZE_T_MAX - digit ) / 10 );
+        value = ( value * 10 ) + digit;
+        ++ i;
+    } while ( str[ i ] >= '0' && str[ i ] <= '9' );
+
+    // Skip spaces.
+    while ( str[ i ] == ' ' || str[ i ] == '\t' ) {
+        ++ i;
+    }; // while
+
+    // Parse unit.
+    #define _case( ch, exp )                            \
+        case ch :                                       \
+        case ch - ( 'a' - 'A' ) : {                     \
+            size_t shift = (exp) * 10;                  \
+            ++ i;                                       \
+            if ( shift < sizeof( size_t ) * 8 ) {       \
+                factor = (size_t)( 1 ) << shift;        \
+            } else {                                    \
+                overflow = 1;                           \
+            };                                          \
+        } break;
+    switch ( str[ i ] ) {
+        _case( 'k', 1 ); // Kilo
+        _case( 'm', 2 ); // Mega
+        _case( 'g', 3 ); // Giga
+        _case( 't', 4 ); // Tera
+        _case( 'p', 5 ); // Peta
+        _case( 'e', 6 ); // Exa
+        _case( 'z', 7 ); // Zetta
+        _case( 'y', 8 ); // Yotta
+        // Oops. No more units...
+    }; // switch
+    #undef _case
+    if ( str[ i ] == 'b' || str[ i ] == 'B' ) {    // Skip optional "b".
+	if ( factor == 0 ) {
+	    factor = 1;
+	}
+        ++ i;
+    }; // if
+    if ( ! ( str[ i ] == ' ' || str[ i ] == '\t' || str[ i ] == 0 ) ) { // Bad unit
+        * error = KMP_I18N_STR( BadUnit );
+        return;
+    }; // if
+
+    if ( factor == 0 ) {
+	factor = dfactor;
+    }
+
+    // Apply factor.
+    overflow = overflow || ( value > ( KMP_SIZE_T_MAX / factor ) );
+    value *= factor;
+
+    // Skip spaces.
+    while ( str[ i ] == ' ' || str[ i ] == '\t' ) {
+        ++ i;
+    }; // while
+
+    if ( str[ i ] != 0 ) {
+        * error = KMP_I18N_STR( IllegalCharacters );
+        return;
+    }; // if
+
+    if ( overflow ) {
+        * error = KMP_I18N_STR( ValueTooLarge );
+        * out = KMP_SIZE_T_MAX;
+        return;
+    }; // if
+
+    * error = NULL;
+    * out = value;
+
+} // __kmp_str_to_size
+
+
+void
+__kmp_str_to_uint(         // R: Error code.
+    char const *   str,    // I: String of characters, unsigned number.
+    kmp_uint64 *   out,    // O: Parsed number.
+    char const * * error   // O: Null if everything is ok, error message otherwise.
+) {
+
+    size_t value    = 0;
+    int    overflow = 0;
+    int    i        = 0;
+    int    digit;
+
+
+    KMP_DEBUG_ASSERT( str != NULL );
+
+    // Skip spaces.
+    while ( str[ i ] == ' ' || str[ i ] == '\t' ) {
+        ++ i;
+    }; // while
+
+    // Parse number.
+    if ( str[ i ] < '0' || str[ i ] > '9' ) {
+        * error = KMP_I18N_STR( NotANumber );
+        return;
+    }; // if
+    do {
+        digit = str[ i ] - '0';
+        overflow = overflow || ( value > ( KMP_SIZE_T_MAX - digit ) / 10 );
+        value = ( value * 10 ) + digit;
+        ++ i;
+    } while ( str[ i ] >= '0' && str[ i ] <= '9' );
+
+    // Skip spaces.
+    while ( str[ i ] == ' ' || str[ i ] == '\t' ) {
+        ++ i;
+    }; // while
+
+    if ( str[ i ] != 0 ) {
+        * error = KMP_I18N_STR( IllegalCharacters );
+        return;
+    }; // if
+
+    if ( overflow ) {
+        * error = KMP_I18N_STR( ValueTooLarge );
+        * out = (kmp_uint64) -1;
+        return;
+    }; // if
+
+    * error = NULL;
+    * out = value;
+
+} // __kmp_str_to_unit
+
+
+
+// end of file //

diff --git a/final/runtime/src/kmp_str.h b/final/runtime/src/kmp_str.h
new file mode 100644
index 0000000..ba71bba
--- /dev/null
+++ b/final/runtime/src/kmp_str.h

@@ -0,0 +1,119 @@
+/*
+ * kmp_str.h -- String manipulation routines.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_STR_H
+#define KMP_STR_H
+
+#include <string.h>
+#include <stdarg.h>
+
+#include "kmp_os.h"
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+#if KMP_OS_WINDOWS
+# define strdup    _strdup
+#endif
+
+/*  some macros to replace ctype.h functions  */
+#define TOLOWER(c)	((((c) >= 'A') && ((c) <= 'Z')) ? ((c) + 'a' - 'A') : (c))
+
+struct kmp_str_buf {
+    char       * str;         // Pointer to buffer content, read only.
+    unsigned int size;        // Do not change this field!
+    int          used;        // Number of characters printed to buffer, read only.
+    char         bulk[ 512 ]; // Do not use this field!
+}; // struct kmp_str_buf
+typedef struct kmp_str_buf  kmp_str_buf_t;
+
+#define __kmp_str_buf_init( b )   { (b)->str = (b)->bulk; (b)->size = sizeof( (b)->bulk ); (b)->used = 0; (b)->bulk[ 0 ] = 0; }
+
+void   __kmp_str_buf_clear( kmp_str_buf_t * buffer );
+void   __kmp_str_buf_reserve( kmp_str_buf_t * buffer, int size );
+void   __kmp_str_buf_detach( kmp_str_buf_t * buffer );
+void   __kmp_str_buf_free( kmp_str_buf_t * buffer );
+void   __kmp_str_buf_cat( kmp_str_buf_t * buffer, char const * str, int len );
+void   __kmp_str_buf_vprint( kmp_str_buf_t * buffer, char const * format, va_list args );
+void   __kmp_str_buf_print( kmp_str_buf_t * buffer, char const * format, ... );
+void   __kmp_str_buf_print_size( kmp_str_buf_t * buffer, size_t size );
+
+/*
+    File name parser. Usage:
+
+        kmp_str_fname_t fname = __kmp_str_fname_init( path );
+        // Use fname.path (copy of original path ), fname.dir, fname.base.
+        // Note fname.dir concatenated with fname.base gives exact copy of path.
+        __kmp_str_fname_free( & fname );
+
+*/
+struct kmp_str_fname {
+    char * path;
+    char * dir;
+    char * base;
+}; // struct kmp_str_fname
+typedef struct kmp_str_fname kmp_str_fname_t;
+void __kmp_str_fname_init( kmp_str_fname_t * fname, char const * path );
+void __kmp_str_fname_free( kmp_str_fname_t * fname );
+// Compares file name with specified patern. If pattern is NULL, any fname matched.
+int __kmp_str_fname_match( kmp_str_fname_t const * fname, char const * pattern );
+
+/*
+    The compiler provides source locations in string form ";file;func;line;col;;". It not not
+    convenient for manupulation. These structure keeps source location in more convenient form.
+    Usage:
+
+        kmp_str_loc_t loc = __kmp_str_loc_init( ident->psource, 0 );
+        // use loc.file, loc.func, loc.line, loc.col.
+        // loc.fname is available if the second argument of __kmp_str_loc_init is true.
+        __kmp_str_loc_free( & loc );
+
+    If psource is NULL or does not follow format above, file and/or func may be NULL pointers.
+*/
+struct kmp_str_loc {
+    char *          _bulk;  // Do not use thid field.
+    kmp_str_fname_t fname;  // Will be initialized if init_fname is true.
+    char *          file;
+    char *          func;
+    int             line;
+    int             col;
+}; // struct kmp_str_loc
+typedef struct kmp_str_loc kmp_str_loc_t;
+kmp_str_loc_t __kmp_str_loc_init( char const * psource, int init_fname );
+void __kmp_str_loc_free( kmp_str_loc_t * loc );
+
+int    __kmp_str_eqf( char const * lhs, char const * rhs );
+char * __kmp_str_format( char const * format, ... );
+void   __kmp_str_free( char const * * str );
+int    __kmp_str_match( char const * target, int len, char const * data );
+int    __kmp_str_match_false( char const * data );
+int    __kmp_str_match_true( char const * data );
+void   __kmp_str_replace( char * str, char search_for, char replace_with );
+void   __kmp_str_split( char * str, char delim, char ** head, char ** tail );
+char * __kmp_str_token( char * str, char const * delim, char ** buf );
+int    __kmp_str_to_int( char const * str, char sentinel );
+
+void __kmp_str_to_size( char const * str, size_t * out, size_t dfactor, char const * * error );
+void __kmp_str_to_uint( char const * str, kmp_uint64 * out, char const * * error );
+
+#ifdef __cplusplus
+    } // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_STR_H
+
+// end of file //
+

diff --git a/final/runtime/src/kmp_stub.c b/final/runtime/src/kmp_stub.c
new file mode 100644
index 0000000..1e0953a
--- /dev/null
+++ b/final/runtime/src/kmp_stub.c

@@ -0,0 +1,252 @@
+/*
+ * kmp_stub.c -- stub versions of user-callable OpenMP RT functions.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stdlib.h>
+#include <limits.h>
+#include <errno.h>
+
+#include "omp.h"                // Function renamings.
+#include "kmp.h"                // KMP_DEFAULT_STKSIZE
+#include "kmp_stub.h"
+
+#if KMP_OS_WINDOWS
+    #include <windows.h>
+#else
+    #include <sys/time.h>
+#endif
+
+// Moved from omp.h
+#define omp_set_max_active_levels    ompc_set_max_active_levels
+#define omp_set_schedule             ompc_set_schedule
+#define omp_get_ancestor_thread_num  ompc_get_ancestor_thread_num
+#define omp_get_team_size            ompc_get_team_size
+
+#define omp_set_num_threads          ompc_set_num_threads
+#define omp_set_dynamic              ompc_set_dynamic
+#define omp_set_nested               ompc_set_nested
+#define kmp_set_stacksize            kmpc_set_stacksize
+#define kmp_set_stacksize_s          kmpc_set_stacksize_s
+#define kmp_set_blocktime            kmpc_set_blocktime
+#define kmp_set_library              kmpc_set_library
+#define kmp_set_defaults             kmpc_set_defaults
+#define kmp_malloc                   kmpc_malloc
+#define kmp_calloc                   kmpc_calloc
+#define kmp_realloc                  kmpc_realloc
+#define kmp_free                     kmpc_free
+
+static double frequency = 0.0;
+
+// Helper functions.
+static size_t __kmps_init() {
+    static int    initialized = 0;
+    static size_t dummy = 0;
+    if ( ! initialized ) {
+
+        // TODO: Analyze KMP_VERSION environment variable, print __kmp_version_copyright and
+        // __kmp_version_build_time.
+        // WARNING: Do not use "fprintf( stderr, ... )" because it will cause unresolved "__iob"
+        // symbol (see C70080). We need to extract __kmp_printf() stuff from kmp_runtime.c and use
+        // it.
+
+        // Trick with dummy variable forces linker to keep __kmp_version_copyright and
+        // __kmp_version_build_time strings in executable file (in case of static linkage).
+        // When KMP_VERSION analyze is implemented, dummy variable should be deleted, function
+        // should return void.
+        dummy = __kmp_version_copyright - __kmp_version_build_time;
+
+        #if KMP_OS_WINDOWS
+            LARGE_INTEGER freq;
+            BOOL status = QueryPerformanceFrequency( & freq );
+            if ( status ) {
+                frequency = double( freq.QuadPart );
+            }; // if
+        #endif
+
+        initialized = 1;
+    }; // if
+    return dummy;
+}; // __kmps_init
+
+#define i __kmps_init();
+
+/* set API functions */
+void omp_set_num_threads( omp_int_t num_threads ) { i; }
+void omp_set_dynamic( omp_int_t dynamic )         { i; __kmps_set_dynamic( dynamic ); }
+void omp_set_nested( omp_int_t nested )           { i; __kmps_set_nested( nested );   }
+void omp_set_max_active_levels( omp_int_t max_active_levels ) { i; }
+void omp_set_schedule( omp_sched_t kind, omp_int_t modifier ) { i; __kmps_set_schedule( (kmp_sched_t)kind, modifier ); }
+int omp_get_ancestor_thread_num( omp_int_t level ) { i; return ( level ) ? ( -1 ) : ( 0 ); }
+int omp_get_team_size( omp_int_t level ) { i; return ( level ) ? ( -1 ) : ( 1 ); }
+int kmpc_set_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
+int kmpc_unset_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
+int kmpc_get_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
+
+/* kmp API functions */
+void kmp_set_stacksize( omp_int_t arg )   { i; __kmps_set_stacksize( arg ); }
+void kmp_set_stacksize_s( size_t arg )    { i; __kmps_set_stacksize( arg ); }
+void kmp_set_blocktime( omp_int_t arg )   { i; __kmps_set_blocktime( arg ); }
+void kmp_set_library( omp_int_t arg )     { i; __kmps_set_library( arg ); }
+void kmp_set_defaults( char const * str ) { i; }
+
+/* KMP memory management functions. */
+void * kmp_malloc( size_t size )                 { i; return malloc( size ); }
+void * kmp_calloc( size_t nelem, size_t elsize ) { i; return calloc( nelem, elsize ); }
+void * kmp_realloc( void *ptr, size_t size )     { i; return realloc( ptr, size ); }
+void   kmp_free( void * ptr )                    { i; free( ptr ); }
+
+static int __kmps_blocktime = INT_MAX;
+
+void __kmps_set_blocktime( int arg ) {
+    i;
+    __kmps_blocktime = arg;
+} // __kmps_set_blocktime
+
+int __kmps_get_blocktime( void ) {
+    i;
+    return __kmps_blocktime;
+} // __kmps_get_blocktime
+
+static int __kmps_dynamic = 0;
+
+void __kmps_set_dynamic( int arg ) {
+    i;
+    __kmps_dynamic = arg;
+} // __kmps_set_dynamic
+
+int __kmps_get_dynamic( void ) {
+    i;
+    return __kmps_dynamic;
+} // __kmps_get_dynamic
+
+static int __kmps_library = 1000;
+
+void __kmps_set_library( int arg ) {
+    i;
+    __kmps_library = arg;
+} // __kmps_set_library
+
+int __kmps_get_library( void ) {
+    i;
+    return __kmps_library;
+} // __kmps_get_library
+
+static int __kmps_nested = 0;
+
+void __kmps_set_nested( int arg ) {
+    i;
+    __kmps_nested = arg;
+} // __kmps_set_nested
+
+int __kmps_get_nested( void ) {
+    i;
+    return __kmps_nested;
+} // __kmps_get_nested
+
+static size_t __kmps_stacksize = KMP_DEFAULT_STKSIZE;
+
+void __kmps_set_stacksize( int arg ) {
+    i;
+    __kmps_stacksize = arg;
+} // __kmps_set_stacksize
+
+int __kmps_get_stacksize( void ) {
+    i;
+    return __kmps_stacksize;
+} // __kmps_get_stacksize
+
+static kmp_sched_t __kmps_sched_kind     = kmp_sched_default;
+static int         __kmps_sched_modifier = 0;
+
+    void __kmps_set_schedule( kmp_sched_t kind, int modifier ) {
+        i;
+        __kmps_sched_kind     = kind;
+        __kmps_sched_modifier = modifier;
+    } // __kmps_set_schedule
+
+    void __kmps_get_schedule( kmp_sched_t *kind, int *modifier ) {
+        i;
+        *kind     = __kmps_sched_kind;
+        *modifier = __kmps_sched_modifier;
+    } // __kmps_get_schedule
+
+#if OMP_40_ENABLED
+
+static kmp_proc_bind_t __kmps_proc_bind = proc_bind_false;
+
+void __kmps_set_proc_bind( kmp_proc_bind_t arg ) {
+    i;
+    __kmps_proc_bind = arg;
+} // __kmps_set_proc_bind
+
+kmp_proc_bind_t __kmps_get_proc_bind( void ) {
+    i;
+    return __kmps_proc_bind;
+} // __kmps_get_proc_bind
+
+#endif /* OMP_40_ENABLED */
+
+double __kmps_get_wtime( void ) {
+    // Elapsed wall clock time (in second) from "sometime in the past".
+    double wtime = 0.0;
+    i;
+    #if KMP_OS_WINDOWS
+        if ( frequency > 0.0 ) {
+            LARGE_INTEGER now;
+            BOOL status = QueryPerformanceCounter( & now );
+            if ( status ) {
+                wtime = double( now.QuadPart ) / frequency;
+            }; // if
+        }; // if
+    #else
+        // gettimeofday() returns seconds and microseconds since the Epoch.
+        struct timeval  tval;
+        int             rc;
+        rc = gettimeofday( & tval, NULL );
+        if ( rc == 0 ) {
+            wtime = (double)( tval.tv_sec ) + 1.0E-06 * (double)( tval.tv_usec );
+        } else {
+            // TODO: Assert or abort here.
+        }; // if
+    #endif
+    return wtime;
+}; // __kmps_get_wtime
+
+double __kmps_get_wtick( void ) {
+    // Number of seconds between successive clock ticks.
+    double wtick = 0.0;
+    i;
+    #if KMP_OS_WINDOWS
+        {
+            DWORD increment;
+            DWORD adjustment;
+            BOOL  disabled;
+            BOOL  rc;
+            rc = GetSystemTimeAdjustment( & adjustment, & increment, & disabled );
+            if ( rc ) {
+                wtick = 1.0E-07 * (double)( disabled ? increment : adjustment );
+            } else {
+                // TODO: Assert or abort here.
+                wtick = 1.0E-03;
+            }; // if
+        }
+    #else
+        // TODO: gettimeofday() returns in microseconds, but what the precision?
+        wtick = 1.0E-06;
+    #endif
+    return wtick;
+}; // __kmps_get_wtick
+
+// end of file //
+

diff --git a/final/runtime/src/kmp_stub.h b/final/runtime/src/kmp_stub.h
new file mode 100644
index 0000000..cdcffa3
--- /dev/null
+++ b/final/runtime/src/kmp_stub.h

@@ -0,0 +1,61 @@
+/*
+ * kmp_stub.h
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_STUB_H
+#define KMP_STUB_H
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+void __kmps_set_blocktime( int arg );
+int  __kmps_get_blocktime( void );
+void __kmps_set_dynamic( int arg );
+int  __kmps_get_dynamic( void );
+void __kmps_set_library( int arg );
+int  __kmps_get_library( void );
+void __kmps_set_nested( int arg );
+int  __kmps_get_nested( void );
+void __kmps_set_stacksize( int arg );
+int  __kmps_get_stacksize();
+
+#ifndef KMP_SCHED_TYPE_DEFINED
+#define KMP_SCHED_TYPE_DEFINED
+typedef enum kmp_sched {
+    kmp_sched_static            = 1, // mapped to kmp_sch_static_chunked           (33)
+    kmp_sched_dynamic           = 2, // mapped to kmp_sch_dynamic_chunked          (35)
+    kmp_sched_guided            = 3, // mapped to kmp_sch_guided_chunked           (36)
+    kmp_sched_auto              = 4, // mapped to kmp_sch_auto                     (38)
+    kmp_sched_default = kmp_sched_static   // default scheduling
+} kmp_sched_t;
+#endif
+void __kmps_set_schedule( kmp_sched_t kind, int modifier );
+void __kmps_get_schedule( kmp_sched_t *kind, int *modifier );
+
+#if OMP_40_ENABLED
+void __kmps_set_proc_bind( kmp_proc_bind_t arg );
+kmp_proc_bind_t __kmps_get_proc_bind( void );
+#endif /* OMP_40_ENABLED */
+
+double __kmps_get_wtime();
+double __kmps_get_wtick();
+
+#ifdef __cplusplus
+    } // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_STUB_H
+
+// end of file //

diff --git a/final/runtime/src/kmp_taskdeps.cpp b/final/runtime/src/kmp_taskdeps.cpp
new file mode 100644
index 0000000..abc037b
--- /dev/null
+++ b/final/runtime/src/kmp_taskdeps.cpp

@@ -0,0 +1,512 @@
+/*
+ * kmp_taskdeps.cpp
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+//#define KMP_SUPPORT_GRAPH_OUTPUT 1
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_wait_release.h"
+
+#if OMP_40_ENABLED
+
+//TODO: Improve memory allocation? keep a list of pre-allocated structures? allocate in blocks? re-use list finished list entries?
+//TODO: don't use atomic ref counters for stack-allocated nodes.
+//TODO: find an alternate to atomic refs for heap-allocated nodes?
+//TODO: Finish graph output support
+//TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other runtime locks
+//TODO: Any ITT support needed?
+
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+static kmp_int32 kmp_node_id_seed = 0;
+#endif
+
+static void
+__kmp_init_node ( kmp_depnode_t *node )
+{
+    node->dn.task = NULL; // set to null initially, it will point to the right task once dependences have been processed
+    node->dn.successors = NULL;
+    __kmp_init_lock(&node->dn.lock);
+    node->dn.nrefs = 1; // init creates the first reference to the node
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+    node->dn.id = KMP_TEST_THEN_INC32(&kmp_node_id_seed);
+#endif
+}
+
+static inline kmp_depnode_t *
+__kmp_node_ref ( kmp_depnode_t *node )
+{
+    KMP_TEST_THEN_INC32(&node->dn.nrefs);
+    return node;
+}
+
+static inline void
+__kmp_node_deref ( kmp_info_t *thread, kmp_depnode_t *node )
+{
+    if (!node) return;
+
+    kmp_int32 n = KMP_TEST_THEN_DEC32(&node->dn.nrefs) - 1;
+    if ( n == 0 ) {
+        KMP_ASSERT(node->dn.nrefs == 0);
+#if USE_FAST_MEMORY
+        __kmp_fast_free(thread,node);
+#else
+        __kmp_thread_free(thread,node);
+#endif
+    }
+}
+
+#define KMP_ACQUIRE_DEPNODE(gtid,n) __kmp_acquire_lock(&(n)->dn.lock,(gtid))
+#define KMP_RELEASE_DEPNODE(gtid,n) __kmp_release_lock(&(n)->dn.lock,(gtid))
+
+static void
+__kmp_depnode_list_free ( kmp_info_t *thread, kmp_depnode_list *list );
+
+static const kmp_int32 kmp_dephash_log2 = 6;
+static const kmp_int32 kmp_dephash_size = (1 << kmp_dephash_log2);
+
+static inline kmp_int32
+__kmp_dephash_hash ( kmp_intptr_t addr )
+{
+    //TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) % m_num_sets );
+    return ((addr >> kmp_dephash_log2) ^ addr) % kmp_dephash_size;
+}
+
+static kmp_dephash_t *
+__kmp_dephash_create ( kmp_info_t *thread )
+{
+    kmp_dephash_t *h;
+
+    kmp_int32 size = kmp_dephash_size * sizeof(kmp_dephash_entry_t) + sizeof(kmp_dephash_t);
+
+#if USE_FAST_MEMORY
+    h = (kmp_dephash_t *) __kmp_fast_allocate( thread, size );
+#else
+    h = (kmp_dephash_t *) __kmp_thread_malloc( thread, size );
+#endif
+
+#ifdef KMP_DEBUG
+    h->nelements = 0;
+#endif
+    h->buckets = (kmp_dephash_entry **)(h+1);
+
+    for ( kmp_int32 i = 0; i < kmp_dephash_size; i++ )
+        h->buckets[i] = 0;
+
+    return h;
+}
+
+static void
+__kmp_dephash_free ( kmp_info_t *thread, kmp_dephash_t *h )
+{
+    for ( kmp_int32 i=0; i < kmp_dephash_size; i++ ) {
+        if ( h->buckets[i] ) {
+            kmp_dephash_entry_t *next;
+            for ( kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next ) {
+                next = entry->next_in_bucket;
+                __kmp_depnode_list_free(thread,entry->last_ins);
+                __kmp_node_deref(thread,entry->last_out);
+#if USE_FAST_MEMORY
+                __kmp_fast_free(thread,entry);
+#else
+                __kmp_thread_free(thread,entry);
+#endif
+            }
+        }
+    }
+#if USE_FAST_MEMORY
+    __kmp_fast_free(thread,h);
+#else
+    __kmp_thread_free(thread,h);
+#endif
+}
+
+static kmp_dephash_entry *
+__kmp_dephash_find ( kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr )
+{
+    kmp_int32 bucket = __kmp_dephash_hash(addr);
+
+    kmp_dephash_entry_t *entry;
+    for ( entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket )
+        if ( entry->addr == addr ) break;
+
+    if ( entry == NULL ) {
+        // create entry. This is only done by one thread so no locking required
+#if USE_FAST_MEMORY
+        entry = (kmp_dephash_entry_t *) __kmp_fast_allocate( thread, sizeof(kmp_dephash_entry_t) );
+#else
+        entry = (kmp_dephash_entry_t *) __kmp_thread_malloc( thread, sizeof(kmp_dephash_entry_t) );
+#endif
+        entry->addr = addr;
+        entry->last_out = NULL;
+        entry->last_ins = NULL;
+        entry->next_in_bucket = h->buckets[bucket];
+        h->buckets[bucket] = entry;
+#ifdef KMP_DEBUG
+        h->nelements++;
+        if ( entry->next_in_bucket ) h->nconflicts++;
+#endif
+    }
+    return entry;
+}
+
+static kmp_depnode_list_t *
+__kmp_add_node ( kmp_info_t *thread, kmp_depnode_list_t *list, kmp_depnode_t *node )
+{
+    kmp_depnode_list_t *new_head;
+
+#if USE_FAST_MEMORY
+    new_head = (kmp_depnode_list_t *) __kmp_fast_allocate(thread,sizeof(kmp_depnode_list_t));
+#else
+    new_head = (kmp_depnode_list_t *) __kmp_thread_malloc(thread,sizeof(kmp_depnode_list_t));
+#endif
+
+    new_head->node = __kmp_node_ref(node);
+    new_head->next = list;
+
+    return new_head;
+}
+
+static void
+__kmp_depnode_list_free ( kmp_info_t *thread, kmp_depnode_list *list )
+{
+    kmp_depnode_list *next;
+
+    for ( ; list ; list = next ) {
+        next = list->next;
+
+        __kmp_node_deref(thread,list->node);
+#if USE_FAST_MEMORY
+        __kmp_fast_free(thread,list);
+#else
+        __kmp_thread_free(thread,list);
+#endif
+    }
+}
+
+static inline void
+__kmp_track_dependence ( kmp_depnode_t *source, kmp_depnode_t *sink )
+{
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+    kmp_taskdata_t * task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+    kmp_taskdata_t * task_sink = KMP_TASK_TO_TASKDATA(sink->dn.task);    // this can be NULL when if(0) ...
+
+    __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id, task_source->td_ident->psource, sink->dn.id, task_sink->td_ident->psource);
+#endif
+}
+
+template< bool filter >
+static inline kmp_int32
+__kmp_process_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
+                     bool dep_barrier,kmp_int32 ndeps, kmp_depend_info_t *dep_list)
+{
+    KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d depencies : dep_barrier = %d\n", filter, gtid, ndeps, dep_barrier ) );
+    
+    kmp_info_t *thread = __kmp_threads[ gtid ];
+    kmp_int32 npredecessors=0;
+    for ( kmp_int32 i = 0; i < ndeps ; i++ ) {
+        const kmp_depend_info_t * dep = &dep_list[i];
+
+        KMP_DEBUG_ASSERT(dep->flags.in);
+
+        if ( filter && dep->base_addr == 0 ) continue; // skip filtered entries
+
+        kmp_dephash_entry_t *info = __kmp_dephash_find(thread,hash,dep->base_addr);
+        kmp_depnode_t *last_out = info->last_out;
+
+        if ( dep->flags.out && info->last_ins ) {
+            for ( kmp_depnode_list_t * p = info->last_ins; p; p = p->next ) {
+                kmp_depnode_t * indep = p->node;
+                if ( indep->dn.task ) {
+                    KMP_ACQUIRE_DEPNODE(gtid,indep);
+                    if ( indep->dn.task ) {
+                        __kmp_track_dependence(indep,node);
+                        indep->dn.successors = __kmp_add_node(thread, indep->dn.successors, node);
+                        KA_TRACE(40,("__kmp_process_deps<%d>: T#%d adding dependence from %p to %p",
+                                 filter,gtid, KMP_TASK_TO_TASKDATA(indep->dn.task), KMP_TASK_TO_TASKDATA(node->dn.task)));
+                        npredecessors++;
+                    }
+                    KMP_RELEASE_DEPNODE(gtid,indep);
+                }
+            }
+
+            __kmp_depnode_list_free(thread,info->last_ins);
+            info->last_ins = NULL;
+
+        } else if ( last_out && last_out->dn.task ) {
+            KMP_ACQUIRE_DEPNODE(gtid,last_out);
+            if ( last_out->dn.task ) {
+                __kmp_track_dependence(last_out,node);
+                last_out->dn.successors = __kmp_add_node(thread, last_out->dn.successors, node);
+                KA_TRACE(40,("__kmp_process_deps<%d>: T#%d adding dependence from %p to %p", 
+                             filter,gtid, KMP_TASK_TO_TASKDATA(last_out->dn.task), KMP_TASK_TO_TASKDATA(node->dn.task)));
+                
+                npredecessors++;
+            }
+            KMP_RELEASE_DEPNODE(gtid,last_out);
+        }
+
+        if ( dep_barrier ) {
+            // if this is a sync point in the serial sequence, then the previous outputs are guaranteed to be completed after
+            // the execution of this task so the previous output nodes can be cleared.
+            __kmp_node_deref(thread,last_out);
+            info->last_out = NULL;
+        } else {
+            if ( dep->flags.out ) {
+                __kmp_node_deref(thread,last_out);
+                info->last_out = __kmp_node_ref(node);
+            } else
+                info->last_ins = __kmp_add_node(thread, info->last_ins, node);
+        }
+
+    }
+
+    KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter, gtid, npredecessors ) );
+
+    return npredecessors;
+}
+
+#define NO_DEP_BARRIER (false)
+#define DEP_BARRIER (true)
+
+// returns true if the task has any outstanding dependence
+static bool
+__kmp_check_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_task_t *task, kmp_dephash_t *hash, bool dep_barrier,
+                   kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                   kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
+{
+    int i;
+    kmp_taskdata_t * taskdata;
+
+    taskdata = KMP_TASK_TO_TASKDATA(task);
+    KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependencies for task %p : %d possibly aliased dependencies, %d non-aliased depedencies : dep_barrier=%d .\n", gtid, taskdata, ndeps, ndeps_noalias, dep_barrier ) );
+
+    // Filter deps in dep_list
+    // TODO: Different algorithm for large dep_list ( > 10 ? )
+    for ( i = 0; i < ndeps; i ++ ) {
+        if ( dep_list[i].base_addr != 0 )
+            for ( int j = i+1; j < ndeps; j++ )
+                if ( dep_list[i].base_addr == dep_list[j].base_addr ) {
+                    dep_list[i].flags.in |= dep_list[j].flags.in;
+                    dep_list[i].flags.out |= dep_list[j].flags.out;
+                    dep_list[j].base_addr = 0; // Mark j element as void
+                }
+    }
+
+    // doesn't need to be atomic as no other thread is going to be accessing this node just yet
+    // npredecessors is set -1 to ensure that none of the releasing tasks queues this task before we have finished processing all the dependencies
+    node->dn.npredecessors = -1;
+
+    // used to pack all npredecessors additions into a single atomic operation at the end
+    int npredecessors;
+
+    npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps, dep_list);
+    npredecessors += __kmp_process_deps<false>(gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list);
+
+    node->dn.task = task;
+    KMP_MB();
+
+    // Account for our initial fake value
+    npredecessors++;
+
+    // Update predecessors and obtain current value to check if there are still any outstandig dependences (some tasks may have finished while we processed the dependences)
+    npredecessors = KMP_TEST_THEN_ADD32(&node->dn.npredecessors, npredecessors) + npredecessors;
+
+    KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n", gtid, npredecessors, taskdata ) );
+
+    // beyond this point the task could be queued (and executed) by a releasing task...
+    return npredecessors > 0 ? true : false;
+}
+
+void
+__kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task )
+{
+    kmp_info_t *thread = __kmp_threads[ gtid ];
+    kmp_depnode_t *node = task->td_depnode;
+
+    if ( task->td_dephash ) {
+        KA_TRACE(40, ("__kmp_realease_deps: T#%d freeing dependencies hash of task %p.\n", gtid, task ) );
+        __kmp_dephash_free(thread,task->td_dephash);
+    }
+
+    if ( !node ) return;
+
+    KA_TRACE(20, ("__kmp_realease_deps: T#%d notifying succesors of task %p.\n", gtid, task ) );
+    
+    KMP_ACQUIRE_DEPNODE(gtid,node);
+    node->dn.task = NULL; // mark this task as finished, so no new dependencies are generated
+    KMP_RELEASE_DEPNODE(gtid,node);
+
+    kmp_depnode_list_t *next;
+    for ( kmp_depnode_list_t *p = node->dn.successors; p; p = next ) {
+        kmp_depnode_t *successor = p->node;
+        kmp_int32 npredecessors = KMP_TEST_THEN_DEC32(&successor->dn.npredecessors) - 1;
+
+        // successor task can be NULL for wait_depends or because deps are still being processed
+        if ( npredecessors == 0 ) {
+            KMP_MB();
+            if ( successor->dn.task ) {            
+                KA_TRACE(20, ("__kmp_realease_deps: T#%d successor %p of %p scheduled for execution.\n", gtid, successor->dn.task, task ) );
+                __kmp_omp_task(gtid,successor->dn.task,false);
+            }
+        }
+
+        next = p->next;
+        __kmp_node_deref(thread,p->node);
+#if USE_FAST_MEMORY
+        __kmp_fast_free(thread,p);
+#else
+        __kmp_thread_free(thread,p);
+#endif
+    }
+
+    __kmp_node_deref(thread,node);
+
+    KA_TRACE(20, ("__kmp_realease_deps: T#%d all successors of %p notified of completation\n", gtid, task ) );
+}
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
+@param ndeps Number of depend items with possible aliasing
+@param dep_list List of depend items with possible aliasing
+@param ndeps_noalias Number of depend items with no aliasing
+@param noalias_dep_list List of depend items with no aliasing
+
+@return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not suspendend and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
+
+Schedule a non-thread-switchable task with dependences for execution
+*/
+kmp_int32
+__kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                            kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                            kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
+{
+
+    kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+    KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata ) );
+
+    kmp_info_t *thread = __kmp_threads[ gtid ];
+    kmp_taskdata_t * current_task = thread->th.th_current_task;
+
+    bool serial = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
+#if OMP_41_ENABLED
+    serial = serial && !(new_taskdata->td_flags.proxy == TASK_PROXY);
+#endif
+
+    if ( !serial && ( ndeps > 0 || ndeps_noalias > 0 )) {
+        /* if no dependencies have been tracked yet, create the dependence hash */
+        if ( current_task->td_dephash == NULL )
+            current_task->td_dephash = __kmp_dephash_create(thread);
+
+#if USE_FAST_MEMORY
+        kmp_depnode_t *node = (kmp_depnode_t *) __kmp_fast_allocate(thread,sizeof(kmp_depnode_t));
+#else
+        kmp_depnode_t *node = (kmp_depnode_t *) __kmp_thread_malloc(thread,sizeof(kmp_depnode_t));
+#endif
+
+        __kmp_init_node(node);
+        new_taskdata->td_depnode = node;
+
+        if ( __kmp_check_deps( gtid, node, new_task, current_task->td_dephash, NO_DEP_BARRIER,
+                               ndeps, dep_list, ndeps_noalias,noalias_dep_list ) ) {
+            KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking dependencies: "
+                  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", gtid, loc_ref,
+                  new_taskdata ) );
+            return TASK_CURRENT_NOT_QUEUED;
+        }
+    } else {
+#if OMP_41_ENABLED
+        kmp_task_team_t * task_team = thread->th.th_task_team;
+        if ( task_team && task_team->tt.tt_found_proxy_tasks )
+           __kmpc_omp_wait_deps ( loc_ref, gtid, ndeps, dep_list, ndeps_noalias, noalias_dep_list );
+        else
+#endif
+           KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies for task (serialized)"
+                           "loc=%p task=%p\n", gtid, loc_ref, new_taskdata ) );
+    }
+
+    KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking dependencies : "
+                  "loc=%p task=%p, transferring to __kmpc_omp_task\n", gtid, loc_ref,
+                  new_taskdata ) );
+
+    return __kmpc_omp_task(loc_ref,gtid,new_task);
+}
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param ndeps Number of depend items with possible aliasing
+@param dep_list List of depend items with possible aliasing
+@param ndeps_noalias Number of depend items with no aliasing
+@param noalias_dep_list List of depend items with no aliasing
+
+Blocks the current task until all specifies dependencies have been fulfilled.
+*/
+void
+__kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                       kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
+{
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref) );
+
+    if ( ndeps == 0 && ndeps_noalias == 0 ) {
+        KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependencies to wait upon : loc=%p\n", gtid, loc_ref) );
+        return;
+    }
+
+    kmp_info_t *thread = __kmp_threads[ gtid ];
+    kmp_taskdata_t * current_task = thread->th.th_current_task;
+
+    // We can return immediately as:
+    //   - dependences are not computed in serial teams (except if we have proxy tasks)
+    //   - if the dephash is not yet created it means we have nothing to wait for
+    bool ignore = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
+#if OMP_41_ENABLED
+    ignore = ignore && thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
+#endif
+    ignore = ignore || current_task->td_dephash == NULL;
+
+    if ( ignore ) {
+        KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking dependencies : loc=%p\n", gtid, loc_ref) );
+        return;
+    }
+
+    kmp_depnode_t node;
+    __kmp_init_node(&node);
+
+    if (!__kmp_check_deps( gtid, &node, NULL, current_task->td_dephash, DEP_BARRIER,
+                           ndeps, dep_list, ndeps_noalias, noalias_dep_list )) {
+        KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking dependencies : loc=%p\n", gtid, loc_ref) );
+        return;
+    }
+
+    int thread_finished = FALSE;
+    kmp_flag_32 flag((volatile kmp_uint32 *)&(node.dn.npredecessors), 0U);
+    while ( node.dn.npredecessors > 0 ) {
+        flag.execute_tasks(thread, gtid, FALSE, &thread_finished,
+#if USE_ITT_BUILD
+                           NULL,
+#endif
+                           __kmp_task_stealing_constraint );
+    }
+
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n", gtid, loc_ref) );
+}
+
+#endif /* OMP_40_ENABLED */
+

diff --git a/final/runtime/src/kmp_tasking.c b/final/runtime/src/kmp_tasking.c
new file mode 100644
index 0000000..d1a94f6
--- /dev/null
+++ b/final/runtime/src/kmp_tasking.c

@@ -0,0 +1,2835 @@
+/*
+ * kmp_tasking.c -- OpenMP 3.0 tasking support.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_wait_release.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+
+/* forward declaration */
+static void __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr );
+static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
+static int  __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
+
+#ifdef OMP_41_ENABLED
+static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
+#endif
+
+static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
+    switch (((kmp_flag_64 *)flag)->get_type()) {
+    case flag32: __kmp_resume_32(gtid, NULL); break;
+    case flag64: __kmp_resume_64(gtid, NULL); break;
+    case flag_oncore: __kmp_resume_oncore(gtid, NULL); break;
+    }
+}
+
+#ifdef BUILD_TIED_TASK_STACK
+
+//---------------------------------------------------------------------------
+//  __kmp_trace_task_stack: print the tied tasks from the task stack in order
+//     from top do bottom
+//
+//  gtid: global thread identifier for thread containing stack
+//  thread_data: thread data for task team thread containing stack
+//  threshold: value above which the trace statement triggers
+//  location: string identifying call site of this function (for trace)
+
+static void
+__kmp_trace_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data, int threshold, char *location )
+{
+    kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
+    kmp_taskdata_t **stack_top = task_stack -> ts_top;
+    kmp_int32 entries = task_stack -> ts_entries;
+    kmp_taskdata_t *tied_task;
+
+    KA_TRACE(threshold, ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
+                         "first_block = %p, stack_top = %p \n",
+                         location, gtid, entries, task_stack->ts_first_block, stack_top ) );
+
+    KMP_DEBUG_ASSERT( stack_top != NULL );
+    KMP_DEBUG_ASSERT( entries > 0 );
+
+    while ( entries != 0 )
+    {
+        KMP_DEBUG_ASSERT( stack_top != & task_stack->ts_first_block.sb_block[0] );
+        // fix up ts_top if we need to pop from previous block
+        if ( entries & TASK_STACK_INDEX_MASK == 0 )
+        {
+            kmp_stack_block_t *stack_block = (kmp_stack_block_t *) (stack_top) ;
+
+            stack_block = stack_block -> sb_prev;
+            stack_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
+        }
+
+        // finish bookkeeping
+        stack_top--;
+        entries--;
+
+        tied_task = * stack_top;
+
+        KMP_DEBUG_ASSERT( tied_task != NULL );
+        KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
+
+        KA_TRACE(threshold, ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
+                             "stack_top=%p, tied_task=%p\n",
+                             location, gtid, entries, stack_top, tied_task ) );
+    }
+    KMP_DEBUG_ASSERT( stack_top == & task_stack->ts_first_block.sb_block[0] );
+
+    KA_TRACE(threshold, ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
+                         location, gtid ) );
+}
+
+//---------------------------------------------------------------------------
+//  __kmp_init_task_stack: initialize the task stack for the first time
+//    after a thread_data structure is created.
+//    It should not be necessary to do this again (assuming the stack works).
+//
+//  gtid: global thread identifier of calling thread
+//  thread_data: thread data for task team thread containing stack
+
+static void
+__kmp_init_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
+{
+    kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
+    kmp_stack_block_t *first_block;
+
+    // set up the first block of the stack
+    first_block = & task_stack -> ts_first_block;
+    task_stack -> ts_top = (kmp_taskdata_t **) first_block;
+    memset( (void *) first_block, '\0', TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
+
+    // initialize the stack to be empty
+    task_stack  -> ts_entries = TASK_STACK_EMPTY;
+    first_block -> sb_next = NULL;
+    first_block -> sb_prev = NULL;
+}
+
+
+//---------------------------------------------------------------------------
+//  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
+//
+//  gtid: global thread identifier for calling thread
+//  thread_data: thread info for thread containing stack
+
+static void
+__kmp_free_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
+{
+    kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
+    kmp_stack_block_t *stack_block = & task_stack -> ts_first_block;
+
+    KMP_DEBUG_ASSERT( task_stack -> ts_entries == TASK_STACK_EMPTY );
+    // free from the second block of the stack
+    while ( stack_block != NULL ) {
+        kmp_stack_block_t *next_block = (stack_block) ? stack_block -> sb_next : NULL;
+
+        stack_block -> sb_next = NULL;
+        stack_block -> sb_prev = NULL;
+        if (stack_block != & task_stack -> ts_first_block) {
+            __kmp_thread_free( thread, stack_block );  // free the block, if not the first
+        }
+        stack_block = next_block;
+    }
+    // initialize the stack to be empty
+    task_stack -> ts_entries = 0;
+    task_stack -> ts_top = NULL;
+}
+
+
+//---------------------------------------------------------------------------
+//  __kmp_push_task_stack: Push the tied task onto the task stack.
+//     Grow the stack if necessary by allocating another block.
+//
+//  gtid: global thread identifier for calling thread
+//  thread: thread info for thread containing stack
+//  tied_task: the task to push on the stack
+
+static void
+__kmp_push_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t * tied_task )
+{
+    // GEH - need to consider what to do if tt_threads_data not allocated yet
+    kmp_thread_data_t *thread_data = & thread -> th.th_task_team ->
+                                        tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
+    kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
+
+    if ( tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser ) {
+        return;  // Don't push anything on stack if team or team tasks are serialized
+    }
+
+    KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
+    KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
+
+    KA_TRACE(20, ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
+                  gtid, thread, tied_task ) );
+    // Store entry
+    * (task_stack -> ts_top) = tied_task;
+
+    // Do bookkeeping for next push
+    task_stack -> ts_top++;
+    task_stack -> ts_entries++;
+
+    if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
+    {
+        // Find beginning of this task block
+        kmp_stack_block_t *stack_block =
+             (kmp_stack_block_t *) (task_stack -> ts_top - TASK_STACK_BLOCK_SIZE);
+
+        // Check if we already have a block
+        if ( stack_block -> sb_next != NULL )
+        {    // reset ts_top to beginning of next block
+            task_stack -> ts_top = & stack_block -> sb_next -> sb_block[0];
+        }
+        else
+        {   // Alloc new block and link it up
+            kmp_stack_block_t *new_block = (kmp_stack_block_t *)
+              __kmp_thread_calloc(thread, sizeof(kmp_stack_block_t));
+
+            task_stack -> ts_top  = & new_block -> sb_block[0];
+            stack_block -> sb_next = new_block;
+            new_block  -> sb_prev = stack_block;
+            new_block  -> sb_next = NULL;
+
+            KA_TRACE(30, ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
+                          gtid, tied_task, new_block ) );
+        }
+    }
+    KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
+}
+
+//---------------------------------------------------------------------------
+//  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
+//     the task, just check to make sure it matches the ending task passed in.
+//
+//  gtid: global thread identifier for the calling thread
+//  thread: thread info structure containing stack
+//  tied_task: the task popped off the stack
+//  ending_task: the task that is ending (should match popped task)
+
+static void
+__kmp_pop_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *ending_task )
+{
+    // GEH - need to consider what to do if tt_threads_data not allocated yet
+    kmp_thread_data_t *thread_data = & thread -> th.th_task_team -> tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
+    kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
+    kmp_taskdata_t *tied_task;
+
+    if ( ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser ) {
+        return;  // Don't pop anything from stack if team or team tasks are serialized
+    }
+
+    KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
+    KMP_DEBUG_ASSERT( task_stack -> ts_entries > 0 );
+
+    KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, thread ) );
+
+    // fix up ts_top if we need to pop from previous block
+    if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
+    {
+        kmp_stack_block_t *stack_block =
+           (kmp_stack_block_t *) (task_stack -> ts_top) ;
+
+        stack_block = stack_block -> sb_prev;
+        task_stack -> ts_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
+    }
+
+    // finish bookkeeping
+    task_stack -> ts_top--;
+    task_stack -> ts_entries--;
+
+    tied_task = * (task_stack -> ts_top );
+
+    KMP_DEBUG_ASSERT( tied_task != NULL );
+    KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
+    KMP_DEBUG_ASSERT( tied_task == ending_task );  // If we built the stack correctly
+
+    KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
+    return;
+}
+#endif /* BUILD_TIED_TASK_STACK */
+
+//---------------------------------------------------
+//  __kmp_push_task: Add a task to the thread's deque
+
+static kmp_int32
+__kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
+{
+    kmp_info_t *        thread = __kmp_threads[ gtid ];
+    kmp_taskdata_t *    taskdata = KMP_TASK_TO_TASKDATA(task);
+    kmp_task_team_t *   task_team = thread->th.th_task_team;
+    kmp_int32           tid = __kmp_tid_from_gtid( gtid );
+    kmp_thread_data_t * thread_data;
+
+    KA_TRACE(20, ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata ) );
+
+    // The first check avoids building task_team thread data if serialized
+    if ( taskdata->td_flags.task_serial ) {
+        KA_TRACE(20, ( "__kmp_push_task: T#%d team serialized; returning TASK_NOT_PUSHED for task %p\n",
+                       gtid, taskdata ) );
+        return TASK_NOT_PUSHED;
+    }
+
+    // Now that serialized tasks have returned, we can assume that we are not in immediate exec mode
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
+    if ( ! KMP_TASKING_ENABLED(task_team) ) {
+         __kmp_enable_tasking( task_team, thread );
+    }
+    KMP_DEBUG_ASSERT( TCR_4(task_team -> tt.tt_found_tasks) == TRUE );
+    KMP_DEBUG_ASSERT( TCR_PTR(task_team -> tt.tt_threads_data) != NULL );
+
+    // Find tasking deque specific to encountering thread
+    thread_data = & task_team -> tt.tt_threads_data[ tid ];
+
+    // No lock needed since only owner can allocate
+    if (thread_data -> td.td_deque == NULL ) {
+        __kmp_alloc_task_deque( thread, thread_data );
+    }
+
+    // Check if deque is full
+    if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+    {
+        KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n",
+                       gtid, taskdata ) );
+        return TASK_NOT_PUSHED;
+    }
+
+    // Lock the deque for the task push operation
+    __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
+
+#if OMP_41_ENABLED
+    // Need to recheck as we can get a proxy task from a thread outside of OpenMP
+    if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+    {
+        __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
+        KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
+                       gtid, taskdata ) );
+        return TASK_NOT_PUSHED;
+    }
+#else
+    // Must have room since no thread can add tasks but calling thread
+    KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE );
+#endif
+
+    thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;  // Push taskdata
+    // Wrap index.
+    thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
+    TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);             // Adjust task count
+
+    __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
+
+    KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
+                  "task=%p ntasks=%d head=%u tail=%u\n",
+                  gtid, taskdata, thread_data->td.td_deque_ntasks,
+                  thread_data->td.td_deque_tail, thread_data->td.td_deque_head) );
+
+    return TASK_SUCCESSFULLY_PUSHED;
+}
+
+
+//-----------------------------------------------------------------------------------------
+// __kmp_pop_current_task_from_thread: set up current task from called thread when team ends
+// this_thr: thread structure to set current_task in.
+
+void
+__kmp_pop_current_task_from_thread( kmp_info_t *this_thr )
+{
+    KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(enter): T#%d this_thread=%p, curtask=%p, "
+                   "curtask_parent=%p\n",
+                   0, this_thr, this_thr -> th.th_current_task,
+                   this_thr -> th.th_current_task -> td_parent ) );
+
+    this_thr -> th.th_current_task = this_thr -> th.th_current_task -> td_parent;
+
+    KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(exit): T#%d this_thread=%p, curtask=%p, "
+                   "curtask_parent=%p\n",
+                   0, this_thr, this_thr -> th.th_current_task,
+                   this_thr -> th.th_current_task -> td_parent ) );
+}
+
+
+//---------------------------------------------------------------------------------------
+// __kmp_push_current_task_to_thread: set up current task in called thread for a new team
+// this_thr: thread structure to set up
+// team: team for implicit task data
+// tid: thread within team to set up
+
+void
+__kmp_push_current_task_to_thread( kmp_info_t *this_thr, kmp_team_t *team, int tid )
+{
+    // current task of the thread is a parent of the new just created implicit tasks of new team
+    KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p curtask=%p "
+                    "parent_task=%p\n",
+                    tid, this_thr, this_thr->th.th_current_task,
+                    team->t.t_implicit_task_taskdata[tid].td_parent ) );
+
+    KMP_DEBUG_ASSERT (this_thr != NULL);
+
+    if( tid == 0 ) {
+        if( this_thr->th.th_current_task != & team -> t.t_implicit_task_taskdata[ 0 ] ) {
+            team -> t.t_implicit_task_taskdata[ 0 ].td_parent = this_thr->th.th_current_task;
+            this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ 0 ];
+        }
+    } else {
+        team -> t.t_implicit_task_taskdata[ tid ].td_parent = team -> t.t_implicit_task_taskdata[ 0 ].td_parent;
+        this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ tid ];
+    }
+
+    KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p curtask=%p "
+                    "parent_task=%p\n",
+                    tid, this_thr, this_thr->th.th_current_task,
+                    team->t.t_implicit_task_taskdata[tid].td_parent ) );
+}
+
+
+//----------------------------------------------------------------------
+// __kmp_task_start: bookkeeping for a task starting execution
+// GTID: global thread id of calling thread
+// task: task starting execution
+// current_task: task suspending
+
+static void
+__kmp_task_start( kmp_int32 gtid, kmp_task_t * task, kmp_taskdata_t * current_task )
+{
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+    kmp_info_t * thread = __kmp_threads[ gtid ];
+
+    KA_TRACE(10, ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
+                  gtid, taskdata, current_task) );
+
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
+
+    // mark currently executing task as suspended
+    // TODO: GEH - make sure root team implicit task is initialized properly.
+    // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
+    current_task -> td_flags.executing = 0;
+
+    // Add task to stack if tied
+#ifdef BUILD_TIED_TASK_STACK
+    if ( taskdata -> td_flags.tiedness == TASK_TIED )
+    {
+        __kmp_push_task_stack( gtid, thread, taskdata );
+    }
+#endif /* BUILD_TIED_TASK_STACK */
+
+    // mark starting task as executing and as current task
+    thread -> th.th_current_task = taskdata;
+
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 0 );
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 0 );
+    taskdata -> td_flags.started = 1;
+    taskdata -> td_flags.executing = 1;
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
+
+    // GEH TODO: shouldn't we pass some sort of location identifier here?
+    // APT: yes, we will pass location here.
+    // need to store current thread state (in a thread or taskdata structure)
+    // before setting work_state, otherwise wrong state is set after end of task
+
+    KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n",
+                  gtid, taskdata ) );
+
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
+        kmp_taskdata_t *parent = taskdata->td_parent;
+        ompt_callbacks.ompt_callback(ompt_event_task_begin)(
+            parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
+            parent ? &(parent->ompt_task_info.frame) : NULL,
+            taskdata->ompt_task_info.task_id,
+            taskdata->ompt_task_info.function);
+    }
+#endif
+
+    return;
+}
+
+
+//----------------------------------------------------------------------
+// __kmpc_omp_task_begin_if0: report that a given serialized task has started execution
+// loc_ref: source location information; points to beginning of task block.
+// gtid: global thread number.
+// task: task thunk for the started task.
+
+void
+__kmpc_omp_task_begin_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
+{
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+    kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
+
+    KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p current_task=%p\n",
+                  gtid, loc_ref, taskdata, current_task ) );
+
+    taskdata -> td_flags.task_serial = 1;  // Execute this task immediately, not deferred.
+    __kmp_task_start( gtid, task, current_task );
+
+    KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n",
+                  gtid, loc_ref, taskdata ) );
+
+    return;
+}
+
+#ifdef TASK_UNUSED
+//----------------------------------------------------------------------
+// __kmpc_omp_task_begin: report that a given task has started execution
+// NEVER GENERATED BY COMPILER, DEPRECATED!!!
+
+void
+__kmpc_omp_task_begin( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
+{
+    kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
+
+    KA_TRACE(10, ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
+                  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task ) );
+
+    __kmp_task_start( gtid, task, current_task );
+
+    KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n",
+                  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
+
+    return;
+}
+#endif // TASK_UNUSED
+
+
+//-------------------------------------------------------------------------------------
+// __kmp_free_task: free the current task space and the space for shareds
+// gtid: Global thread ID of calling thread
+// taskdata: task to free
+// thread: thread data structure of caller
+
+static void
+__kmp_free_task( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
+{
+    KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n",
+                  gtid, taskdata) );
+
+    // Check to make sure all flags and counters have the correct values
+    KMP_DEBUG_ASSERT( taskdata->td_flags.tasktype == TASK_EXPLICIT );
+    KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 );
+    KMP_DEBUG_ASSERT( taskdata->td_flags.complete == 1 );
+    KMP_DEBUG_ASSERT( taskdata->td_flags.freed == 0 );
+    KMP_DEBUG_ASSERT( TCR_4(taskdata->td_allocated_child_tasks) == 0  || taskdata->td_flags.task_serial == 1);
+    KMP_DEBUG_ASSERT( TCR_4(taskdata->td_incomplete_child_tasks) == 0 );
+
+    taskdata->td_flags.freed = 1;
+    // deallocate the taskdata and shared variable blocks associated with this task
+    #if USE_FAST_MEMORY
+        __kmp_fast_free( thread, taskdata );
+    #else /* ! USE_FAST_MEMORY */
+        __kmp_thread_free( thread, taskdata );
+    #endif
+
+    KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n",
+                  gtid, taskdata) );
+}
+
+//-------------------------------------------------------------------------------------
+// __kmp_free_task_and_ancestors: free the current task and ancestors without children
+//
+// gtid: Global thread ID of calling thread
+// taskdata: task to free
+// thread: thread data structure of caller
+
+static void
+__kmp_free_task_and_ancestors( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
+{
+    kmp_int32 children = 0;
+    kmp_int32 team_or_tasking_serialized = taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser;
+
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
+
+    if ( !team_or_tasking_serialized ) {
+        children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
+        KMP_DEBUG_ASSERT( children >= 0 );
+    }
+
+    // Now, go up the ancestor tree to see if any ancestors can now be freed.
+    while ( children == 0 )
+    {
+        kmp_taskdata_t * parent_taskdata = taskdata -> td_parent;
+
+        KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
+                      "and freeing itself\n", gtid, taskdata) );
+
+        // --- Deallocate my ancestor task ---
+        __kmp_free_task( gtid, taskdata, thread );
+
+        taskdata = parent_taskdata;
+
+        // Stop checking ancestors at implicit task or if tasking serialized
+        // instead of walking up ancestor tree to avoid premature deallocation of ancestors.
+        if ( team_or_tasking_serialized || taskdata -> td_flags.tasktype == TASK_IMPLICIT )
+            return;
+
+        if ( !team_or_tasking_serialized ) {
+            // Predecrement simulated by "- 1" calculation
+            children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
+            KMP_DEBUG_ASSERT( children >= 0 );
+        }
+    }
+
+    KA_TRACE(20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
+                  "not freeing it yet\n", gtid, taskdata, children) );
+}
+
+//---------------------------------------------------------------------
+// __kmp_task_finish: bookkeeping to do when a task finishes execution
+// gtid: global thread ID for calling thread
+// task: task to be finished
+// resumed_task: task to be resumed.  (may be NULL if task is serialized)
+
+static void
+__kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task )
+{
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+    kmp_info_t * thread = __kmp_threads[ gtid ];
+    kmp_int32 children = 0;
+
+#if OMPT_SUPPORT
+    if ((ompt_status == ompt_status_track_callback) &&
+        ompt_callbacks.ompt_callback(ompt_event_task_end)) {
+        kmp_taskdata_t *parent = taskdata->td_parent;
+        ompt_callbacks.ompt_callback(ompt_event_task_end)(
+            taskdata->ompt_task_info.task_id);
+    }
+#endif
+
+    KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming task %p\n",
+                  gtid, taskdata, resumed_task) );
+
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
+
+    // Pop task from stack if tied
+#ifdef BUILD_TIED_TASK_STACK
+    if ( taskdata -> td_flags.tiedness == TASK_TIED )
+    {
+        __kmp_pop_task_stack( gtid, thread, taskdata );
+    }
+#endif /* BUILD_TIED_TASK_STACK */
+
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
+    taskdata -> td_flags.complete = 1;   // mark the task as completed
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 1 );
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
+
+    // Only need to keep track of count if team parallel and tasking not serialized
+    if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
+        // Predecrement simulated by "- 1" calculation
+        children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
+        KMP_DEBUG_ASSERT( children >= 0 );
+#if OMP_40_ENABLED
+        if ( taskdata->td_taskgroup )
+            KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
+        __kmp_release_deps(gtid,taskdata);
+#endif
+    }
+
+    // td_flags.executing  must be marked as 0 after __kmp_release_deps has been called
+    // Othertwise, if a task is executed immediately from the release_deps code
+    // the flag will be reset to 1 again by this same function
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 1 );
+    taskdata -> td_flags.executing = 0;  // suspend the finishing task
+
+    KA_TRACE(20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
+                  gtid, taskdata, children) );
+
+#if OMP_40_ENABLED
+    /* If the tasks' destructor thunk flag has been set, we need to invoke the
+       destructor thunk that has been generated by the compiler.
+       The code is placed here, since at this point other tasks might have been released
+       hence overlapping the destructor invokations with some other work in the
+       released tasks.  The OpenMP spec is not specific on when the destructors are
+       invoked, so we should be free to choose.
+     */
+    if (taskdata->td_flags.destructors_thunk) {
+        kmp_routine_entry_t destr_thunk = task->destructors;
+        KMP_ASSERT(destr_thunk);
+        destr_thunk(gtid, task);
+    }
+#endif // OMP_40_ENABLED
+
+    // bookkeeping for resuming task:
+    // GEH - note tasking_ser => task_serial
+    KMP_DEBUG_ASSERT( (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
+                       taskdata->td_flags.task_serial);
+    if ( taskdata->td_flags.task_serial )
+    {
+        if (resumed_task == NULL) {
+            resumed_task = taskdata->td_parent;  // In a serialized task, the resumed task is the parent
+        }
+        else {
+            // verify resumed task passed in points to parent
+            KMP_DEBUG_ASSERT( resumed_task == taskdata->td_parent );
+        }
+    }
+    else {
+        KMP_DEBUG_ASSERT( resumed_task != NULL );        // verify that resumed task is passed as arguemnt
+    }
+
+    // Free this task and then ancestor tasks if they have no children.
+    __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+
+    // FIXME johnmc: I this statement should be before the last one so if an
+    // asynchronous inquiry peers into the runtime system it doesn't see the freed
+    // task as the current task
+    __kmp_threads[ gtid ] -> th.th_current_task = resumed_task; // restore current_task
+
+    // TODO: GEH - make sure root team implicit task is initialized properly.
+    // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
+    resumed_task->td_flags.executing = 1;  // resume previous task
+
+    KA_TRACE(10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
+                  gtid, taskdata, resumed_task) );
+
+    return;
+}
+
+//---------------------------------------------------------------------
+// __kmpc_omp_task_complete_if0: report that a task has completed execution
+// loc_ref: source location information; points to end of task block.
+// gtid: global thread number.
+// task: task thunk for the completed task.
+
+void
+__kmpc_omp_task_complete_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
+{
+    KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
+
+    __kmp_task_finish( gtid, task, NULL );  // this routine will provide task to resume
+
+    KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
+
+    return;
+}
+
+#ifdef TASK_UNUSED
+//---------------------------------------------------------------------
+// __kmpc_omp_task_complete: report that a task has completed execution
+// NEVER GENERATED BY COMPILER, DEPRECATED!!!
+
+void
+__kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
+{
+    KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
+
+    __kmp_task_finish( gtid, task, NULL );  // Not sure how to find task to resume
+
+    KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
+    return;
+}
+#endif // TASK_UNUSED
+
+
+#if OMPT_SUPPORT
+//----------------------------------------------------------------------------------------------------
+// __kmp_task_init_ompt:
+//   Initialize OMPT fields maintained by a task. Since the serial task is initialized before
+//   ompt_initialize is called, at the point the serial task is initialized we don't know whether
+//   OMPT will be used or not when the serial task is initialized. This function provides the support
+//   needed to initialize OMPT for the serial task after the fact.
+
+void
+__kmp_task_init_ompt( kmp_taskdata_t * task, int tid )
+{
+    task->ompt_task_info.task_id = __ompt_task_id_new(tid);
+    task->ompt_task_info.function = NULL;
+    task->ompt_task_info.frame.exit_runtime_frame = NULL;
+    task->ompt_task_info.frame.reenter_runtime_frame = NULL;
+}
+#endif
+
+
+//----------------------------------------------------------------------------------------------------
+// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit task for a given thread
+//
+// loc_ref:  reference to source location of parallel region
+// this_thr:  thread data structure corresponding to implicit task
+// team: team for this_thr
+// tid: thread id of given thread within team
+// set_curr_task: TRUE if need to push current task to thread
+// NOTE: Routine does not set up the implicit task ICVS.  This is assumed to have already been done elsewhere.
+// TODO: Get better loc_ref.  Value passed in may be NULL
+
+void
+__kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task )
+{
+    kmp_taskdata_t * task   = & team->t.t_implicit_task_taskdata[ tid ];
+
+    KF_TRACE(10, ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
+                  tid, team, task, set_curr_task ? "TRUE" : "FALSE" ) );
+
+    task->td_task_id  = KMP_GEN_TASK_ID();
+    task->td_team     = team;
+//    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info in debugger)
+    task->td_ident    = loc_ref;
+    task->td_taskwait_ident   = NULL;
+    task->td_taskwait_counter = 0;
+    task->td_taskwait_thread  = 0;
+
+    task->td_flags.tiedness    = TASK_TIED;
+    task->td_flags.tasktype    = TASK_IMPLICIT;
+#if OMP_41_ENABLED
+    task->td_flags.proxy       = TASK_FULL;
+#endif
+
+    // All implicit tasks are executed immediately, not deferred
+    task->td_flags.task_serial = 1;
+    task->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
+    task->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
+
+    task->td_flags.started     = 1;
+    task->td_flags.executing   = 1;
+    task->td_flags.complete    = 0;
+    task->td_flags.freed       = 0;
+
+#if OMP_40_ENABLED
+    task->td_dephash = NULL;
+    task->td_depnode = NULL;
+#endif
+
+    if (set_curr_task) {  // only do this initialization the first time a thread is created
+        task->td_incomplete_child_tasks = 0;
+        task->td_allocated_child_tasks  = 0; // Not used because do not need to deallocate implicit task
+#if OMP_40_ENABLED
+        task->td_taskgroup = NULL;           // An implicit task does not have taskgroup
+#endif
+        __kmp_push_current_task_to_thread( this_thr, team, tid );
+    } else {
+        KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
+        KMP_DEBUG_ASSERT(task->td_allocated_child_tasks  == 0);
+    }
+
+#if OMPT_SUPPORT
+    __kmp_task_init_ompt(task, tid);
+#endif
+
+    KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n",
+                  tid, team, task ) );
+}
+
+// Round up a size to a power of two specified by val
+// Used to insert padding between structures co-allocated using a single malloc() call
+static size_t
+__kmp_round_up_to_val( size_t size, size_t val ) {
+    if ( size & ( val - 1 ) ) {
+        size &= ~ ( val - 1 );
+        if ( size <= KMP_SIZE_T_MAX - val ) {
+            size += val;    // Round up if there is no overflow.
+        }; // if
+    }; // if
+    return size;
+} // __kmp_round_up_to_va
+
+
+//---------------------------------------------------------------------------------
+// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
+//
+// loc_ref: source location information
+// gtid: global thread number.
+// flags: include tiedness & task type (explicit vs. implicit) of the ''new'' task encountered.
+//        Converted from kmp_int32 to kmp_tasking_flags_t in routine.
+// sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including private vars accessed in task.
+// sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed in task.
+// task_entry: Pointer to task code entry point generated by compiler.
+// returns: a pointer to the allocated kmp_task_t structure (task).
+
+kmp_task_t *
+__kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
+                  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                  kmp_routine_entry_t task_entry )
+{
+    kmp_task_t *task;
+    kmp_taskdata_t *taskdata;
+    kmp_info_t *thread = __kmp_threads[ gtid ];
+    kmp_team_t *team = thread->th.th_team;
+    kmp_taskdata_t *parent_task = thread->th.th_current_task;
+    size_t shareds_offset;
+
+    KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
+                  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
+                  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
+                  sizeof_shareds, task_entry) );
+
+    if ( parent_task->td_flags.final ) {
+        if (flags->merged_if0) {
+        }
+        flags->final = 1;
+    }
+
+#if OMP_41_ENABLED
+    if ( flags->proxy == TASK_PROXY ) {
+        flags->tiedness = TASK_UNTIED;
+        flags->merged_if0 = 1;
+
+        /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */
+        if ( (thread->th.th_task_team) == NULL ) {
+            /* This should only happen if the team is serialized
+                setup a task team and propagate it to the thread
+            */
+            KMP_DEBUG_ASSERT(team->t.t_serialized);
+            KA_TRACE(30,("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid));
+            __kmp_task_team_setup(thread,team,0,1); // 0,1 indicates only setup the current team regardless of nthreads
+            thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
+        }
+        kmp_task_team_t * task_team = thread->th.th_task_team;
+
+        /* tasking must be enabled now as the task might not be pushed */
+        if ( !KMP_TASKING_ENABLED( task_team ) ) {
+            KA_TRACE(30,("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
+            __kmp_enable_tasking( task_team, thread );
+            kmp_int32 tid = thread->th.th_info.ds.ds_tid;
+            kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
+            // No lock needed since only owner can allocate
+            if (thread_data -> td.td_deque == NULL ) {
+                __kmp_alloc_task_deque( thread, thread_data );
+            }
+        }
+
+        if ( task_team->tt.tt_found_proxy_tasks == FALSE )
+          TCW_4(task_team -> tt.tt_found_proxy_tasks, TRUE);
+    }
+#endif
+
+    // Calculate shared structure offset including padding after kmp_task_t struct
+    // to align pointers in shared struct
+    shareds_offset = sizeof( kmp_taskdata_t ) + sizeof_kmp_task_t;
+    shareds_offset = __kmp_round_up_to_val( shareds_offset, sizeof( void * ));
+
+    // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+    KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n",
+                  gtid, shareds_offset) );
+    KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n",
+                  gtid, sizeof_shareds) );
+
+    // Avoid double allocation here by combining shareds with taskdata
+    #if USE_FAST_MEMORY
+    taskdata = (kmp_taskdata_t *) __kmp_fast_allocate( thread, shareds_offset + sizeof_shareds );
+    #else /* ! USE_FAST_MEMORY */
+    taskdata = (kmp_taskdata_t *) __kmp_thread_malloc( thread, shareds_offset + sizeof_shareds );
+    #endif /* USE_FAST_MEMORY */
+
+    task                      = KMP_TASKDATA_TO_TASK(taskdata);
+
+    // Make sure task & taskdata are aligned appropriately
+#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
+    KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(double)-1) ) == 0 );
+    KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(double)-1) ) == 0 );
+#else
+    KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(_Quad)-1) ) == 0 );
+    KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(_Quad)-1) ) == 0 );
+#endif
+    if (sizeof_shareds > 0) {
+        // Avoid double allocation here by combining shareds with taskdata
+        task->shareds         = & ((char *) taskdata)[ shareds_offset ];
+        // Make sure shareds struct is aligned to pointer size
+        KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task->shareds) & (sizeof(void *)-1) ) == 0 );
+    } else {
+        task->shareds         = NULL;
+    }
+    task->routine             = task_entry;
+    task->part_id             = 0;      // AC: Always start with 0 part id
+
+    taskdata->td_task_id      = KMP_GEN_TASK_ID();
+    taskdata->td_team         = team;
+    taskdata->td_alloc_thread = thread;
+    taskdata->td_parent       = parent_task;
+    taskdata->td_level        = parent_task->td_level + 1; // increment nesting level
+    taskdata->td_ident        = loc_ref;
+    taskdata->td_taskwait_ident   = NULL;
+    taskdata->td_taskwait_counter = 0;
+    taskdata->td_taskwait_thread  = 0;
+    KMP_DEBUG_ASSERT( taskdata->td_parent != NULL );
+#if OMP_41_ENABLED
+    // avoid copying icvs for proxy tasks
+    if ( flags->proxy == TASK_FULL )
+#endif
+       copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
+
+    taskdata->td_flags.tiedness    = flags->tiedness;
+    taskdata->td_flags.final       = flags->final;
+    taskdata->td_flags.merged_if0  = flags->merged_if0;
+#if OMP_40_ENABLED
+    taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
+#endif // OMP_40_ENABLED
+#if OMP_41_ENABLED
+    taskdata->td_flags.proxy           = flags->proxy;
+#endif
+    taskdata->td_flags.tasktype    = TASK_EXPLICIT;
+
+    // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
+    taskdata->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
+
+    // GEH - TODO: fix this to copy parent task's value of team_serial flag
+    taskdata->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
+
+    // GEH - Note we serialize the task if the team is serialized to make sure implicit parallel region
+    //       tasks are not left until program termination to execute.  Also, it helps locality to execute
+    //       immediately.
+    taskdata->td_flags.task_serial = ( parent_task->td_flags.final
+      || taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser );
+
+    taskdata->td_flags.started     = 0;
+    taskdata->td_flags.executing   = 0;
+    taskdata->td_flags.complete    = 0;
+    taskdata->td_flags.freed       = 0;
+
+    taskdata->td_flags.native      = flags->native;
+
+    taskdata->td_incomplete_child_tasks = 0;
+    taskdata->td_allocated_child_tasks  = 1; // start at one because counts current task and children
+#if OMP_40_ENABLED
+    taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
+    taskdata->td_dephash = NULL;
+    taskdata->td_depnode = NULL;
+#endif
+
+    // Only need to keep track of child task counts if team parallel and tasking not serialized or if it is a proxy task
+#if OMP_41_ENABLED
+    if ( flags->proxy == TASK_PROXY || !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) 
+#else
+    if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) 
+#endif
+    {
+        KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
+#if OMP_40_ENABLED
+        if ( parent_task->td_taskgroup )
+            KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
+#endif
+        // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
+        if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ) {
+            KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
+        }
+    }
+
+    KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
+                  gtid, taskdata, taskdata->td_parent) );
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        taskdata->ompt_task_info.task_id = __ompt_task_id_new(gtid);
+        taskdata->ompt_task_info.function = (void*) task_entry;
+        taskdata->ompt_task_info.frame.exit_runtime_frame = NULL; 
+        taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
+    }
+#endif
+
+    return task;
+}
+
+
+kmp_task_t *
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry )
+{
+    kmp_task_t *retval;
+    kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags;
+
+    input_flags->native = FALSE;
+    // __kmp_task_alloc() sets up all other runtime flags
+
+#if OMP_41_ENABLED
+    KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
+                  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
+                  gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
+                  input_flags->proxy ? "proxy" : "",
+                  sizeof_kmp_task_t, sizeof_shareds, task_entry) );
+#else
+    KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
+                  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
+                  gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
+                  sizeof_kmp_task_t, sizeof_shareds, task_entry) );
+#endif
+
+    retval = __kmp_task_alloc( loc_ref, gtid, input_flags, sizeof_kmp_task_t,
+                               sizeof_shareds, task_entry );
+
+    KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval) );
+
+    return retval;
+}
+
+//-----------------------------------------------------------
+//  __kmp_invoke_task: invoke the specified task
+//
+// gtid: global thread ID of caller
+// task: the task to invoke
+// current_task: the task to resume after task invokation
+
+static void
+__kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_task )
+{
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+#if OMP_40_ENABLED
+    int discard = 0 /* false */;
+#endif
+    KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
+                  gtid, taskdata, current_task) );
+
+#if OMP_41_ENABLED
+    if ( taskdata->td_flags.proxy == TASK_PROXY &&
+         taskdata->td_flags.complete == 1)
+         {
+            // This is a proxy task that was already completed but it needs to run
+            // its bottom-half finish
+            KA_TRACE(30, ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
+                  gtid, taskdata) );
+
+            __kmp_bottom_half_finish_proxy(gtid,task);
+
+            KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for proxy task %p, resuming task %p\n", gtid, taskdata, current_task) );
+
+            return;
+         }
+#endif
+
+#if OMP_41_ENABLED
+    // Proxy tasks are not handled by the runtime
+    if ( taskdata->td_flags.proxy != TASK_PROXY )
+#endif
+    __kmp_task_start( gtid, task, current_task );
+
+#if OMPT_SUPPORT
+    ompt_thread_info_t oldInfo;
+    kmp_info_t * thread;
+    if (ompt_status & ompt_status_track) {
+        // Store the threads states and restore them after the task
+        thread = __kmp_threads[ gtid ];
+        oldInfo = thread->th.ompt_thread_info;
+        thread->th.ompt_thread_info.wait_id = 0;
+        thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+        taskdata->ompt_task_info.frame.exit_runtime_frame = __builtin_frame_address(0);
+    }
+#endif
+
+#if OMP_40_ENABLED
+    // TODO: cancel tasks if the parallel region has also been cancelled
+    // TODO: check if this sequence can be hoisted above __kmp_task_start
+    // if cancellation has been enabled for this run ...
+    if (__kmp_omp_cancellation) {
+        kmp_info_t *this_thr = __kmp_threads [ gtid ];
+        kmp_team_t * this_team = this_thr->th.th_team;
+        kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
+        if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
+            // this task belongs to a task group and we need to cancel it
+            discard = 1 /* true */;
+        }
+    }
+
+    //
+    // Invoke the task routine and pass in relevant data.
+    // Thunks generated by gcc take a different argument list.
+    //
+    if (!discard) {
+#endif // OMP_40_ENABLED
+#ifdef KMP_GOMP_COMPAT
+        if (taskdata->td_flags.native) {
+            ((void (*)(void *))(*(task->routine)))(task->shareds);
+        }
+        else
+#endif /* KMP_GOMP_COMPAT */
+        {
+            (*(task->routine))(gtid, task);
+        }
+#if OMP_40_ENABLED
+    }
+#endif // OMP_40_ENABLED
+
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        thread->th.ompt_thread_info = oldInfo;
+        taskdata->ompt_task_info.frame.exit_runtime_frame = 0;
+    }
+#endif
+
+#if OMP_41_ENABLED
+    // Proxy tasks are not handled by the runtime
+    if ( taskdata->td_flags.proxy != TASK_PROXY )
+#endif
+       __kmp_task_finish( gtid, task, current_task );
+
+    KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
+                  gtid, taskdata, current_task) );
+    return;
+}
+
+//-----------------------------------------------------------------------
+// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
+
+kmp_int32
+__kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
+{
+    kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+
+    KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata ) );
+
+    /* Should we execute the new task or queue it?   For now, let's just always try to
+       queue it.  If the queue fills up, then we'll execute it.  */
+
+    if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
+    {                                                           // Execute this task immediately
+        kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
+        new_taskdata->td_flags.task_serial = 1;
+        __kmp_invoke_task( gtid, new_task, current_task );
+    }
+
+    KA_TRACE(10, ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
+                  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", gtid, loc_ref,
+                  new_taskdata ) );
+
+    return TASK_CURRENT_NOT_QUEUED;
+}
+
+//---------------------------------------------------------------------
+// __kmp_omp_task: Schedule a non-thread-switchable task for execution
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
+// serialize_immediate: if TRUE then if the task is executed immediately its execution will be serialized
+// returns:
+//
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
+kmp_int32
+__kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate )
+{
+    kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
+            __builtin_frame_address(0);
+    }
+#endif
+
+    /* Should we execute the new task or queue it?   For now, let's just always try to
+       queue it.  If the queue fills up, then we'll execute it.  */
+#if OMP_41_ENABLED
+    if ( new_taskdata->td_flags.proxy == TASK_PROXY || __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
+#else
+    if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
+#endif
+    {                                                           // Execute this task immediately
+        kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
+        if ( serialize_immediate )
+          new_taskdata -> td_flags.task_serial = 1;
+        __kmp_invoke_task( gtid, new_task, current_task );
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_status & ompt_status_track) {
+        new_taskdata->ompt_task_info.frame.reenter_runtime_frame = 0;
+    }
+#endif
+
+    return TASK_CURRENT_NOT_QUEUED;
+}
+
+//---------------------------------------------------------------------
+// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a non-thread-switchable task from
+// the parent thread only!
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
+// returns:
+//
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
+
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
+{
+    kmp_taskdata_t * new_taskdata;
+    kmp_int32 res;
+
+    new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+    KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata ) );
+
+    res =  __kmp_omp_task(gtid,new_task,true);
+
+    KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata ) );
+    return res;
+}
+
+//-------------------------------------------------------------------------------------
+// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are complete
+
+kmp_int32
+__kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
+{
+    kmp_taskdata_t * taskdata;
+    kmp_info_t * thread;
+    int thread_finished = FALSE;
+
+    KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n",
+                  gtid, loc_ref) );
+
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
+
+        thread = __kmp_threads[ gtid ];
+        taskdata = thread -> th.th_current_task;
+#if USE_ITT_BUILD
+        // Note: These values are used by ITT events as well.
+#endif /* USE_ITT_BUILD */
+        taskdata->td_taskwait_counter += 1;
+        taskdata->td_taskwait_ident    = loc_ref;
+        taskdata->td_taskwait_thread   = gtid + 1;
+
+#if USE_ITT_BUILD
+        void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
+        if ( itt_sync_obj != NULL )
+            __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+
+#if OMP_41_ENABLED
+        if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) ) 
+#else
+        if ( ! taskdata->td_flags.team_serial ) 
+#endif
+	{
+            // GEH: if team serialized, avoid reading the volatile variable below.
+            kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
+            while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
+                flag.execute_tasks(thread, gtid, FALSE, &thread_finished
+                                   USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
+            }
+        }
+#if USE_ITT_BUILD
+        if ( itt_sync_obj != NULL )
+            __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+
+        // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
+        taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
+    }
+
+    KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
+                  "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
+
+    return TASK_CURRENT_NOT_QUEUED;
+}
+
+
+//-------------------------------------------------
+// __kmpc_omp_taskyield: switch to a different task
+
+kmp_int32
+__kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
+{
+    kmp_taskdata_t * taskdata;
+    kmp_info_t * thread;
+    int thread_finished = FALSE;
+
+    KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
+                  gtid, loc_ref, end_part) );
+
+    if ( __kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel ) {
+        // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
+
+        thread = __kmp_threads[ gtid ];
+        taskdata = thread -> th.th_current_task;
+        // Should we model this as a task wait or not?
+#if USE_ITT_BUILD
+        // Note: These values are used by ITT events as well.
+#endif /* USE_ITT_BUILD */
+        taskdata->td_taskwait_counter += 1;
+        taskdata->td_taskwait_ident    = loc_ref;
+        taskdata->td_taskwait_thread   = gtid + 1;
+
+#if USE_ITT_BUILD
+        void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
+        if ( itt_sync_obj != NULL )
+            __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+        if ( ! taskdata->td_flags.team_serial ) {
+            kmp_task_team_t * task_team = thread->th.th_task_team;
+            if (task_team != NULL) {
+                if (KMP_TASKING_ENABLED(task_team)) {
+                    __kmp_execute_tasks_32( thread, gtid, NULL, FALSE, &thread_finished
+                                            USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
+                }
+            }
+        }
+#if USE_ITT_BUILD
+        if ( itt_sync_obj != NULL )
+            __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+
+        // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
+        taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
+    }
+
+    KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
+                  "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
+
+    return TASK_CURRENT_NOT_QUEUED;
+}
+
+
+#if OMP_40_ENABLED
+//-------------------------------------------------------------------------------------
+// __kmpc_taskgroup: Start a new taskgroup
+
+void
+__kmpc_taskgroup( ident_t* loc, int gtid )
+{
+    kmp_info_t      * thread = __kmp_threads[ gtid ];
+    kmp_taskdata_t  * taskdata = thread->th.th_current_task;
+    kmp_taskgroup_t * tg_new =
+        (kmp_taskgroup_t *)__kmp_thread_malloc( thread, sizeof( kmp_taskgroup_t ) );
+    KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new) );
+    tg_new->count = 0;
+    tg_new->cancel_request = cancel_noreq;
+    tg_new->parent = taskdata->td_taskgroup;
+    taskdata->td_taskgroup = tg_new;
+}
+
+
+//-------------------------------------------------------------------------------------
+// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
+//                       and its descendants are complete
+
+void
+__kmpc_end_taskgroup( ident_t* loc, int gtid )
+{
+    kmp_info_t      * thread = __kmp_threads[ gtid ];
+    kmp_taskdata_t  * taskdata = thread->th.th_current_task;
+    kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
+    int thread_finished = FALSE;
+
+    KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) );
+    KMP_DEBUG_ASSERT( taskgroup != NULL );
+
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+#if USE_ITT_BUILD
+        // For ITT the taskgroup wait is similar to taskwait until we need to distinguish them
+        void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
+        if ( itt_sync_obj != NULL )
+            __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+
+#if OMP_41_ENABLED
+        if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) ) 
+#else
+        if ( ! taskdata->td_flags.team_serial ) 
+#endif
+	{
+            kmp_flag_32 flag(&(taskgroup->count), 0U);
+            while ( TCR_4(taskgroup->count) != 0 ) {
+                flag.execute_tasks(thread, gtid, FALSE, &thread_finished
+                                   USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
+            }
+        }
+
+#if USE_ITT_BUILD
+        if ( itt_sync_obj != NULL )
+            __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+    }
+    KMP_DEBUG_ASSERT( taskgroup->count == 0 );
+
+    // Restore parent taskgroup for the current task
+    taskdata->td_taskgroup = taskgroup->parent;
+    __kmp_thread_free( thread, taskgroup );
+
+    KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", gtid, taskdata) );
+}
+#endif
+
+
+//------------------------------------------------------
+// __kmp_remove_my_task: remove a task from my own deque
+
+static kmp_task_t *
+__kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task_team,
+                      kmp_int32 is_constrained )
+{
+    kmp_task_t * task;
+    kmp_taskdata_t * taskdata;
+    kmp_thread_data_t *thread_data;
+    kmp_uint32 tail;
+
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
+    KMP_DEBUG_ASSERT( task_team -> tt.tt_threads_data != NULL ); // Caller should check this condition
+
+        thread_data = & task_team -> tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
+
+    KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
+                  gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
+                  thread_data->td.td_deque_tail) );
+
+    if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
+        KA_TRACE(10, ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
+                      gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
+                      thread_data->td.td_deque_tail) );
+        return NULL;
+    }
+
+    __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
+
+    if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
+        __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
+        KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
+                      gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
+                      thread_data->td.td_deque_tail) );
+        return NULL;
+    }
+
+    tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK;  // Wrap index.
+    taskdata = thread_data -> td.td_deque[ tail ];
+
+    if (is_constrained) {
+        // we need to check if the candidate obeys task scheduling constraint:
+        // only child of current task can be scheduled
+        kmp_taskdata_t * current = thread->th.th_current_task;
+        kmp_int32        level = current->td_level;
+        kmp_taskdata_t * parent = taskdata->td_parent;
+        while ( parent != current && parent->td_level > level ) {
+            parent = parent->td_parent;  // check generation up to the level of the current task
+            KMP_DEBUG_ASSERT(parent != NULL);
+        }
+        if ( parent != current ) {
+            // If the tail task is not a child, then no other childs can appear in the deque.
+            __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
+            KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
+                          gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
+                          thread_data->td.td_deque_tail) );
+            return NULL;
+        }
+    }
+
+    thread_data -> td.td_deque_tail = tail;
+    TCW_4(thread_data -> td.td_deque_ntasks, thread_data -> td.td_deque_ntasks - 1);
+
+    __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
+
+    KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: ntasks=%d head=%u tail=%u\n",
+                  gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
+                  thread_data->td.td_deque_tail) );
+
+    task = KMP_TASKDATA_TO_TASK( taskdata );
+    return task;
+}
+
+
+//-----------------------------------------------------------
+// __kmp_steal_task: remove a task from another thread's deque
+// Assume that calling thread has already checked existence of
+// task_team thread_data before calling this routine.
+
+static kmp_task_t *
+__kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team,
+                  volatile kmp_uint32 *unfinished_threads, int *thread_finished,
+                  kmp_int32 is_constrained )
+{
+    kmp_task_t * task;
+    kmp_taskdata_t * taskdata;
+    kmp_thread_data_t *victim_td, *threads_data;
+    kmp_int32 victim_tid;
+
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
+
+    threads_data = task_team -> tt.tt_threads_data;
+    KMP_DEBUG_ASSERT( threads_data != NULL );  // Caller should check this condition
+
+    victim_tid = victim->th.th_info.ds.ds_tid;
+    victim_td = & threads_data[ victim_tid ];
+
+    KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: task_team=%p ntasks=%d "
+                  "head=%u tail=%u\n",
+                  gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
+                  victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
+
+    if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) || // Caller should not check this condition
+         (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
+    {
+        KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: task_team=%p "
+                      "ntasks=%d head=%u tail=%u\n",
+                      gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
+                      victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
+        return NULL;
+    }
+
+    __kmp_acquire_bootstrap_lock( & victim_td -> td.td_deque_lock );
+
+    // Check again after we acquire the lock
+    if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) ||
+         (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
+    {
+        __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
+        KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
+                      "ntasks=%d head=%u tail=%u\n",
+                      gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
+                      victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
+        return NULL;
+    }
+
+    KMP_DEBUG_ASSERT( victim_td -> td.td_deque != NULL );
+
+    if ( !is_constrained ) {
+        taskdata = victim_td -> td.td_deque[ victim_td -> td.td_deque_head ];
+        // Bump head pointer and Wrap.
+        victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK;
+    } else {
+        // While we have postponed tasks let's steal from tail of the deque (smaller tasks)
+        kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK;  // Wrap index.
+        taskdata = victim_td -> td.td_deque[ tail ];
+        // we need to check if the candidate obeys task scheduling constraint:
+        // only child of current task can be scheduled
+        kmp_taskdata_t * current = __kmp_threads[ gtid ]->th.th_current_task;
+        kmp_int32        level = current->td_level;
+        kmp_taskdata_t * parent = taskdata->td_parent;
+        while ( parent != current && parent->td_level > level ) {
+            parent = parent->td_parent;  // check generation up to the level of the current task
+            KMP_DEBUG_ASSERT(parent != NULL);
+        }
+        if ( parent != current ) {
+            // If the tail task is not a child, then no other childs can appear in the deque (?).
+            __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
+            KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
+                          "ntasks=%d head=%u tail=%u\n",
+                          gtid, __kmp_gtid_from_thread( threads_data[victim_tid].td.td_thr ),
+                          task_team, victim_td->td.td_deque_ntasks,
+                          victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
+            return NULL;
+        }
+        victim_td -> td.td_deque_tail = tail;
+    }
+    if (*thread_finished) {
+        // We need to un-mark this victim as a finished victim.  This must be done before
+        // releasing the lock, or else other threads (starting with the master victim)
+        // might be prematurely released from the barrier!!!
+        kmp_uint32 count;
+
+        count = KMP_TEST_THEN_INC32( (kmp_int32 *)unfinished_threads );
+
+        KA_TRACE(20, ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
+                      gtid, count + 1, task_team) );
+
+        *thread_finished = FALSE;
+    }
+    TCW_4(victim_td -> td.td_deque_ntasks, TCR_4(victim_td -> td.td_deque_ntasks) - 1);
+
+    __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
+
+    KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
+                  "ntasks=%d head=%u tail=%u\n",
+                  gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,
+                  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
+                  victim_td->td.td_deque_tail) );
+
+    task = KMP_TASKDATA_TO_TASK( taskdata );
+    return task;
+}
+
+
+//-----------------------------------------------------------------------------
+// __kmp_execute_tasks_template: Choose and execute tasks until either the condition
+// is statisfied (return true) or there are none left (return false).
+// final_spin is TRUE if this is the spin at the release barrier.
+// thread_finished indicates whether the thread is finished executing all
+// the tasks it has on its deque, and is at the release barrier.
+// spinner is the location on which to spin.
+// spinner == NULL means only execute a single task and return.
+// checker is the value to check to terminate the spin.
+template <class C>
+static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 
+                                               int *thread_finished
+                                               USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
+{
+    kmp_task_team_t *     task_team;
+    kmp_thread_data_t *   threads_data;
+    kmp_task_t *          task;
+    kmp_taskdata_t *      current_task = thread -> th.th_current_task;
+    volatile kmp_uint32 * unfinished_threads;
+    kmp_int32             nthreads, last_stolen, k, tid;
+
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
+    KMP_DEBUG_ASSERT( thread == __kmp_threads[ gtid ] );
+
+    task_team = thread -> th.th_task_team;
+    KMP_DEBUG_ASSERT( task_team != NULL );
+
+    KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n",
+                  gtid, final_spin, *thread_finished) );
+
+    threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
+    KMP_DEBUG_ASSERT( threads_data != NULL );
+
+    nthreads = task_team -> tt.tt_nproc;
+    unfinished_threads = &(task_team -> tt.tt_unfinished_threads);
+#if OMP_41_ENABLED
+    KMP_DEBUG_ASSERT( nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
+#else
+    KMP_DEBUG_ASSERT( nthreads > 1 );
+#endif
+    KMP_DEBUG_ASSERT( TCR_4((int)*unfinished_threads) >= 0 );
+
+    // Choose tasks from our own work queue.
+    start:
+    while (( task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained )) != NULL ) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
+            if ( itt_sync_obj == NULL ) {
+                // we are at fork barrier where we could not get the object reliably
+                itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
+            }
+            __kmp_itt_task_starting( itt_sync_obj );
+        }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        __kmp_invoke_task( gtid, task, current_task );
+#if USE_ITT_BUILD
+        if ( itt_sync_obj != NULL )
+            __kmp_itt_task_finished( itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+
+        // If this thread is only partway through the barrier and the condition
+        // is met, then return now, so that the barrier gather/release pattern can proceed.
+        // If this thread is in the last spin loop in the barrier, waiting to be
+        // released, we know that the termination condition will not be satisified,
+        // so don't waste any cycles checking it.
+        if (flag == NULL || (!final_spin && flag->done_check())) {
+            KA_TRACE(15, ("__kmp_execute_tasks_template(exit #1): T#%d spin condition satisfied\n", gtid) );
+            return TRUE;
+        }
+        KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
+    }
+
+    // This thread's work queue is empty.  If we are in the final spin loop
+    // of the barrier, check and see if the termination condition is satisfied.
+#if OMP_41_ENABLED
+    // The work queue may be empty but there might be proxy tasks still executing
+    if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0) 
+#else
+    if (final_spin) 
+#endif
+    {
+        // First, decrement the #unfinished threads, if that has not already
+        // been done.  This decrement might be to the spin location, and
+        // result in the termination condition being satisfied.
+        if (! *thread_finished) {
+            kmp_uint32 count;
+
+            count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
+            KA_TRACE(20, ("__kmp_execute_tasks_template(dec #1): T#%d dec unfinished_threads to %d task_team=%p\n",
+                          gtid, count, task_team) );
+            *thread_finished = TRUE;
+        }
+
+        // It is now unsafe to reference thread->th.th_team !!!
+        // Decrementing task_team->tt.tt_unfinished_threads can allow the master
+        // thread to pass through the barrier, where it might reset each thread's
+        // th.th_team field for the next parallel region.
+        // If we can steal more work, we know that this has not happened yet.
+        if (flag != NULL && flag->done_check()) {
+            KA_TRACE(15, ("__kmp_execute_tasks_template(exit #2): T#%d spin condition satisfied\n", gtid) );
+            return TRUE;
+        }
+    }
+
+#if OMP_41_ENABLED
+    // check if there are other threads to steal from, otherwise go back
+    if ( nthreads  == 1 )
+        goto start;
+#endif
+
+    // Try to steal from the last place I stole from successfully.
+    tid = thread -> th.th_info.ds.ds_tid;//__kmp_tid_from_gtid( gtid );
+    last_stolen = threads_data[ tid ].td.td_deque_last_stolen;
+
+    if (last_stolen != -1) {
+        kmp_info_t *other_thread = threads_data[last_stolen].td.td_thr;
+
+        while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads,
+                                         thread_finished, is_constrained )) != NULL)
+        {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
+                if ( itt_sync_obj == NULL ) {
+                    // we are at fork barrier where we could not get the object reliably
+                    itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
+                }
+                __kmp_itt_task_starting( itt_sync_obj );
+            }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+            __kmp_invoke_task( gtid, task, current_task );
+#if USE_ITT_BUILD
+            if ( itt_sync_obj != NULL )
+                __kmp_itt_task_finished( itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+
+            // Check to see if this thread can proceed.
+            if (flag == NULL || (!final_spin && flag->done_check())) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #3): T#%d spin condition satisfied\n",
+                              gtid) );
+                return TRUE;
+            }
+
+            KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
+            // If the execution of the stolen task resulted in more tasks being
+            // placed on our run queue, then restart the whole process.
+            if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
+                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
+                              gtid) );
+                goto start;
+            }
+        }
+
+        // Don't give priority to stealing from this thread anymore.
+        threads_data[ tid ].td.td_deque_last_stolen = -1;
+
+        // The victims's work queue is empty.  If we are in the final spin loop
+        // of the barrier, check and see if the termination condition is satisfied.
+#if OMP_41_ENABLED
+        // The work queue may be empty but there might be proxy tasks still executing
+        if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0) 
+#else
+        if (final_spin) 
+#endif
+	{
+            // First, decrement the #unfinished threads, if that has not already
+            // been done.  This decrement might be to the spin location, and
+            // result in the termination condition being satisfied.
+            if (! *thread_finished) {
+                kmp_uint32 count;
+
+                count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
+                KA_TRACE(20, ("__kmp_execute_tasks_template(dec #2): T#%d dec unfinished_threads to %d "
+                              "task_team=%p\n", gtid, count, task_team) );
+                *thread_finished = TRUE;
+            }
+
+            // If __kmp_tasking_mode != tskm_immediate_exec
+            // then it is now unsafe to reference thread->th.th_team !!!
+            // Decrementing task_team->tt.tt_unfinished_threads can allow the master
+            // thread to pass through the barrier, where it might reset each thread's
+            // th.th_team field for the next parallel region.
+            // If we can steal more work, we know that this has not happened yet.
+            if (flag != NULL && flag->done_check()) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #4): T#%d spin condition satisfied\n",
+                              gtid) );
+                return TRUE;
+            }
+        }
+    }
+
+    // Find a different thread to steal work from.  Pick a random thread.
+    // My initial plan was to cycle through all the threads, and only return
+    // if we tried to steal from every thread, and failed.  Arch says that's
+    // not such a great idea.
+    // GEH - need yield code in this loop for throughput library mode?
+    new_victim:
+    k = __kmp_get_random( thread ) % (nthreads - 1);
+    if ( k >= thread -> th.th_info.ds.ds_tid ) {
+        ++k;               // Adjusts random distribution to exclude self
+    }
+    {
+        kmp_info_t *other_thread = threads_data[k].td.td_thr;
+        int first;
+
+        // There is a slight chance that __kmp_enable_tasking() did not wake up
+        // all threads waiting at the barrier.  If this thread is sleeping, then
+        // then wake it up.  Since we weree going to pay the cache miss penalty
+        // for referenceing another thread's kmp_info_t struct anyway, the check
+        // shouldn't cost too much performance at this point.
+        // In extra barrier mode, tasks do not sleep at the separate tasking
+        // barrier, so this isn't a problem.
+        if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
+             (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
+             (TCR_PTR(other_thread->th.th_sleep_loc) != NULL))
+        {
+            __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
+            // A sleeping thread should not have any tasks on it's queue.
+            // There is a slight possibility that it resumes, steals a task from
+            // another thread, which spawns more tasks, all in the that it takes
+            // this thread to check => don't write an assertion that the victim's
+            // queue is empty.  Try stealing from a different thread.
+            goto new_victim;
+        }
+
+        // Now try to steal work from the selected thread
+        first = TRUE;
+        while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads,
+                                         thread_finished, is_constrained )) != NULL)
+        {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
+                if ( itt_sync_obj == NULL ) {
+                    // we are at fork barrier where we could not get the object reliably
+                    itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
+                }
+                __kmp_itt_task_starting( itt_sync_obj );
+            }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+            __kmp_invoke_task( gtid, task, current_task );
+#if USE_ITT_BUILD
+            if ( itt_sync_obj != NULL )
+                __kmp_itt_task_finished( itt_sync_obj );
+#endif /* USE_ITT_BUILD */
+
+            // Try stealing from this victim again, in the future.
+            if (first) {
+                threads_data[ tid ].td.td_deque_last_stolen = k;
+                first = FALSE;
+            }
+
+            // Check to see if this thread can proceed.
+            if (flag == NULL || (!final_spin && flag->done_check())) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #5): T#%d spin condition satisfied\n",
+                              gtid) );
+                return TRUE;
+            }
+            KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
+
+            // If the execution of the stolen task resulted in more tasks being
+            // placed on our run queue, then restart the whole process.
+            if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
+                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
+                              gtid) );
+                goto start;
+            }
+        }
+
+        // The victims's work queue is empty.  If we are in the final spin loop
+        // of the barrier, check and see if the termination condition is satisfied.
+        // Going on and finding a new victim to steal from is expensive, as it
+        // involves a lot of cache misses, so we definitely want to re-check the
+        // termination condition before doing that.
+#if OMP_41_ENABLED
+        // The work queue may be empty but there might be proxy tasks still executing
+        if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0) 
+#else
+        if (final_spin) 
+#endif
+	{
+            // First, decrement the #unfinished threads, if that has not already
+            // been done.  This decrement might be to the spin location, and
+            // result in the termination condition being satisfied.
+            if (! *thread_finished) {
+                kmp_uint32 count;
+
+                count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
+                KA_TRACE(20, ("__kmp_execute_tasks_template(dec #3): T#%d dec unfinished_threads to %d; "
+                              "task_team=%p\n",
+                              gtid, count, task_team) );
+                *thread_finished = TRUE;
+            }
+
+            // If __kmp_tasking_mode != tskm_immediate_exec,
+            // then it is now unsafe to reference thread->th.th_team !!!
+            // Decrementing task_team->tt.tt_unfinished_threads can allow the master
+            // thread to pass through the barrier, where it might reset each thread's
+            // th.th_team field for the next parallel region.
+            // If we can steal more work, we know that this has not happened yet.
+            if (flag != NULL && flag->done_check()) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #6): T#%d spin condition satisfied\n", gtid) );
+                return TRUE;
+            }
+        }
+    }
+
+    KA_TRACE(15, ("__kmp_execute_tasks_template(exit #7): T#%d can't find work\n", gtid) );
+    return FALSE;
+}
+
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
+                           int *thread_finished
+                           USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
+{
+    return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
+                           int *thread_finished
+                           USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
+{
+    return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
+                               int *thread_finished
+                               USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
+{
+    return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+
+
+//-----------------------------------------------------------------------------
+// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
+// next barrier so they can assist in executing enqueued tasks.
+// First thread in allocates the task team atomically.
+
+static void
+__kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
+{
+    kmp_team_t *team;
+    kmp_thread_data_t *threads_data;
+    int nthreads, i, is_init_thread;
+
+    KA_TRACE( 10, ( "__kmp_enable_tasking(enter): T#%d\n",
+                    __kmp_gtid_from_thread( this_thr ) ) );
+
+    team = this_thr->th.th_team;
+    KMP_DEBUG_ASSERT(task_team != NULL);
+    KMP_DEBUG_ASSERT(team != NULL);
+
+    nthreads = task_team->tt.tt_nproc;
+    KMP_DEBUG_ASSERT(nthreads > 0);
+    KMP_DEBUG_ASSERT(nthreads == team->t.t_nproc);
+
+    // Allocate or increase the size of threads_data if necessary
+    is_init_thread = __kmp_realloc_task_threads_data( this_thr, task_team );
+
+    if (!is_init_thread) {
+        // Some other thread already set up the array.
+        KA_TRACE( 20, ( "__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
+                        __kmp_gtid_from_thread( this_thr ) ) );
+        return;
+    }
+    threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
+    KMP_DEBUG_ASSERT( threads_data != NULL );
+
+    if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
+         ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) )
+    {
+        // Release any threads sleeping at the barrier, so that they can steal
+        // tasks and execute them.  In extra barrier mode, tasks do not sleep
+        // at the separate tasking barrier, so this isn't a problem.
+        for (i = 0; i < nthreads; i++) {
+            volatile void *sleep_loc;
+            kmp_info_t *thread = threads_data[i].td.td_thr;
+
+            if (i == this_thr->th.th_info.ds.ds_tid) {
+                continue;
+            }
+            // Since we haven't locked the thread's suspend mutex lock at this
+            // point, there is a small window where a thread might be putting
+            // itself to sleep, but hasn't set the th_sleep_loc field yet.
+            // To work around this, __kmp_execute_tasks_template() periodically checks
+            // see if other threads are sleeping (using the same random
+            // mechanism that is used for task stealing) and awakens them if
+            // they are.
+            if ( ( sleep_loc = TCR_PTR( thread -> th.th_sleep_loc) ) != NULL )
+            {
+                KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d waking up thread T#%d\n",
+                                 __kmp_gtid_from_thread( this_thr ),
+                                 __kmp_gtid_from_thread( thread ) ) );
+                __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+            }
+            else {
+                KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
+                                 __kmp_gtid_from_thread( this_thr ),
+                                 __kmp_gtid_from_thread( thread ) ) );
+            }
+        }
+    }
+
+    KA_TRACE( 10, ( "__kmp_enable_tasking(exit): T#%d\n",
+                    __kmp_gtid_from_thread( this_thr ) ) );
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* // TODO: Check the comment consistency
+ * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
+ * like a shadow of the kmp_team_t data struct, with a different lifetime.
+ * After a child * thread checks into a barrier and calls __kmp_release() from
+ * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
+ * longer assume that the kmp_team_t structure is intact (at any moment, the
+ * master thread may exit the barrier code and free the team data structure,
+ * and return the threads to the thread pool).
+ *
+ * This does not work with the the tasking code, as the thread is still
+ * expected to participate in the execution of any tasks that may have been
+ * spawned my a member of the team, and the thread still needs access to all
+ * to each thread in the team, so that it can steal work from it.
+ *
+ * Enter the existence of the kmp_task_team_t struct.  It employs a reference
+ * counting mechanims, and is allocated by the master thread before calling
+ * __kmp_<barrier_kind>_release, and then is release by the last thread to
+ * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
+ * of the kmp_task_team_t structs for consecutive barriers can overlap
+ * (and will, unless the master thread is the last thread to exit the barrier
+ * release phase, which is not typical).
+ *
+ * The existence of such a struct is useful outside the context of tasking,
+ * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
+ * so that any performance differences show up when comparing the 2.5 vs. 3.0
+ * libraries.
+ *
+ * We currently use the existence of the threads array as an indicator that
+ * tasks were spawned since the last barrier.  If the structure is to be
+ * useful outside the context of tasking, then this will have to change, but
+ * not settting the field minimizes the performance impact of tasking on
+ * barriers, when no explicit tasks were spawned (pushed, actually).
+ */
+
+
+static kmp_task_team_t *__kmp_free_task_teams = NULL;           // Free list for task_team data structures
+// Lock for task team data structures
+static kmp_bootstrap_lock_t __kmp_task_team_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_task_team_lock );
+
+
+//------------------------------------------------------------------------------
+// __kmp_alloc_task_deque:
+// Allocates a task deque for a particular thread, and initialize the necessary
+// data structures relating to the deque.  This only happens once per thread
+// per task team since task teams are recycled.
+// No lock is needed during allocation since each thread allocates its own
+// deque.
+
+static void
+__kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data )
+{
+    __kmp_init_bootstrap_lock( & thread_data -> td.td_deque_lock );
+    KMP_DEBUG_ASSERT( thread_data -> td.td_deque == NULL );
+
+    // Initialize last stolen task field to "none"
+    thread_data -> td.td_deque_last_stolen = -1;
+
+    KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) == 0 );
+    KMP_DEBUG_ASSERT( thread_data -> td.td_deque_head == 0 );
+    KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 );
+
+    KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
+                   __kmp_gtid_from_thread( thread ), TASK_DEQUE_SIZE, thread_data ) );
+    // Allocate space for task deque, and zero the deque
+    // Cannot use __kmp_thread_calloc() because threads not around for
+    // kmp_reap_task_team( ).
+    thread_data -> td.td_deque = (kmp_taskdata_t **)
+            __kmp_allocate( TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_free_task_deque:
+// Deallocates a task deque for a particular thread.
+// Happens at library deallocation so don't need to reset all thread data fields.
+
+static void
+__kmp_free_task_deque( kmp_thread_data_t *thread_data )
+{
+    __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
+
+    if ( thread_data -> td.td_deque != NULL ) {
+        TCW_4(thread_data -> td.td_deque_ntasks, 0);
+         __kmp_free( thread_data -> td.td_deque );
+        thread_data -> td.td_deque = NULL;
+    }
+    __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
+
+#ifdef BUILD_TIED_TASK_STACK
+    // GEH: Figure out what to do here for td_susp_tied_tasks
+    if ( thread_data -> td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY ) {
+        __kmp_free_task_stack( __kmp_thread_from_gtid( gtid ), thread_data );
+    }
+#endif // BUILD_TIED_TASK_STACK
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_realloc_task_threads_data:
+// Allocates a threads_data array for a task team, either by allocating an initial
+// array or enlarging an existing array.  Only the first thread to get the lock
+// allocs or enlarges the array and re-initializes the array eleemnts.
+// That thread returns "TRUE", the rest return "FALSE".
+// Assumes that the new array size is given by task_team -> tt.tt_nproc.
+// The current size is given by task_team -> tt.tt_max_threads.
+
+static int
+__kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team )
+{
+    kmp_thread_data_t ** threads_data_p;
+    kmp_int32            nthreads, maxthreads;
+    int                  is_init_thread = FALSE;
+
+    if ( TCR_4(task_team -> tt.tt_found_tasks) ) {
+        // Already reallocated and initialized.
+        return FALSE;
+    }
+
+    threads_data_p = & task_team -> tt.tt_threads_data;
+    nthreads   = task_team -> tt.tt_nproc;
+    maxthreads = task_team -> tt.tt_max_threads;
+
+    // All threads must lock when they encounter the first task of the implicit task
+    // region to make sure threads_data fields are (re)initialized before used.
+    __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
+
+    if ( ! TCR_4(task_team -> tt.tt_found_tasks) ) {
+        // first thread to enable tasking
+        kmp_team_t *team = thread -> th.th_team;
+        int i;
+
+        is_init_thread = TRUE;
+        if ( maxthreads < nthreads ) {
+
+            if ( *threads_data_p != NULL ) {
+                kmp_thread_data_t *old_data = *threads_data_p;
+                kmp_thread_data_t *new_data = NULL;
+
+                KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d reallocating "
+                               "threads data for task_team %p, new_size = %d, old_size = %d\n",
+                               __kmp_gtid_from_thread( thread ), task_team,
+                               nthreads, maxthreads ) );
+                // Reallocate threads_data to have more elements than current array
+                // Cannot use __kmp_thread_realloc() because threads not around for
+                // kmp_reap_task_team( ).  Note all new array entries are initialized
+                // to zero by __kmp_allocate().
+                new_data = (kmp_thread_data_t *)
+                            __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
+                // copy old data to new data
+                KMP_MEMCPY_S( (void *) new_data, nthreads * sizeof(kmp_thread_data_t),
+			(void *) old_data, 
+                        maxthreads * sizeof(kmp_taskdata_t *) );
+
+#ifdef BUILD_TIED_TASK_STACK
+                // GEH: Figure out if this is the right thing to do
+                for (i = maxthreads; i < nthreads; i++) {
+                    kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
+                    __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
+                }
+#endif // BUILD_TIED_TASK_STACK
+                // Install the new data and free the old data
+                (*threads_data_p) = new_data;
+                __kmp_free( old_data );
+            }
+            else {
+                KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d allocating "
+                               "threads data for task_team %p, size = %d\n",
+                               __kmp_gtid_from_thread( thread ), task_team, nthreads ) );
+                // Make the initial allocate for threads_data array, and zero entries
+                // Cannot use __kmp_thread_calloc() because threads not around for
+                // kmp_reap_task_team( ).
+                *threads_data_p = (kmp_thread_data_t *)
+                                  __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
+#ifdef BUILD_TIED_TASK_STACK
+                // GEH: Figure out if this is the right thing to do
+                for (i = 0; i < nthreads; i++) {
+                    kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
+                    __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
+                }
+#endif // BUILD_TIED_TASK_STACK
+            }
+            task_team -> tt.tt_max_threads = nthreads;
+        }
+        else {
+            // If array has (more than) enough elements, go ahead and use it
+            KMP_DEBUG_ASSERT( *threads_data_p != NULL );
+        }
+
+        // initialize threads_data pointers back to thread_info structures
+        for (i = 0; i < nthreads; i++) {
+            kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
+            thread_data -> td.td_thr = team -> t.t_threads[i];
+
+            if ( thread_data -> td.td_deque_last_stolen >= nthreads) {
+                // The last stolen field survives across teams / barrier, and the number
+                // of threads may have changed.  It's possible (likely?) that a new
+                // parallel region will exhibit the same behavior as the previous region.
+                thread_data -> td.td_deque_last_stolen = -1;
+            }
+        }
+
+        KMP_MB();
+        TCW_SYNC_4(task_team -> tt.tt_found_tasks, TRUE);
+    }
+
+    __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
+    return is_init_thread;
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_free_task_threads_data:
+// Deallocates a threads_data array for a task team, including any attached
+// tasking deques.  Only occurs at library shutdown.
+
+static void
+__kmp_free_task_threads_data( kmp_task_team_t *task_team )
+{
+    __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
+    if ( task_team -> tt.tt_threads_data != NULL ) {
+        int i;
+        for (i = 0; i < task_team->tt.tt_max_threads; i++ ) {
+            __kmp_free_task_deque( & task_team -> tt.tt_threads_data[i] );
+        }
+        __kmp_free( task_team -> tt.tt_threads_data );
+        task_team -> tt.tt_threads_data = NULL;
+    }
+    __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_allocate_task_team:
+// Allocates a task team associated with a specific team, taking it from
+// the global task team free list if possible.  Also initializes data structures.
+
+static kmp_task_team_t *
+__kmp_allocate_task_team( kmp_info_t *thread, kmp_team_t *team )
+{
+    kmp_task_team_t *task_team = NULL;
+    int nthreads;
+
+    KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d entering; team = %p\n",
+                    (thread ? __kmp_gtid_from_thread( thread ) : -1), team ) );
+
+    if (TCR_PTR(__kmp_free_task_teams) != NULL) {
+        // Take a task team from the task team pool
+        __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
+        if (__kmp_free_task_teams != NULL) {
+            task_team = __kmp_free_task_teams;
+            TCW_PTR(__kmp_free_task_teams, task_team -> tt.tt_next);
+            task_team -> tt.tt_next = NULL;
+        }
+        __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
+    }
+
+    if (task_team == NULL) {
+        KE_TRACE( 10, ( "__kmp_allocate_task_team: T#%d allocating "
+                       "task team for team %p\n",
+                       __kmp_gtid_from_thread( thread ), team ) );
+        // Allocate a new task team if one is not available.
+        // Cannot use __kmp_thread_malloc() because threads not around for
+        // kmp_reap_task_team( ).
+        task_team = (kmp_task_team_t *) __kmp_allocate( sizeof(kmp_task_team_t) );
+        __kmp_init_bootstrap_lock( & task_team -> tt.tt_threads_lock );
+        //task_team -> tt.tt_threads_data = NULL;   // AC: __kmp_allocate zeroes returned memory
+        //task_team -> tt.tt_max_threads = 0;
+        //task_team -> tt.tt_next = NULL;
+    }
+
+    TCW_4(task_team -> tt.tt_found_tasks, FALSE);
+#if OMP_41_ENABLED
+    TCW_4(task_team -> tt.tt_found_proxy_tasks, FALSE);
+#endif
+    task_team -> tt.tt_nproc = nthreads = team->t.t_nproc;
+
+    TCW_4( task_team -> tt.tt_unfinished_threads, nthreads );
+    TCW_4( task_team -> tt.tt_active, TRUE );
+    TCW_4( task_team -> tt.tt_ref_ct, nthreads - 1);
+
+    KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d exiting; task_team = %p\n",
+                    (thread ? __kmp_gtid_from_thread( thread ) : -1), task_team ) );
+    return task_team;
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_free_task_team:
+// Frees the task team associated with a specific thread, and adds it
+// to the global task team free list.
+//
+
+static void
+__kmp_free_task_team( kmp_info_t *thread, kmp_task_team_t *task_team )
+{
+    KA_TRACE( 20, ( "__kmp_free_task_team: T#%d task_team = %p\n",
+                    thread ? __kmp_gtid_from_thread( thread ) : -1, task_team ) );
+
+    KMP_DEBUG_ASSERT( TCR_4(task_team -> tt.tt_ref_ct) == 0 );
+
+    // Put task team back on free list
+    __kmp_acquire_bootstrap_lock( & __kmp_task_team_lock );
+
+    KMP_DEBUG_ASSERT( task_team -> tt.tt_next == NULL );
+    task_team -> tt.tt_next = __kmp_free_task_teams;
+    TCW_4(task_team -> tt.tt_found_tasks, FALSE);
+    TCW_PTR(__kmp_free_task_teams, task_team);
+
+    __kmp_release_bootstrap_lock( & __kmp_task_team_lock );
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_reap_task_teams:
+// Free all the task teams on the task team free list.
+// Should only be done during library shutdown.
+// Cannot do anything that needs a thread structure or gtid since they are already gone.
+
+void
+__kmp_reap_task_teams( void )
+{
+    kmp_task_team_t   *task_team;
+
+    if ( TCR_PTR(__kmp_free_task_teams) != NULL ) {
+        // Free all task_teams on the free list
+        __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
+        while ( ( task_team = __kmp_free_task_teams ) != NULL ) {
+            __kmp_free_task_teams = task_team -> tt.tt_next;
+            task_team -> tt.tt_next = NULL;
+
+            // Free threads_data if necessary
+            if ( task_team -> tt.tt_threads_data != NULL ) {
+                __kmp_free_task_threads_data( task_team );
+            }
+            __kmp_free( task_team );
+        }
+        __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_unref_task_teams:
+// Remove one thread from referencing the task team structure by
+// decreasing the reference count and deallocate task team if no more
+// references to it.
+//
+void
+__kmp_unref_task_team( kmp_task_team_t *task_team, kmp_info_t *thread )
+{
+    kmp_uint ref_ct;
+
+    ref_ct = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& task_team->tt.tt_ref_ct) ) - 1;
+
+    KA_TRACE( 20, ( "__kmp_unref_task_team: T#%d task_team = %p ref_ct = %d\n",
+                    __kmp_gtid_from_thread( thread ), task_team, ref_ct ) );
+
+
+    if ( ref_ct == 0 ) {
+        __kmp_free_task_team( thread, task_team );
+    }
+
+    TCW_PTR( *((volatile kmp_task_team_t **)(&thread->th.th_task_team)), NULL );
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_wait_to_unref_task_teams:
+// Some threads could still be in the fork barrier release code, possibly
+// trying to steal tasks.  Wait for each thread to unreference its task team.
+//
+void
+__kmp_wait_to_unref_task_teams(void)
+{
+    kmp_info_t *thread;
+    kmp_uint32 spins;
+    int done;
+
+    KMP_INIT_YIELD( spins );
+
+
+    for (;;) {
+        done = TRUE;
+
+        // TODO: GEH - this may be is wrong because some sync would be necessary
+        //             in case threads are added to the pool during the traversal.
+        //             Need to verify that lock for thread pool is held when calling
+        //             this routine.
+        for (thread = (kmp_info_t *)__kmp_thread_pool;
+             thread != NULL;
+             thread = thread->th.th_next_pool)
+        {
+#if KMP_OS_WINDOWS
+            DWORD exit_val;
+#endif
+            if ( TCR_PTR(thread->th.th_task_team) == NULL ) {
+                KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
+                               __kmp_gtid_from_thread( thread ) ) );
+                continue;
+            }
+#if KMP_OS_WINDOWS
+            // TODO: GEH - add this check for Linux* OS / OS X* as well?
+            if (!__kmp_is_thread_alive(thread, &exit_val)) {
+                if (TCR_PTR(thread->th.th_task_team) != NULL) {
+                    __kmp_unref_task_team( thread->th.th_task_team, thread );
+                }
+                continue;
+            }
+#endif
+
+            done = FALSE;  // Because th_task_team pointer is not NULL for this thread
+
+            KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to unreference task_team\n",
+                           __kmp_gtid_from_thread( thread ) ) );
+
+            if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
+                volatile void *sleep_loc;
+                // If the thread is sleeping, awaken it.
+                if ( ( sleep_loc = TCR_PTR( thread->th.th_sleep_loc) ) != NULL ) {
+                    KA_TRACE( 10, ( "__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
+                                    __kmp_gtid_from_thread( thread ), __kmp_gtid_from_thread( thread ) ) );
+                    __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+                }
+            }
+        }
+        if (done) {
+            break;
+        }
+
+        // If we are oversubscribed,
+        // or have waited a bit (and library mode is throughput), yield.
+        // Pause is in the following code.
+        KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
+        KMP_YIELD_SPIN( spins );        // Yields only if KMP_LIBRARY=throughput
+    }
+
+
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_task_team_setup:  Create a task_team for the current team, but use
+// an already created, unused one if it already exists.
+// This may be called by any thread, but only for teams with # threads >1.
+void
+__kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int both, int always )
+{
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
+
+    if ( ( team->t.t_task_team[this_thr->th.th_task_state] == NULL ) && ( always || team->t.t_nproc > 1 ) ) {
+        // Allocate a new task team, which will be propagated to
+        // all of the worker threads after the barrier.  As they
+        // spin in the barrier release phase, then will continue
+        // to use the previous task team struct, until they receive
+        // the signal to stop checking for tasks (they can't safely
+        // reference the kmp_team_t struct, which could be reallocated
+        // by the master thread).
+        team->t.t_task_team[this_thr->th.th_task_state] = __kmp_allocate_task_team( this_thr, team );
+        KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p for team %d\n",
+                      __kmp_gtid_from_thread(this_thr), team->t.t_task_team[this_thr->th.th_task_state],
+                        ((team != NULL) ? team->t.t_id : -1)) );
+    }
+    //else
+        // All threads have reported in, and no tasks were spawned
+        // for this release->gather region.  Leave the old task
+        // team struct in place for the upcoming region.  No task
+        // teams are formed for serialized teams.
+    if (both) {
+        int other_team = 1 - this_thr->th.th_task_state;
+        if ( ( team->t.t_task_team[other_team] == NULL ) && ( team->t.t_nproc > 1 ) ) { // setup other team as well
+            team->t.t_task_team[other_team] = __kmp_allocate_task_team( this_thr, team );
+            KA_TRACE( 20, ( "__kmp_task_team_setup: Master T#%d created new task_team %p for team %d\n",
+                            __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
+                            ((team != NULL) ? team->t.t_id : -1)) );
+    }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_task_team_sync: Propagation of task team data from team to threads
+// which happens just after the release phase of a team barrier.  This may be
+// called by any thread, but only for teams with # threads > 1.
+
+void
+__kmp_task_team_sync( kmp_info_t *this_thr, kmp_team_t *team )
+{
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
+
+    // In case this thread never saw that the task team was no longer active, unref/deallocate it now.
+    if ( this_thr->th.th_task_team != NULL ) {
+        if ( ! TCR_SYNC_4( this_thr->th.th_task_team->tt.tt_active ) ) {
+            KMP_DEBUG_ASSERT( ! KMP_MASTER_TID( __kmp_tid_from_gtid( __kmp_gtid_from_thread( this_thr ) ) ) );
+            __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
+        } else {  // We are re-using a task team that was never enabled.
+            KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
+        }
+    }
+
+    // Toggle the th_task_state field, to switch which task_team this thread refers to
+        this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
+    // It is now safe to propagate the task team pointer from the team struct to the current thread.
+    TCW_PTR(this_thr->th.th_task_team, team->t.t_task_team[this_thr->th.th_task_state]);
+    KA_TRACE( 20, ( "__kmp_task_team_sync: Thread T#%d task team assigned pointer (%p) from Team #%d task team\n",
+                    __kmp_gtid_from_thread( this_thr ), &this_thr->th.th_task_team,
+                    this_thr->th.th_task_team, ((team != NULL) ? (team->t.t_id) : -1) ) );
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_task_team_wait: Master thread waits for outstanding tasks after the
+// barrier gather phase.  Only called by master thread if #threads in team > 1 or if proxy tasks were created
+void
+__kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj)
+                      )
+{
+    kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
+
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
+    KMP_DEBUG_ASSERT( task_team == this_thr->th.th_task_team );
+
+    if ( ( task_team != NULL ) && KMP_TASKING_ENABLED(task_team) ) {
+        KA_TRACE( 20, ( "__kmp_task_team_wait: Master T#%d waiting for all tasks: task_team = %p\n",
+                          __kmp_gtid_from_thread( this_thr ), task_team ) );
+        // All worker threads might have dropped through to the release phase, but could still
+        // be executing tasks. Wait here for all tasks to complete.  To avoid memory contention,
+        // only the master thread checks for the termination condition.
+        kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj));
+
+        // Kill the old task team, so that the worker threads will stop referencing it while spinning.
+        // They will deallocate it when the reference count reaches zero.
+        // The master thread is not included in the ref count.
+        KA_TRACE( 20, ( "__kmp_task_team_wait: Master T#%d deactivating task_team %p\n",
+                          __kmp_gtid_from_thread( this_thr ), task_team ) );
+#if OMP_41_ENABLED
+        KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 || task_team->tt.tt_found_proxy_tasks == TRUE );
+        TCW_SYNC_4( task_team->tt.tt_found_proxy_tasks, FALSE );
+#else
+        KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 );
+#endif
+        TCW_SYNC_4( task_team->tt.tt_active, FALSE );
+        KMP_MB();
+
+        TCW_PTR(this_thr->th.th_task_team, NULL);
+        team->t.t_task_team[this_thr->th.th_task_state] = NULL;
+    }
+}
+
+
+//------------------------------------------------------------------------------
+// __kmp_tasking_barrier:
+// Internal function to execute all tasks prior to a regular barrier or a
+// join barrier.  It is a full barrier itself, which unfortunately turns
+// regular barriers into double barriers and join barriers into 1 1/2
+// barriers.
+// This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
+
+void
+__kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
+{
+    volatile kmp_uint32 *spin = &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads;
+    int flag = FALSE;
+    KMP_DEBUG_ASSERT( __kmp_tasking_mode == tskm_extra_barrier );
+
+#if USE_ITT_BUILD
+    KMP_FSYNC_SPIN_INIT( spin, (kmp_uint32*) NULL );
+#endif /* USE_ITT_BUILD */
+    kmp_flag_32 spin_flag(spin, 0U);
+    while (! spin_flag.execute_tasks(thread, gtid, TRUE, &flag
+                                     USE_ITT_BUILD_ARG(NULL), 0 ) ) {
+#if USE_ITT_BUILD
+        // TODO: What about itt_sync_obj??
+        KMP_FSYNC_SPIN_PREPARE( spin );
+#endif /* USE_ITT_BUILD */
+
+        if( TCR_4(__kmp_global.g.g_done) ) {
+            if( __kmp_global.g.g_abort )
+                __kmp_abort_thread( );
+            break;
+        }
+        KMP_YIELD( TRUE );       // GH: We always yield here
+    }
+#if USE_ITT_BUILD
+    KMP_FSYNC_SPIN_ACQUIRED( (void*) spin );
+#endif /* USE_ITT_BUILD */
+}
+
+
+#if OMP_41_ENABLED
+
+/* __kmp_give_task puts a task into a given thread queue if:
+    - the queue for that thread it was created
+    - there's space in that queue
+
+    Because of this, __kmp_push_task needs to check if there's space after getting the lock
+ */
+static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task )
+{
+    kmp_task_team_t *   task_team = thread->th.th_task_team;
+    kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
+    kmp_taskdata_t *    taskdata = KMP_TASK_TO_TASKDATA(task);
+    bool result = false;
+
+    KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", taskdata, tid ) );
+
+    // assert tasking is enabled? what if not?
+    KMP_DEBUG_ASSERT( task_team != NULL );
+
+    if (thread_data -> td.td_deque == NULL ) {
+        // There's no queue in this thread, go find another one
+        // We're guaranteed that at least one thread has a queue
+        KA_TRACE(30, ("__kmp_give_task: thread %d has no queue while giving task %p.\n", tid, taskdata ) );
+        return result;
+    }
+
+    if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+    {
+        KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
+        return result;
+    }
+
+    __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
+
+    if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+    {
+        KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
+        goto release_and_exit;
+    }
+
+    thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
+    // Wrap index.
+    thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
+    TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
+
+    result = true;
+    KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", taskdata, tid ) );
+
+release_and_exit:
+    __kmp_release_bootstrap_lock( & thread_data-> td.td_deque_lock );
+
+     return result;
+}
+
+
+/* The finish of the a proxy tasks is divided in two pieces:
+    - the top half is the one that can be done from a thread outside the team
+    - the bottom half must be run from a them within the team
+
+    In order to run the bottom half the task gets queued back into one of the threads of the team.
+    Once the td_incomplete_child_task counter of the parent is decremented the threads can leave the barriers.
+    So, the bottom half needs to be queued before the counter is decremented. The top half is therefore divided in two parts:
+    - things that can be run before queuing the bottom half
+    - things that must be run after queuing the bottom half
+
+    This creates a second race as the bottom half can free the task before the second top half is executed. To avoid this
+    we use the td_incomplete_child_task of the proxy task to synchronize the top and bottom half.
+*/
+
+static void __kmp_first_top_half_finish_proxy( kmp_taskdata_t * taskdata )
+{
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
+
+    taskdata -> td_flags.complete = 1;   // mark the task as completed
+
+    if ( taskdata->td_taskgroup )
+       KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
+
+    // Create an imaginary children for this task so the bottom half cannot release the task before we have completed the second top half
+    TCR_4(taskdata->td_incomplete_child_tasks++);
+}
+
+static void __kmp_second_top_half_finish_proxy( kmp_taskdata_t * taskdata )
+{
+    kmp_int32 children = 0;
+
+    // Predecrement simulated by "- 1" calculation
+    children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
+    KMP_DEBUG_ASSERT( children >= 0 );
+
+    // Remove the imaginary children
+    TCR_4(taskdata->td_incomplete_child_tasks--);
+}
+
+static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask )
+{
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
+    kmp_info_t * thread = __kmp_threads[ gtid ];
+
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 1 ); // top half must run before bottom half
+
+    // We need to wait to make sure the top half is finished
+    // Spinning here should be ok as this should happen quickly
+    while ( TCR_4(taskdata->td_incomplete_child_tasks) > 0 ) ;
+
+    __kmp_release_deps(gtid,taskdata);
+    __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+}
+
+/*!
+@ingroup TASKING
+@param gtid Global Thread ID of encountering thread
+@param ptask Task which execution is completed
+
+Execute the completation of a proxy task from a thread of that is part of the team. Run first and bottom halves directly.
+*/
+void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask )
+{
+    KMP_DEBUG_ASSERT( ptask != NULL );
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
+    KA_TRACE(10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", gtid, taskdata ) );
+
+    KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
+
+    __kmp_first_top_half_finish_proxy(taskdata);
+    __kmp_second_top_half_finish_proxy(taskdata);
+    __kmp_bottom_half_finish_proxy(gtid,ptask);
+
+    KA_TRACE(10, ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", gtid, taskdata ) );
+}
+
+/*!
+@ingroup TASKING
+@param ptask Task which execution is completed
+
+Execute the completation of a proxy task from a thread that could not belong to the team.
+*/
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
+{
+    KMP_DEBUG_ASSERT( ptask != NULL );
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
+
+    KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", taskdata ) );
+
+    KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
+
+    __kmp_first_top_half_finish_proxy(taskdata);
+
+    // Enqueue task to complete bottom half completation from a thread within the corresponding team
+    kmp_team_t * team = taskdata->td_team;
+    kmp_int32 nthreads = team->t.t_nproc;
+    kmp_info_t *thread;
+    kmp_int32 k = 0;
+
+    do {
+        //This should be similar to k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
+        //For now we're just linearly trying to find a thread
+        k = (k+1) % nthreads;
+        thread = team->t.t_threads[k];
+    } while ( !__kmp_give_task( thread, k,  ptask ) );
+
+    __kmp_second_top_half_finish_proxy(taskdata);
+
+    KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
+}
+
+#endif

diff --git a/final/runtime/src/kmp_taskq.c b/final/runtime/src/kmp_taskq.c
new file mode 100644
index 0000000..3079d45
--- /dev/null
+++ b/final/runtime/src/kmp_taskq.c

@@ -0,0 +1,2032 @@
+/*
+ * kmp_taskq.c -- TASKQ support for OpenMP.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_error.h"
+
+#define MAX_MESSAGE 512
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Taskq routines and global variables
+ */
+
+#define KMP_DEBUG_REF_CTS(x)    KF_TRACE(1, x);
+
+#define THREAD_ALLOC_FOR_TASKQ
+
+static int
+in_parallel_context( kmp_team_t *team )
+{
+    return ! team -> t.t_serialized;
+}
+
+static void
+__kmp_taskq_eo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    int                gtid = *gtid_ref;
+    int                tid  = __kmp_tid_from_gtid( gtid );
+    kmp_uint32         my_token;
+    kmpc_task_queue_t *taskq;
+    kmp_taskq_t       *tq   = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
+
+    if ( __kmp_env_consistency_check )
+#if KMP_USE_DYNAMIC_LOCK
+        __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL, 0 );
+#else
+        __kmp_push_sync( gtid, ct_ordered_in_taskq, loc_ref, NULL );
+#endif
+
+    if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        /* GEH - need check here under stats to make sure   */
+        /*       inside task (curr_thunk[*tid_ref] != NULL) */
+
+        my_token =tq->tq_curr_thunk[ tid ]-> th_tasknum;
+
+        taskq = tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue;
+
+        KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
+        KMP_MB();
+    }
+}
+
+static void
+__kmp_taskq_xo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
+{
+    int           gtid = *gtid_ref;
+    int           tid  = __kmp_tid_from_gtid( gtid );
+    kmp_uint32    my_token;
+    kmp_taskq_t  *tq   = & __kmp_threads[gtid] -> th.th_team -> t.t_taskq;
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_sync( gtid, ct_ordered_in_taskq, loc_ref );
+
+    if ( ! __kmp_threads[ gtid ]-> th.th_team -> t.t_serialized ) {
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        /* GEH - need check here under stats to make sure */
+        /*       inside task (curr_thunk[tid] != NULL)    */
+
+        my_token = tq->tq_curr_thunk[ tid ]->th_tasknum;
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        tq->tq_curr_thunk[ tid ]-> th.th_shareds -> sv_queue -> tq_tasknum_serving = my_token + 1;
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+    }
+}
+
+static void
+__kmp_taskq_check_ordered( kmp_int32 gtid, kmpc_thunk_t *thunk )
+{
+    kmp_uint32 my_token;
+    kmpc_task_queue_t *taskq;
+
+    /* assume we are always called from an active parallel context */
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    my_token =  thunk -> th_tasknum;
+
+    taskq =  thunk -> th.th_shareds -> sv_queue;
+
+    if(taskq->tq_tasknum_serving <= my_token) {
+        KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
+        KMP_MB();
+        taskq->tq_tasknum_serving = my_token +1;
+        KMP_MB();
+    }
+}
+
+#ifdef KMP_DEBUG
+
+static void
+__kmp_dump_TQF(kmp_int32 flags)
+{
+    if (flags & TQF_IS_ORDERED)
+        __kmp_printf("ORDERED ");
+    if (flags & TQF_IS_LASTPRIVATE)
+        __kmp_printf("LAST_PRIV ");
+    if (flags & TQF_IS_NOWAIT)
+        __kmp_printf("NOWAIT ");
+    if (flags & TQF_HEURISTICS)
+        __kmp_printf("HEURIST ");
+    if (flags & TQF_INTERFACE_RESERVED1)
+        __kmp_printf("RESERV1 ");
+    if (flags & TQF_INTERFACE_RESERVED2)
+        __kmp_printf("RESERV2 ");
+    if (flags & TQF_INTERFACE_RESERVED3)
+        __kmp_printf("RESERV3 ");
+    if (flags & TQF_INTERFACE_RESERVED4)
+        __kmp_printf("RESERV4 ");
+    if (flags & TQF_IS_LAST_TASK)
+        __kmp_printf("LAST_TASK ");
+    if (flags & TQF_TASKQ_TASK)
+        __kmp_printf("TASKQ_TASK ");
+    if (flags & TQF_RELEASE_WORKERS)
+        __kmp_printf("RELEASE ");
+    if (flags & TQF_ALL_TASKS_QUEUED)
+        __kmp_printf("ALL_QUEUED ");
+    if (flags & TQF_PARALLEL_CONTEXT)
+        __kmp_printf("PARALLEL ");
+    if (flags & TQF_DEALLOCATED)
+        __kmp_printf("DEALLOC ");
+    if (!(flags & (TQF_INTERNAL_FLAGS|TQF_INTERFACE_FLAGS)))
+        __kmp_printf("(NONE)");
+}
+
+static void
+__kmp_dump_thunk( kmp_taskq_t *tq, kmpc_thunk_t *thunk, kmp_int32 global_tid )
+{
+    int i;
+    int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
+
+    __kmp_printf("\tThunk at %p on (%d):  ", thunk, global_tid);
+
+    if (thunk != NULL) {
+        for (i = 0; i < nproc; i++) {
+            if( tq->tq_curr_thunk[i] == thunk ) {
+                __kmp_printf("[%i] ", i);
+            }
+        }
+        __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
+        __kmp_printf("th_task=%p, ", thunk->th_task);
+        __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
+        __kmp_printf("th_status=%d, ", thunk->th_status);
+        __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
+        __kmp_printf("th_flags="); __kmp_dump_TQF(thunk->th_flags);
+    }
+
+    __kmp_printf("\n");
+}
+
+static void
+__kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num)
+{
+    kmpc_thunk_t *th;
+
+    __kmp_printf("    Thunk stack for T#%d:  ", thread_num);
+
+    for (th = thunk; th != NULL; th = th->th_encl_thunk )
+        __kmp_printf("%p ", th);
+
+    __kmp_printf("\n");
+}
+
+static void
+__kmp_dump_task_queue( kmp_taskq_t *tq, kmpc_task_queue_t *queue, kmp_int32 global_tid )
+{
+    int                  qs, count, i;
+    kmpc_thunk_t        *thunk;
+    kmpc_task_queue_t   *taskq;
+
+    __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
+
+    if (queue != NULL) {
+        int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
+
+    if ( __kmp_env_consistency_check ) {
+        __kmp_printf("    tq_loc             : ");
+    }
+        if (in_parallel) {
+
+            //if (queue->tq.tq_parent != 0)
+                //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+            //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+            __kmp_printf("    tq_parent          : %p\n", queue->tq.tq_parent);
+            __kmp_printf("    tq_first_child     : %p\n", queue->tq_first_child);
+            __kmp_printf("    tq_next_child      : %p\n", queue->tq_next_child);
+            __kmp_printf("    tq_prev_child      : %p\n", queue->tq_prev_child);
+            __kmp_printf("    tq_ref_count       : %d\n", queue->tq_ref_count);
+
+            //__kmp_release_lock(& queue->tq_link_lck, global_tid);
+
+            //if (queue->tq.tq_parent != 0)
+                //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+            //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
+            //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+        }
+
+        __kmp_printf("    tq_shareds         : ");
+        for (i=0; i<((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
+            __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
+        __kmp_printf("\n");
+
+        if (in_parallel) {
+            __kmp_printf("    tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
+            __kmp_printf("    tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
+        }
+
+        __kmp_printf("    tq_queue           : %p\n", queue->tq_queue);
+        __kmp_printf("    tq_thunk_space     : %p\n", queue->tq_thunk_space);
+        __kmp_printf("    tq_taskq_slot      : %p\n", queue->tq_taskq_slot);
+
+        __kmp_printf("    tq_free_thunks     : ");
+        for (thunk = queue->tq_free_thunks; thunk != NULL; thunk = thunk->th.th_next_free )
+            __kmp_printf("%p ", thunk);
+        __kmp_printf("\n");
+
+        __kmp_printf("    tq_nslots          : %d\n", queue->tq_nslots);
+        __kmp_printf("    tq_head            : %d\n", queue->tq_head);
+        __kmp_printf("    tq_tail            : %d\n", queue->tq_tail);
+        __kmp_printf("    tq_nfull           : %d\n", queue->tq_nfull);
+        __kmp_printf("    tq_hiwat           : %d\n", queue->tq_hiwat);
+        __kmp_printf("    tq_flags           : "); __kmp_dump_TQF(queue->tq_flags);
+        __kmp_printf("\n");
+
+        if (in_parallel) {
+            __kmp_printf("    tq_th_thunks       : ");
+            for (i = 0; i < queue->tq_nproc; i++) {
+                __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
+            }
+            __kmp_printf("\n");
+        }
+
+        __kmp_printf("\n");
+        __kmp_printf("    Queue slots:\n");
+
+
+        qs = queue->tq_tail;
+        for ( count = 0; count < queue->tq_nfull; ++count ) {
+            __kmp_printf("(%d)", qs);
+            __kmp_dump_thunk( tq, queue->tq_queue[qs].qs_thunk, global_tid );
+            qs = (qs+1) % queue->tq_nslots;
+        }
+
+        __kmp_printf("\n");
+
+        if (in_parallel) {
+            if (queue->tq_taskq_slot != NULL) {
+                __kmp_printf("    TaskQ slot:\n");
+                __kmp_dump_thunk( tq, (kmpc_thunk_t *) queue->tq_taskq_slot, global_tid );
+                __kmp_printf("\n");
+            }
+            //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
+            //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
+        }
+    }
+
+    __kmp_printf("    Taskq freelist: ");
+
+    //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
+
+    KMP_MB();  /* make sure data structures are in consistent state before querying them */
+               /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+    for( taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free )
+        __kmp_printf("%p ", taskq);
+
+    //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
+
+    __kmp_printf("\n\n");
+}
+
+static void
+__kmp_aux_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *curr_queue, kmp_int32 level, kmp_int32 global_tid )
+{
+    int i, count, qs;
+    int nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
+    kmpc_task_queue_t *queue = curr_queue;
+
+    if (curr_queue == NULL)
+        return;
+
+    __kmp_printf("    ");
+
+    for (i=0; i<level; i++)
+        __kmp_printf("  ");
+
+    __kmp_printf("%p", curr_queue);
+
+    for (i = 0; i < nproc; i++) {
+        if( tq->tq_curr_thunk[i] && tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue ) {
+            __kmp_printf(" [%i]", i);
+        }
+    }
+
+    __kmp_printf(":");
+
+    //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
+
+    KMP_MB();  /* make sure data structures are in consistent state before querying them */
+               /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+    qs = curr_queue->tq_tail;
+
+    for ( count = 0; count < curr_queue->tq_nfull; ++count ) {
+        __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
+         qs = (qs+1) % curr_queue->tq_nslots;
+    }
+
+    //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
+
+    __kmp_printf("\n");
+
+    if (curr_queue->tq_first_child) {
+        //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+        KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                   /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+        if (curr_queue->tq_first_child) {
+            for(queue = (kmpc_task_queue_t *)curr_queue->tq_first_child;
+                queue != NULL;
+                queue = queue->tq_next_child) {
+                __kmp_aux_dump_task_queue_tree( tq, queue, level+1, global_tid );
+            }
+        }
+
+        //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+    }
+}
+
+static void
+__kmp_dump_task_queue_tree( kmp_taskq_t *tq, kmpc_task_queue_t *tqroot, kmp_int32 global_tid)
+{
+    __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
+
+    __kmp_aux_dump_task_queue_tree( tq, tqroot, 0, global_tid );
+
+    __kmp_printf("\n");
+}
+#endif
+
+/* --------------------------------------------------------------------------- */
+
+/*
+    New taskq storage routines that try to minimize overhead of mallocs but
+    still provide cache line alignment.
+*/
+
+
+static void *
+__kmp_taskq_allocate(size_t size, kmp_int32 global_tid)
+{
+    void *addr, *orig_addr;
+    size_t bytes;
+
+    KB_TRACE( 5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int) size, global_tid ) );
+
+    bytes = sizeof(void *) + CACHE_LINE + size;
+
+#ifdef THREAD_ALLOC_FOR_TASKQ
+    orig_addr = (void *) __kmp_thread_malloc( __kmp_thread_from_gtid(global_tid), bytes );
+#else
+    KE_TRACE( 10, ("%%%%%% MALLOC( %d )\n", bytes ) );
+    orig_addr = (void *) KMP_INTERNAL_MALLOC( bytes );
+#endif /* THREAD_ALLOC_FOR_TASKQ */
+
+    if (orig_addr == 0)
+        KMP_FATAL( OutOfHeapMemory );
+
+    addr = orig_addr;
+
+    if (((kmp_uintptr_t) addr & ( CACHE_LINE - 1 )) != 0) {
+        KB_TRACE( 50, ("__kmp_taskq_allocate:  adjust for cache alignment\n" ) );
+        addr = (void *) (((kmp_uintptr_t) addr + CACHE_LINE) & ~( CACHE_LINE - 1 ));
+    }
+
+    (* (void **) addr) = orig_addr;
+
+    KB_TRACE( 10, ("__kmp_taskq_allocate:  allocate: %p, use: %p - %p, size: %d, gtid: %d\n",
+             orig_addr, ((void **) addr) + 1, ((char *)(((void **) addr) + 1)) + size-1,
+             (int) size, global_tid ));
+
+    return ( ((void **) addr) + 1 );
+}
+
+static void
+__kmpc_taskq_free(void *p, kmp_int32 global_tid)
+{
+    KB_TRACE( 5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid ) );
+
+    KB_TRACE(10, ("__kmpc_taskq_free:  freeing: %p, gtid: %d\n", (*( ((void **) p)-1)), global_tid ));
+
+#ifdef THREAD_ALLOC_FOR_TASKQ
+    __kmp_thread_free( __kmp_thread_from_gtid(global_tid), *( ((void **) p)-1) );
+#else
+    KMP_INTERNAL_FREE( *( ((void **) p)-1) );
+#endif /* THREAD_ALLOC_FOR_TASKQ */
+}
+
+/* --------------------------------------------------------------------------- */
+
+/*
+ *      Keep freed kmpc_task_queue_t on an internal freelist and recycle since
+ *      they're of constant size.
+ */
+
+static kmpc_task_queue_t *
+__kmp_alloc_taskq ( kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots, kmp_int32 nthunks,
+                    kmp_int32 nshareds, kmp_int32 nproc, size_t sizeof_thunk,
+                    size_t sizeof_shareds, kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid )
+{
+    kmp_int32                  i;
+    size_t                     bytes;
+    kmpc_task_queue_t          *new_queue;
+    kmpc_aligned_shared_vars_t *shared_var_array;
+    char                       *shared_var_storage;
+    char                       *pt; /* for doing byte-adjusted address computations */
+
+    __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
+
+    KMP_MB();  /* make sure data structures are in consistent state before querying them */
+               /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+    if( tq->tq_freelist ) {
+        new_queue =  tq -> tq_freelist;
+        tq -> tq_freelist =  tq -> tq_freelist -> tq.tq_next_free;
+
+        KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
+
+        new_queue->tq_flags = 0;
+
+        __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
+    }
+    else {
+        __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
+
+        new_queue = (kmpc_task_queue_t *) __kmp_taskq_allocate (sizeof (kmpc_task_queue_t), global_tid);
+        new_queue->tq_flags = 0;
+    }
+
+    /*  space in the task queue for queue slots (allocate as one big chunk */
+    /* of storage including new_taskq_task space)                          */
+
+    sizeof_thunk += (CACHE_LINE - (sizeof_thunk % CACHE_LINE));         /* pad to cache line size */
+    pt = (char *) __kmp_taskq_allocate (nthunks * sizeof_thunk, global_tid);
+    new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
+    *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
+
+    /*  chain the allocated thunks into a freelist for this queue  */
+
+    new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
+
+    for (i = 0; i < (nthunks - 2); i++) {
+        ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th.th_next_free = (kmpc_thunk_t *)(pt + (i+1)*sizeof_thunk);
+#ifdef KMP_DEBUG
+        ((kmpc_thunk_t *)(pt+i*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
+#endif
+    }
+
+    ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th.th_next_free = NULL;
+#ifdef KMP_DEBUG
+    ((kmpc_thunk_t *)(pt+(nthunks-2)*sizeof_thunk))->th_flags = TQF_DEALLOCATED;
+#endif
+
+    /* initialize the locks */
+
+    if (in_parallel) {
+        __kmp_init_lock( & new_queue->tq_link_lck );
+        __kmp_init_lock( & new_queue->tq_free_thunks_lck );
+        __kmp_init_lock( & new_queue->tq_queue_lck );
+    }
+
+    /* now allocate the slots */
+
+    bytes = nslots * sizeof (kmpc_aligned_queue_slot_t);
+    new_queue->tq_queue = (kmpc_aligned_queue_slot_t *) __kmp_taskq_allocate( bytes, global_tid );
+
+    /*  space for array of pointers to shared variable structures */
+    sizeof_shareds += sizeof(kmpc_task_queue_t *);
+    sizeof_shareds += (CACHE_LINE - (sizeof_shareds % CACHE_LINE));     /* pad to cache line size */
+
+    bytes = nshareds * sizeof (kmpc_aligned_shared_vars_t);
+    shared_var_array = (kmpc_aligned_shared_vars_t *) __kmp_taskq_allocate ( bytes, global_tid);
+
+    bytes = nshareds * sizeof_shareds;
+    shared_var_storage = (char *) __kmp_taskq_allocate ( bytes, global_tid);
+
+    for (i=0; i<nshareds; i++) {
+        shared_var_array[i].ai_data = (kmpc_shared_vars_t *) (shared_var_storage + i*sizeof_shareds);
+        shared_var_array[i].ai_data->sv_queue = new_queue;
+    }
+    new_queue->tq_shareds = shared_var_array;
+
+
+    /* array for number of outstanding thunks per thread */
+
+    if (in_parallel) {
+        bytes = nproc * sizeof(kmpc_aligned_int32_t);
+        new_queue->tq_th_thunks = (kmpc_aligned_int32_t *) __kmp_taskq_allocate ( bytes, global_tid);
+        new_queue->tq_nproc     = nproc;
+
+        for (i=0; i<nproc; i++)
+            new_queue->tq_th_thunks[i].ai_data = 0;
+    }
+
+    return new_queue;
+}
+
+static void
+__kmp_free_taskq (kmp_taskq_t *tq, kmpc_task_queue_t *p, int in_parallel, kmp_int32 global_tid)
+{
+    __kmpc_taskq_free(p->tq_thunk_space, global_tid);
+    __kmpc_taskq_free(p->tq_queue, global_tid);
+
+    /* free shared var structure storage */
+    __kmpc_taskq_free((void *) p->tq_shareds[0].ai_data, global_tid);
+
+    /* free array of pointers to shared vars storage */
+    __kmpc_taskq_free(p->tq_shareds, global_tid);
+
+#ifdef KMP_DEBUG
+    p->tq_first_child = NULL;
+    p->tq_next_child = NULL;
+    p->tq_prev_child = NULL;
+    p->tq_ref_count = -10;
+    p->tq_shareds = NULL;
+    p->tq_tasknum_queuing = 0;
+    p->tq_tasknum_serving = 0;
+    p->tq_queue = NULL;
+    p->tq_thunk_space = NULL;
+    p->tq_taskq_slot = NULL;
+    p->tq_free_thunks = NULL;
+    p->tq_nslots = 0;
+    p->tq_head = 0;
+    p->tq_tail = 0;
+    p->tq_nfull = 0;
+    p->tq_hiwat = 0;
+
+    if (in_parallel) {
+        int i;
+
+        for (i=0; i<p->tq_nproc; i++)
+            p->tq_th_thunks[i].ai_data = 0;
+    }
+    if ( __kmp_env_consistency_check )
+        p->tq_loc = NULL;
+    KMP_DEBUG_ASSERT( p->tq_flags & TQF_DEALLOCATED );
+    p->tq_flags = TQF_DEALLOCATED;
+#endif /* KMP_DEBUG */
+
+    if (in_parallel)  {
+        __kmpc_taskq_free(p->tq_th_thunks, global_tid);
+        __kmp_destroy_lock(& p->tq_link_lck);
+        __kmp_destroy_lock(& p->tq_queue_lck);
+        __kmp_destroy_lock(& p->tq_free_thunks_lck);
+    }
+#ifdef KMP_DEBUG
+    p->tq_th_thunks = NULL;
+#endif /* KMP_DEBUG */
+
+    KMP_MB();  /* make sure data structures are in consistent state before querying them */
+               /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+    __kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
+    p->tq.tq_next_free = tq->tq_freelist;
+
+    tq->tq_freelist = p;
+    __kmp_release_lock( & tq->tq_freelist_lck, global_tid );
+}
+
+/*
+ *    Once a group of thunks has been allocated for use in a particular queue,
+ *    these are managed via a per-queue freelist.
+ *    We force a check that there's always a thunk free if we need one.
+ */
+
+static kmpc_thunk_t *
+__kmp_alloc_thunk (kmpc_task_queue_t *queue, int in_parallel, kmp_int32 global_tid)
+{
+    kmpc_thunk_t *fl;
+
+    if (in_parallel) {
+        __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
+
+        KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                   /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+    }
+
+    fl = queue->tq_free_thunks;
+
+    KMP_DEBUG_ASSERT (fl != NULL);
+
+    queue->tq_free_thunks = fl->th.th_next_free;
+    fl->th_flags = 0;
+
+    if (in_parallel)
+        __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
+
+    return fl;
+}
+
+static void
+__kmp_free_thunk (kmpc_task_queue_t *queue, kmpc_thunk_t *p, int in_parallel, kmp_int32 global_tid)
+{
+#ifdef KMP_DEBUG
+    p->th_task = 0;
+    p->th_encl_thunk = 0;
+    p->th_status = 0;
+    p->th_tasknum = 0;
+    /* Also could zero pointers to private vars */
+#endif
+
+    if (in_parallel) {
+        __kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
+
+        KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                   /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+    }
+
+    p->th.th_next_free = queue->tq_free_thunks;
+    queue->tq_free_thunks = p;
+
+#ifdef KMP_DEBUG
+    p->th_flags = TQF_DEALLOCATED;
+#endif
+
+    if (in_parallel)
+        __kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
+}
+
+/* --------------------------------------------------------------------------- */
+
+/*  returns nonzero if the queue just became full after the enqueue  */
+
+static kmp_int32
+__kmp_enqueue_task ( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, kmpc_thunk_t *thunk, int in_parallel )
+{
+    kmp_int32    ret;
+
+    /*  dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the master is executing then)  */
+    if (in_parallel) {
+        __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
+
+        KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                   /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+    }
+
+    KMP_DEBUG_ASSERT (queue->tq_nfull < queue->tq_nslots);  /*  check queue not full  */
+
+    queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
+
+    if (queue->tq_head >= queue->tq_nslots)
+        queue->tq_head = 0;
+
+    (queue->tq_nfull)++;
+
+    KMP_MB();   /* to assure that nfull is seen to increase before TQF_ALL_TASKS_QUEUED is set */
+
+    ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
+
+    if (in_parallel) {
+        /* don't need to wait until workers are released before unlocking */
+        __kmp_release_lock(& queue->tq_queue_lck, global_tid);
+
+        if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
+            /* If just creating the root queue, the worker threads are waiting at */
+            /* a join barrier until now, when there's something in the queue for  */
+            /* them to do; release them now to do work.                           */
+            /* This should only be done when this is the first task enqueued,     */
+            /* so reset the flag here also.                                       */
+
+            tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;  /* no lock needed, workers are still in spin mode */
+
+            KMP_MB();   /* avoid releasing barrier twice if taskq_task switches threads */
+
+            __kmpc_end_barrier_master( NULL, global_tid);
+        }
+    }
+
+    return ret;
+}
+
+static kmpc_thunk_t *
+__kmp_dequeue_task (kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel)
+{
+    kmpc_thunk_t *pt;
+    int           tid = __kmp_tid_from_gtid( global_tid );
+
+    KMP_DEBUG_ASSERT (queue->tq_nfull > 0);  /*  check queue not empty  */
+
+    if (queue->tq.tq_parent != NULL && in_parallel) {
+        int ct;
+        __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+        ct = ++(queue->tq_ref_count);
+        __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+        KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
+          __LINE__, global_tid, queue, ct));
+    }
+
+    pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
+
+    if (queue->tq_tail >= queue->tq_nslots)
+        queue->tq_tail = 0;
+
+    if (in_parallel) {
+        queue->tq_th_thunks[tid].ai_data++;
+
+        KMP_MB(); /* necessary so ai_data increment is propagated to other threads immediately (digital) */
+
+        KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding thunks from queue %p\n",
+            global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
+    }
+
+    (queue->tq_nfull)--;
+
+#ifdef KMP_DEBUG
+    KMP_MB();
+
+    /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is decremented */
+
+    KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
+
+    if (in_parallel) {
+        KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <= __KMP_TASKQ_THUNKS_PER_TH);
+    }
+#endif
+
+    return pt;
+}
+
+/*
+ * Find the next (non-null) task to dequeue and return it.
+ * This is never called unless in_parallel=TRUE
+ *
+ * Here are the rules for deciding which queue to take the task from:
+ * 1.  Walk up the task queue tree from the current queue's parent and look
+ *      on the way up (for loop, below).
+ * 2.  Do a depth-first search back down the tree from the root and
+ *      look (find_task_in_descendant_queue()).
+ *
+ * Here are the rules for deciding which task to take from a queue
+ * (__kmp_find_task_in_queue ()):
+ * 1.  Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
+ *     must be staged to make sure we execute the last one with
+ *     TQF_IS_LAST_TASK at the end of task queue execution.
+ * 2.  If the queue length is below some high water mark and the taskq task
+ *     is enqueued, prefer running the taskq task.
+ * 3.  Otherwise, take a (normal) task from the queue.
+ *
+ * If we do all this and return pt == NULL at the bottom of this routine,
+ * this means there are no more tasks to execute (except possibly for
+ * TQF_IS_LASTPRIVATE).
+ */
+
+static kmpc_thunk_t *
+__kmp_find_task_in_queue (kmp_int32 global_tid, kmpc_task_queue_t *queue)
+{
+    kmpc_thunk_t *pt  = NULL;
+    int           tid = __kmp_tid_from_gtid( global_tid );
+
+    /* To prevent deadlock from tq_queue_lck if queue already deallocated */
+    if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
+
+        __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
+
+        /* Check again to avoid race in __kmpc_end_taskq() */
+        if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+            if ((queue->tq_taskq_slot != NULL) && (queue->tq_nfull <= queue->tq_hiwat)) {
+                /* if there's enough room in the queue and the dispatcher */
+                /* (taskq task) is available, schedule more tasks         */
+                pt = (kmpc_thunk_t *) queue->tq_taskq_slot;
+                queue->tq_taskq_slot = NULL;
+            }
+            else if (queue->tq_nfull == 0 ||
+                     queue->tq_th_thunks[tid].ai_data >= __KMP_TASKQ_THUNKS_PER_TH) {
+                /* do nothing if no thunks available or this thread can't */
+                /* run any because it already is executing too many       */
+
+                pt = NULL;
+            }
+            else if (queue->tq_nfull > 1) {
+                /*  always safe to schedule a task even if TQF_IS_LASTPRIVATE  */
+
+                pt = __kmp_dequeue_task (global_tid, queue, TRUE);
+            }
+            else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
+                /*  one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE  */
+
+                pt = __kmp_dequeue_task (global_tid, queue, TRUE);
+            }
+            else if (queue->tq_flags & TQF_IS_LAST_TASK) {
+                /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task()   */
+                /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
+                /* instrumentation does copy-out.                                  */
+
+                pt = __kmp_dequeue_task (global_tid, queue, TRUE);
+                pt->th_flags |= TQF_IS_LAST_TASK;  /* don't need test_then_or since already locked */
+            }
+        }
+
+        /* GEH - What happens here if is lastprivate, but not last task? */
+        __kmp_release_lock(& queue->tq_queue_lck, global_tid);
+    }
+
+    return pt;
+}
+
+/*
+ * Walk a tree of queues starting at queue's first child
+ * and return a non-NULL thunk if one can be scheduled.
+ * Must only be called when in_parallel=TRUE
+ */
+
+static kmpc_thunk_t *
+__kmp_find_task_in_descendant_queue (kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
+{
+    kmpc_thunk_t *pt = NULL;
+    kmpc_task_queue_t *queue = curr_queue;
+
+    if (curr_queue->tq_first_child != NULL) {
+        __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+        KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                   /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+        queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
+        if (queue == NULL) {
+            __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+            return NULL;
+        }
+
+        while (queue != NULL)  {
+            int ct;
+            kmpc_task_queue_t *next;
+
+            ct= ++(queue->tq_ref_count);
+            __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+            KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
+              __LINE__, global_tid, queue, ct));
+
+            pt = __kmp_find_task_in_queue (global_tid, queue);
+
+            if (pt != NULL) {
+                int ct;
+
+                __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+                KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                           /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+                ct = --(queue->tq_ref_count);
+                KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
+                  __LINE__, global_tid, queue, ct));
+                KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
+
+                __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+
+                return pt;
+            }
+
+            /* although reference count stays active during descendant walk, shouldn't matter  */
+            /* since if children still exist, reference counts aren't being monitored anyway   */
+
+            pt = __kmp_find_task_in_descendant_queue (global_tid, queue);
+
+            if (pt != NULL) {
+                int ct;
+
+                __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+                KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                           /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+                ct = --(queue->tq_ref_count);
+                KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
+                  __LINE__, global_tid, queue, ct));
+                KMP_DEBUG_ASSERT( ct >= 0 );
+
+                __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+
+                return pt;
+            }
+
+            __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+            next = queue->tq_next_child;
+
+            ct = --(queue->tq_ref_count);
+            KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
+              __LINE__, global_tid, queue, ct));
+            KMP_DEBUG_ASSERT( ct >= 0 );
+
+            queue = next;
+        }
+
+        __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+    }
+
+    return pt;
+}
+
+/*
+ * Walk up the taskq tree looking for a task to execute.
+ * If we get to the root, search the tree for a descendent queue task.
+ * Must only be called when in_parallel=TRUE
+ */
+
+static kmpc_thunk_t *
+__kmp_find_task_in_ancestor_queue (kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue)
+{
+    kmpc_task_queue_t *queue;
+    kmpc_thunk_t      *pt;
+
+    pt = NULL;
+
+    if (curr_queue->tq.tq_parent != NULL) {
+        queue = curr_queue->tq.tq_parent;
+
+        while (queue != NULL) {
+            if (queue->tq.tq_parent != NULL) {
+                int ct;
+                __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+                KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                           /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+                ct = ++(queue->tq_ref_count);
+                __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+                KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
+                  __LINE__, global_tid, queue, ct));
+            }
+
+            pt = __kmp_find_task_in_queue (global_tid, queue);
+            if (pt != NULL) {
+                if (queue->tq.tq_parent != NULL) {
+                    int ct;
+                    __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+                    KMP_MB();  /* make sure data structures are in consistent state before querying them   */
+                               /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
+
+                    ct = --(queue->tq_ref_count);
+                    KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
+                      __LINE__, global_tid, queue, ct));
+                    KMP_DEBUG_ASSERT( ct >= 0 );
+
+                    __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+                }
+
+                return pt;
+            }
+
+            if (queue->tq.tq_parent != NULL) {
+                int ct;
+                __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+                KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                           /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+                ct = --(queue->tq_ref_count);
+                KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
+                  __LINE__, global_tid, queue, ct));
+                KMP_DEBUG_ASSERT( ct >= 0 );
+            }
+            queue = queue->tq.tq_parent;
+
+            if (queue != NULL)
+                __kmp_release_lock(& queue->tq_link_lck, global_tid);
+        }
+
+    }
+
+    pt = __kmp_find_task_in_descendant_queue( global_tid, tq->tq_root );
+
+    return pt;
+}
+
+static int
+__kmp_taskq_tasks_finished (kmpc_task_queue_t *queue)
+{
+    int i;
+
+    /* KMP_MB(); *//* is this really necessary? */
+
+    for (i=0; i<queue->tq_nproc; i++) {
+        if (queue->tq_th_thunks[i].ai_data != 0)
+            return FALSE;
+    }
+
+    return TRUE;
+}
+
+static int
+__kmp_taskq_has_any_children (kmpc_task_queue_t *queue)
+{
+    return (queue->tq_first_child != NULL);
+}
+
+static void
+__kmp_remove_queue_from_tree( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_parallel )
+{
+#ifdef KMP_DEBUG
+    kmp_int32     i;
+    kmpc_thunk_t *thunk;
+#endif
+
+    KF_TRACE(50, ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
+    KF_DUMP(50, __kmp_dump_task_queue( tq, queue, global_tid ));
+
+    /*  sub-queue in a recursion, not the root task queue  */
+    KMP_DEBUG_ASSERT (queue->tq.tq_parent != NULL);
+
+    if (in_parallel) {
+        __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+        KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                   /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+    }
+
+    KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
+
+    /*  unlink queue from its siblings if any at this level  */
+    if (queue->tq_prev_child != NULL)
+        queue->tq_prev_child->tq_next_child = queue->tq_next_child;
+    if (queue->tq_next_child != NULL)
+        queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
+    if (queue->tq.tq_parent->tq_first_child == queue)
+        queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
+
+    queue->tq_prev_child = NULL;
+    queue->tq_next_child = NULL;
+
+    if (in_parallel) {
+        KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
+          __LINE__, global_tid, queue, queue->tq_ref_count));
+
+        /* wait until all other threads have stopped accessing this queue */
+        while (queue->tq_ref_count > 1) {
+            __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+            KMP_WAIT_YIELD((volatile kmp_uint32*)&queue->tq_ref_count, 1, KMP_LE, NULL);
+
+            __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+        }
+
+        __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+    }
+
+    KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p freeing queue\n",
+      __LINE__, global_tid, queue));
+
+#ifdef KMP_DEBUG
+    KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
+    KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
+
+    for (i=0; i<queue->tq_nproc; i++) {
+        KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
+    }
+
+    i = 0;
+    for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
+        ++i;
+
+    KMP_ASSERT (i == queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
+#endif
+
+    /*  release storage for queue entry  */
+    __kmp_free_taskq ( tq, queue, TRUE, global_tid );
+
+    KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
+    KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
+}
+
+/*
+ * Starting from indicated queue, proceed downward through tree and
+ * remove all taskqs which are finished, but only go down to taskqs
+ * which have the "nowait" clause present.  Assume this is only called
+ * when in_parallel=TRUE.
+ */
+
+static void
+__kmp_find_and_remove_finished_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue )
+{
+    kmpc_task_queue_t *queue = curr_queue;
+
+    if (curr_queue->tq_first_child != NULL) {
+        __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+        KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                   /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+        queue = (kmpc_task_queue_t *) curr_queue->tq_first_child;
+        if (queue != NULL) {
+            __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+            return;
+        }
+
+        while (queue != NULL)  {
+            kmpc_task_queue_t *next;
+            int ct = ++(queue->tq_ref_count);
+            KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n",
+              __LINE__, global_tid, queue, ct));
+
+
+            /* although reference count stays active during descendant walk, */
+            /* shouldn't matter since if children still exist, reference     */
+            /* counts aren't being monitored anyway                          */
+
+            if (queue->tq_flags & TQF_IS_NOWAIT) {
+                __kmp_find_and_remove_finished_child_taskq ( tq, global_tid, queue );
+
+                if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) && (queue->tq_nfull == 0) &&
+                    __kmp_taskq_tasks_finished(queue) && ! __kmp_taskq_has_any_children(queue)) {
+
+                    /*
+                     Only remove this if we have not already marked it for deallocation.
+                     This should prevent multiple threads from trying to free this.
+                     */
+
+                    if ( __kmp_test_lock(& queue->tq_queue_lck, global_tid) ) {
+                        if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
+                            queue->tq_flags |= TQF_DEALLOCATED;
+                            __kmp_release_lock(& queue->tq_queue_lck, global_tid);
+
+                            __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
+
+                            /* Can't do any more here since can't be sure where sibling queue is so just exit this level */
+                            return;
+                        }
+                        else {
+                            __kmp_release_lock(& queue->tq_queue_lck, global_tid);
+                        }
+                    }
+                    /* otherwise, just fall through and decrement reference count */
+                }
+            }
+
+            __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+            next = queue->tq_next_child;
+
+            ct = --(queue->tq_ref_count);
+            KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
+              __LINE__, global_tid, queue, ct));
+            KMP_DEBUG_ASSERT( ct >= 0 );
+
+            queue = next;
+        }
+
+        __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+    }
+}
+
+/*
+ * Starting from indicated queue, proceed downward through tree and
+ * remove all taskq's assuming all are finished and
+ * assuming NO other threads are executing at this point.
+ */
+
+static void
+__kmp_remove_all_child_taskq( kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *queue )
+{
+    kmpc_task_queue_t *next_child;
+
+    queue = (kmpc_task_queue_t *) queue->tq_first_child;
+
+    while (queue != NULL)  {
+        __kmp_remove_all_child_taskq ( tq, global_tid, queue );
+
+        next_child = queue->tq_next_child;
+        queue->tq_flags |= TQF_DEALLOCATED;
+        __kmp_remove_queue_from_tree ( tq, global_tid, queue, FALSE );
+        queue = next_child;
+    }
+}
+
+static void
+__kmp_execute_task_from_queue( kmp_taskq_t *tq, ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, int in_parallel )
+{
+    kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
+    kmp_int32          tid   = __kmp_tid_from_gtid( global_tid );
+
+    KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
+    KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
+    KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
+    KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
+
+    /*
+     * For the taskq task, the curr_thunk pushes and pop pairs are set up as follows:
+     *
+     * happens exactly once:
+     * 1) __kmpc_taskq             : push (if returning thunk only)
+     * 4) __kmpc_end_taskq_task    : pop
+     *
+     * optionally happens *each* time taskq task is dequeued/enqueued:
+     * 2) __kmpc_taskq_task        : pop
+     * 3) __kmp_execute_task_from_queue  : push
+     *
+     * execution ordering:  1,(2,3)*,4
+     */
+
+    if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
+        kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
+        thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[index].ai_data;
+
+        if ( __kmp_env_consistency_check ) {
+            __kmp_push_workshare( global_tid,
+                    (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
+                    queue->tq_loc );
+        }
+    }
+    else {
+        if ( __kmp_env_consistency_check )
+            __kmp_push_workshare( global_tid, ct_taskq, queue->tq_loc );
+    }
+
+    if (in_parallel) {
+        thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
+        tq->tq_curr_thunk[tid] = thunk;
+
+        KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
+    }
+
+    KF_TRACE( 50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
+    thunk->th_task (global_tid, thunk);
+    KF_TRACE( 50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue, global_tid));
+
+    if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
+        if ( __kmp_env_consistency_check )
+            __kmp_pop_workshare( global_tid, (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered : ct_task,
+                                 queue->tq_loc );
+
+        if (in_parallel) {
+            tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
+            thunk->th_encl_thunk = NULL;
+            KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
+        }
+
+        if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
+            __kmp_taskq_check_ordered(global_tid, thunk);
+        }
+
+        __kmp_free_thunk (queue, thunk, in_parallel, global_tid);
+
+        KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n", global_tid, thunk));
+        KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
+
+        if (in_parallel) {
+            KMP_MB();   /* needed so thunk put on free list before outstanding thunk count is decremented */
+
+            KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
+
+            KF_TRACE( 200, ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
+                global_tid, queue->tq_th_thunks[tid].ai_data-1, queue));
+
+            queue->tq_th_thunks[tid].ai_data--;
+
+            /* KMP_MB(); */     /* is MB really necessary ? */
+        }
+
+        if (queue->tq.tq_parent != NULL && in_parallel) {
+            int ct;
+            __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+            ct = --(queue->tq_ref_count);
+            __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+            KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n",
+              __LINE__, global_tid, queue, ct));
+            KMP_DEBUG_ASSERT( ct >= 0 );
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------- */
+
+/* starts a taskq; creates and returns a thunk for the taskq_task        */
+/* also, returns pointer to shared vars for this thread in "shareds" arg */
+
+kmpc_thunk_t *
+__kmpc_taskq( ident_t *loc, kmp_int32 global_tid, kmpc_task_t taskq_task,
+              size_t sizeof_thunk, size_t sizeof_shareds,
+              kmp_int32 flags, kmpc_shared_vars_t **shareds )
+{
+    int                  in_parallel;
+    kmp_int32            nslots, nthunks, nshareds, nproc;
+    kmpc_task_queue_t   *new_queue, *curr_queue;
+    kmpc_thunk_t        *new_taskq_thunk;
+    kmp_info_t          *th;
+    kmp_team_t          *team;
+    kmp_taskq_t         *tq;
+    kmp_int32            tid;
+
+    KE_TRACE( 10, ("__kmpc_taskq called (%d)\n", global_tid));
+
+    th = __kmp_threads[ global_tid ];
+    team = th -> th.th_team;
+    tq = & team -> t.t_taskq;
+    nproc = team -> t.t_nproc;
+    tid = __kmp_tid_from_gtid( global_tid );
+
+    /* find out whether this is a parallel taskq or serialized one. */
+    in_parallel = in_parallel_context( team );
+
+    if( ! tq->tq_root ) {
+        if (in_parallel) {
+            /* Vector ORDERED SECTION to taskq version */
+            th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
+
+            /* Vector ORDERED SECTION to taskq version */
+            th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
+        }
+
+        if (in_parallel) {
+            /* This shouldn't be a barrier region boundary, it will confuse the user. */
+            /* Need the boundary to be at the end taskq instead. */
+            if ( __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
+                /* Creating the active root queue, and we are not the master thread. */
+                /* The master thread below created the queue and tasks have been     */
+                /* enqueued, and the master thread released this barrier.  This      */
+                /* worker thread can now proceed and execute tasks.  See also the    */
+                /* TQF_RELEASE_WORKERS which is used to handle this case.            */
+
+                *shareds = (kmpc_shared_vars_t *) tq->tq_root->tq_shareds[tid].ai_data;
+
+                KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
+
+                return NULL;
+            }
+        }
+
+        /* master thread only executes this code */
+
+        if( tq->tq_curr_thunk_capacity < nproc ) {
+            if(tq->tq_curr_thunk)
+                __kmp_free(tq->tq_curr_thunk);
+            else {
+                /* only need to do this once at outer level, i.e. when tq_curr_thunk is still NULL */
+                __kmp_init_lock( & tq->tq_freelist_lck );
+            }
+
+            tq->tq_curr_thunk = (kmpc_thunk_t **) __kmp_allocate( nproc * sizeof(kmpc_thunk_t *) );
+            tq -> tq_curr_thunk_capacity = nproc;
+        }
+
+        if (in_parallel)
+            tq->tq_global_flags = TQF_RELEASE_WORKERS;
+    }
+
+    /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
+    /*      on some heuristics (e.g., depth of queue nesting?).            */
+
+    nslots = (in_parallel) ? (2 * nproc) : 1;
+
+    /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
+    /* jobs being executed by other threads, and one extra for taskq slot          */
+
+    nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1) : nslots + 2;
+
+    /* Only the root taskq gets a per-thread array of shareds.       */
+    /* The rest of the taskq's only get one copy of the shared vars. */
+
+    nshareds = ( !tq->tq_root && in_parallel) ? nproc : 1;
+
+    /*  create overall queue data structure and its components that require allocation */
+
+    new_queue = __kmp_alloc_taskq ( tq, in_parallel, nslots, nthunks, nshareds, nproc,
+        sizeof_thunk, sizeof_shareds, &new_taskq_thunk, global_tid );
+
+    /*  rest of new_queue initializations  */
+
+    new_queue->tq_flags           = flags & TQF_INTERFACE_FLAGS;
+
+    if (in_parallel) {
+        new_queue->tq_tasknum_queuing  = 0;
+        new_queue->tq_tasknum_serving  = 0;
+        new_queue->tq_flags           |= TQF_PARALLEL_CONTEXT;
+    }
+
+    new_queue->tq_taskq_slot   = NULL;
+    new_queue->tq_nslots       = nslots;
+    new_queue->tq_hiwat        = HIGH_WATER_MARK (nslots);
+    new_queue->tq_nfull        = 0;
+    new_queue->tq_head         = 0;
+    new_queue->tq_tail         = 0;
+    new_queue->tq_loc          = loc;
+
+    if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
+        /* prepare to serve the first-queued task's ORDERED directive */
+        new_queue->tq_tasknum_serving = 1;
+
+        /* Vector ORDERED SECTION to taskq version */
+        th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
+
+        /* Vector ORDERED SECTION to taskq version */
+        th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
+    }
+
+    /*  create a new thunk for the taskq_task in the new_queue  */
+    *shareds = (kmpc_shared_vars_t *) new_queue->tq_shareds[0].ai_data;
+
+    new_taskq_thunk->th.th_shareds = *shareds;
+    new_taskq_thunk->th_task       = taskq_task;
+    new_taskq_thunk->th_flags      = new_queue->tq_flags | TQF_TASKQ_TASK;
+    new_taskq_thunk->th_status     = 0;
+
+    KMP_DEBUG_ASSERT (new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
+
+    /* KMP_MB(); */ /* make sure these inits complete before threads start using this queue (necessary?) */
+
+    /* insert the new task queue into the tree, but only after all fields initialized */
+
+    if (in_parallel) {
+        if( ! tq->tq_root ) {
+            new_queue->tq.tq_parent   = NULL;
+            new_queue->tq_first_child = NULL;
+            new_queue->tq_next_child  = NULL;
+            new_queue->tq_prev_child  = NULL;
+            new_queue->tq_ref_count   = 1;
+            tq->tq_root = new_queue;
+        }
+        else {
+            curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
+            new_queue->tq.tq_parent   = curr_queue;
+            new_queue->tq_first_child = NULL;
+            new_queue->tq_prev_child  = NULL;
+            new_queue->tq_ref_count   = 1;      /* for this the thread that built the queue */
+
+            KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n",
+              __LINE__, global_tid, new_queue, new_queue->tq_ref_count));
+
+            __kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+            new_queue->tq_next_child = (struct kmpc_task_queue_t *) curr_queue->tq_first_child;
+
+            if (curr_queue->tq_first_child != NULL)
+                curr_queue->tq_first_child->tq_prev_child = new_queue;
+
+            curr_queue->tq_first_child = new_queue;
+
+            __kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
+        }
+
+        /* set up thunk stack only after code that determines curr_queue above */
+        new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
+        tq->tq_curr_thunk[tid] = new_taskq_thunk;
+
+        KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
+    }
+    else {
+        new_taskq_thunk->th_encl_thunk = 0;
+        new_queue->tq.tq_parent   = NULL;
+        new_queue->tq_first_child = NULL;
+        new_queue->tq_next_child  = NULL;
+        new_queue->tq_prev_child  = NULL;
+        new_queue->tq_ref_count   = 1;
+    }
+
+#ifdef KMP_DEBUG
+    KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
+    KF_DUMP(150, __kmp_dump_thunk( tq, new_taskq_thunk, global_tid ));
+
+    if (in_parallel) {
+        KF_TRACE(25, ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
+    } else {
+        KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
+    }
+
+    KF_DUMP(25, __kmp_dump_task_queue( tq, new_queue, global_tid ));
+
+    if (in_parallel) {
+        KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
+    }
+#endif /* KMP_DEBUG */
+
+    if ( __kmp_env_consistency_check )
+        __kmp_push_workshare( global_tid, ct_taskq, new_queue->tq_loc );
+
+    KE_TRACE( 10, ("__kmpc_taskq return (%d)\n", global_tid));
+
+    return new_taskq_thunk;
+}
+
+
+/*  ends a taskq; last thread out destroys the queue  */
+
+void
+__kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk)
+{
+#ifdef KMP_DEBUG
+    kmp_int32           i;
+#endif
+    kmp_taskq_t        *tq;
+    int                 in_parallel;
+    kmp_info_t         *th;
+    kmp_int32           is_outermost;
+    kmpc_task_queue_t  *queue;
+    kmpc_thunk_t       *thunk;
+    int                 nproc;
+
+    KE_TRACE( 10, ("__kmpc_end_taskq called (%d)\n", global_tid));
+
+    tq = & __kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
+    nproc = __kmp_threads[global_tid] -> th.th_team -> t.t_nproc;
+
+    /* For the outermost taskq only, all but one thread will have taskq_thunk == NULL */
+    queue = (taskq_thunk == NULL) ? tq->tq_root : taskq_thunk->th.th_shareds->sv_queue;
+
+    KE_TRACE( 50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
+    is_outermost = (queue == tq->tq_root);
+    in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
+
+    if (in_parallel) {
+        kmp_uint32 spins;
+
+        /* this is just a safeguard to release the waiting threads if */
+        /* the outermost taskq never queues a task                    */
+
+        if (is_outermost && (KMP_MASTER_GTID( global_tid ))) {
+            if( tq->tq_global_flags & TQF_RELEASE_WORKERS ) {
+                /* no lock needed, workers are still in spin mode */
+                tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
+
+                __kmp_end_split_barrier( bs_plain_barrier, global_tid );
+            }
+        }
+
+        /* keep dequeueing work until all tasks are queued and dequeued */
+
+        do {
+            /* wait until something is available to dequeue */
+            KMP_INIT_YIELD(spins);
+
+            while ( (queue->tq_nfull == 0)
+                 && (queue->tq_taskq_slot == NULL)
+                 && (! __kmp_taskq_has_any_children(queue) )
+                 && (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED) )
+                  ) {
+                KMP_YIELD_WHEN( TRUE, spins );
+            }
+
+            /* check to see if we can execute tasks in the queue */
+            while ( ( (queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL) )
+                 && (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL
+                  ) {
+                KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk, queue, global_tid));
+                __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
+            }
+
+            /* see if work found can be found in a descendant queue */
+            if ( (__kmp_taskq_has_any_children(queue))
+              && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
+               ) {
+
+                KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
+                    thunk, thunk->th.th_shareds->sv_queue, queue, global_tid ));
+
+                __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
+            }
+
+        } while ( (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED))
+               || (queue->tq_nfull != 0)
+                );
+
+        KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue, global_tid));
+
+        /* wait while all tasks are not finished and more work found
+           in descendant queues */
+
+        while ( (!__kmp_taskq_tasks_finished(queue))
+             && (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != NULL
+              ) {
+
+            KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
+                thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
+
+            __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
+        }
+
+        KF_TRACE(50, ("No work found in descendent queues or all work finished in queue: %p (%d)\n", queue, global_tid));
+
+        if (!is_outermost) {
+            /* need to return if NOWAIT present and not outermost taskq */
+
+            if (queue->tq_flags & TQF_IS_NOWAIT) {
+                __kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+                queue->tq_ref_count--;
+                KMP_DEBUG_ASSERT( queue->tq_ref_count >= 0 );
+                __kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
+
+                KE_TRACE( 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
+
+                return;
+            }
+
+            __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
+
+            /* WAIT until all tasks are finished and no child queues exist before proceeding */
+            KMP_INIT_YIELD(spins);
+
+            while (!__kmp_taskq_tasks_finished(queue) || __kmp_taskq_has_any_children(queue)) {
+                thunk = __kmp_find_task_in_ancestor_queue( tq, global_tid, queue );
+
+                if (thunk != NULL) {
+                    KF_TRACE(50, ("Stole thunk: %p in ancestor queue: %p while waiting in queue: %p (%d)\n",
+                                  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
+                    __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
+                }
+
+                KMP_YIELD_WHEN( thunk == NULL, spins );
+
+                __kmp_find_and_remove_finished_child_taskq( tq, global_tid, queue );
+            }
+
+            __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
+            if ( !(queue->tq_flags & TQF_DEALLOCATED) ) {
+                queue->tq_flags |= TQF_DEALLOCATED;
+            }
+            __kmp_release_lock(& queue->tq_queue_lck, global_tid);
+
+            /* only the allocating thread can deallocate the queue */
+            if (taskq_thunk != NULL) {
+                __kmp_remove_queue_from_tree( tq, global_tid, queue, TRUE );
+            }
+
+            KE_TRACE( 10, ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n", global_tid));
+
+            return;
+        }
+
+        /* Outermost Queue: steal work from descendants until all tasks are finished */
+
+        KMP_INIT_YIELD(spins);
+
+        while (!__kmp_taskq_tasks_finished(queue)) {
+            thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
+
+            if (thunk != NULL) {
+                KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in queue: %p (%d)\n",
+                    thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
+
+                __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
+            }
+
+            KMP_YIELD_WHEN( thunk == NULL, spins );
+        }
+
+        /* Need this barrier to prevent destruction of queue before threads have all executed above code */
+        /* This may need to be done earlier when NOWAIT is implemented for the outermost level */
+
+        if ( !__kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL )) {
+            /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here;   */
+            /* for right now, everybody waits, and the master thread destroys the  */
+            /* remaining queues.                                                   */
+
+            __kmp_remove_all_child_taskq( tq, global_tid, queue );
+
+            /* Now destroy the root queue */
+            KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n", global_tid, queue ));
+            KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
+
+#ifdef KMP_DEBUG
+            /*  the root queue entry  */
+            KMP_DEBUG_ASSERT ((queue->tq.tq_parent == NULL) && (queue->tq_next_child == NULL));
+
+            /*  children must all be gone by now because of barrier above */
+            KMP_DEBUG_ASSERT (queue->tq_first_child == NULL);
+
+            for (i=0; i<nproc; i++) {
+                KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
+            }
+
+            for (i=0, thunk=queue->tq_free_thunks; thunk != NULL; i++, thunk=thunk->th.th_next_free);
+
+            KMP_DEBUG_ASSERT (i == queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
+
+            for (i = 0; i < nproc; i++) {
+                KMP_DEBUG_ASSERT( ! tq->tq_curr_thunk[i] );
+            }
+#endif
+            /*  unlink the root queue entry  */
+            tq -> tq_root =  NULL;
+
+            /*  release storage for root queue entry  */
+            KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue, global_tid));
+
+            queue->tq_flags |= TQF_DEALLOCATED;
+            __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
+
+            KF_DUMP(50, __kmp_dump_task_queue_tree( tq, tq->tq_root, global_tid ));
+
+            /* release the workers now that the data structures are up to date */
+            __kmp_end_split_barrier( bs_plain_barrier, global_tid );
+        }
+
+        th = __kmp_threads[ global_tid ];
+
+        /* Reset ORDERED SECTION to parallel version */
+        th->th.th_dispatch->th_deo_fcn = 0;
+
+        /* Reset ORDERED SECTION to parallel version */
+        th->th.th_dispatch->th_dxo_fcn = 0;
+    }
+    else {
+        /* in serial execution context, dequeue the last task  */
+        /* and execute it, if there were any tasks encountered */
+
+        if (queue->tq_nfull > 0) {
+            KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
+
+            thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
+
+            if (queue->tq_flags & TQF_IS_LAST_TASK) {
+                /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
+                /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
+                /* instrumentation does copy-out.                                  */
+
+                /* no need for test_then_or call since already locked */
+                thunk->th_flags |= TQF_IS_LAST_TASK;
+            }
+
+            KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, thunk, queue));
+
+            __kmp_execute_task_from_queue( tq, loc, global_tid, thunk, in_parallel );
+        }
+
+        /* destroy the unattached serial queue now that there is no more work to do */
+        KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n", queue, global_tid));
+        KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
+
+#ifdef KMP_DEBUG
+        i = 0;
+        for (thunk=queue->tq_free_thunks; thunk != NULL; thunk=thunk->th.th_next_free)
+            ++i;
+        KMP_DEBUG_ASSERT (i == queue->tq_nslots + 1);
+#endif
+        /*  release storage for unattached serial queue  */
+        KF_TRACE(50, ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
+
+        queue->tq_flags |= TQF_DEALLOCATED;
+        __kmp_free_taskq ( tq, queue, in_parallel, global_tid );
+    }
+
+    KE_TRACE( 10, ("__kmpc_end_taskq return (%d)\n", global_tid));
+}
+
+/*  Enqueues a task for thunk previously created by __kmpc_task_buffer. */
+/*  Returns nonzero if just filled up queue  */
+
+kmp_int32
+__kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
+{
+    kmp_int32          ret;
+    kmpc_task_queue_t *queue;
+    int                in_parallel;
+    kmp_taskq_t       *tq;
+
+    KE_TRACE( 10, ("__kmpc_task called (%d)\n", global_tid));
+
+    KMP_DEBUG_ASSERT (!(thunk->th_flags & TQF_TASKQ_TASK));  /*  thunk->th_task is a regular task  */
+
+    tq          = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
+    queue       = thunk->th.th_shareds->sv_queue;
+    in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
+
+    if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
+        thunk->th_tasknum = ++queue->tq_tasknum_queuing;
+
+    /* For serial execution dequeue the preceding task and execute it, if one exists */
+    /* This cannot be the last task.  That one is handled in __kmpc_end_taskq */
+
+    if (!in_parallel && queue->tq_nfull > 0) {
+        kmpc_thunk_t *prev_thunk;
+
+        KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
+
+        prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
+
+        KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, prev_thunk, queue));
+
+        __kmp_execute_task_from_queue( tq, loc, global_tid, prev_thunk, in_parallel );
+    }
+
+    /* The instrumentation sequence is:  __kmpc_task_buffer(), initialize private    */
+    /* variables, __kmpc_task().  The __kmpc_task_buffer routine checks that the     */
+    /* task queue is not full and allocates a thunk (which is then passed to         */
+    /* __kmpc_task()).  So, the enqueue below should never fail due to a full queue. */
+
+    KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
+    KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
+
+    ret = __kmp_enqueue_task ( tq, global_tid, queue, thunk, in_parallel );
+
+    KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
+    KF_DUMP(100, __kmp_dump_task_queue( tq, queue, global_tid ));
+
+    KE_TRACE( 10, ("__kmpc_task return (%d)\n", global_tid));
+
+    return ret;
+}
+
+/*  enqueues a taskq_task for thunk previously created by __kmpc_taskq  */
+/*  this should never be called unless in a parallel context            */
+
+void
+__kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, kmp_int32 status)
+{
+    kmpc_task_queue_t *queue;
+    kmp_taskq_t       *tq  = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
+    int                tid = __kmp_tid_from_gtid( global_tid );
+
+    KE_TRACE( 10, ("__kmpc_taskq_task called (%d)\n", global_tid));
+    KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
+    KF_DUMP(100, __kmp_dump_thunk( tq, thunk, global_tid ));
+
+    queue = thunk->th.th_shareds->sv_queue;
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_workshare( global_tid, ct_taskq, loc );
+
+    /*  thunk->th_task is the taskq_task  */
+    KMP_DEBUG_ASSERT (thunk->th_flags & TQF_TASKQ_TASK);
+
+    /*  not supposed to call __kmpc_taskq_task if it's already enqueued  */
+    KMP_DEBUG_ASSERT (queue->tq_taskq_slot == NULL);
+
+    /* dequeue taskq thunk from curr_thunk stack */
+    tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
+    thunk->th_encl_thunk = NULL;
+
+    KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
+
+    thunk->th_status = status;
+
+    KMP_MB();  /*  flush thunk->th_status before taskq_task enqueued to avoid race condition  */
+
+    /*  enqueue taskq_task in thunk into special slot in queue     */
+    /* GEH - probably don't need to lock taskq slot since only one */
+    /*       thread enqueues & already a lock set at dequeue point */
+
+    queue->tq_taskq_slot = thunk;
+
+    KE_TRACE( 10, ("__kmpc_taskq_task return (%d)\n", global_tid));
+}
+
+/*  ends a taskq_task; done generating tasks  */
+
+void
+__kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk)
+{
+    kmp_taskq_t       *tq;
+    kmpc_task_queue_t *queue;
+    int                in_parallel;
+    int                tid;
+
+    KE_TRACE( 10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
+
+    tq          = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
+    queue       = thunk->th.th_shareds->sv_queue;
+    in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
+    tid         = __kmp_tid_from_gtid( global_tid );
+
+    if ( __kmp_env_consistency_check )
+        __kmp_pop_workshare( global_tid, ct_taskq, loc );
+
+    if (in_parallel) {
+#if KMP_ARCH_X86 || \
+    KMP_ARCH_X86_64
+
+        KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_ALL_TASKS_QUEUED );
+#else
+        {
+            __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
+
+            KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                       /* Seems to work fine without this call for digital/alpha, needed for IBM/RS6000 */
+
+            queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
+
+            __kmp_release_lock(& queue->tq_queue_lck, global_tid);
+        }
+#endif
+    }
+
+    if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
+        /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in the */
+        /* queue if TQF_IS_LASTPRIVATE so we can positively identify that last task      */
+        /* and run it with its TQF_IS_LAST_TASK bit turned on in th_flags.  When         */
+        /* __kmpc_end_taskq_task() is called we are done generating all the tasks, so    */
+        /* we know the last one in the queue is the lastprivate task.  Mark the queue    */
+        /* as having gotten to this state via tq_flags & TQF_IS_LAST_TASK; when that     */
+        /* task actually executes mark it via th_flags & TQF_IS_LAST_TASK (this th_flags */
+        /* bit signals the instrumented code to do copy-outs after execution).           */
+
+        if (! in_parallel) {
+            /* No synchronization needed for serial context */
+            queue->tq_flags |= TQF_IS_LAST_TASK;
+        }
+        else {
+#if KMP_ARCH_X86 || \
+    KMP_ARCH_X86_64
+
+            KMP_TEST_THEN_OR32( &queue->tq_flags, (kmp_int32) TQF_IS_LAST_TASK );
+#else
+            {
+                __kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
+
+                KMP_MB();  /* make sure data structures are in consistent state before querying them */
+                           /* Seems to work without this call for digital/alpha, needed for IBM/RS6000 */
+
+                queue->tq_flags |= TQF_IS_LAST_TASK;
+
+                __kmp_release_lock(& queue->tq_queue_lck, global_tid);
+            }
+#endif
+            /* to prevent race condition where last task is dequeued but */
+            /* flag isn't visible yet (not sure about this)              */
+            KMP_MB();
+        }
+    }
+
+    /* dequeue taskq thunk from curr_thunk stack */
+    if (in_parallel) {
+        tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
+        thunk->th_encl_thunk = NULL;
+
+        KF_DUMP( 200, __kmp_dump_thunk_stack( tq->tq_curr_thunk[tid], global_tid ));
+    }
+
+    KE_TRACE( 10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
+}
+
+/* returns thunk for a regular task based on taskq_thunk              */
+/* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK)  */
+
+kmpc_thunk_t *
+__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk, kmpc_task_t task)
+{
+    kmp_taskq_t       *tq;
+    kmpc_task_queue_t *queue;
+    kmpc_thunk_t      *new_thunk;
+    int                in_parallel;
+
+    KE_TRACE( 10, ("__kmpc_task_buffer called (%d)\n", global_tid));
+
+    KMP_DEBUG_ASSERT (taskq_thunk->th_flags & TQF_TASKQ_TASK);  /*  taskq_thunk->th_task is the taskq_task  */
+
+    tq          = &__kmp_threads[global_tid] -> th.th_team -> t.t_taskq;
+    queue       = taskq_thunk->th.th_shareds->sv_queue;
+    in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
+
+    /* The instrumentation sequence is:  __kmpc_task_buffer(), initialize private */
+    /* variables, __kmpc_task().  The __kmpc_task_buffer routine checks that the  */
+    /* task queue is not full and allocates a thunk (which is then passed to      */
+    /* __kmpc_task()).  So, we can pre-allocate a thunk here assuming it will be  */
+    /* the next to be enqueued in __kmpc_task().                                  */
+
+    new_thunk = __kmp_alloc_thunk (queue, in_parallel, global_tid);
+    new_thunk->th.th_shareds = (kmpc_shared_vars_t *) queue->tq_shareds[0].ai_data;
+    new_thunk->th_encl_thunk = NULL;
+    new_thunk->th_task       = task;
+
+    /* GEH - shouldn't need to lock the read of tq_flags here */
+    new_thunk->th_flags      = queue->tq_flags & TQF_INTERFACE_FLAGS;
+
+    new_thunk->th_status     = 0;
+
+    KMP_DEBUG_ASSERT (!(new_thunk->th_flags & TQF_TASKQ_TASK));
+
+    KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
+    KF_DUMP(100, __kmp_dump_thunk( tq, new_thunk, global_tid ));
+
+    KE_TRACE( 10, ("__kmpc_task_buffer return (%d)\n", global_tid));
+
+    return new_thunk;
+}
+
+/* --------------------------------------------------------------------------- */

diff --git a/final/runtime/src/kmp_threadprivate.c b/final/runtime/src/kmp_threadprivate.c
new file mode 100644
index 0000000..240319f
--- /dev/null
+++ b/final/runtime/src/kmp_threadprivate.c

@@ -0,0 +1,733 @@
+/*
+ * kmp_threadprivate.c -- OpenMP threadprivate support library
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_itt.h"
+#include "kmp_i18n.h"
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#define USE_CHECKS_COMMON
+
+#define KMP_INLINE_SUBR         1
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+kmp_threadprivate_insert_private_data( int gtid, void *pc_addr, void *data_addr, size_t pc_size );
+struct private_common *
+kmp_threadprivate_insert( int gtid, void *pc_addr, void *data_addr, size_t pc_size );
+
+struct shared_table     __kmp_threadprivate_d_table;
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+static
+#ifdef KMP_INLINE_SUBR
+__forceinline
+#endif
+struct private_common *
+__kmp_threadprivate_find_task_common( struct common_table *tbl, int gtid, void *pc_addr )
+
+{
+    struct private_common *tn;
+
+#ifdef KMP_TASK_COMMON_DEBUG
+    KC_TRACE( 10, ( "__kmp_threadprivate_find_task_common: thread#%d, called with address %p\n",
+                    gtid, pc_addr ) );
+    dump_list();
+#endif
+
+    for (tn = tbl->data[ KMP_HASH(pc_addr) ]; tn; tn = tn->next) {
+        if (tn->gbl_addr == pc_addr) {
+#ifdef KMP_TASK_COMMON_DEBUG
+            KC_TRACE( 10, ( "__kmp_threadprivate_find_task_common: thread#%d, found node %p on list\n",
+                            gtid, pc_addr ) );
+#endif
+            return tn;
+        }
+    }
+    return 0;
+}
+
+static
+#ifdef KMP_INLINE_SUBR
+__forceinline
+#endif
+struct shared_common *
+__kmp_find_shared_task_common( struct shared_table *tbl, int gtid, void *pc_addr )
+{
+    struct shared_common *tn;
+
+    for (tn = tbl->data[ KMP_HASH(pc_addr) ]; tn; tn = tn->next) {
+        if (tn->gbl_addr == pc_addr) {
+#ifdef KMP_TASK_COMMON_DEBUG
+            KC_TRACE( 10, ( "__kmp_find_shared_task_common: thread#%d, found node %p on list\n",
+                            gtid, pc_addr ) );
+#endif
+            return tn;
+        }
+    }
+    return 0;
+}
+
+
+/*
+ *      Create a template for the data initialized storage.
+ *      Either the template is NULL indicating zero fill,
+ *      or the template is a copy of the original data.
+ */
+
+static struct private_data *
+__kmp_init_common_data( void *pc_addr, size_t pc_size )
+{
+    struct private_data *d;
+    size_t       i;
+    char        *p;
+
+    d = (struct private_data *) __kmp_allocate( sizeof( struct private_data ) );
+/*
+    d->data = 0;  // AC: commented out because __kmp_allocate zeroes the memory
+    d->next = 0;
+*/
+    d->size = pc_size;
+    d->more = 1;
+
+    p = (char*)pc_addr;
+
+    for (i = pc_size;  i > 0; --i) {
+        if (*p++ != '\0') {
+            d->data = __kmp_allocate( pc_size );
+            KMP_MEMCPY( d->data, pc_addr, pc_size );
+            break;
+        }
+    }
+
+    return d;
+}
+
+/*
+ *      Initialize the data area from the template.
+ */
+
+static void
+__kmp_copy_common_data( void *pc_addr, struct private_data *d )
+{
+    char *addr = (char *) pc_addr;
+    int   i, offset;
+
+    for (offset = 0; d != 0; d = d->next) {
+        for (i = d->more; i > 0; --i) {
+            if (d->data == 0)
+                memset( & addr[ offset ], '\0', d->size );
+            else
+                KMP_MEMCPY( & addr[ offset ], d->data, d->size );
+            offset += d->size;
+        }
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* we are called from __kmp_serial_initialize() with __kmp_initz_lock held. */
+void
+__kmp_common_initialize( void )
+{
+    if( ! TCR_4(__kmp_init_common) ) {
+        int q;
+#ifdef KMP_DEBUG
+        int gtid;
+#endif
+
+        __kmp_threadpriv_cache_list = NULL;
+
+#ifdef KMP_DEBUG
+        /* verify the uber masters were initialized */
+        for(gtid = 0 ; gtid < __kmp_threads_capacity; gtid++ )
+            if( __kmp_root[gtid] ) {
+                KMP_DEBUG_ASSERT( __kmp_root[gtid]->r.r_uber_thread );
+                for ( q = 0; q< KMP_HASH_TABLE_SIZE; ++q)
+                    KMP_DEBUG_ASSERT( !__kmp_root[gtid]->r.r_uber_thread->th.th_pri_common->data[q] );
+/*                    __kmp_root[ gitd ]-> r.r_uber_thread -> th.th_pri_common -> data[ q ] = 0;*/
+            }
+#endif /* KMP_DEBUG */
+
+        for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q)
+            __kmp_threadprivate_d_table.data[ q ] = 0;
+
+        TCW_4(__kmp_init_common, TRUE);
+    }
+}
+
+/* Call all destructors for threadprivate data belonging to all threads.
+   Currently unused! */
+void
+__kmp_common_destroy( void )
+{
+    if( TCR_4(__kmp_init_common) ) {
+        int q;
+
+        TCW_4(__kmp_init_common, FALSE);
+
+        for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q) {
+            int gtid;
+            struct private_common *tn;
+            struct shared_common  *d_tn;
+
+            /*  C++ destructors need to be called once per thread before exiting  */
+            /*  don't call destructors for master thread though unless we used copy constructor */
+
+            for (d_tn = __kmp_threadprivate_d_table.data[ q ]; d_tn; d_tn = d_tn->next) {
+                if (d_tn->is_vec) {
+                    if (d_tn->dt.dtorv != 0) {
+                        for (gtid = 0; gtid < __kmp_all_nth; ++gtid) {
+                            if( __kmp_threads[gtid] ) {
+                                if( (__kmp_foreign_tp) ? (! KMP_INITIAL_GTID (gtid)) :
+                                                         (! KMP_UBER_GTID (gtid)) ) {
+                                    tn = __kmp_threadprivate_find_task_common( __kmp_threads[ gtid ]->th.th_pri_common,
+                                                                               gtid, d_tn->gbl_addr );
+                                    if (tn) {
+                                        (*d_tn->dt.dtorv) (tn->par_addr, d_tn->vec_len);
+                                    }
+                                }
+                            }
+                        }
+                        if (d_tn->obj_init != 0) {
+                            (*d_tn->dt.dtorv) (d_tn->obj_init, d_tn->vec_len);
+                        }
+                    }
+                } else {
+                    if (d_tn->dt.dtor != 0) {
+                        for (gtid = 0; gtid < __kmp_all_nth; ++gtid) {
+                            if( __kmp_threads[gtid] ) {
+                                if( (__kmp_foreign_tp) ? (! KMP_INITIAL_GTID (gtid)) :
+                                                         (! KMP_UBER_GTID (gtid)) ) {
+                                    tn = __kmp_threadprivate_find_task_common( __kmp_threads[ gtid ]->th.th_pri_common,
+                                                                               gtid, d_tn->gbl_addr );
+                                    if (tn) {
+                                        (*d_tn->dt.dtor) (tn->par_addr);
+                                    }
+                                }
+                            }
+                        }
+                        if (d_tn->obj_init != 0) {
+                            (*d_tn->dt.dtor) (d_tn->obj_init);
+                        }
+                    }
+                }
+            }
+            __kmp_threadprivate_d_table.data[ q ] = 0;
+        }
+    }
+}
+
+/* Call all destructors for threadprivate data belonging to this thread */
+void
+__kmp_common_destroy_gtid( int gtid )
+{
+    struct private_common *tn;
+    struct shared_common *d_tn;
+
+    KC_TRACE( 10, ("__kmp_common_destroy_gtid: T#%d called\n", gtid ) );
+    if( (__kmp_foreign_tp) ? (! KMP_INITIAL_GTID (gtid)) :
+                             (! KMP_UBER_GTID (gtid)) ) {
+
+        if( TCR_4(__kmp_init_common) ) {
+
+            /* Cannot do this here since not all threads have destroyed their data */
+            /* TCW_4(__kmp_init_common, FALSE); */
+
+            for (tn = __kmp_threads[ gtid ]->th.th_pri_head; tn; tn = tn->link) {
+
+                d_tn = __kmp_find_shared_task_common( &__kmp_threadprivate_d_table,
+                                                      gtid, tn->gbl_addr );
+
+                KMP_DEBUG_ASSERT( d_tn );
+
+                if (d_tn->is_vec) {
+                    if (d_tn->dt.dtorv != 0) {
+                        (void) (*d_tn->dt.dtorv) (tn->par_addr, d_tn->vec_len);
+                    }
+                    if (d_tn->obj_init != 0) {
+                        (void) (*d_tn->dt.dtorv) (d_tn->obj_init, d_tn->vec_len);
+                    }
+                } else {
+                    if (d_tn->dt.dtor != 0) {
+                        (void) (*d_tn->dt.dtor) (tn->par_addr);
+                    }
+                    if (d_tn->obj_init != 0) {
+                        (void) (*d_tn->dt.dtor) (d_tn->obj_init);
+                    }
+                }
+            }
+            KC_TRACE( 30, ("__kmp_common_destroy_gtid: T#%d threadprivate destructors complete\n",
+                           gtid ) );
+        }
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#ifdef KMP_TASK_COMMON_DEBUG
+static void
+dump_list( void )
+{
+    int p, q;
+
+    for (p = 0; p < __kmp_all_nth; ++p) {
+        if( !__kmp_threads[p] ) continue;
+        for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q) {
+            if (__kmp_threads[ p ]->th.th_pri_common->data[ q ]) {
+                struct private_common *tn;
+
+                KC_TRACE( 10, ( "\tdump_list: gtid:%d addresses\n", p ) );
+
+                for (tn = __kmp_threads[ p ]->th.th_pri_common->data[ q ]; tn; tn = tn->next)                 {
+                    KC_TRACE( 10, ( "\tdump_list: THREADPRIVATE: Serial %p -> Parallel %p\n",
+                                    tn->gbl_addr, tn->par_addr ) );
+                }
+            }
+        }
+    }
+}
+#endif /* KMP_TASK_COMMON_DEBUG */
+
+
+/*
+ * NOTE: this routine is to be called only from the serial part of the program.
+ */
+
+void
+kmp_threadprivate_insert_private_data( int gtid, void *pc_addr, void *data_addr, size_t pc_size )
+{
+    struct shared_common **lnk_tn, *d_tn;
+    KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] &&
+            __kmp_threads[ gtid ] -> th.th_root -> r.r_active == 0 );
+
+    d_tn = __kmp_find_shared_task_common( &__kmp_threadprivate_d_table,
+                                          gtid, pc_addr );
+
+    if (d_tn == 0) {
+        d_tn = (struct shared_common *) __kmp_allocate( sizeof( struct shared_common ) );
+
+        d_tn->gbl_addr = pc_addr;
+        d_tn->pod_init = __kmp_init_common_data( data_addr, pc_size );
+/*
+        d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate zeroes the memory
+        d_tn->ct.ctor = 0;
+        d_tn->cct.cctor = 0;;
+        d_tn->dt.dtor = 0;
+        d_tn->is_vec = FALSE;
+        d_tn->vec_len = 0L;
+*/
+        d_tn->cmn_size = pc_size;
+
+        __kmp_acquire_lock( &__kmp_global_lock, gtid );
+
+        lnk_tn = &(__kmp_threadprivate_d_table.data[ KMP_HASH(pc_addr) ]);
+
+        d_tn->next = *lnk_tn;
+        *lnk_tn = d_tn;
+
+        __kmp_release_lock( &__kmp_global_lock, gtid );
+    }
+}
+
+struct private_common *
+kmp_threadprivate_insert( int gtid, void *pc_addr, void *data_addr, size_t pc_size )
+{
+    struct private_common *tn, **tt;
+    struct shared_common  *d_tn;
+
+    /* +++++++++ START OF CRITICAL SECTION +++++++++ */
+
+    __kmp_acquire_lock( & __kmp_global_lock, gtid );
+
+    tn = (struct private_common *) __kmp_allocate( sizeof (struct private_common) );
+
+    tn->gbl_addr = pc_addr;
+
+    d_tn = __kmp_find_shared_task_common( &__kmp_threadprivate_d_table,
+                                          gtid, pc_addr );     /* Only the MASTER data table exists. */
+
+    if (d_tn != 0) {
+        /* This threadprivate variable has already been seen. */
+
+        if ( d_tn->pod_init == 0 && d_tn->obj_init == 0 ) {
+            d_tn->cmn_size = pc_size;
+
+            if (d_tn->is_vec) {
+                if (d_tn->ct.ctorv != 0) {
+                    /* Construct from scratch so no prototype exists */
+                    d_tn->obj_init = 0;
+                }
+                else if (d_tn->cct.cctorv != 0) {
+                    /* Now data initialize the prototype since it was previously registered */
+                    d_tn->obj_init = (void *) __kmp_allocate( d_tn->cmn_size );
+                    (void) (*d_tn->cct.cctorv) (d_tn->obj_init, pc_addr, d_tn->vec_len);
+                }
+                else {
+                    d_tn->pod_init = __kmp_init_common_data( data_addr, d_tn->cmn_size );
+                }
+            } else {
+                if (d_tn->ct.ctor != 0) {
+                    /* Construct from scratch so no prototype exists */
+                    d_tn->obj_init = 0;
+                }
+                else if (d_tn->cct.cctor != 0) {
+                    /* Now data initialize the prototype since it was previously registered */
+                    d_tn->obj_init = (void *) __kmp_allocate( d_tn->cmn_size );
+                    (void) (*d_tn->cct.cctor) (d_tn->obj_init, pc_addr);
+                }
+                else {
+                    d_tn->pod_init = __kmp_init_common_data( data_addr, d_tn->cmn_size );
+                }
+            }
+        }
+    }
+    else {
+        struct shared_common **lnk_tn;
+
+        d_tn = (struct shared_common *) __kmp_allocate( sizeof( struct shared_common ) );
+        d_tn->gbl_addr = pc_addr;
+        d_tn->cmn_size = pc_size;
+        d_tn->pod_init = __kmp_init_common_data( data_addr, pc_size );
+/*
+        d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate zeroes the memory
+        d_tn->ct.ctor = 0;
+        d_tn->cct.cctor = 0;
+        d_tn->dt.dtor = 0;
+        d_tn->is_vec = FALSE;
+        d_tn->vec_len = 0L;
+*/
+        lnk_tn = &(__kmp_threadprivate_d_table.data[ KMP_HASH(pc_addr) ]);
+
+        d_tn->next = *lnk_tn;
+        *lnk_tn = d_tn;
+    }
+
+    tn->cmn_size = d_tn->cmn_size;
+
+    if ( (__kmp_foreign_tp) ? (KMP_INITIAL_GTID (gtid)) : (KMP_UBER_GTID (gtid)) ) {
+        tn->par_addr = (void *) pc_addr;
+    }
+    else {
+        tn->par_addr = (void *) __kmp_allocate( tn->cmn_size );
+    }
+
+    __kmp_release_lock( & __kmp_global_lock, gtid );
+
+    /* +++++++++ END OF CRITICAL SECTION +++++++++ */
+
+#ifdef USE_CHECKS_COMMON
+        if (pc_size > d_tn->cmn_size) {
+            KC_TRACE( 10, ( "__kmp_threadprivate_insert: THREADPRIVATE: %p (%"
+                            KMP_UINTPTR_SPEC " ,%" KMP_UINTPTR_SPEC ")\n",
+                            pc_addr, pc_size, d_tn->cmn_size ) );
+            KMP_FATAL( TPCommonBlocksInconsist );
+        }
+#endif /* USE_CHECKS_COMMON */
+
+    tt = &(__kmp_threads[ gtid ]->th.th_pri_common->data[ KMP_HASH(pc_addr) ]);
+
+#ifdef KMP_TASK_COMMON_DEBUG
+    if (*tt != 0) {
+        KC_TRACE( 10, ( "__kmp_threadprivate_insert: WARNING! thread#%d: collision on %p\n",
+                        gtid, pc_addr ) );
+    }
+#endif
+    tn->next = *tt;
+    *tt = tn;
+
+#ifdef KMP_TASK_COMMON_DEBUG
+    KC_TRACE( 10, ( "__kmp_threadprivate_insert: thread#%d, inserted node %p on list\n",
+                    gtid, pc_addr ) );
+    dump_list( );
+#endif
+
+    /* Link the node into a simple list */
+
+    tn->link = __kmp_threads[ gtid ]->th.th_pri_head;
+    __kmp_threads[ gtid ]->th.th_pri_head = tn;
+
+#ifdef BUILD_TV
+    __kmp_tv_threadprivate_store( __kmp_threads[ gtid ], tn->gbl_addr, tn->par_addr );
+#endif
+
+    if( (__kmp_foreign_tp) ? (KMP_INITIAL_GTID (gtid)) : (KMP_UBER_GTID (gtid)) )
+        return tn;
+
+    /*
+     * if C++ object with copy constructor, use it;
+     * else if C++ object with constructor, use it for the non-master copies only;
+     * else use pod_init and memcpy
+     *
+     * C++ constructors need to be called once for each non-master thread on allocate
+     * C++ copy constructors need to be called once for each thread on allocate
+     */
+
+    /*
+     * C++ object with constructors/destructors;
+     * don't call constructors for master thread though
+     */
+    if (d_tn->is_vec) {
+        if ( d_tn->ct.ctorv != 0) {
+            (void) (*d_tn->ct.ctorv) (tn->par_addr, d_tn->vec_len);
+        } else if (d_tn->cct.cctorv != 0) {
+            (void) (*d_tn->cct.cctorv) (tn->par_addr, d_tn->obj_init, d_tn->vec_len);
+        } else if (tn->par_addr != tn->gbl_addr) {
+            __kmp_copy_common_data( tn->par_addr, d_tn->pod_init );
+        }
+    } else {
+        if ( d_tn->ct.ctor != 0 ) {
+            (void) (*d_tn->ct.ctor) (tn->par_addr);
+        } else if (d_tn->cct.cctor != 0) {
+            (void) (*d_tn->cct.cctor) (tn->par_addr, d_tn->obj_init);
+        } else if (tn->par_addr != tn->gbl_addr) {
+            __kmp_copy_common_data( tn->par_addr, d_tn->pod_init );
+        }
+    }
+/* !BUILD_OPENMP_C
+    if (tn->par_addr != tn->gbl_addr)
+        __kmp_copy_common_data( tn->par_addr, d_tn->pod_init ); */
+
+    return tn;
+}
+
+/* ------------------------------------------------------------------------ */
+/* We are currently parallel, and we know the thread id.                    */
+/* ------------------------------------------------------------------------ */
+
+/*!
+ @ingroup THREADPRIVATE
+
+ @param loc source location information 
+ @param data  pointer to data being privatized 
+ @param ctor  pointer to constructor function for data 
+ @param cctor  pointer to copy constructor function for data 
+ @param dtor  pointer to destructor function for data 
+
+ Register constructors and destructors for thread private data.
+ This function is called when executing in parallel, when we know the thread id.
+*/
+void
+__kmpc_threadprivate_register(ident_t *loc, void *data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor)
+{
+    struct shared_common *d_tn, **lnk_tn;
+
+    KC_TRACE( 10, ("__kmpc_threadprivate_register: called\n" ) );
+
+#ifdef USE_CHECKS_COMMON
+    /* copy constructor must be zero for current code gen (Nov 2002 - jph) */
+    KMP_ASSERT( cctor == 0);
+#endif /* USE_CHECKS_COMMON */
+
+    /* Only the global data table exists. */
+    d_tn = __kmp_find_shared_task_common( &__kmp_threadprivate_d_table, -1, data );
+
+    if (d_tn == 0) {
+        d_tn = (struct shared_common *) __kmp_allocate( sizeof( struct shared_common ) );
+        d_tn->gbl_addr = data;
+
+        d_tn->ct.ctor = ctor;
+        d_tn->cct.cctor = cctor;
+        d_tn->dt.dtor = dtor;
+/*
+        d_tn->is_vec = FALSE;  // AC: commented out because __kmp_allocate zeroes the memory
+        d_tn->vec_len = 0L;
+        d_tn->obj_init = 0;
+        d_tn->pod_init = 0;
+*/
+        lnk_tn = &(__kmp_threadprivate_d_table.data[ KMP_HASH(data) ]);
+
+        d_tn->next = *lnk_tn;
+        *lnk_tn = d_tn;
+    }
+}
+
+void *
+__kmpc_threadprivate(ident_t *loc, kmp_int32 global_tid, void *data, size_t size)
+{
+    void *ret;
+    struct private_common *tn;
+
+    KC_TRACE( 10, ("__kmpc_threadprivate: T#%d called\n", global_tid ) );
+
+#ifdef USE_CHECKS_COMMON
+    if (! __kmp_init_serial)
+        KMP_FATAL( RTLNotInitialized );
+#endif /* USE_CHECKS_COMMON */
+
+    if ( ! __kmp_threads[global_tid] -> th.th_root -> r.r_active && ! __kmp_foreign_tp ) {
+        /* The parallel address will NEVER overlap with the data_address */
+        /* dkp: 3rd arg to kmp_threadprivate_insert_private_data() is the data_address; use data_address = data */
+
+        KC_TRACE( 20, ("__kmpc_threadprivate: T#%d inserting private data\n", global_tid ) );
+        kmp_threadprivate_insert_private_data( global_tid, data, data, size );
+
+        ret = data;
+    }
+    else {
+        KC_TRACE( 50, ("__kmpc_threadprivate: T#%d try to find private data at address %p\n",
+                       global_tid, data ) );
+        tn = __kmp_threadprivate_find_task_common( __kmp_threads[ global_tid ]->th.th_pri_common, global_tid, data );
+
+        if ( tn ) {
+            KC_TRACE( 20, ("__kmpc_threadprivate: T#%d found data\n", global_tid ) );
+#ifdef USE_CHECKS_COMMON
+            if ((size_t) size > tn->cmn_size) {
+                KC_TRACE( 10, ( "THREADPRIVATE: %p (%" KMP_UINTPTR_SPEC " ,%" KMP_UINTPTR_SPEC ")\n",
+                                data, size, tn->cmn_size ) );
+                KMP_FATAL( TPCommonBlocksInconsist );
+            }
+#endif /* USE_CHECKS_COMMON */
+        }
+        else {
+            /* The parallel address will NEVER overlap with the data_address */
+            /* dkp: 3rd arg to kmp_threadprivate_insert() is the data_address; use data_address = data */
+            KC_TRACE( 20, ("__kmpc_threadprivate: T#%d inserting data\n", global_tid ) );
+            tn = kmp_threadprivate_insert( global_tid, data, data, size );
+        }
+
+        ret = tn->par_addr;
+    }
+    KC_TRACE( 10, ("__kmpc_threadprivate: T#%d exiting; return value = %p\n",
+                   global_tid, ret ) );
+
+    return ret;
+}
+
+/*!
+ @ingroup THREADPRIVATE
+ @param loc source location information 
+ @param global_tid  global thread number 
+ @param data  pointer to data to privatize 
+ @param size  size of data to privatize 
+ @param cache  pointer to cache 
+ @return pointer to private storage 
+
+ Allocate private storage for threadprivate data. 
+*/
+void *
+__kmpc_threadprivate_cached(
+    ident_t *  loc,
+    kmp_int32  global_tid,   // gtid.
+    void *     data,         // Pointer to original global variable.
+    size_t     size,         // Size of original global variable.
+    void ***   cache
+) {
+    KC_TRACE( 10, ("__kmpc_threadprivate_cached: T#%d called with cache: %p, address: %p, size: %"
+                   KMP_SIZE_T_SPEC "\n",
+                   global_tid, *cache, data, size ) );
+
+    if ( TCR_PTR(*cache) == 0) {
+        __kmp_acquire_lock( & __kmp_global_lock, global_tid );
+
+        if ( TCR_PTR(*cache) == 0) {
+            __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
+            __kmp_tp_cached = 1;
+            __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+            void ** my_cache;
+            KMP_ITT_IGNORE(
+            my_cache = (void**)
+                __kmp_allocate(sizeof( void * ) * __kmp_tp_capacity + sizeof ( kmp_cached_addr_t ));
+                           );
+            // No need to zero the allocated memory; __kmp_allocate does that.
+            KC_TRACE( 50, ("__kmpc_threadprivate_cached: T#%d allocated cache at address %p\n",
+                           global_tid, my_cache ) );
+            
+            /* TODO: free all this memory in __kmp_common_destroy using __kmp_threadpriv_cache_list */
+            /* Add address of mycache to linked list for cleanup later  */
+            kmp_cached_addr_t *tp_cache_addr;
+
+            tp_cache_addr = (kmp_cached_addr_t *) & my_cache[__kmp_tp_capacity];
+            tp_cache_addr -> addr = my_cache;
+            tp_cache_addr -> next = __kmp_threadpriv_cache_list;
+            __kmp_threadpriv_cache_list = tp_cache_addr;
+
+            KMP_MB();
+
+            TCW_PTR( *cache, my_cache);
+
+            KMP_MB();
+        }
+
+        __kmp_release_lock( & __kmp_global_lock, global_tid );
+    }
+
+    void *ret;
+    if ((ret = TCR_PTR((*cache)[ global_tid ])) == 0) {
+        ret = __kmpc_threadprivate( loc, global_tid, data, (size_t) size);
+
+        TCW_PTR( (*cache)[ global_tid ], ret);
+    }
+    KC_TRACE( 10, ("__kmpc_threadprivate_cached: T#%d exiting; return value = %p\n",
+                   global_tid, ret ) );
+
+    return ret;
+}
+
+/*!
+ @ingroup THREADPRIVATE
+ @param loc source location information 
+ @param data  pointer to data being privatized 
+ @param ctor  pointer to constructor function for data 
+ @param cctor  pointer to copy constructor function for data 
+ @param dtor  pointer to destructor function for data 
+ @param vector_length length of the vector (bytes or elements?)
+ Register vector constructors and destructors for thread private data.
+*/
+void
+__kmpc_threadprivate_register_vec( ident_t *loc, void *data, kmpc_ctor_vec ctor,
+                                   kmpc_cctor_vec cctor, kmpc_dtor_vec dtor,
+                                   size_t vector_length )
+{
+    struct shared_common *d_tn, **lnk_tn;
+
+    KC_TRACE( 10, ("__kmpc_threadprivate_register_vec: called\n" ) );
+
+#ifdef USE_CHECKS_COMMON
+    /* copy constructor must be zero for current code gen (Nov 2002 - jph) */
+    KMP_ASSERT( cctor == 0);
+#endif /* USE_CHECKS_COMMON */
+
+    d_tn = __kmp_find_shared_task_common( &__kmp_threadprivate_d_table,
+                                          -1, data );        /* Only the global data table exists. */
+
+    if (d_tn == 0) {
+        d_tn = (struct shared_common *) __kmp_allocate( sizeof( struct shared_common ) );
+        d_tn->gbl_addr = data;
+
+        d_tn->ct.ctorv = ctor;
+        d_tn->cct.cctorv = cctor;
+        d_tn->dt.dtorv = dtor;
+        d_tn->is_vec = TRUE;
+        d_tn->vec_len = (size_t) vector_length;
+/*
+        d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate zeroes the memory
+        d_tn->pod_init = 0;
+*/
+        lnk_tn = &(__kmp_threadprivate_d_table.data[ KMP_HASH(data) ]);
+
+        d_tn->next = *lnk_tn;
+        *lnk_tn = d_tn;
+    }
+}

diff --git a/final/runtime/src/kmp_utility.c b/final/runtime/src/kmp_utility.c
new file mode 100644
index 0000000..7e9f07c
--- /dev/null
+++ b/final/runtime/src/kmp_utility.c

@@ -0,0 +1,436 @@
+/*
+ * kmp_utility.c -- Utility routines for the OpenMP support library.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_wrapper_getpid.h"
+#include "kmp_str.h"
+#include <float.h>
+#include "kmp_i18n.h"
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+static const char *unknown = "unknown";
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+/* NOTE: If called before serial_initialize (i.e. from runtime_initialize), then */
+/* the debugging package has not been initialized yet, and only "0" will print   */
+/* debugging output since the environment variables have not been read.          */
+
+#ifdef KMP_DEBUG
+static int trace_level = 5;
+#endif
+
+/*
+ * LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
+ * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
+ * PHY_ID       = APIC_ID >> LOG_ID_BITS
+ */
+int
+__kmp_get_physical_id( int log_per_phy, int apic_id )
+{
+   int index_lsb, index_msb, temp;
+
+   if (log_per_phy > 1) {
+	index_lsb = 0;
+	index_msb = 31;
+
+	temp = log_per_phy;
+        while ( (temp & 1) == 0 ) {
+	    temp  >>= 1;
+	    index_lsb++;
+	}
+
+	temp = log_per_phy;
+	while ( (temp & 0x80000000)==0 ) {
+	    temp <<= 1;
+	    index_msb--;
+	}
+
+	/* If >1 bits were set in log_per_phy, choose next higher power of 2 */
+	if (index_lsb != index_msb) index_msb++;
+
+	return ( (int) (apic_id >> index_msb) );
+   }
+
+   return apic_id;
+}
+
+
+/*
+ * LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
+ * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
+ * LOG_ID       = APIC_ID & (( 1 << LOG_ID_BITS ) - 1 )
+ */
+int
+__kmp_get_logical_id( int log_per_phy, int apic_id )
+{
+   unsigned current_bit;
+   int bits_seen;
+
+   if (log_per_phy <= 1) return ( 0 );
+
+   bits_seen = 0;
+
+   for (current_bit = 1; log_per_phy != 0; current_bit <<= 1) {
+	if ( log_per_phy & current_bit ) {
+	    log_per_phy &= ~current_bit;
+	    bits_seen++;
+	}
+   }
+
+   /* If exactly 1 bit was set in log_per_phy, choose next lower power of 2 */
+   if (bits_seen == 1) {
+	current_bit >>= 1;
+   }
+
+   return ( (int) ((current_bit - 1) & apic_id) );
+}
+
+
+static
+kmp_uint64
+__kmp_parse_frequency(        // R: Frequency in Hz.
+    char const * frequency    // I: Float number and unit: MHz, GHz, or TGz.
+) {
+
+    double       value  = 0.0;
+    char const * unit   = NULL;
+    kmp_uint64   result = ~ 0;
+
+    if ( frequency == NULL ) {
+        return result;
+    }; // if
+    value = strtod( frequency, (char * *) & unit ); // strtod() does not like "char conts *".
+    if ( 0 < value && value <= DBL_MAX ) {          // Good value (not overflow, underflow, etc).
+        if ( strcmp( unit, "MHz" ) == 0 ) {
+            value = value * 1.0E+6;
+        } else if ( strcmp( unit, "GHz" ) == 0 ) {
+            value = value * 1.0E+9;
+        } else if ( strcmp( unit, "THz" ) == 0 ) {
+            value = value * 1.0E+12;
+        } else {                      // Wrong unit.
+            return result;
+        }; // if
+        result = value;
+    }; // if
+    return result;
+
+}; // func __kmp_parse_cpu_frequency
+
+void
+__kmp_query_cpuid( kmp_cpuinfo_t *p )
+{
+    struct kmp_cpuid buf;
+    int max_arg;
+    int log_per_phy;
+#ifdef KMP_DEBUG
+    int cflush_size;
+#endif
+
+    p->initialized = 1;
+
+    p->sse2 = 1; // Assume SSE2 by default.
+
+    __kmp_x86_cpuid( 0, 0, &buf );
+
+    KA_TRACE( trace_level, ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n",
+        0, buf.eax, buf.ebx, buf.ecx, buf.edx ) );
+
+    max_arg = buf.eax;
+
+    p->apic_id = -1;
+
+    if (max_arg >= 1) {
+        int i;
+        kmp_uint32 t, data[ 4 ];
+
+        __kmp_x86_cpuid( 1, 0, &buf );
+        KA_TRACE( trace_level, ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n",
+                                1, buf.eax, buf.ebx, buf.ecx, buf.edx ) );
+
+        {
+#define get_value(reg,lo,mask) ( ( ( reg ) >> ( lo ) ) & ( mask  ) )
+
+            p->signature = buf.eax;
+            p->family    =   get_value( buf.eax, 20, 0xff )        + get_value( buf.eax, 8, 0x0f );
+            p->model     = ( get_value( buf.eax, 16, 0x0f ) << 4 ) + get_value( buf.eax, 4, 0x0f );
+            p->stepping  =   get_value( buf.eax,  0, 0x0f );
+
+#undef get_value
+
+            KA_TRACE( trace_level, (" family = %d, model = %d, stepping = %d\n", p->family, p->model, p->stepping ) );
+        }
+
+        for ( t = buf.ebx, i = 0; i < 4; t >>= 8, ++i ) {
+            data[ i ] = (t & 0xff);
+        }; // for
+
+        p->sse2 = ( buf.edx >> 26 ) & 1;
+
+#ifdef KMP_DEBUG
+
+        if ( (buf.edx >> 4) & 1 ) {
+            /* TSC - Timestamp Counter Available */
+            KA_TRACE( trace_level, (" TSC" ) );
+        }
+        if ( (buf.edx >> 8) & 1 ) {
+            /* CX8 - CMPXCHG8B Instruction Available */
+            KA_TRACE( trace_level, (" CX8" ) );
+        }
+        if ( (buf.edx >> 9) & 1 ) {
+            /* APIC - Local APIC Present (multi-processor operation support */
+            KA_TRACE( trace_level, (" APIC" ) );
+        }
+        if ( (buf.edx >> 15) & 1 ) {
+            /* CMOV - Conditional MOVe Instruction Available */
+            KA_TRACE( trace_level, (" CMOV" ) );
+        }
+        if ( (buf.edx >> 18) & 1 ) {
+            /* PSN - Processor Serial Number Available */
+            KA_TRACE( trace_level, (" PSN" ) );
+        }
+        if ( (buf.edx >> 19) & 1 ) {
+            /* CLFULSH - Cache Flush Instruction Available */
+            cflush_size = data[ 1 ] * 8;    /* Bits 15-08: CLFLUSH line size = 8 (64 bytes) */
+            KA_TRACE( trace_level, (" CLFLUSH(%db)", cflush_size ) );
+
+        }
+        if ( (buf.edx >> 21) & 1 ) {
+            /* DTES - Debug Trace & EMON Store */
+            KA_TRACE( trace_level, (" DTES" ) );
+        }
+        if ( (buf.edx >> 22) & 1 ) {
+            /* ACPI - ACPI Support Available */
+            KA_TRACE( trace_level, (" ACPI" ) );
+        }
+        if ( (buf.edx >> 23) & 1 ) {
+            /* MMX - Multimedia Extensions */
+            KA_TRACE( trace_level, (" MMX" ) );
+        }
+        if ( (buf.edx >> 25) & 1 ) {
+            /* SSE - SSE Instructions */
+            KA_TRACE( trace_level, (" SSE" ) );
+        }
+        if ( (buf.edx >> 26) & 1 ) {
+            /* SSE2 - SSE2 Instructions */
+            KA_TRACE( trace_level, (" SSE2" ) );
+        }
+        if ( (buf.edx >> 27) & 1 ) {
+            /* SLFSNP - Self-Snooping Cache */
+            KA_TRACE( trace_level, (" SLFSNP" ) );
+        }
+#endif /* KMP_DEBUG */
+
+        if ( (buf.edx >> 28) & 1 ) {
+            /* Bits 23-16: Logical Processors per Physical Processor (1 for P4) */
+            log_per_phy = data[ 2 ];
+            p->apic_id     = data[ 3 ]; /* Bits 31-24: Processor Initial APIC ID (X) */
+            KA_TRACE( trace_level, (" HT(%d TPUs)", log_per_phy ) );
+
+            if( log_per_phy > 1 ) {
+                /* default to 1k FOR JT-enabled processors (4k on OS X*) */
+#if KMP_OS_DARWIN
+                p->cpu_stackoffset = 4 * 1024;
+#else
+                p->cpu_stackoffset = 1 * 1024;
+#endif
+            }
+
+            p->physical_id = __kmp_get_physical_id( log_per_phy, p->apic_id );
+            p->logical_id  = __kmp_get_logical_id( log_per_phy, p->apic_id );
+        }
+#ifdef KMP_DEBUG
+        if ( (buf.edx >> 29) & 1 ) {
+            /* ATHROTL - Automatic Throttle Control */
+            KA_TRACE( trace_level, (" ATHROTL" ) );
+        }
+        KA_TRACE( trace_level, (" ]\n" ) );
+
+        for (i = 2; i <= max_arg; ++i) {
+            __kmp_x86_cpuid( i, 0, &buf );
+            KA_TRACE( trace_level,
+                      ( "INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n",
+                        i, buf.eax, buf.ebx, buf.ecx, buf.edx ) );
+        }
+#endif
+#if KMP_USE_ADAPTIVE_LOCKS
+        p->rtm = 0;
+        if (max_arg > 7)
+        {
+            /* RTM bit CPUID.07:EBX, bit 11 */
+            __kmp_x86_cpuid(7, 0, &buf);
+            p->rtm = (buf.ebx >> 11) & 1;
+            KA_TRACE( trace_level, (" RTM" ) );
+        }
+#endif
+    }; // if
+
+    { // Parse CPU brand string for frequency.
+
+        union kmp_cpu_brand_string {
+            struct kmp_cpuid buf[ 3 ];
+            char             string[ sizeof( struct kmp_cpuid ) * 3 + 1 ];
+        }; // union kmp_cpu_brand_string
+        union kmp_cpu_brand_string brand;
+        int i;
+
+        p->frequency = 0;
+
+        // Get CPU brand string.
+        for ( i = 0; i < 3; ++ i ) {
+            __kmp_x86_cpuid( 0x80000002 + i, 0, &brand.buf[ i ] );
+        }; // for
+        brand.string[ sizeof( brand.string ) - 1 ] = 0; // Just in case. ;-)
+        KA_TRACE( trace_level, ( "cpu brand string: \"%s\"\n", brand.string ) );
+
+        // Parse frequency.
+        p->frequency = __kmp_parse_frequency( strrchr( brand.string, ' ' ) );
+        KA_TRACE( trace_level, ( "cpu frequency from brand string: %" KMP_UINT64_SPEC "\n", p->frequency ) );
+    }
+}
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+/* ------------------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------------------ */
+
+void
+__kmp_expand_host_name( char *buffer, size_t size )
+{
+    KMP_DEBUG_ASSERT(size >= sizeof(unknown));
+#if KMP_OS_WINDOWS
+    {
+	DWORD	s = size;
+
+	if (! GetComputerNameA( buffer, & s ))
+	    KMP_STRCPY_S( buffer, size, unknown );
+    }
+#else
+    buffer[size - 2] = 0;
+    if (gethostname( buffer, size ) || buffer[size - 2] != 0)
+	KMP_STRCPY_S( buffer, size, unknown );
+#endif
+}
+
+/* Expand the meta characters in the filename:
+ *
+ * Currently defined characters are:
+ *
+ * %H the hostname
+ * %P the number of threads used.
+ * %I the unique identifier for this run.
+ */
+
+void
+__kmp_expand_file_name( char *result, size_t rlen, char *pattern )
+{
+    char	*pos = result, *end = result + rlen - 1;
+    char	 buffer[256];
+    int		 default_cpu_width = 1;
+    int          snp_result;
+
+    KMP_DEBUG_ASSERT(rlen > 0);
+    *end = 0;
+    {
+	int i;
+	for(i = __kmp_xproc; i >= 10; i /= 10, ++default_cpu_width);
+    }
+
+    if (pattern != NULL) {
+	while (*pattern != '\0' && pos < end) {
+	    if (*pattern != '%') {
+		*pos++ = *pattern++;
+	    } else {
+		char *old_pattern = pattern;
+		int width = 1;
+		int cpu_width = default_cpu_width;
+
+		++pattern;
+
+		if (*pattern >= '0' && *pattern <= '9') {
+		    width = 0;
+		    do {
+			width = (width * 10) + *pattern++ - '0';
+		    } while (*pattern >= '0' && *pattern <= '9');
+		    if (width < 0 || width > 1024)
+			width = 1;
+
+		    cpu_width = width;
+		}
+
+		switch (*pattern) {
+		case 'H':
+		case 'h':
+		    {
+			__kmp_expand_host_name( buffer, sizeof( buffer ) );
+			KMP_STRNCPY( pos,  buffer, end - pos + 1);
+			if(*end == 0) {
+			    while ( *pos )
+				++pos;
+			    ++pattern;
+			} else
+			    pos = end;
+		    }
+		    break;
+		case 'P':
+		case 'p':
+		    {
+			snp_result = KMP_SNPRINTF( pos, end - pos + 1, "%0*d", cpu_width, __kmp_dflt_team_nth );
+			if(snp_result >= 0 && snp_result <= end - pos) {
+			    while ( *pos )
+				++pos;
+			    ++pattern;
+			} else
+			    pos = end;
+		    }
+		    break;
+		case 'I':
+		case 'i':
+		    {
+			pid_t id = getpid();
+			snp_result = KMP_SNPRINTF( pos, end - pos + 1, "%0*d", width, id );
+			if(snp_result >= 0 && snp_result <= end - pos) {
+			    while ( *pos )
+				++pos;
+			    ++pattern;
+			} else
+			    pos = end;
+			break;
+		    }
+		case '%':
+		    {
+			*pos++ = '%';
+			++pattern;
+			break;
+		    }
+		default:
+		    {
+			*pos++ = '%';
+			pattern = old_pattern + 1;
+			break;
+		    }
+		}
+	    }
+	}
+	/* TODO: How do we get rid of this? */
+	if(*pattern != '\0')
+	    KMP_FATAL( FileNameTooLong );
+    }
+
+    *pos = '\0';
+}
+

diff --git a/final/runtime/src/kmp_version.c b/final/runtime/src/kmp_version.c
new file mode 100644
index 0000000..1d0b9e4
--- /dev/null
+++ b/final/runtime/src/kmp_version.c

@@ -0,0 +1,209 @@
+/*
+ * kmp_version.c
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_version.h"
+
+// Replace with snapshot date YYYYMMDD for promotion build.
+#define KMP_VERSION_BUILD    20140926
+
+// Helper macros to convert value of macro to string literal.
+#define _stringer( x ) #x
+#define stringer( x )  _stringer( x )
+
+// Detect compiler.
+#if KMP_COMPILER_ICC
+    #if   __INTEL_COMPILER == 1010
+        #define KMP_COMPILER "Intel C++ Compiler 10.1"
+    #elif __INTEL_COMPILER == 1100
+        #define KMP_COMPILER "Intel C++ Compiler 11.0"
+    #elif __INTEL_COMPILER == 1110
+        #define KMP_COMPILER "Intel C++ Compiler 11.1"
+    #elif __INTEL_COMPILER == 1200
+        #define KMP_COMPILER "Intel C++ Compiler 12.0"
+    #elif __INTEL_COMPILER == 1210
+        #define KMP_COMPILER "Intel C++ Compiler 12.1"
+    #elif __INTEL_COMPILER == 1300
+        #define KMP_COMPILER "Intel C++ Compiler 13.0"
+    #elif __INTEL_COMPILER == 1310
+        #define KMP_COMPILER "Intel C++ Compiler 13.1"
+    #elif __INTEL_COMPILER == 1400
+        #define KMP_COMPILER "Intel C++ Compiler 14.0"
+    #elif __INTEL_COMPILER == 1410
+        #define KMP_COMPILER "Intel C++ Compiler 14.1"
+    #elif __INTEL_COMPILER == 1500
+        #define KMP_COMPILER "Intel C++ Compiler 15.0"
+    #elif __INTEL_COMPILER == 9999
+        #define KMP_COMPILER "Intel C++ Compiler mainline"
+    #endif
+#elif KMP_COMPILER_CLANG
+    #define KMP_COMPILER "Clang " stringer( __clang_major__ ) "." stringer( __clang_minor__ )
+#elif KMP_COMPILER_GCC
+    #define KMP_COMPILER "GCC " stringer( __GNUC__ ) "." stringer( __GNUC_MINOR__ )
+#elif KMP_COMPILER_MSVC
+    #define KMP_COMPILER "MSVC " stringer( _MSC_FULL_VER )
+#endif
+#ifndef KMP_COMPILER
+    #warning "Unknown compiler"
+    #define KMP_COMPILER "unknown compiler"
+#endif
+
+// Detect librray type (perf, stub).
+#ifdef KMP_STUB
+    #define KMP_LIB_TYPE "stub"
+#else
+    #define KMP_LIB_TYPE "performance"
+#endif // KMP_LIB_TYPE
+
+// Detect link type (static, dynamic).
+#ifdef KMP_DYNAMIC_LIB
+    #define KMP_LINK_TYPE "dynamic"
+#else
+    #define KMP_LINK_TYPE "static"
+#endif // KMP_LINK_TYPE
+
+// Finally, define strings.
+#define KMP_LIBRARY   KMP_LIB_TYPE " library (" KMP_LINK_TYPE ")"
+#define KMP_COPYRIGHT ""
+
+int const __kmp_version_major = KMP_VERSION_MAJOR;
+int const __kmp_version_minor = KMP_VERSION_MINOR;
+int const __kmp_version_build = KMP_VERSION_BUILD;
+int const __kmp_openmp_version =
+    #if OMP_40_ENABLED
+        201307;
+    #else
+        201107;
+    #endif
+
+/* Do NOT change the format of this string!  Intel(R) Thread Profiler checks for a
+   specific format some changes in the recognition routine there need to
+   be made before this is changed.
+*/
+char const __kmp_copyright[] =
+    KMP_VERSION_PREFIX KMP_LIBRARY
+    " ver. " stringer( KMP_VERSION_MAJOR ) "." stringer( KMP_VERSION_MINOR )
+    "." stringer( KMP_VERSION_BUILD ) " "
+    KMP_COPYRIGHT;
+
+char const __kmp_version_copyright[]      = KMP_VERSION_PREFIX KMP_COPYRIGHT;
+char const __kmp_version_lib_ver[]        = KMP_VERSION_PREFIX "version: " stringer( KMP_VERSION_MAJOR ) "." stringer( KMP_VERSION_MINOR ) "." stringer( KMP_VERSION_BUILD );
+char const __kmp_version_lib_type[]       = KMP_VERSION_PREFIX "library type: " KMP_LIB_TYPE;
+char const __kmp_version_link_type[]      = KMP_VERSION_PREFIX "link type: " KMP_LINK_TYPE;
+char const __kmp_version_build_time[]     = KMP_VERSION_PREFIX "build time: " __DATE__ " " __TIME__;
+#if KMP_MIC2
+    char const __kmp_version_target_env[] = KMP_VERSION_PREFIX "target environment: MIC2";
+#endif
+char const __kmp_version_build_compiler[] = KMP_VERSION_PREFIX "build compiler: " KMP_COMPILER;
+
+//
+// Called at serial initialization time.
+//
+static int __kmp_version_1_printed = FALSE;
+
+void
+__kmp_print_version_1( void )
+{
+    if ( __kmp_version_1_printed ) {
+        return;
+    }; // if
+    __kmp_version_1_printed = TRUE;
+
+    #ifndef KMP_STUB
+        kmp_str_buf_t buffer;
+        __kmp_str_buf_init( & buffer );
+        // Print version strings skipping initial magic.
+        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_lib_ver[ KMP_VERSION_MAGIC_LEN ] );
+        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_lib_type[ KMP_VERSION_MAGIC_LEN ] );
+        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_link_type[ KMP_VERSION_MAGIC_LEN ] );
+        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_build_time[ KMP_VERSION_MAGIC_LEN ] );
+      #if KMP_MIC
+        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_target_env[ KMP_VERSION_MAGIC_LEN ] );
+      #endif
+        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_build_compiler[ KMP_VERSION_MAGIC_LEN ] );
+        #if defined(KMP_GOMP_COMPAT)
+            __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_alt_comp[ KMP_VERSION_MAGIC_LEN ] );
+        #endif /* defined(KMP_GOMP_COMPAT) */
+        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_omp_api[ KMP_VERSION_MAGIC_LEN ] );
+        __kmp_str_buf_print( & buffer, "%sdynamic error checking: %s\n", KMP_VERSION_PREF_STR, ( __kmp_env_consistency_check ? "yes" : "no" )  );
+        #ifdef KMP_DEBUG
+            for ( int i = bs_plain_barrier; i < bs_last_barrier; ++ i ) {
+                __kmp_str_buf_print(
+                    & buffer,
+                    "%s%s barrier branch bits: gather=%u, release=%u\n",
+                    KMP_VERSION_PREF_STR,
+                    __kmp_barrier_type_name[ i ],
+                    __kmp_barrier_gather_branch_bits[ i ],
+                    __kmp_barrier_release_branch_bits[ i ]
+                ); // __kmp_str_buf_print
+            }; // for i
+            for ( int i = bs_plain_barrier; i < bs_last_barrier; ++ i ) {
+                __kmp_str_buf_print(
+                    & buffer,
+                    "%s%s barrier pattern: gather=%s, release=%s\n",
+                    KMP_VERSION_PREF_STR,
+                    __kmp_barrier_type_name[ i ],
+                    __kmp_barrier_pattern_name[ __kmp_barrier_gather_pattern[ i ] ],
+                    __kmp_barrier_pattern_name[ __kmp_barrier_release_pattern[ i ] ]
+                ); // __kmp_str_buf_print
+            }; // for i
+            __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_lock[ KMP_VERSION_MAGIC_LEN ] );
+        #endif
+        __kmp_str_buf_print(
+            & buffer,
+            "%sthread affinity support: %s\n",
+            KMP_VERSION_PREF_STR,
+            #if KMP_AFFINITY_SUPPORTED
+                (
+                    KMP_AFFINITY_CAPABLE()
+                    ?
+                    (
+                        __kmp_affinity_type == affinity_none
+                        ?
+                        "not used"
+                        :
+                        "yes"
+                    )
+                    :
+                    "no"
+                )
+            #else
+                "no"
+            #endif
+        );
+        __kmp_printf( "%s", buffer.str );
+        __kmp_str_buf_free( & buffer );
+        K_DIAG( 1, ( "KMP_VERSION is true\n" ) );
+    #endif // KMP_STUB
+} // __kmp_print_version_1
+
+//
+// Called at parallel initialization time.
+//
+static int __kmp_version_2_printed = FALSE;
+
+void
+__kmp_print_version_2( void ) {
+    if ( __kmp_version_2_printed ) {
+        return;
+    }; // if
+    __kmp_version_2_printed = TRUE;
+
+    #ifndef KMP_STUB
+    #endif // KMP_STUB
+} // __kmp_print_version_2
+
+// end of file //

diff --git a/final/runtime/src/kmp_version.h b/final/runtime/src/kmp_version.h
new file mode 100644
index 0000000..212853b
--- /dev/null
+++ b/final/runtime/src/kmp_version.h

@@ -0,0 +1,68 @@
+/*
+ * kmp_version.h -- version number for this release
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_VERSION_H
+#define KMP_VERSION_H
+
+#ifdef __cplusplus
+    extern "C" {
+#endif // __cplusplus
+
+#ifndef KMP_VERSION_MAJOR
+    #error KMP_VERSION_MAJOR macro is not defined.
+#endif
+#define KMP_VERSION_MINOR       0
+/*
+    Using "magic" prefix in all the version strings is rather convenient to get static version info
+    from binaries by using standard utilities "strings" and "grep", e. g.:
+        $ strings libomp.so | grep "@(#)"
+    gives clean list of all version strings in the library. Leading zero helps to keep version
+    string separate from printable characters which may occurs just before version string.
+*/
+#define KMP_VERSION_MAGIC_STR   "\x00@(#) "
+#define KMP_VERSION_MAGIC_LEN   6                // Length of KMP_VERSION_MAGIC_STR.
+#define KMP_VERSION_PREF_STR    "Intel(R) OMP "
+#define KMP_VERSION_PREFIX      KMP_VERSION_MAGIC_STR KMP_VERSION_PREF_STR
+
+/* declare all the version string constants for KMP_VERSION env. variable */
+extern int  const __kmp_version_major;
+extern int  const __kmp_version_minor;
+extern int  const __kmp_version_build;
+extern int  const __kmp_openmp_version;
+extern char const __kmp_copyright[];    // Old variable, kept for compatibility with ITC and ITP.
+extern char const __kmp_version_copyright[];
+extern char const __kmp_version_lib_ver[];
+extern char const __kmp_version_lib_type[];
+extern char const __kmp_version_link_type[];
+extern char const __kmp_version_build_time[];
+extern char const __kmp_version_target_env[];
+extern char const __kmp_version_build_compiler[];
+extern char const __kmp_version_alt_comp[];
+extern char const __kmp_version_omp_api[];
+// ??? extern char const __kmp_version_debug[];
+extern char const __kmp_version_lock[];
+extern char const __kmp_version_nested_stats_reporting[];
+extern char const __kmp_version_ftnstdcall[];
+extern char const __kmp_version_ftncdecl[];
+extern char const __kmp_version_ftnextra[];
+
+void __kmp_print_version_1( void );
+void __kmp_print_version_2( void );
+
+#ifdef __cplusplus
+    } // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_VERSION_H */

diff --git a/final/runtime/src/kmp_wait_release.cpp b/final/runtime/src/kmp_wait_release.cpp
new file mode 100644
index 0000000..d865bf6
--- /dev/null
+++ b/final/runtime/src/kmp_wait_release.cpp

@@ -0,0 +1,50 @@
+/*
+ * kmp_wait_release.cpp -- Wait/Release implementation
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_wait_release.h"
+
+void __kmp_wait_32(kmp_info_t *this_thr, kmp_flag_32 *flag, int final_spin
+                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    __kmp_wait_template(this_thr, flag, final_spin
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+}
+
+void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, int final_spin
+                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    __kmp_wait_template(this_thr, flag, final_spin
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+}
+
+void __kmp_wait_oncore(kmp_info_t *this_thr, kmp_flag_oncore *flag, int final_spin
+                       USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    __kmp_wait_template(this_thr, flag, final_spin
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+}
+
+
+
+void __kmp_release_32(kmp_flag_32 *flag) {
+    __kmp_release_template(flag);
+}
+
+void __kmp_release_64(kmp_flag_64 *flag) {
+    __kmp_release_template(flag);
+}
+
+void __kmp_release_oncore(kmp_flag_oncore *flag) {
+    __kmp_release_template(flag);
+}

diff --git a/final/runtime/src/kmp_wait_release.h b/final/runtime/src/kmp_wait_release.h
new file mode 100644
index 0000000..dde1763
--- /dev/null
+++ b/final/runtime/src/kmp_wait_release.h

@@ -0,0 +1,548 @@
+/*
+ * kmp_wait_release.h -- Wait/Release implementation
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_WAIT_RELEASE_H
+#define KMP_WAIT_RELEASE_H
+
+#include "kmp.h"
+#include "kmp_itt.h"
+
+/*!
+@defgroup WAIT_RELEASE Wait/Release operations
+
+The definitions and functions here implement the lowest level thread
+synchronizations of suspending a thread and awaking it. They are used
+to build higher level operations such as barriers and fork/join.
+*/
+
+/*!
+@ingroup WAIT_RELEASE
+@{
+*/
+
+/*! 
+ * The flag_type describes the storage used for the flag.
+ */
+enum flag_type {
+    flag32,        /**< 32 bit flags */
+    flag64,        /**< 64 bit flags */
+    flag_oncore    /**< special 64-bit flag for on-core barrier (hierarchical) */
+};
+
+/*!
+ * Base class for wait/release volatile flag
+ */
+template <typename P>
+class kmp_flag {
+    volatile P * loc;  /**< Pointer to the flag storage that is modified by another thread */
+    flag_type t;       /**< "Type" of the flag in loc */
+ public:
+    typedef P flag_t;
+    kmp_flag(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+    /*!
+     * @result the pointer to the actual flag
+     */
+    volatile P * get() { return loc; }
+    /*!
+     * @result the flag_type
+     */
+    flag_type get_type() { return t; }
+    // Derived classes must provide the following:
+    /*
+    kmp_info_t * get_waiter(kmp_uint32 i);
+    kmp_uint32 get_num_waiters();
+    bool done_check();
+    bool done_check_val(P old_loc);
+    bool notdone_check();
+    P internal_release();
+    P set_sleeping();
+    P unset_sleeping();
+    bool is_sleeping();
+    bool is_sleeping_val(P old_loc);
+    */
+};
+
+/* Spin wait loop that first does pause, then yield, then sleep. A thread that calls __kmp_wait_*
+   must make certain that another thread calls __kmp_release to wake it back up to prevent deadlocks!  */
+template <class C>
+static inline void __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
+                                       USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    // NOTE: We may not belong to a team at this point.
+    volatile typename C::flag_t *spin = flag->get();
+    kmp_uint32 spins;
+    kmp_uint32 hibernate;
+    int th_gtid;
+    int tasks_completed = FALSE;
+
+    KMP_FSYNC_SPIN_INIT(spin, NULL);
+    if (flag->done_check()) {
+        KMP_FSYNC_SPIN_ACQUIRED(spin);
+        return;
+    }
+    th_gtid = this_thr->th.th_info.ds.ds_gtid;
+    KA_TRACE(20, ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
+
+#if OMPT_SUPPORT && OMPT_BLAME
+    if (ompt_status == ompt_status_track_callback) {
+        if (this_thr->th.ompt_thread_info.state == ompt_state_idle){
+            if (ompt_callbacks.ompt_callback(ompt_event_idle_begin)) {
+                ompt_callbacks.ompt_callback(ompt_event_idle_begin)(th_gtid + 1);
+            }
+        } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)) {
+            KMP_DEBUG_ASSERT(this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_explicit);
+
+            ompt_lw_taskteam_t* team = this_thr->th.th_team->t.ompt_serialized_team_info;
+            ompt_parallel_id_t pId;
+            ompt_task_id_t tId;
+            if (team){
+                pId = team->ompt_team_info.parallel_id;
+                tId = team->ompt_task_info.task_id;
+            } else {
+                pId = this_thr->th.th_team->t.ompt_team_info.parallel_id;
+                tId = this_thr->th.th_current_task->ompt_task_info.task_id;
+            }
+            ompt_callbacks.ompt_callback(ompt_event_wait_barrier_begin)(pId, tId);
+        }
+    }
+#endif
+
+    // Setup for waiting
+    KMP_INIT_YIELD(spins);
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        // The worker threads cannot rely on the team struct existing at this point.
+        // Use the bt values cached in the thread struct instead.
+#ifdef KMP_ADJUST_BLOCKTIME
+        if (__kmp_zero_bt && !this_thr->th.th_team_bt_set)
+            // Force immediate suspend if not set by user and more threads than available procs
+            hibernate = 0;
+        else
+            hibernate = this_thr->th.th_team_bt_intervals;
+#else
+        hibernate = this_thr->th.th_team_bt_intervals;
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+        /* If the blocktime is nonzero, we want to make sure that we spin wait for the entirety
+           of the specified #intervals, plus up to one interval more.  This increment make
+           certain that this thread doesn't go to sleep too soon.  */
+        if (hibernate != 0)
+            hibernate++;
+
+        // Add in the current time value.
+        hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
+        KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
+                      th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
+                      hibernate - __kmp_global.g.g_time.dt.t_value));
+    }
+    KMP_MB();
+
+    // Main wait spin loop
+    while (flag->notdone_check()) {
+        int in_pool;
+
+        /* If the task team is NULL, it means one of things:
+           1) A newly-created thread is first being released by __kmp_fork_barrier(), and
+              its task team has not been set up yet.
+           2) All tasks have been executed to completion, this thread has decremented the task
+              team's ref ct and possibly deallocated it, and should no longer reference it.
+           3) Tasking is off for this region.  This could be because we are in a serialized region
+              (perhaps the outer one), or else tasking was manually disabled (KMP_TASKING=0).  */
+        kmp_task_team_t * task_team = NULL;
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+            task_team = this_thr->th.th_task_team;
+            if (task_team != NULL) {
+                if (!TCR_SYNC_4(task_team->tt.tt_active)) {
+                    KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
+                    __kmp_unref_task_team(task_team, this_thr);
+                } else if (KMP_TASKING_ENABLED(task_team)) {
+                    flag->execute_tasks(this_thr, th_gtid, final_spin, &tasks_completed
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+                }
+            } // if
+        } // if
+
+        KMP_FSYNC_SPIN_PREPARE(spin);
+        if (TCR_4(__kmp_global.g.g_done)) {
+            if (__kmp_global.g.g_abort)
+                __kmp_abort_thread();
+            break;
+        }
+
+        // If we are oversubscribed, or have waited a bit (and KMP_LIBRARY=throughput), then yield
+        KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
+        // TODO: Should it be number of cores instead of thread contexts? Like:
+        // KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores);
+        // Need performance improvement data to make the change...
+        KMP_YIELD_SPIN(spins);
+
+        // Check if this thread was transferred from a team
+        // to the thread pool (or vice-versa) while spinning.
+        in_pool = !!TCR_4(this_thr->th.th_in_pool);
+        if (in_pool != !!this_thr->th.th_active_in_pool) {
+            if (in_pool) { // Recently transferred from team to pool
+                KMP_TEST_THEN_INC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+                this_thr->th.th_active_in_pool = TRUE;
+                /* Here, we cannot assert that:
+                   KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) <= __kmp_thread_pool_nth);
+                   __kmp_thread_pool_nth is inc/dec'd by the master thread while the fork/join
+                   lock is held, whereas __kmp_thread_pool_active_nth is inc/dec'd asynchronously
+                   by the workers.  The two can get out of sync for brief periods of time.  */
+            }
+            else { // Recently transferred from pool to team
+                KMP_TEST_THEN_DEC32((kmp_int32 *) &__kmp_thread_pool_active_nth);
+                KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+                this_thr->th.th_active_in_pool = FALSE;
+            }
+        }
+
+        // Don't suspend if KMP_BLOCKTIME is set to "infinite"
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
+            continue;
+
+        // Don't suspend if there is a likelihood of new tasks being spawned.
+        if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
+            continue;
+
+        // If we have waited a bit more, fall asleep
+        if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
+            continue;
+
+        KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
+
+        flag->suspend(th_gtid);
+
+        if (TCR_4(__kmp_global.g.g_done)) {
+            if (__kmp_global.g.g_abort)
+                __kmp_abort_thread();
+            break;
+        }
+        // TODO: If thread is done with work and times out, disband/free
+    }
+
+#if OMPT_SUPPORT && OMPT_BLAME
+    if (ompt_status == ompt_status_track_callback) {
+        if (this_thr->th.ompt_thread_info.state == ompt_state_idle){
+            if (ompt_callbacks.ompt_callback(ompt_event_idle_end)) {
+                ompt_callbacks.ompt_callback(ompt_event_idle_end)(th_gtid + 1);
+            }
+        } else if (ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)) {
+            KMP_DEBUG_ASSERT(this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit ||
+                             this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_explicit);
+
+            ompt_lw_taskteam_t* team = this_thr->th.th_team->t.ompt_serialized_team_info;
+            ompt_parallel_id_t pId;
+            ompt_task_id_t tId;
+            if (team){
+                pId = team->ompt_team_info.parallel_id;
+                tId = team->ompt_task_info.task_id;
+            } else {
+                pId = this_thr->th.th_team->t.ompt_team_info.parallel_id;
+                tId = this_thr->th.th_current_task->ompt_task_info.task_id;
+            }
+            ompt_callbacks.ompt_callback(ompt_event_wait_barrier_end)(pId, tId);
+        }
+    }
+#endif
+
+    KMP_FSYNC_SPIN_ACQUIRED(spin);
+}
+
+/* Release any threads specified as waiting on the flag by releasing the flag and resume the waiting thread
+   if indicated by the sleep bit(s). A thread that calls __kmp_wait_template must call this function to wake
+   up the potentially sleeping thread and prevent deadlocks!  */
+template <class C>
+static inline void __kmp_release_template(C *flag)
+{
+#ifdef KMP_DEBUG
+    // FIX ME
+    kmp_info_t * wait_thr = flag->get_waiter(0);
+    int target_gtid = wait_thr->th.th_info.ds.ds_gtid;
+    int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+    KF_TRACE(20, ("__kmp_release: T#%d releasing T#%d spin(%p)\n", gtid, target_gtid, flag->get()));
+    KMP_DEBUG_ASSERT(flag->get());
+    KMP_FSYNC_RELEASING(flag->get());
+
+    typename C::flag_t old_spin = flag->internal_release();
+
+    KF_TRACE(100, ("__kmp_release: T#%d old spin(%p)=%d, set new spin=%d\n",
+                   gtid, flag->get(), old_spin, *(flag->get())));
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        // Only need to check sleep stuff if infinite block time not set
+        if (flag->is_sleeping_val(old_spin)) {
+            for (unsigned int i=0; i<flag->get_num_waiters(); ++i) {
+                kmp_info_t * waiter = flag->get_waiter(i);
+                int wait_gtid = waiter->th.th_info.ds.ds_gtid;
+                // Wake up thread if needed
+                KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep spin(%p) set\n",
+                              gtid, wait_gtid, flag->get()));
+                flag->resume(wait_gtid);
+            }
+        } else {
+            KF_TRACE(50, ("__kmp_release: T#%d don't wake up thread T#%d since sleep spin(%p) not set\n",
+                          gtid, target_gtid, flag->get()));
+        }
+    }
+}
+
+template <typename FlagType>
+struct flag_traits {};
+
+template <>
+struct flag_traits<kmp_uint32> {
+    typedef kmp_uint32 flag_t;
+    static const flag_type t = flag32;
+    static inline flag_t tcr(flag_t f) { return TCR_4(f); }
+    static inline flag_t test_then_add4(volatile flag_t *f) { return KMP_TEST_THEN_ADD4_32((volatile kmp_int32 *)f); }
+    static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_OR32((volatile kmp_int32 *)f, v); }
+    static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_AND32((volatile kmp_int32 *)f, v); }
+};
+
+template <>
+struct flag_traits<kmp_uint64> {
+    typedef kmp_uint64 flag_t;
+    static const flag_type t = flag64;
+    static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+    static inline flag_t test_then_add4(volatile flag_t *f) { return KMP_TEST_THEN_ADD4_64((volatile kmp_int64 *)f); }
+    static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_OR64((volatile kmp_int64 *)f, v); }
+    static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_AND64((volatile kmp_int64 *)f, v); }
+};
+
+template <typename FlagType>
+class kmp_basic_flag : public kmp_flag<FlagType> {
+    typedef flag_traits<FlagType> traits_type;
+    FlagType checker;  /**< Value to compare flag to to check if flag has been released. */
+    kmp_info_t * waiting_threads[1];  /**< Array of threads sleeping on this thread. */
+    kmp_uint32 num_waiting_threads;       /**< Number of threads sleeping on this thread. */
+public:
+    kmp_basic_flag(volatile FlagType *p) : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+    kmp_basic_flag(volatile FlagType *p, kmp_info_t *thr) : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+        waiting_threads[0] = thr; 
+    }
+    kmp_basic_flag(volatile FlagType *p, FlagType c) : kmp_flag<FlagType>(p, traits_type::t), checker(c), num_waiting_threads(0) {}
+    /*!
+     * param i in   index into waiting_threads
+     * @result the thread that is waiting at index i
+     */
+    kmp_info_t * get_waiter(kmp_uint32 i) { 
+        KMP_DEBUG_ASSERT(i<num_waiting_threads);
+        return waiting_threads[i]; 
+    }
+    /*!
+     * @result num_waiting_threads
+     */
+    kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+    /*!
+     * @param thr in   the thread which is now waiting
+     *
+     * Insert a waiting thread at index 0.
+     */
+    void set_waiter(kmp_info_t *thr) { 
+        waiting_threads[0] = thr; 
+        num_waiting_threads = 1;
+    }
+    /*!
+     * @result true if the flag object has been released.
+     */
+    bool done_check() { return traits_type::tcr(*(this->get())) == checker; }
+    /*!
+     * @param old_loc in   old value of flag
+     * @result true if the flag's old value indicates it was released.
+     */
+    bool done_check_val(FlagType old_loc) { return old_loc == checker; }
+    /*!
+     * @result true if the flag object is not yet released.
+     * Used in __kmp_wait_template like:
+     * @code
+     * while (flag.notdone_check()) { pause(); }
+     * @endcode
+     */
+    bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
+    /*!
+     * @result Actual flag value before release was applied.
+     * Trigger all waiting threads to run by modifying flag to release state.
+     */
+    FlagType internal_release() {
+        return traits_type::test_then_add4((volatile FlagType *)this->get());
+    }
+    /*!
+     * @result Actual flag value before sleep bit(s) set.
+     * Notes that there is at least one thread sleeping on the flag by setting sleep bit(s).
+     */
+    FlagType set_sleeping() { 
+        return traits_type::test_then_or((volatile FlagType *)this->get(), KMP_BARRIER_SLEEP_STATE);
+    }
+    /*!
+     * @result Actual flag value before sleep bit(s) cleared.
+     * Notes that there are no longer threads sleeping on the flag by clearing sleep bit(s).
+     */
+    FlagType unset_sleeping() { 
+        return traits_type::test_then_and((volatile FlagType *)this->get(), ~KMP_BARRIER_SLEEP_STATE);
+    }
+    /*! 
+     * @param old_loc in   old value of flag
+     * Test whether there are threads sleeping on the flag's old value in old_loc.
+     */
+    bool is_sleeping_val(FlagType old_loc) { return old_loc & KMP_BARRIER_SLEEP_STATE; }
+    /*! 
+     * Test whether there are threads sleeping on the flag.
+     */
+    bool is_sleeping() { return is_sleeping_val(*(this->get())); }
+};
+
+class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
+public:
+    kmp_flag_32(volatile kmp_uint32 *p) : kmp_basic_flag<kmp_uint32>(p) {}
+    kmp_flag_32(volatile kmp_uint32 *p, kmp_info_t *thr) : kmp_basic_flag<kmp_uint32>(p, thr) {}
+    kmp_flag_32(volatile kmp_uint32 *p, kmp_uint32 c) : kmp_basic_flag<kmp_uint32>(p, c) {}
+    void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
+    void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
+    int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained) {
+        return __kmp_execute_tasks_32(this_thr, gtid, this, final_spin, thread_finished
+                                      USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    }
+    void wait(kmp_info_t *this_thr, int final_spin
+              USE_ITT_BUILD_ARG(void * itt_sync_obj)) {
+        __kmp_wait_template(this_thr, this, final_spin
+                            USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    void release() { __kmp_release_template(this); }
+};
+
+class kmp_flag_64 : public kmp_basic_flag<kmp_uint64> {
+public:
+    kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag<kmp_uint64>(p) {}
+    kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr) : kmp_basic_flag<kmp_uint64>(p, thr) {}
+    kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c) : kmp_basic_flag<kmp_uint64>(p, c) {}
+    void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
+    void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
+    int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained) {
+        return __kmp_execute_tasks_64(this_thr, gtid, this, final_spin, thread_finished
+                                      USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    }
+    void wait(kmp_info_t *this_thr, int final_spin
+              USE_ITT_BUILD_ARG(void * itt_sync_obj)) {
+        __kmp_wait_template(this_thr, this, final_spin
+                            USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    void release() { __kmp_release_template(this); }
+};
+
+// Hierarchical 64-bit on-core barrier instantiation
+class kmp_flag_oncore : public kmp_flag<kmp_uint64> {
+    kmp_uint64 checker;
+    kmp_info_t * waiting_threads[1];
+    kmp_uint32 num_waiting_threads;
+    kmp_uint32 offset;      /**< Portion of flag that is of interest for an operation. */
+    bool flag_switch;       /**< Indicates a switch in flag location. */
+    enum barrier_type bt;   /**< Barrier type. */
+    kmp_info_t * this_thr;  /**< Thread that may be redirected to different flag location. */
+#if USE_ITT_BUILD
+    void *itt_sync_obj;     /**< ITT object that must be passed to new flag location. */
+#endif
+    unsigned char& byteref(volatile kmp_uint64* loc, size_t offset) { return ((unsigned char *)loc)[offset]; }
+public:
+    kmp_flag_oncore(volatile kmp_uint64 *p)
+        : kmp_flag<kmp_uint64>(p, flag_oncore), num_waiting_threads(0), flag_switch(false) {}
+    kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
+        : kmp_flag<kmp_uint64>(p, flag_oncore), num_waiting_threads(0), offset(idx), flag_switch(false) {}
+    kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx, enum barrier_type bar_t,
+                    kmp_info_t * thr
+#if USE_ITT_BUILD
+                    , void *itt
+#endif
+                    ) 
+        : kmp_flag<kmp_uint64>(p, flag_oncore), checker(c), num_waiting_threads(0), offset(idx),
+          flag_switch(false), bt(bar_t), this_thr(thr)
+#if USE_ITT_BUILD
+        , itt_sync_obj(itt)
+#endif
+        {}
+    kmp_info_t * get_waiter(kmp_uint32 i) { 
+        KMP_DEBUG_ASSERT(i<num_waiting_threads);
+        return waiting_threads[i]; 
+    }
+    kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+    void set_waiter(kmp_info_t *thr) { 
+        waiting_threads[0] = thr; 
+        num_waiting_threads = 1;
+    }
+    bool done_check_val(kmp_uint64 old_loc) { return byteref(&old_loc,offset) == checker; }
+    bool done_check() { return done_check_val(*get()); }
+    bool notdone_check() { 
+        // Calculate flag_switch
+        if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
+            flag_switch = true;
+        if (byteref(get(),offset) != 1 && !flag_switch) 
+            return true;
+        else if (flag_switch) {
+            this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
+            kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go, (kmp_uint64)KMP_BARRIER_STATE_BUMP);
+            __kmp_wait_64(this_thr, &flag, TRUE
+#if USE_ITT_BUILD
+                          , itt_sync_obj
+#endif
+                          );
+        }
+        return false;
+    }
+    kmp_uint64 internal_release() { 
+        kmp_uint64 old_val;
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+            old_val = *get();
+            byteref(get(),offset) = 1;
+        }
+        else {
+            kmp_uint64 mask=0;
+            byteref(&mask,offset) = 1;
+            old_val = KMP_TEST_THEN_OR64((volatile kmp_int64 *)get(), mask);
+        }
+        return old_val;
+    }
+    kmp_uint64 set_sleeping() { 
+        return KMP_TEST_THEN_OR64((kmp_int64 volatile *)get(), KMP_BARRIER_SLEEP_STATE);
+    }
+    kmp_uint64 unset_sleeping() { 
+        return KMP_TEST_THEN_AND64((kmp_int64 volatile *)get(), ~KMP_BARRIER_SLEEP_STATE);
+    }
+    bool is_sleeping_val(kmp_uint64 old_loc) { return old_loc & KMP_BARRIER_SLEEP_STATE; }
+    bool is_sleeping() { return is_sleeping_val(*get()); }
+    void wait(kmp_info_t *this_thr, int final_spin
+              USE_ITT_BUILD_ARG(void * itt_sync_obj)) {
+        __kmp_wait_template(this_thr, this, final_spin
+                            USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    void release() { __kmp_release_template(this); }
+    void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
+    void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
+    int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained) {
+        return __kmp_execute_tasks_oncore(this_thr, gtid, this, final_spin, thread_finished
+                                      USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    }
+};
+
+/*!
+@}
+*/
+
+#endif // KMP_WAIT_RELEASE_H

diff --git a/final/runtime/src/kmp_wrapper_getpid.h b/final/runtime/src/kmp_wrapper_getpid.h
new file mode 100644
index 0000000..61a046c
--- /dev/null
+++ b/final/runtime/src/kmp_wrapper_getpid.h

@@ -0,0 +1,56 @@
+/*
+ * kmp_wrapper_getpid.h -- getpid() declaration.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_WRAPPER_GETPID_H
+#define KMP_WRAPPER_GETPID_H
+
+#if KMP_OS_UNIX
+
+    // On Unix-like systems (Linux* OS and OS X*) getpid() is declared in standard headers.
+    #include <sys/types.h>
+    #include <unistd.h>
+
+#elif KMP_OS_WINDOWS
+
+    // On Windows* OS _getpid() returns int (not pid_t) and is declared in "process.h".
+    #include <process.h>
+    // Let us simulate Unix.
+    typedef int pid_t;
+    #define getpid _getpid
+
+#else
+
+    #error Unknown or unsupported OS.
+
+#endif
+
+/*
+    TODO: All the libomp source code uses pid_t type for storing the result of getpid(), it is good.
+    But often it printed as "%d", that is not good, because it ignores pid_t definition (may pid_t
+    be longer that int?). It seems all pid prints should be rewritten as
+
+        printf( "%" KMP_UINT64_SPEC, (kmp_uint64) pid );
+
+    or (at least) as
+
+        printf( "%" KMP_UINT32_SPEC, (kmp_uint32) pid );
+
+    (kmp_uint32, kmp_uint64, KMP_UINT64_SPEC, and KMP_UNIT32_SPEC are defined in "kmp_os.h".)
+
+*/
+
+#endif // KMP_WRAPPER_GETPID_H
+
+// end of file //

diff --git a/final/runtime/src/kmp_wrapper_malloc.h b/final/runtime/src/kmp_wrapper_malloc.h
new file mode 100644
index 0000000..58a7ab7
--- /dev/null
+++ b/final/runtime/src/kmp_wrapper_malloc.h

@@ -0,0 +1,205 @@
+/*
+ * kmp_wrapper_malloc.h -- Wrappers for memory allocation routines
+ *                         (malloc(), free(), and others).
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_WRAPPER_MALLOC_H
+#define KMP_WRAPPER_MALLOC_H
+
+/*
+    This header serves for 3 purposes:
+
+        1. Declaring standard memory allocation rourines in OS-independent way.
+        2. Passing source location info through memory allocation wrappers.
+        3. Enabling native memory debugging capabilities.
+
+
+    1. Declaring standard memory allocation rourines in OS-independent way.
+    -----------------------------------------------------------------------
+
+    On Linux* OS, alloca() function is declared in <alloca.h> header, while on Windows* OS there is no
+    <alloca.h> header, function _alloca() (note underscore!) is declared in <malloc.h>. This header
+    eliminates these differences, so client code incluiding "kmp_wrapper_malloc.h" can rely on
+    following routines:
+
+        malloc
+        calloc
+        realloc
+        free
+        alloca
+
+    in OS-independent way. It also enables memory tracking capabilities in debug build. (Currently
+    it is available only on Windows* OS.)
+
+
+    2. Passing source location info through memory allocation wrappers.
+    -------------------------------------------------------------------
+
+    Some tools may help debugging memory errors, for example, report memory leaks. However, memory
+    allocation wrappers may hinder source location.
+
+    For example:
+
+        void * aligned_malloc( int size ) {
+            void * ptr = malloc( size ); // All the memory leaks will be reported at this line.
+            // some adjustments...
+            return ptr;
+        };
+
+        ptr = aligned_malloc( size );    // Memory leak will *not* be detected here. :-(
+
+    To overcome the problem, information about original source location should be passed through all
+    the memory allocation wrappers, for example:
+
+        void * aligned_malloc( int size, char const * file, int line ) {
+            void * ptr = _malloc_dbg( size, file, line );
+            // some adjustments...
+            return ptr;
+        };
+
+        void * ptr = aligned_malloc( size, __FILE__, __LINE__ );
+
+    This is a good idea for debug, but passing additional arguments impacts performance. Disabling
+    extra arguments in release version of the software introduces too many conditional compilation,
+    which makes code unreadable. This header defines few macros and functions facilitating it:
+
+        void * _aligned_malloc( int size KMP_SRC_LOC_DECL ) {
+            void * ptr = malloc_src_loc( size KMP_SRC_LOC_PARM );
+            // some adjustments...
+            return ptr;
+        };
+        #define aligned_malloc( size ) _aligned_malloc( (size) KMP_SRC_LOC_CURR )
+            // Use macro instead of direct call to function.
+
+        void * ptr = aligned_malloc( size );  // Bingo! Memory leak will be reported at this line.
+
+
+    3. Enabling native memory debugging capabilities.
+    -------------------------------------------------
+
+    Some platforms may offer memory debugging capabilities. For example, debug version of Microsoft
+    RTL tracks all memory allocations and can report memory leaks. This header enables this, and
+    makes report more useful (see "Passing source location info through memory allocation
+    wrappers").
+
+*/
+
+#include <stdlib.h>
+
+#include "kmp_os.h"
+
+// Include alloca() declaration.
+#if KMP_OS_WINDOWS
+    #include <malloc.h>        // Windows* OS: _alloca() declared in "malloc.h".
+    #define alloca _alloca     // Allow to use alloca() with no underscore.
+#elif KMP_OS_FREEBSD
+    // Declared in "stdlib.h".
+#elif KMP_OS_UNIX
+    #include <alloca.h>        // Linux* OS and OS X*: alloc() declared in "alloca".
+#else
+    #error Unknown or unsupported OS.
+#endif
+
+/*
+    KMP_SRC_LOC_DECL -- Declaring source location paramemters, to be used in function declaration.
+    KMP_SRC_LOC_PARM -- Source location paramemters, to be used to pass parameters to underlying
+        levels.
+    KMP_SRC_LOC_CURR -- Source location arguments describing current location, to be used at
+        top-level.
+
+    Typical usage:
+
+        void * _aligned_malloc( int size KMP_SRC_LOC_DECL ) {
+            // Note: Comma is missed before KMP_SRC_LOC_DECL.
+            KE_TRACE( 25, ( "called from %s:%d\n", KMP_SRC_LOC_PARM ) );
+            ...
+        }
+        #define aligned_malloc( size ) _aligned_malloc( (size) KMP_SRC_LOC_CURR )
+            // Use macro instead of direct call to function -- macro passes info about current
+            // source location to the func.
+*/
+#if KMP_DEBUG
+    #define KMP_SRC_LOC_DECL    , char const * _file_, int _line_
+    #define KMP_SRC_LOC_PARM    , _file_, _line_
+    #define KMP_SRC_LOC_CURR    , __FILE__, __LINE__
+#else
+    #define KMP_SRC_LOC_DECL
+    #define KMP_SRC_LOC_PARM
+    #define KMP_SRC_LOC_CURR
+#endif // KMP_DEBUG
+
+/*
+    malloc_src_loc() and free_src_loc() are pseudo-functions (really macros) with accepts extra
+    arguments (source location info) in debug mode. They should be used in place of malloc() and
+    free(), this allows enabling native memory debugging capabilities (if any).
+
+    Typical usage:
+
+        ptr = malloc_src_loc( size KMP_SRC_LOC_PARM );
+            // Inside memory allocation wrapper, or
+        ptr = malloc_src_loc( size KMP_SRC_LOC_CURR );
+            // Outside of memory allocation wrapper.
+
+
+*/
+#define malloc_src_loc( args )    _malloc_src_loc( args )
+#define free_src_loc(   args )    _free_src_loc(   args )
+    /*
+        Depending on build mode (debug or release), malloc_src_loc is declared with 1 or 3
+        parameters, but calls to malloc_src_loc() are always the same:
+
+            ... malloc_src_loc( size KMP_SRC_LOC_PARM ); // or KMP_SRC_LOC_CURR
+
+        Compiler issues warning/error "too few arguments in macro invocation". Declaring two
+        macroses, malloc_src_loc() and _malloc_src_loc() overcomes the problem.
+    */
+
+#if KMP_DEBUG
+
+    #if KMP_OS_WINDOWS && _DEBUG
+        // KMP_DEBUG != _DEBUG. MS debug RTL is available only if _DEBUG is defined.
+
+        // Windows* OS has native memory debugging capabilities. Enable them.
+
+        #include <crtdbg.h>
+
+        #define KMP_MEM_BLOCK           _CLIENT_BLOCK
+        #define malloc( size )          _malloc_dbg( (size), KMP_MEM_BLOCK, __FILE__, __LINE__ )
+        #define calloc( num, size )     _calloc_dbg( (num), (size), KMP_MEM_BLOCK, __FILE__, __LINE__ )
+        #define realloc( ptr, size )    _realloc_dbg( (ptr), (size), KMP_MEM_BLOCK, __FILE__, __LINE__ )
+        #define free( ptr )             _free_dbg( (ptr), KMP_MEM_BLOCK )
+
+        #define _malloc_src_loc( size, file, line )  _malloc_dbg( (size), KMP_MEM_BLOCK, (file), (line) )
+        #define _free_src_loc(    ptr, file, line )  _free_dbg(   (ptr),  KMP_MEM_BLOCK                 )
+
+    #else
+
+        // Linux* OS, OS X*, or non-debug Windows* OS.
+
+        #define _malloc_src_loc( size, file, line )    malloc( (size) )
+        #define _free_src_loc( ptr, file, line )       free( (ptr) )
+
+    #endif
+
+#else
+
+    // In release build malloc_src_loc() and free_src_loc() do not have extra parameters.
+    #define _malloc_src_loc( size )    malloc( (size) )
+    #define _free_src_loc( ptr )       free( (ptr) )
+
+#endif // KMP_DEBUG
+
+#endif // KMP_WRAPPER_MALLOC_H
+
+// end of file //

diff --git a/final/runtime/src/libiomp.rc.var b/final/runtime/src/libiomp.rc.var
new file mode 100644
index 0000000..c5f81e8
--- /dev/null
+++ b/final/runtime/src/libiomp.rc.var

@@ -0,0 +1,70 @@
+// libiomp.rc.var
+
+//
+////===----------------------------------------------------------------------===//
+////
+////                     The LLVM Compiler Infrastructure
+////
+//// This file is dual licensed under the MIT and the University of Illinois Open
+//// Source Licenses. See LICENSE.txt for details.
+////
+////===----------------------------------------------------------------------===//
+//
+
+#include "winres.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US    // English (U.S.) resources
+#pragma code_page(1252)
+
+VS_VERSION_INFO VERSIONINFO
+    // Parts of FILEVERSION and PRODUCTVERSION are 16-bit fields, entire build date yyyymmdd
+    // does not fit into one version part, so we need to split it into yyyy and mmdd:
+    FILEVERSION    $KMP_VERSION_MAJOR,$KMP_VERSION_MINOR,${{ our $KMP_VERSION_BUILD; int( $KMP_VERSION_BUILD / 10000 ) . "," . ( $KMP_VERSION_BUILD % 10000 ) }}
+    PRODUCTVERSION $KMP_VERSION_MAJOR,$KMP_VERSION_MINOR,${{ our $KMP_VERSION_BUILD; int( $KMP_VERSION_BUILD / 10000 ) . "," . ( $KMP_VERSION_BUILD % 10000 ) }}
+    FILEFLAGSMASK  VS_FFI_FILEFLAGSMASK
+    FILEFLAGS      0
+    #if $KMP_DIAG || $KMP_DEBUG_INFO
+        | VS_FF_DEBUG
+    #endif
+    #if $KMP_VERSION_BUILD == 0
+        | VS_FF_PRIVATEBUILD | VS_FF_PRERELEASE
+    #endif
+    FILEOS          VOS_NT_WINDOWS32    // Windows* Server* 2003, XP*, 2000, or NT*
+    FILETYPE        VFT_DLL
+    BEGIN
+        BLOCK "StringFileInfo"
+        BEGIN
+            BLOCK "040904b0"            // U.S. English, Unicode (0x04b0 == 1200)
+            BEGIN
+
+                // FileDescription and LegalCopyright should be short.
+                VALUE "FileDescription",  "Intel(R) OpenMP* Runtime Library${{ our $MESSAGE_CATALOG; $MESSAGE_CATALOG ? " Message Catalog" : "" }}\0"
+                // Following values may be relatively long.
+                VALUE "CompanyName",      "Intel Corporation\0"
+                // VALUE "LegalTrademarks",  "\0"  // Not used for now.
+                VALUE "ProductName",      "Intel(R) OpenMP* Runtime Library\0"
+                VALUE "ProductVersion",   "$KMP_VERSION_MAJOR.$KMP_VERSION_MINOR\0"
+                VALUE "FileVersion",      "$KMP_VERSION_BUILD\0"
+                VALUE "InternalName",     "$KMP_FILE\0"
+                VALUE "OriginalFilename", "$KMP_FILE\0"
+                VALUE "Comments",
+                    "Intel(R) OpenMP* ${{ our ( $MESSAGE_CATALOG, $KMP_TYPE ); $MESSAGE_CATALOG ? "Runtime Library Message Catalog" : "$KMP_TYPE Library" }} "
+                    "version $KMP_VERSION_MAJOR.$KMP_VERSION_MINOR.$KMP_VERSION_BUILD "
+                    "for $KMP_ARCH architecture built on $KMP_BUILD_DATE.\0"
+                #if $KMP_VERSION_BUILD == 0
+                    VALUE "PrivateBuild",
+                        "This is a development build for internal testing purposes only. "
+                        "Do not distribute it outside of Intel.\0"
+                #endif
+                // VALUE "SpecialBuild",     "\0"    // Not used for now.
+
+            END
+        END
+        BLOCK "VarFileInfo"
+        BEGIN
+            VALUE "Translation", ${{ our ( $MESSAGE_CATALOG, $LANGUAGE ); $MESSAGE_CATALOG ? $LANGUAGE : 1033 }}, 1200
+            // 1033 -- U.S. English, 1200 -- Unicode
+        END
+    END
+
+// end of file //

diff --git a/final/runtime/src/libomp.rc.var b/final/runtime/src/libomp.rc.var
new file mode 100644
index 0000000..f78a6d1
--- /dev/null
+++ b/final/runtime/src/libomp.rc.var

@@ -0,0 +1,70 @@
+// libomp.rc.var
+
+//
+////===----------------------------------------------------------------------===//
+////
+////                     The LLVM Compiler Infrastructure
+////
+//// This file is dual licensed under the MIT and the University of Illinois Open
+//// Source Licenses. See LICENSE.txt for details.
+////
+////===----------------------------------------------------------------------===//
+//
+
+#include "winres.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US    // English (U.S.) resources
+#pragma code_page(1252)
+
+VS_VERSION_INFO VERSIONINFO
+    // Parts of FILEVERSION and PRODUCTVERSION are 16-bit fields, entire build date yyyymmdd
+    // does not fit into one version part, so we need to split it into yyyy and mmdd:
+    FILEVERSION    $KMP_VERSION_MAJOR,$KMP_VERSION_MINOR,${{ our $KMP_VERSION_BUILD; int( $KMP_VERSION_BUILD / 10000 ) . "," . ( $KMP_VERSION_BUILD % 10000 ) }}
+    PRODUCTVERSION $KMP_VERSION_MAJOR,$KMP_VERSION_MINOR,${{ our $KMP_VERSION_BUILD; int( $KMP_VERSION_BUILD / 10000 ) . "," . ( $KMP_VERSION_BUILD % 10000 ) }}
+    FILEFLAGSMASK  VS_FFI_FILEFLAGSMASK
+    FILEFLAGS      0
+    #if $KMP_DIAG || $KMP_DEBUG_INFO
+        | VS_FF_DEBUG
+    #endif
+    #if $KMP_VERSION_BUILD == 0
+        | VS_FF_PRIVATEBUILD | VS_FF_PRERELEASE
+    #endif
+    FILEOS          VOS_NT_WINDOWS32    // Windows* Server* 2003, XP*, 2000, or NT*
+    FILETYPE        VFT_DLL
+    BEGIN
+        BLOCK "StringFileInfo"
+        BEGIN
+            BLOCK "040904b0"            // U.S. English, Unicode (0x04b0 == 1200)
+            BEGIN
+
+                // FileDescription and LegalCopyright should be short.
+                VALUE "FileDescription",  "LLVM* OpenMP* Runtime Library${{ our $MESSAGE_CATALOG; $MESSAGE_CATALOG ? " Message Catalog" : "" }}\0"
+                // Following values may be relatively long.
+                VALUE "CompanyName",      "Intel Corporation\0"
+                // VALUE "LegalTrademarks",  "\0"  // Not used for now.
+                VALUE "ProductName",      "LLVM* OpenMP* Runtime Library\0"
+                VALUE "ProductVersion",   "$KMP_VERSION_MAJOR.$KMP_VERSION_MINOR\0"
+                VALUE "FileVersion",      "$KMP_VERSION_BUILD\0"
+                VALUE "InternalName",     "$KMP_FILE\0"
+                VALUE "OriginalFilename", "$KMP_FILE\0"
+                VALUE "Comments",
+                    "LLVM* OpenMP* ${{ our ( $MESSAGE_CATALOG, $KMP_TYPE ); $MESSAGE_CATALOG ? "Runtime Library Message Catalog" : "$KMP_TYPE Library" }} "
+                    "version $KMP_VERSION_MAJOR.$KMP_VERSION_MINOR.$KMP_VERSION_BUILD "
+                    "for $KMP_ARCH architecture built on $KMP_BUILD_DATE.\0"
+                #if $KMP_VERSION_BUILD == 0
+                    VALUE "PrivateBuild",
+                        "This is a development build for internal testing purposes only. "
+                        "Do not distribute it outside of Intel.\0"
+                #endif
+                // VALUE "SpecialBuild",     "\0"    // Not used for now.
+
+            END
+        END
+        BLOCK "VarFileInfo"
+        BEGIN
+            VALUE "Translation", ${{ our ( $MESSAGE_CATALOG, $LANGUAGE ); $MESSAGE_CATALOG ? $LANGUAGE : 1033 }}, 1200
+            // 1033 -- U.S. English, 1200 -- Unicode
+        END
+    END
+
+// end of file //

diff --git a/final/runtime/src/makefile.mk b/final/runtime/src/makefile.mk
new file mode 100644
index 0000000..0210102
--- /dev/null
+++ b/final/runtime/src/makefile.mk

@@ -0,0 +1,1565 @@
+# makefile.mk #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Check and normalize LIBOMP_WORK.
+# This piece of code is common, but it cannot be moved to common file.
+ifeq "$(LIBOMP_WORK)" ""
+    $(error LIBOMP_WORK environment variable must be set)
+endif
+ifneq "$(words $(LIBOMP_WORK))" "1"
+    $(error LIBOMP_WORK must not contain spaces)
+endif
+override LIBOMP_WORK := $(subst \,/,$(LIBOMP_WORK))
+ifeq "$(filter %/,$(LIBOMP_WORK))" ""
+    override LIBOMP_WORK := $(LIBOMP_WORK)/
+endif
+
+# Include definitions common for RTL and DSL.
+include $(LIBOMP_WORK)src/defs.mk
+
+src_dir      = $(LIBOMP_WORK)src/
+inc_dir      = $(LIBOMP_WORK)src/include/$(OMP_VERSION)/
+
+# --------------------------------------------------------------------------------------------------
+# Configuration options.
+# --------------------------------------------------------------------------------------------------
+
+# Build compiler
+BUILD_COMPILER := $(call check_variable,BUILD_COMPILER,icc gcc clang icl icl.exe)
+# Distribution type: com (commercial) or oss (open-source)
+DISTRIBUTION  := $(call check_variable,DISTRIBUTION,com oss)
+
+ifeq "$(c)" ""
+    c = $(BUILD_COMPILER)
+    ifeq "$(os)" "win"
+        c = icl.exe
+    endif
+endif
+ifeq "$(dist)" ""
+    dist = $(DISTRIBUTION)
+endif
+ifeq "$(dist)" ""
+    dist = com
+endif
+
+# Compile all C files as C++ source.
+CPLUSPLUS    := $(call check_variable,CPLUSPLUS,on)
+# Turn on instrumentation for code coverage.
+COVERAGE     := $(call check_variable,COVERAGE,off on)
+# Instruct compiler to emit debug information.
+DEBUG_INFO   := $(call check_variable,DEBUG_INFO,on off)
+# Turn on debug support in library code, assertions and traces.
+DIAG         := $(call check_variable,DIAG,on off)
+LIB_TYPE     := $(call check_variable,LIB_TYPE,norm prof stub)
+# Type of library: dynamic or static linking.
+LINK_TYPE    := $(call check_variable,LINK_TYPE,dyna stat)
+# Supported OpenMP version, 2.5 or 3.0.
+OMP_VERSION  := $(call check_variable,OMP_VERSION,41 40 30 25)
+# Generate optimized code.
+OPTIMIZATION := $(call check_variable,OPTIMIZATION,off on)
+# Library version: 4 -- legacy, 5 -- compat.
+VERSION      := $(call check_variable,VERSION,5 4)
+# quad precision floating point
+HAVE_QUAD     = 1
+
+VPATH += $(src_dir)
+VPATH += $(src_dir)i18n/
+VPATH += $(inc_dir)
+VPATH += $(src_dir)thirdparty/ittnotify/
+
+
+# Define config.
+define curr_config
+    CPLUSPLUS=$(CPLUSPLUS)
+    COVERAGE=$(COVERAGE)
+    DEBUG_INFO=$(DEBUG_INFO)
+    DIAG=$(DIAG)
+    LIB_TYPE=$(LIB_TYPE)
+    LINK_TYPE=$(LINK_TYPE)
+    OMP_VERSION=$(OMP_VERSION)
+    OPTIMIZATION=$(OPTIMIZATION)
+    VERSION=$(VERSION)
+    CPPFLAGS=$(subst $(space),_,$(CPPFLAGS))
+    CFLAGS=$(subst $(space),_,$(CFLAGS))
+    CXXFLAGS=$(subst $(space),_,$(CXXFLAGS))
+    FFLAGS=$(subst $(space),_,$(FFLAGS))
+    LDFLAGS=$(subst $(space),_,$(LDFLAGS))
+    OMPT_SUPPORT=$(OMPT_SUPPORT)
+    OMPT_BLAME=$(OMPT_BLAME)
+    OMPT_TRACE=$(OMPT_TRACE)
+endef
+# And check it.
+include $(tools_dir)src/common-checks.mk
+
+# Function to convert LIB_TYPE to printable one.
+legal_type = $(if $(filter norm,$(LIB_TYPE)),Performance,$(if $(filter prof,$(LIB_TYPE)),Profiling,Stub))
+
+# Check the OS X version (we need it to decide which tool use for objects accumulation)
+ifeq "$(os)" "mac"
+    mac_os_new := $(shell /bin/sh -c 'if [[ `sw_vers -productVersion` > 10.6 ]]; then echo "1"; else echo "0"; fi')
+endif
+
+
+# --------------------------------------------------------------------------------------------------
+# Dev tools and general options (like -fpic, -O2 or -g).
+# --------------------------------------------------------------------------------------------------
+include $(tools_dir)src/common-tools.mk
+
+# --------------------------------------------------------------------------------------------------
+# Project-specific tools options.
+# --------------------------------------------------------------------------------------------------
+
+# --- Assembler options ---
+
+ifeq "$(os)" "win"
+    ifeq "$(arch)" "32"
+        as-flags += -coff
+        as-flags += -D_M_IA32
+    endif
+    ifeq "$(arch)" "32e"
+        as-flags  += -D_M_AMD64
+    endif
+    ifeq "$(arch)" "64"
+    endif
+endif
+
+# --- C/C++ options ---
+
+# Enable _Quad type.
+ifneq "$(filter icc icl icl.exe,$(c))" ""
+    c-flags   += -Qoption,cpp,--extended_float_types
+    cxx-flags += -Qoption,cpp,--extended_float_types
+endif
+
+ifeq "$(c)" "gcc"
+    ifeq "$(arch)" "32"
+        c-flags += -m32 -msse
+        cxx-flags += -m32 -msse
+        fort-flags += -m32 -msse
+        ld-flags += -m32 -msse
+        as-flags += -m32 -msse
+    endif
+endif
+
+ifeq "$(c)" "clang"
+    c-flags += -Wno-unused-value -Wno-switch -Wno-deprecated-register
+    cxx-flags += -Wno-unused-value -Wno-switch -Wno-deprecated-register
+    ifeq "$(arch)" "32"
+        c-flags += -m32 -msse
+        cxx-flags += -m32 -msse
+        fort-flags += -m32 -msse
+        ld-flags += -m32 -msse
+        as-flags += -m32 -msse
+    endif
+    HAVE_QUAD = 0
+endif
+
+ifeq "$(LINK_TYPE)" "dyna"
+# debug-info
+    ifeq "$(os)" "win"
+        c-flags    += -Zi
+        cxx-flags  += -Zi
+        fort-flags += -Zi
+    else
+      ifneq "$(os)" "mac"
+        c-flags    += -g
+        cxx-flags  += -g
+        fort-flags += -g
+        ld-flags   += -g
+      endif
+    endif
+endif
+
+# Enable 80-bit "long double".
+# ??? In original makefile, it was enabled for all files on win_32 and win_64, and only for one
+# file kmp_atomic.c on win_32e.
+ifeq "$(os)" "win"
+    c-flags   += -Qlong_double
+    cxx-flags += -Qlong_double
+endif
+
+# Enable saving compiler options and version in object files and libraries.
+ifeq "$(filter gcc clang,$(c))" ""
+    ifeq "$(os)" "win"
+        # Newer MS linker issues warnings if -Qsox is used:
+        # "warning LNK4224: /COMMENT is no longer supported;  ignored"
+        # so let us comment it out (and delete later).
+        # ifneq "$(arch)" "32e"
+        #     c-flags   += -Qsox
+        #     cxx-flags += -Qsox
+        # endif
+        fort-flags += -Qsox
+    else
+        # For unknown reason, icc and ifort on mac does not accept this option.
+        ifneq "$(filter lin,$(os))" ""
+            c-flags    += -sox
+            cxx-flags  += -sox
+            fort-flags += -sox
+        endif
+    endif
+endif
+
+# On Linux and Windows Intel64 we need offload attribute for all Fortran entries
+# in order to support OpenMP function calls inside Device constructs
+ifeq "$(fort)" "ifort"
+    ifeq "$(os)_$(arch)" "lin_32e"
+        # TODO: change to -qoffload... when we stop supporting 14.0 compiler (-offload is deprecated)
+        fort-flags += -offload-attribute-target=mic
+    endif
+    ifeq "$(os)_$(arch)" "win_32e"
+        fort-flags += /Qoffload-attribute-target:mic
+    endif
+endif
+
+ifeq "$(arch)" "mic"
+    c-flags    += -mmic
+    cxx-flags  += -mmic
+    fort-flags += -mmic
+    ld-flags   += -mmic
+    as-flags   += -mmic
+    cpp-flags  += -mmic
+endif
+
+# Exception handling.
+ifeq "$(os)" "win"
+    # ??? Enable exception handling?
+    ifeq "$(LINK_TYPE)" "dyna"
+        c-flags   += -EHsc
+        cxx-flags += -EHsc
+    endif
+else
+    # Disable exception handling.
+    c-flags   += -fno-exceptions
+    cxx-flags += -fno-exceptions
+endif
+
+# Disable use of EBP as general purpose register.
+ifeq "$(os)" "win"
+    ifeq "$(arch)" "32"
+        c-flags   += -Oy-
+        cxx-flags += -Oy-
+    endif
+endif
+
+ifeq "$(os)" "lin"
+    ifneq "$(arch)" "mic"
+    c-flags   += -Wsign-compare
+    cxx-flags += -Wsign-compare
+    ld-flags  += -Wsign-compare
+    ifeq "$(filter gcc clang,$(c))" ""
+        c-flags   += -Werror
+        cxx-flags += -Werror
+        ld-flags  += -Werror
+    endif
+endif
+endif
+ifeq "$(os)" "win"
+    c-flags   += -WX
+    cxx-flags += -WX
+    ld-flags  += -WX:NO
+endif
+
+ifeq "$(arch)" "mic"
+    # With "-ftls-model=initial-exec" the compiler generates faster code for static TLS
+    # accesses, it generates slower calls to glibc otherwise. We don't use this
+    # feature on Linux because it prevents dynamic loading (use of dlopen) of the library.
+    # Reliable dynamic loading is more important than slightly faster access to TLS.
+    # On Intel(R) Xeon Phi(TM) coprocessor we haven't encountered dynamic loading problem yet, so use faster
+    # access to static TLS.
+    c-flags   += -ftls-model=initial-exec
+    cxx-flags += -ftls-model=initial-exec
+    # disable streaming stores in order to work on A0 Si
+    c-flags   += -opt-streaming-stores never
+    cxx-flags += -opt-streaming-stores never
+endif
+
+# Select C runtime.
+ifeq "$(os)" "win"
+    # Regardless of following -Zl option, we should specify -MT or -MTd, otherwise test-touch
+    # wil fails due to unresolved reference "_errno".
+    ifeq "$(OPTIMIZATION)" "on"
+        c-flags   += -MT
+        cxx-flags += -MT
+    else
+        c-flags   += -MTd
+        cxx-flags += -MTd
+    endif
+    ifeq "$(LINK_TYPE)" "stat"
+        # Do not emit C runtime library to object file. It will allows link OpenMP RTL with either static
+        # or dynamic C runtime. Windows* OS specific, applicable only to static RTL.
+        c-flags   += -Zl
+        cxx-flags += -Zl
+    endif
+endif
+
+ifeq "$(os)" "win"
+    c-flags   += -W3
+    cxx-flags += -W3
+    # Disable warning: "... declared but never referenced"
+    # Disable remark #5082: Directive ignored - Syntax error, found IDENTIFIER 'LRB'...
+    fort-flags   += -Qdiag-disable:177,5082
+    c-flags      += -Qdiag-disable:177
+    cxx-flags    += -Qdiag-disable:177
+endif
+
+ifeq "$(CPLUSPLUS)" "on"
+    ifeq "$(os)" "win"
+        c-flags   += -TP
+    else ifeq "$(arch)" "ppc64"
+    # c++11 on ppc64 linux removes definition of preproc. macros, needed in .hs
+      c-flags   += -x c++ -std=gnu++11
+    else
+        ifneq "$(filter gcc clang,$(c))" ""
+            c-flags   += -x c++ -std=c++11
+        else
+            c-flags   += -Kc++
+        endif
+    endif
+endif
+
+# --- Linker options ---
+
+ifeq "$(os)" "lin"
+    ifneq "$(arch)" "mic"
+    ifneq "$(LIB_TYPE)" "stub"
+        ifeq "$(ld)" "ld"
+            # Warn about non-PIC code presence
+            ld-flags += --warn-shared-textrel
+            ld-flags += -fini=__kmp_internal_end_fini
+            ld-flags += -lpthread
+        else # $(c) or $(cxx)
+            ld-flags += -Wl,--warn-shared-textrel
+            ld-flags += -Wl,-fini=__kmp_internal_end_fini
+            ld-flags += -pthread
+        endif
+    endif
+    ifeq "$(ld)" "$(c)"
+        ld-flags += -fPIC
+        ifeq "$(DEBUG_INFO)" "on"
+            ld-flags += -g
+        endif
+        ifeq "$(OPTIMIZATION)" "off"
+            ld-flags += -O0
+        endif
+        ld-flags += -Wl,--version-script=$(src_dir)exports_so.txt
+    else
+        ld-flags += --version-script=$(src_dir)exports_so.txt
+    endif
+    ifeq "$(ld)" "$(c)"
+        # to remove dependency on libimf, libsvml, libintlc:
+        ifeq "$(c)" "icc"
+            ld-flags-dll += -static-intel
+        endif
+        ld-flags-dll += -Wl,--as-needed
+        # to remove dependency on libgcc_s:
+        ifeq "$(c)" "gcc"
+            ld-flags-dll += -static-libgcc
+            # omp_os is non-empty only in the open-source code
+            ifneq "$(omp_os)" "freebsd"
+                ld-flags-extra += -Wl,-ldl
+            endif
+        endif
+        ifeq "$(c)" "clang"
+            ifneq "$(omp_os)" "freebsd"
+                ld-flags-extra += -Wl,-ldl
+            endif
+        endif
+        ifeq "$(arch)" "32"
+            ifeq "$(filter gcc clang,$(c))" ""
+            # to workaround CQ215229 link libirc_pic manually
+            ld-flags-extra += -lirc_pic
+            endif
+        endif
+        ifeq "$(filter 32 32e 64 ppc64 ppc64le,$(arch))" ""
+            ld-flags-extra += $(shell pkg-config --libs libffi)
+        endif
+    else
+        ifeq "$(arch)" "32e"
+            # ???
+            ld-flags += -Bstatic -L/usr/lib64 -lc_nonshared -Bdynamic
+        endif
+    endif
+    else # Below are Intel(R) Many Integrated Core Architecture linker flags
+  ifeq "$(ld)" "ld"
+    ifneq "$(LIB_TYPE)" "stub"
+        ld-flags += -lthr
+        ld-flags += -fini=__kmp_internal_end_atexit
+        # Warn about non-PIC code presence
+        ld-flags += --warn-shared-textrel
+    endif
+    ld-flags += --version-script=$(src_dir)exports_so.txt
+  endif
+  ifeq "$(ld)" "$(c)"
+    ld-flags += -Wl,--warn-shared-textrel
+    ld-flags += -Wl,--version-script=$(src_dir)exports_so.txt
+    ld-flags += -static-intel
+    # Don't link libcilk*.
+    ld-flags += -no-intel-extensions
+    # Discard unneeded dependencies.
+    ld-flags += -Wl,--as-needed
+#    ld-flags += -nodefaultlibs
+    # To check which libraries the compiler links comment above line and uncomment below line
+#    ld-flags += -\#
+    # link libraries in the order the icc compiler uses (obtained using "icc -shared -#" command line)
+    # Compiler 20101017 uses "-lintlc -lthr -lc -lintlc -lirc_s" sequence, we follow it:
+#    ld-flags += -lintlc
+    ifneq "$(LIB_TYPE)" "stub"
+        ld-flags += -pthread
+            ld-flags += -ldl
+        endif
+    # include the c++ library for stats-gathering code
+    ifeq "$(stats)" "on"
+        ld-flags-extra += -Wl,-lstdc++
+    endif
+  endif
+endif
+endif
+
+
+ifeq "$(os)" "mac"
+    ifeq "$(ld)" "icc"
+        ld-flags += -no-intel-extensions
+    endif
+    ld-flags += -single_module
+    ld-flags += -current_version $(VERSION).0 -compatibility_version $(VERSION).0
+endif
+
+ifeq "$(os)" "win"
+    ld-flags += -incremental:no
+    ld-flags += -version:$(VERSION).0
+endif
+
+# --------------------------------------------------------------------------------------------------
+# Project-specific preprocessor definitions.
+# --------------------------------------------------------------------------------------------------
+
+cpp-flags += -D KMP_ARCH_STR="\"$(call legal_arch,$(arch))\""
+
+ifeq "$(os)" "win"
+    cpp-flags += -D _WINDOWS -D _WINNT -D _WIN32_WINNT=0x0501
+    # 0x0501 means Windows* XP* OS or Windows* Server 2003* OS or later.
+    # We need this for GetModuleHanleEx function.
+    ifeq "$(LINK_TYPE)" "dyna"
+        cpp-flags += -D _USRDLL
+    endif
+else # lin, mic or mac
+    cpp-flags += -D _GNU_SOURCE
+    cpp-flags += -D _REENTRANT
+endif
+
+# TODO: DIAG leads to DEBUG. Confusing a bit. Raname KMP_DEBUG to KMP_DIAG?
+ifeq "$(DIAG)" "on"
+    cpp-flags += -D KMP_DEBUG
+endif
+ifeq "$(COVERAGE)" "on"
+    cpp-flags += -D COVER
+endif
+# Assertions in OMP RTL code are controlled by two macros: KMP_DEBUG enables or disables assertions
+# iff KMP_USE_ASSERT is defined. If KMP_USE_ASSERT is not defined, assertions disabled regardless of
+# KMP_DEBUG. It was implemented for code coverage -- to have debug build with no assertion, but it
+# does not have much effect. TODO: Remove macro.
+ifeq "$(COVERAGE)" "off"
+    cpp-flags += -D KMP_USE_ASSERT
+endif
+
+cpp-flags += -D BUILD_I8
+ifneq "$(os)" "win"
+    cpp-flags += -D BUILD_TV
+endif
+cpp-flags += -D KMP_LIBRARY_FILE=\"$(lib_file)\"
+cpp-flags += -D KMP_VERSION_MAJOR=$(VERSION)
+
+# Customize ppc64 and aarch64 cache line size to 128, use 64 otherwise
+# Almost all data structures (kmp.h) are aligned to a cache line to reduce false sharing, thus
+# increasing performance.  For heavily accessed data structures (e.g., kmp_base_info), there are
+# members of the data structure that are grouped together according to their memory access 
+# pattern.  For example, readonly data is put on cache lines together. Then, on separate cachelines,
+# private data used by the working thread is put on its own cache lines. etc.
+ifneq "$(filter aarch64 ppc64 ppc64le,$(arch))" ""
+	cpp-flags += -D CACHE_LINE=128
+else 
+	cpp-flags += -D CACHE_LINE=64
+endif
+
+cpp-flags += -D KMP_ADJUST_BLOCKTIME=1
+cpp-flags += -D BUILD_PARALLEL_ORDERED
+cpp-flags += -D KMP_ASM_INTRINS
+cpp-flags += -D KMP_USE_INTERNODE_ALIGNMENT=0
+# Linux and MIC compile with version symbols
+# ppc64 and ppc64le architectures don't compile with version symbols
+ifneq "$(filter lin,$(os))" ""
+ifeq "$(filter ppc64 ppc64le,$(arch))" ""
+    cpp-flags += -D KMP_USE_VERSION_SYMBOLS
+endif
+endif
+ifneq "$(arch)" "mic"
+    cpp-flags += -D USE_LOAD_BALANCE
+endif
+ifneq "$(os)" "win"
+    cpp-flags += -D USE_CBLKDATA
+    # ??? Windows* OS: USE_CBLKDATA defined in kmp.h.
+endif
+ifeq "$(os)" "win"
+    cpp-flags += -D KMP_WIN_CDECL
+endif
+ifeq "$(LINK_TYPE)" "dyna"
+    cpp-flags += -D KMP_DYNAMIC_LIB
+endif
+ifeq "$(LIB_TYPE)" "stub"
+    cpp-flags += -D KMP_STUB
+endif
+ifeq "$(VERSION)" "4"
+else # 5
+    ifeq "$(os)" "win"
+    else
+        cpp-flags += -D KMP_GOMP_COMPAT
+    endif
+endif
+cpp-flags += -D KMP_NESTED_HOT_TEAMS
+ifneq "$(filter 32 32e mic,$(arch))" ""
+cpp-flags += -D KMP_USE_ADAPTIVE_LOCKS=1 -D KMP_DEBUG_ADAPTIVE_LOCKS=0
+endif
+
+# is the std c++ library needed? (for stats-gathering, it is)
+std_cpp_lib=0
+ifneq "$(filter lin,$(os))" ""
+    ifeq "$(stats)" "on"
+        cpp-flags += -D KMP_STATS_ENABLED=1
+        std_cpp_lib=1
+    else
+        cpp-flags += -D KMP_STATS_ENABLED=0
+    endif
+else # no mac or windows support for stats-gathering
+    ifeq "$(stats)" "on"
+        $(error Statistics-gathering functionality not available on $(os) platform)
+    endif
+    cpp-flags += -D KMP_STATS_ENABLED=0
+endif
+
+# define compatibility with different OpenMP versions
+have_omp_50=0
+have_omp_41=0
+have_omp_40=0
+ifeq "$(OMP_VERSION)" "50"
+	have_omp_50=1
+	have_omp_41=1
+	have_omp_40=1
+endif
+ifeq "$(OMP_VERSION)" "41"
+	have_omp_50=0
+	have_omp_41=1
+	have_omp_40=1
+endif
+ifeq "$(OMP_VERSION)" "40"
+	have_omp_50=0
+	have_omp_41=0
+	have_omp_40=1
+endif
+ifeq "$(OMP_VERSION)" "30"
+	have_omp_50=0
+	have_omp_41=0
+	have_omp_40=0
+endif
+cpp-flags += -D OMP_50_ENABLED=$(have_omp_50) -D OMP_41_ENABLED=$(have_omp_41) -D OMP_40_ENABLED=$(have_omp_40)
+
+# Using ittnotify is enabled by default.
+USE_ITT_NOTIFY = 1
+ifeq "$(os)-$(arch)" "win-64"
+    USE_ITT_NOTIFY = 0
+endif
+ifeq "$(LINK_TYPE)" "stat"
+    USE_ITT_NOTIFY = 0
+endif
+cpp-flags += -D USE_ITT_NOTIFY=$(USE_ITT_NOTIFY)
+ifeq "$(USE_ITT_NOTIFY)" "0"
+    # Disable all ittnotify calls.
+    cpp-flags += -D INTEL_NO_ITTNOTIFY_API
+else
+    ifeq "$(os)" "win"
+        ittnotify_static$(obj) : cpp-flags += -D UNICODE
+    endif
+endif
+# Specify prefix to be used for external symbols. Prefix is required even if ITT Nofity turned off
+# because we have some functions with __itt_ prefix (__itt_error_handler) and want prefix to be
+# changed to __kmp_itt_.
+cpp-flags += -D INTEL_ITTNOTIFY_PREFIX=__kmp_itt_
+
+
+# Linux* OS: __declspec(thread) TLS is still buggy on static builds.
+# Windows* OS: This define causes problems with LoadLibrary + declspec(thread) on Windows* OS. See CQ50564,
+#     tests kmp_load_library_lib*.c, and the following MSDN reference:
+#     http://support.microsoft.com/kb/118816
+ifneq "$(filter lin ,$(os))" ""
+    ifeq "$(LINK_TYPE)" "dyna"
+        cpp-flags += -D KMP_TDATA_GTID
+    else
+        # AC: allow __thread in static build for Intel(R) 64, looks like it is
+        # working there. It is broken on IA-32 architecture for RHEL4 and SLES9.
+        ifeq "$(arch)" "32e"
+            cpp-flags += -D KMP_TDATA_GTID
+        endif
+    endif
+endif
+
+# Defining KMP_BUILD_DATE for all files leads to warning "incompatible redefinition", because the
+# same macro is also defined in omp.h. To avoid conflict, let us define macro with different name,
+# _KMP_BUILD_TIME.
+kmp_version$(obj) : cpp-flags += -D _KMP_BUILD_TIME="\"$(date)\""
+
+# --- Macros for generate-def.pl ---
+
+gd-flags += -D arch_$(arch)
+gd-flags += -D $(LIB_TYPE)
+ifeq "$(HAVE_QUAD)" "1"
+    gd-flags += -D HAVE_QUAD
+endif
+ifeq "$(OMP_VERSION)" "41"
+    gd-flags += -D OMP_41 -D OMP_40 -D OMP_30
+else
+    ifeq "$(OMP_VERSION)" "40"
+        gd-flags += -D OMP_40 -D OMP_30
+    else
+        ifeq "$(OMP_VERSION)" "30"
+            gd-flags += -D OMP_30
+        endif
+    endif
+endif
+ifneq "$(VERSION)" "4"
+    gd-flags += -D msvc_compat
+endif
+ifeq "$(DIAG)" "on"
+    gd-flags += -D KMP_DEBUG
+endif
+
+# --- Macro for expand-vars.pl ---
+
+# $Revision and $Date often occur in file header, so define these variables to satisfy expand-vars.pl.
+ev-flags += -D Revision="\$$Revision" -D Date="\$$Date"
+
+# Various variables.
+ev-flags += -D KMP_TYPE="$(call legal_type,$(LIB_TYPE))" -D KMP_ARCH="$(call legal_arch,$(arch))"
+ev-flags += -D KMP_VERSION_MAJOR=$(VERSION) -D KMP_VERSION_MINOR=0 -D KMP_VERSION_BUILD=$(build)
+ev-flags += -D KMP_BUILD_DATE="$(date)"
+ev-flags += -D KMP_DIAG=$(if $(filter on,$(DIAG)),1,0)
+ev-flags += -D KMP_DEBUG_INFO=$(if $(filter on,$(DEBUG_INFO)),1,0)
+ifeq "$(OMP_VERSION)" "40"
+    ev-flags += -D OMP_VERSION=201307
+else
+    ifeq "$(OMP_VERSION)" "30"
+        ev-flags += -D OMP_VERSION=201107
+    else
+        ev-flags += -D OMP_VERSION=200505
+    endif
+endif
+
+# -- Options specified in command line ---
+
+cpp-flags  += $(CPPFLAGS)
+c-flags    += $(CFLAGS)
+cxx-flags  += $(CXXFLAGS)
+fort-flags += $(FFLAGS)
+ld-flags   += $(LDFLAGS)
+
+# --------------------------------------------------------------------------------------------------
+# Files.
+# --------------------------------------------------------------------------------------------------
+ifeq "$(OMPT_SUPPORT)" "on"
+    ompt_items = ompt-general
+    cpp-flags += -D OMPT_SUPPORT=1
+
+    ifeq "$(OMPT_BLAME)" "on"
+        cpp-flags += -D OMPT_BLAME=1
+    endif
+
+    ifeq "$(OMPT_TRACE)" "on"
+        cpp-flags += -D OMPT_TRACE=1
+    endif
+endif
+
+# Library files. These files participate in all kinds of library.
+lib_c_items :=      \
+    kmp_ftn_cdecl   \
+    kmp_ftn_extra   \
+    kmp_version     \
+    $(ompt_items)   \
+    $(empty)
+lib_cpp_items :=
+lib_asm_items :=
+
+# Files to be linked into import library.
+imp_c_items :=
+
+do_test_touch_mt := 1
+
+ifeq "$(LIB_TYPE)" "stub"
+    lib_c_items += kmp_stub
+else # norm or prof
+    lib_c_items +=                   \
+        kmp_alloc                    \
+        kmp_atomic                   \
+        kmp_csupport                 \
+        kmp_debug                    \
+        kmp_debugger                 \
+        kmp_itt                      \
+        $(empty)
+    ifeq "$(USE_ITT_NOTIFY)" "1"
+        lib_c_items +=  ittnotify_static
+    endif
+
+
+    lib_cpp_items +=                 \
+        kmp_environment              \
+        kmp_error                    \
+        kmp_global                   \
+        kmp_i18n                     \
+        kmp_io                       \
+        kmp_runtime                  \
+        kmp_wait_release             \
+        kmp_barrier                  \
+        kmp_settings                 \
+        kmp_str                      \
+        kmp_tasking                  \
+        kmp_taskq                    \
+        kmp_threadprivate            \
+        kmp_utility                  \
+        kmp_affinity                 \
+        kmp_dispatch                 \
+        kmp_lock                     \
+        kmp_sched                    \
+        $(empty)
+
+ifeq ($(OMP_VERSION),$(filter $(OMP_VERSION),40 41))
+    lib_cpp_items += kmp_taskdeps
+    lib_cpp_items += kmp_cancel
+endif
+ifeq "$(stats)" "on"
+    lib_cpp_items += kmp_stats
+    lib_cpp_items += kmp_stats_timing
+endif
+
+    # OS-specific files.
+    ifeq "$(os)" "win"
+        lib_c_items += z_Windows_NT_util
+        # Arch-specific files.
+        lib_c_items   += z_Windows_NT-586_util
+        lib_asm_items += z_Windows_NT-586_asm
+        ifeq "$(LINK_TYPE)" "dyna"
+            imp_c_items += kmp_import
+            # for win_32/win_32e dynamic libguide40.dll,
+            # build the shim lib instead
+            ifeq "$(VERSION)" "4"
+                ifneq "$(arch)" "64"
+                    ifeq "$(LIB_TYPE)" "norm"
+                    lib_c_items   = kmp_shim
+                        lib_cpp_items =
+                        lib_asm_items =
+                        gd-flags += -D shim
+                        # for some reason, test-touch-md is able to work with
+                        # the build compiler's version of libompmd.dll, but
+                        # test-touch-mt can't load it.
+                        do_test_touch_mt := 0
+                    endif
+                endif
+            endif
+        endif
+    else # lin, mic or mac
+        lib_c_items += z_Linux_util
+        # GCC Compatibility files
+        ifeq "$(VERSION)" "4"
+        else # 5
+            lib_c_items += kmp_gsupport
+        endif
+        lib_asm_items += z_Linux_asm
+    endif
+endif
+
+lib_obj_files := $(sort $(addsuffix $(obj),$(lib_c_items) $(lib_cpp_items) $(lib_asm_items)))
+imp_obj_files := $(sort $(addsuffix $(obj),$(imp_c_items) $(imp_cpp_items) $(imp_asm_items)))
+dep_files     := $(sort $(addsuffix .d,$(lib_c_items) $(lib_cpp_items) $(imp_c_items) $(imp_cpp_items)))
+i_files       := $(sort $(addsuffix .i,$(lib_c_items) $(lib_cpp_items) $(imp_c_items) $(imp_cpp_items)))
+
+
+# --- Construct library file name ---
+
+ifeq "$(VERSION)" "4"
+    ifeq "$(LIB_TYPE)" "stub"
+        _lib_item = libompstub
+    else # norm or prof
+        _lib_item = libguide
+    endif
+    ifeq "$(os)-$(LINK_TYPE)" "win-dyna"
+        _lib_item += 40
+    endif
+    ifeq "$(LIB_TYPE)" "prof"
+        _lib_item += _stats
+    endif
+else
+    _lib_item = libomp
+    ifeq "$(LIB_TYPE)" "prof"
+        _lib_item += prof
+    endif
+    ifeq "$(LIB_TYPE)" "stub"
+        _lib_item += stubs
+    endif
+    ifeq "$(os)" "win"
+        ifeq "$(LINK_TYPE)" "dyna"
+            _lib_item += md
+        else
+            _lib_item += mt
+        endif
+    endif
+endif
+# _lib_item is a list of space separated name parts. Remove spaces to form final name.
+lib_item = $(subst $(space),,$(_lib_item))
+ifeq "$(LINK_TYPE)" "dyna"
+    lib_ext = $(dll)
+else
+    lib_ext = $(lib)
+endif
+lib_file  = $(lib_item)$(lib_ext)
+ifeq "$(os)-$(LINK_TYPE)" "win-dyna"
+    imp_file  = $(lib_item)$(lib)
+    def_file  = $(lib_item).def
+    res_file  = $(lib_item).res
+    # PDB file should be generated if: ( DEBIG_INFO is on ) OR ( we are building 32-bit normal
+    # library AND version is 5 ).
+    ifneq "$(filter on,$(DEBUG_INFO))$(filter norm-5,$(LIB_TYPE)-$(VERSION))" ""
+        pdb_file = $(lib_item).pdb
+    endif
+endif
+ifneq "$(filter lin,$(os))" ""
+    ifeq "$(LINK_TYPE)" "dyna"
+      ifneq "$(DEBUG_INFO)" "on"
+        dbg_file = $(lib_item).dbg
+      endif
+    else
+        dbg_strip = "on"
+    endif
+endif
+
+# --- Output files ---
+
+out_lib_files  = $(addprefix $(out_lib_dir),$(lib_file) $(imp_file) $(pdb_file) $(dbg_file))
+out_mod_files  = \
+    $(addprefix $(out_ptf_dir)include/,omp_lib.mod omp_lib_kinds.mod)
+out_cmn_files  = \
+    $(addprefix $(out_cmn_dir)include/,omp.h omp_lib.h omp_lib.f omp_lib.f90)
+ifeq "$(OMPT_SUPPORT)" "on"
+    out_cmn_files  += $(addprefix $(out_cmn_dir)include/,ompt.h)
+endif
+
+ifneq "$(out_lib_fat_dir)" ""
+    out_lib_fat_files  = $(addprefix $(out_lib_fat_dir),$(lib_file) $(imp_file))
+endif
+
+# --- Special dependencies ---
+
+# We have to encode dependencies on omp.h manually, because automatic dependency generation
+# by compiler produces depedency on omp.h file located in compiler include directory.
+kmp_csupport$(obj) : omp.h
+kmp_stub$(obj)     : omp.h
+
+# --------------------------------------------------------------------------------------------------
+# External libraries to link in.
+# --------------------------------------------------------------------------------------------------
+
+# We (actually, our customers) do no want OpenMP RTL depends on external libraries, so we have to
+# pick up some object files from libirc library (Intel compiler generate code with calls to libirc)
+# and link them into OMP RTL.
+# libipgo is required only for collecting code coverage data, but is is convenient to link in into
+# OMP RTL as well, not to depend on extra libs and paths.
+
+# libirc does matter only if Intel compiler is used.
+ifneq "$(filter icc icl icl.exe,$(c))" ""
+
+    ifneq "$(ICC_LIB_DIR)" ""
+        icc_lib_dir := $(ICC_LIB_DIR)
+    else
+        #
+        # Let us find path to Intel libraries first. (don't use tabs in these lines!)
+        #
+        icc_path := $(shell which $(c))
+        $(call debug,icc_path)
+        ifeq "$(words $(icc_path))" "0"
+            $(error Path to "$(c)" not found, reported path: $(icc_path))
+        endif
+        ifneq "$(words $(icc_path))" "1"
+            $(error Path to "$(c)" contains spaces: "$(icc_path)")
+        endif
+        ifeq "$(os)" "win"  # Windows* OS specific.
+            # `which' can return path with backslashes. Convert them.
+            icc_path := $(subst \,/,$(icc_path))
+            # icc's "bin/" directory may be named as "Bin/" or even "BIN/". Convert it to lower case.
+            icc_path := $(subst B,b,$(icc_path))
+            icc_path := $(subst I,i,$(icc_path))
+            icc_path := $(subst N,n,$(icc_path))
+            $(call debug,icc_path)
+        endif
+        # icc 10.x directory layout:
+        #         bin/
+        #         lib/
+        # icc 11.x directory layout:
+        #         bin/{ia32,intel64}/
+        #         lib/{ia32,intel64}/
+        # icc 12.x directory layout:
+        #         bin/{ia32,intel64}/
+        #         compiler/lib/{ia32,intel64}/
+        # Note: On OS X* fat libraries reside in lib/ directory. On other systems libraries are in
+        # lib/<arch>/.
+        icc_path_up1 := $(dir $(icc_path))
+        icc_path_up2 := $(dir $(patsubst %/,%,$(icc_path_up1)))
+        $(call debug,icc_path_up1)
+        $(call debug,icc_path_up2)
+        ifneq "$(filter %/bin/,$(icc_path_up1))" ""
+            # Look like 10.x compiler.
+            icc_lib_dir := $(patsubst %/bin/,%/lib/,$(icc_path_up1))
+        else
+            ifneq "$(filter %/bin/,$(icc_path_up2))" ""
+                # It looks like 11.x or later compiler.
+                ifeq "$(os)" "mac"
+                    icc_lib12 := $(patsubst %/bin/,%/compiler/lib/,$(icc_path_up2))
+                    ifneq "$(wildcard $(icc_lib12)libirc*$(lib))" ""
+                        # 12.x
+                        icc_lib_dir := $(icc_lib12)
+                    else
+                        # 11.x
+                        icc_lib_dir := $(patsubst %/bin/,%/lib/,$(icc_path_up2))
+                    endif
+                else
+                    icc_lib12 := $(patsubst %/bin/,%/compiler/lib/,$(icc_path_up2))$(notdir $(patsubst %/,%,$(icc_path_up1)))/
+                    ifneq "$(wildcard $(icc_lib12)libirc*$(lib))" ""
+                        # 12.x
+                        icc_lib_dir := $(icc_lib12)
+                    else
+                        # 11.x
+                        icc_lib_dir := $(patsubst %/bin/,%/lib/,$(icc_path_up2))$(notdir $(patsubst %/,%,$(icc_path_up1)))/
+                    endif
+                endif
+            endif
+        endif
+        $(call debug,icc_lib_dir)
+        ifeq "$(icc_lib_dir)" ""
+            $(error Path to $(c) lib/ dir not found)
+        endif
+    endif
+
+    #
+    # Then select proper libraries.
+    #
+    ifeq "$(os)" "win"
+        libirc  = $(icc_lib_dir)\libircmt$(lib)
+        libipgo = $(icc_lib_dir)\libipgo$(lib)
+    else # lin, mic or mac
+        ifeq "$(LINK_TYPE)" "dyna"
+            # In case of dynamic linking, prefer libi*_pic.a libraries, they contains
+            # position-independent code.
+            libirc  = $(icc_lib_dir)libirc_pic$(lib)
+            libipgo = $(icc_lib_dir)libipgo_pic$(lib)
+            # If libi*_pic.a is not found (it is missed in older compilers), use libi*.a.
+            ifeq "$(wildcard $(libirc))" ""
+                libirc = $(icc_lib_dir)libirc$(lib)
+            endif
+            ifeq "$(wildcard $(libipgo))" ""
+                libipgo = $(icc_lib_dir)libipgo$(lib)
+            endif
+        else
+            libirc  = $(icc_lib_dir)libirc$(lib)
+            libipgo = $(icc_lib_dir)libipgo$(lib)
+        endif
+    endif
+
+    # Ok, now let us decide when linked
+    # Linux* OS:
+    # We link in libraries to static library only.
+    ifeq "$(os)-$(LINK_TYPE)" "lin-stat"
+        ifneq "$(arch)" "mic"
+        linked_in_libs += libirc
+    endif
+    endif
+    # OS X*:
+    # The trick is not required in case of dynamic library, but on Intel(R) 64 architecture we have a
+    # problem: libirc.a is a fat, so linker (libtool) produces fat libguide.dylib... :-( (Only
+    # functions from libirc are presented for both architectures, libguide functions are for Intel(R) 64
+    # only.) To avoid this undesired effect, libirc trick is enabled for both static and dynamic
+    # builds. Probably we can instruct libtool to produce "thin" (not fat) library by using
+    # -arch_only option...
+    ifeq "$(os)" "mac"
+        linked_in_libs += libirc
+    endif
+    # Windows* OS:
+    # The trick is required only in case of static OMP RTL. In case of dynamic OMP RTL linker does
+    # the job.
+    ifeq "$(os)-$(LINK_TYPE)" "win-stat"
+        linked_in_libs += libirc
+    endif
+
+    ifeq "$(COVERAGE)" "on"
+        linked_in_libs += libipgo
+    endif
+
+endif
+
+# --------------------------------------------------------------------------------------------------
+# Main targets.
+# --------------------------------------------------------------------------------------------------
+
+all    : lib inc mod
+lib    : tests $(out_lib_files) libomp_aliases
+mod    : $(out_mod_files)
+clean  :
+	$(rm) $(out_lib_files) $(out_lib_fat_files)
+	$(rm) $(out_mod_files)
+
+# --------------------------------------------------------------------------------------------------
+# Building library.
+# --------------------------------------------------------------------------------------------------
+
+$(lib_file) : $(if $(dbg_file),stripped,unstripped)/$(lib_file)
+	$(target)
+	$(cp) $< $@
+
+ifneq "$(dbg_file)" ""
+    $(dbg_file) : unstripped/$(dbg_file)
+	$(target)
+	$(cp) $< $@
+endif
+
+ifneq "$(filter lin,$(os))" ""
+    lib_file_deps = $(if $(linked_in_libs),required/.objs,$(lib_obj_files))
+endif
+ifeq "$(os)" "mac"
+    lib_file_deps = omp$(obj)
+endif
+ifeq "$(os)" "win"
+    lib_file_deps = $(if $(linked_in_libs),wiped/.objs,$(lib_obj_files))
+endif
+
+# obj_dep_files -- object files, explicitly specified in dependency list. Other files (non-object)
+# are filtered out.
+obj_deps_files = $(filter %$(obj),$^)
+# obj_deps_flags -- object files corresponding to flags, specified in dependency list. Flag is a
+# special file like "required/.objs". Flag file is replaced with a list of all object files in flag
+# directory, for example, "required/*.o"
+obj_deps_flags = $(addsuffix *$(obj),$(dir $(filter %/.objs,$^)))
+# obj_deps_all -- list of all object files specified in dependency list, either explicit or found
+# in flagged directories.
+obj_deps_all   = $(obj_deps_files) $(obj_deps_flags)
+
+unstripped/$(lib_file).lst : $(lib_file_deps) unstripped/.dir .rebuild
+	$(target)
+	echo $(obj_deps_all) > $@
+
+ifeq "$(os)-$(LINK_TYPE)" "lin-dyna"
+    $(lib_file) : exports_so.txt
+endif
+
+# Copy object files, wiping out references to libirc library. Object files (ours and extracted
+# from libirc.lib) have "-defaultlib:libirc.lib" linker directive, so linker will require libirc.lib
+# regardless of absence of real dependency. Actually, this rule is required only on Windows* OS, but
+# there is no Windows* OS-specific commands, so I omit conditions to keep code shorter and be able test
+# the rule on Linux* OS.
+# Note: If we are not going to pick up objects from libirc, there is no point in wiping out
+# libirc references.
+# Addition: Wipe also references to C++ runtime (libcpmt.lib) for the same reason: sometimes C++
+# runtime routines are not actually used, but compiler puts "-defaultlib:libcpmt.lib" directive to
+# object file. Wipe it out, if we have real dependency on C++ runtime, test-touch will fail.
+wiped/.objs : required/.objs \
+    $(tools_dir)wipe-string.pl wiped/.dir .rebuild
+	$(target)
+	$(rm) $(dir $@)*$(obj)
+    ifeq "$(os)" "win"
+	$(perl) $(tools_dir)wipe-string.pl --quiet \
+	    --wipe-regexp="(-|/)(defaultlib|DEFAULTLIB):\"(libir|libc|LIBC|OLDN|libmm|libde|svml).*?\"" \
+	    --target-directory=$(dir $@) $(obj_deps_all)
+    else
+	$(perl) $(tools_dir)wipe-string.pl --quiet \
+	    --wipe-regexp="(-|/)(defaultlib|DEFAULTLIB):\"(libir|libc|LIBC|OLDN).*?\"" \
+	    --target-directory=$(dir $@) $(obj_deps_all)
+    endif
+	$(touch) $@
+
+# required-objects.pl uses "objcopy" utility to rename symbols in object files. On Linux* OS this is a
+# standard utility (from binutils package). On Windows* OS we provides homebrew implementation (very
+# limited, but enough for our purposes).
+ifeq "$(os)" "win"
+    objcopy = objcopy$(exe)
+endif
+
+# required/ is a directory containing OMP RTL object files and really required files from external
+# libraries. required/.obj is a flag. If this file present, it means all required objects already
+# in place. Note, required-objects.pl copies files to specified directory. It is necessary, because
+# object files are edited during copying -- symbols defined in external object files are renamed.
+required/.objs : $(lib_obj_files) $(addsuffix /.objs,$(linked_in_libs)) \
+    $(tools_dir)required-objects.pl $(objcopy) required/.dir .rebuild
+	$(target)
+	$(rm) $(dir $@)*$(obj)
+	$(perl) $(tools_dir)required-objects.pl --quiet $(oa-opts) --prefix=__kmp_external_ \
+	    --base $(obj_deps_files) --extra $(obj_deps_flags) --copy-all=$(dir $@)
+	$(touch) $@
+
+# Extracting object files from libirc. File "libirc/.obj" is a flag. If the file present, make
+# thinks files are extracted.
+ifneq "$(libirc)" ""
+    libirc/.objs : $(libirc) \
+	$(tools_dir)extract-objects.pl libirc/.dir .rebuild
+	    $(target)
+	    $(rm) $(dir $@)*$(obj)
+	    $(perl) $(tools_dir)extract-objects.pl --quiet $(oa-opts) --output=$(dir $@) $<
+	    $(touch) $@
+endif
+
+# Extracting object files from libipgo. File "/libipgo/.obj" is a flag. If the file present, make
+# thinks objects are extracted.
+ifneq "$(libipgo)" ""
+    libipgo/.objs : $(libipgo) \
+	$(tools_dir)extract-objects.pl libipgo/.dir .rebuild
+	    $(target)
+	    $(rm) $(dir $@)*$(obj)
+	    $(perl) $(tools_dir)extract-objects.pl --quiet $(oa-opts) --output=$(dir $@) $<
+	    $(touch) $@
+endif
+
+
+stripped/$(lib_file) : unstripped/$(lib_file) $(dbg_file) stripped/.dir .rebuild
+	$(target)
+        ifeq "$(arch)" "mic"
+	    x86_64-k1om-linux-objcopy --strip-debug $< $@.tmp
+	    x86_64-k1om-linux-objcopy --add-gnu-debuglink=$(dbg_file) $@.tmp $@
+        else
+	objcopy --strip-debug $< $@.tmp
+	objcopy --add-gnu-debuglink=$(dbg_file) $@.tmp $@
+        endif
+
+ifeq "$(os)" "mac"
+
+    # These targets are under condition because of some OS X*-specific ld and nm options. For
+    # example, GNU nm does not accept -j, GNU ld does not know -filelist.
+
+    # omp.o is a big object file including all the OMP RTL object files and object files from
+    # external libraries (like libirc). It is partially linked, references to external symbols
+    # (e. g. references to libirc) already resolved, symbols defined in external libraries are
+    # hidden by using -unexported-symbol-list and -non_global_symbols_strip_list linker options
+    # (both options are required).
+    # AC: 2012-04-12: after MAC machines upgrade compiler fails to create object, so use linker instead
+ifeq "$(mac_os_new)" "1"
+    omp$(obj) : $(lib_obj_files) external-symbols.lst external-objects.lst .rebuild
+	    $(target)
+	    ld -r -unexported_symbols_list external-symbols.lst \
+		-non_global_symbols_strip_list external-symbols.lst \
+		-filelist external-objects.lst \
+		-o $@ $(obj_deps_files)
+else
+    omp$(obj) : $(lib_obj_files) external-symbols.lst external-objects.lst .rebuild
+	    $(target)
+	    $(c) -r -nostartfiles -static-intel  -no-intel-extensions \
+		-Wl,-unexported_symbols_list,external-symbols.lst \
+		-Wl,-non_global_symbols_strip_list,external-symbols.lst \
+		-filelist external-objects.lst \
+		-o $@ $(obj_deps_files)
+endif
+
+    # external-objects.lst is a list of object files extracted from external libraries, which should
+    # be linked into omp.o. kmp_dummy.o is added to the list to avoid empty list -- OS X* utilities
+    # nm and ld do not like empty lists.
+    external-objects.lst : $(lib_obj_files) $(addsuffix /.objs,$(linked_in_libs)) kmp_dummy$(obj) \
+	$(tools_dir)required-objects.pl .rebuild
+	    $(target)
+	    $(perl) $(tools_dir)required-objects.pl $(oa-opts) \
+		--base $(obj_deps_files) --extra $(obj_deps_flags) --print-extra > $@.tmp
+	    echo "kmp_dummy$(obj)" >> $@.tmp
+	    mv $@.tmp $@
+
+    # Prepare list of symbols in external object files. We will hide all these symbols.
+    # Note: -j is non-GNU option which means "Just display the symbol names (no value or type)."
+    external-symbols.lst : external-objects.lst .rebuild
+	    $(target)
+	    nm -goj $$(cat external-objects.lst) > $@.0.tmp
+	    cut -f2 -d" " $@.0.tmp > $@.1.tmp
+	    mv $@.1.tmp $@
+
+endif # mac
+
+# Import library tricks are Windows* OS-specific.
+ifeq "$(os)" "win"
+
+    import$(lib) : $(lib_item)$(dll)
+
+    # Change the name of import library produced by the linker, we will combine it with some more
+    # object files to produce "extended import lib".
+    $(lib_item)$(dll) : imp_file := import$(lib)
+
+    # Default rule "c to obj" will compile sources with -MT option, which is not desired.
+    # Cancel effect of -MT with -Zl.
+    # AC: Currently we only have one object that does not need any special
+    #     compiler options, so use minimal set. With standard set of options we used
+    #     there were problems with debug info leaked into the import library
+    #     with this object (bug report #334565).
+    $(imp_obj_files) : c-flags := -Zl -nologo -c
+
+    $(imp_file).lst : $(imp_obj_files) import$(lib) .rebuild
+	    $(target)
+	    echo $(filter-out .rebuild,$^) > $@
+
+endif
+
+kmp_i18n_id.inc : en_US.txt \
+    $(tools_dir)message-converter.pl .rebuild
+	$(target)
+	$(perl) $(tools_dir)message-converter.pl $(oa-opts) --prefix=kmp_i18n --enum=$@ $<
+
+kmp_i18n_default.inc : en_US.txt \
+    $(tools_dir)message-converter.pl .rebuild
+	$(target)
+	$(perl) $(tools_dir)message-converter.pl $(oa-opts) --prefix=kmp_i18n --default=$@ $<
+
+# Rebuilt kmp_version.o on any change to have actual build time string always updated.
+kmp_version$(obj): $(filter-out kmp_version$(obj),$(lib_obj_files) $(imp_obj_files))
+
+$(def_file) : dllexports \
+    $(tools_dir)generate-def.pl .rebuild
+	$(target)
+	$(perl) $(tools_dir)generate-def.pl $(gd-flags) -o $@ $<
+
+libomp.rc : libomp.rc.var kmp_version.c
+libomp.rc : ev-flags += -D KMP_FILE=$(lib_file)
+
+kmp_dummy.c : .rebuild
+	$(target)
+	echo "void __kmp_dummy() {}" > $@
+
+# --------------------------------------------------------------------------------------------------
+# Tests.
+# --------------------------------------------------------------------------------------------------
+
+# --- test-touch ---
+
+# test-touch is not available for mic.
+ifneq "$(arch)" "mic"
+
+    # Compile a simple C test and link it with the library. Do it two times: the first link gives us
+    # clear message if there are any problems, the second link run in verbose mode, linker output
+    # searched for "libirc" string -- should not be any libirc references. Finally, test executable
+    # run with KMP_VERBOSE=1.
+
+    ifeq "$(os)" "win"
+        ifneq "$(do_test_touch_mt)" "0"
+            test_touch_items += test-touch-md test-touch-mt
+        else
+            test_touch_items += test-touch-md
+        endif
+    else
+        test_touch_items += test-touch-rt
+    endif
+
+    force-test-touch : $(addsuffix /.force,$(test_touch_items)) $(addsuffix /.test,$(test_touch_items))
+    test-touch       : $(addsuffix /.test,$(test_touch_items))
+
+    tt-exe-file = $(dir $@)test-touch$(exe)
+    ifeq "$(os)" "win"
+        # On Windows* OS the test quality is problematic, because LIB environment variable is set up for
+        # Intel compiler so Microsoft compiler is able to find libirc if it is specified in defaultlib
+        # directive within object file... This disadvantage is compensated by grepping verbose link
+        # output for "libirc" string.
+        tt-c            = cl
+        tt-c-flags     += -nologo
+        ifeq "$(OPTIMIZATION)" "on"
+            tt-c-flags-mt = -MT
+            tt-c-flags-md = -MD
+        else
+            tt-c-flags-mt = -MTd
+            tt-c-flags-md = -MDd
+        endif
+        ifeq "$(LINK_TYPE)" "stat"
+            tt-libs  += $(lib_file)
+        else
+            tt-libs  += $(imp_file)
+        endif
+        ifneq "$(arch)" "32"
+            # To successfully build with VS2008
+            # tt-libs += bufferoverflowu.lib
+            # Preventing "unresolved external symbol __security_cookie" (and
+            # "... __security_check_cookie") linker errors on win_32e and win_64.
+        endif
+        tt-c-flags  += -Fo$(dir $@)test-touch$(obj) -Fe$(tt-exe-file)
+        tt-ld-flags += -link
+        # Some Posix but non-ISO functions (like strdup) are defined in oldnames.lib, which is used
+        # implicitly. Drop oldnames.lib library, so we can catch
+        tt-ld-flags += -nodefaultlib:oldnames
+        ifeq "$(arch)" "32"
+            tt-ld-flags += -safeseh
+        endif
+        tt-ld-flags-v += -verbose
+    else # lin or mac
+        # On Linux* OS and OS X* the test is good enough because GNU compiler knows nothing
+        # about libirc and Intel compiler private lib directories, but we will grep verbose linker
+        # output just in case.
+        # Using clang on OS X* because of discontinued support of GNU compilers.
+        ifeq "$(os)" "mac"
+            ifeq "$(std_cpp_lib)" "1"
+                tt-c        = clang++
+            else
+                tt-c        = clang
+            endif
+        else # lin
+            ifeq "$(std_cpp_lib)" "1"
+                tt-c        = g++
+            else
+                tt-c        = gcc
+            endif
+            # GCC on OS X* does not recognize -pthread.
+            tt-c-flags  += -pthread
+        endif
+        tt-c-flags += -o $(tt-exe-file)
+        ifneq "$(filter 32 32e 64,$(arch))" ""
+            tt-c-flags += $(if $(filter 64,$(arch)),,$(if $(filter 32,$(arch)),-m32,-m64))
+        endif
+        tt-libs    += $(lib_file)
+        ifeq "$(os)-$(COVERAGE)-$(LINK_TYPE)" "lin-on-stat"
+            # Static coverage build on Linux* OS fails due to unresolved symbols dlopen, dlsym, dlclose.
+            # Explicitly add dl library to avoid failure.
+            tt-ld-flags += -ldl
+        endif
+        ifeq "$(os)" "mac"
+            tt-ld-flags-v += -Wl,-t
+            tt-env        += DYLD_LIBRARY_PATH=".:$(DYLD_LIBRARY_PATH)"
+        else # lin
+            tt-ld-flags-v += -Wl,--verbose
+            tt-env        += LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)"
+        endif
+    endif
+    tt-c-flags += $(tt-c-flags-rt)
+    tt-env     += KMP_VERSION=1
+    tt-i        = $(if $(filter off,$(TEST_TOUCH)),-)
+
+    ifndef test-touch-commands
+        # The first building gives short and clear error message in case of any problem.
+        # The second building runs linker in verbose mode and saves linker output for grepping.
+      define test-touch-commands
+	    $(rm) $(dir $@)*
+	    $(tt-i)$(tt-c) $(tt-c-flags) $< $(tt-libs) $(tt-ld-flags)
+	    $(rm) $(tt-exe-file)
+	    $(tt-i)$(tt-c) $(tt-c-flags) \
+		$< $(tt-libs) \
+		$(tt-ld-flags) $(tt-ld-flags-v) \
+		> $(dir $@)build.log 2>&1
+	    $(tt-i)$(tt-env) $(tt-exe-file)
+	    $(tt-i)grep -i -e "[^_]libirc" $(dir $@)build.log > $(dir $@)libirc.log; \
+		[ $$? -eq 1 ]
+      endef
+    endif
+
+    test-touch-rt/.test : tt-c-flags-rt =
+    test-touch-mt/.test : tt-c-flags-rt = $(tt-c-flags-mt)
+    test-touch-md/.test : tt-c-flags-rt = $(tt-c-flags-md)
+
+    test-touch-rt/.test : test-touch.c $(lib_file) test-touch-rt/.dir .rebuild
+	    $(target)
+	    $(test-touch-commands)
+	    $(touch) $@
+    test-touch-mt/.test : test-touch.c $(lib_file) $(imp_file) test-touch-mt/.dir .rebuild
+	    $(target)
+	    $(test-touch-commands)
+	    $(touch) $@
+    test-touch-md/.test : test-touch.c $(lib_file) $(imp_file) test-touch-md/.dir .rebuild
+	    $(target)
+	    $(test-touch-commands)
+	    $(touch) $@
+
+endif
+
+# --- test-relo ---
+
+# But test-relo does actual work only on Linux* OS and
+# Intel(R) Many Integrated Core Architecture in case of dynamic linking.
+ifeq "$(if $(filter lin,$(os)),os)-$(LINK_TYPE)" "os-dyna"
+
+    # Make sure dynamic library does not contain position-dependent code.
+    force-test-relo : test-relo/.force test-relo/.test
+    test-relo       : test-relo/.test
+
+    test-relo/.test : $(lib_item)$(dll) test-relo/.dir .rebuild
+	    $(target)
+        ifeq "$(arch)" "mic"
+	    x86_64-k1om-linux-readelf -d $< > $(dir $@)readelf.log
+        else
+	    readelf -d $< > $(dir $@)readelf.log
+        endif
+	    grep -e TEXTREL $(dir $@)readelf.log; [ $$? -eq 1 ]
+	    $(touch) $@
+
+endif
+
+# --- test-execstack ---
+
+# But test-execstack does actual work only on Linux* OS in case of dynamic linking.
+ifeq "$(if $(filter lin,$(os)),os)-$(LINK_TYPE)" "os-dyna"
+    tests += test-execstack
+
+    # Make sure stack is not executable.
+    force-test-execstack : test-execstack/.force test-execstack/.test
+    test-execstack       : test-execstack/.test
+
+    test-execstack/.test : $(lib_item)$(dll) test-execstack/.dir .rebuild
+	    $(target)
+		$(perl) $(tools_dir)check-execstack.pl $(oa-opts) $<
+	    $(touch) $@
+endif
+
+# --- test-instr ---
+
+# But test-instr does actual work only on Intel(R) Many Integrated Core Architecture.
+ifeq "$(arch)" "mic"
+
+    # Make sure dynamic library does not contain position-dependent code.
+    force-test-instr : test-instr/.force test-instr/.test
+    test-instr       : test-instr/.test
+
+    test-instr/.test : $(lib_file) $(tools_dir)check-instruction-set.pl test-instr/.dir .rebuild
+	    $(target)
+		$(perl) $(tools_dir)check-instruction-set.pl $(oa-opts) --show --mic-arch=$(MIC_ARCH) $<
+	    $(touch) $@
+
+endif
+
+# --- test-deps ---
+
+# test-deps does actual work for dymanic linking (all OSes), and Windows* OS (all linking types).
+ifneq "$(filter %-dyna win-%,$(os)-$(LINK_TYPE))" ""
+
+    force-test-deps : test-deps/.force test-deps/.test
+    test-deps       : test-deps/.test
+
+    td_exp =
+    ifeq "$(os)" "lin"
+        ifeq "$(arch)" "32"
+            td_exp += libc.so.6
+            td_exp += ld-linux.so.2
+            td_exp += libgcc_s.so.1
+        endif
+        ifeq "$(arch)" "32e"
+            td_exp += libc.so.6
+            td_exp += ld-linux-x86-64.so.2
+            td_exp += libgcc_s.so.1
+        endif
+        ifeq "$(arch)" "64"
+            td_exp += libc.so.6.1
+            td_exp += libgcc_s.so.1
+        endif
+        ifeq "$(arch)" "arm"
+            td_exp += libc.so.6
+            td_exp += ld-linux-armhf.so.3
+            td_exp += libgcc_s.so.1
+        endif
+        ifneq "$(filter ppc64 ppc64le,$(arch))" ""
+            td_exp += libc.so.6
+            td_exp += ld64.so.1
+            # warning: this is for ppc64le, but as we do not currently
+            # distinguish it from ppc64, we need to add this dep here
+            td_exp += ld64.so.2
+            td_exp += libgcc_s.so.1
+        endif
+        ifeq "$(arch)" "aarch"
+            td_exp += libc.so.6
+            td_exp += ld-linux-aarch64.so.1
+        endif
+        ifeq "$(arch)-$(MIC_ARCH)" "mic-knf"
+            td_exp += ld-linux-l1om.so.2
+            td_exp += libc.so.6
+            td_exp += libgcc_s.so.1
+        endif
+        ifeq "$(arch)-$(MIC_ARCH)" "mic-knc"
+            td_exp += ld-linux-k1om.so.2
+            td_exp += libc.so.6
+        endif
+        ifeq "$(std_cpp_lib)" "1"
+            td_exp += libstdc++.so.6
+        endif
+
+        td_exp += libdl.so.2
+        ifeq "$(filter 32 32e 64 ppc64 ppc64le mic,$(arch))" ""
+            td_exp += libffi.so.6
+            td_exp += libffi.so.5
+        endif
+        ifneq "$(LIB_TYPE)" "stub"
+            td_exp += libpthread.so.0
+        endif
+    endif
+    ifeq "$(os)" "mac"
+#        td_exp += /usr/lib/libgcc_s.1.dylib
+        td_exp += /usr/lib/libSystem.B.dylib
+    endif
+    ifeq "$(os)" "win"
+        ifeq "$(LINK_TYPE)" "dyna"
+            td_exp += kernel32.dll
+        else
+            td_exp += uuid
+        endif
+    endif
+
+    ifeq "$(omp_os)" "freebsd"
+        td_exp = 
+        td_exp += libc.so.7
+        td_exp += libthr.so.3
+        td_exp += libunwind.so.5
+    endif
+
+    test-deps/.test : $(lib_file) $(tools_dir)check-depends.pl test-deps/.dir .rebuild
+	    $(target)
+	    $(td-i)$(perl) $(tools_dir)check-depends.pl $(oa-opts) \
+		$(if $(td_exp),--expected="$(subst $(space),$(comma),$(td_exp))") $<
+	    $(touch) $@
+
+endif
+
+
+# --------------------------------------------------------------------------------------------------
+# Fortran files.
+# --------------------------------------------------------------------------------------------------
+omp_lib_f = omp_lib.f90
+omp_lib_kinds.mod : $(omp_lib_f) .rebuild
+	$(target)
+	$(fort) $(fort-flags) $<
+omp_lib.mod : omp_lib_kinds.mod
+
+omp_lib.h  : ev-flags += -D KMP_INT_PTR_KIND="int_ptr_kind()"
+
+# --------------------------------------------------------------------------------------------------
+# Common files.
+# --------------------------------------------------------------------------------------------------
+
+common : $(out_cmn_files)
+
+clean-common :
+	$(rm) $(out_cmn_files)
+
+# --------------------------------------------------------------------------------------------------
+# Dependency files and common rules.
+# --------------------------------------------------------------------------------------------------
+
+.PHONY : dep
+dep    : $(dep_files)
+	$(target)
+
+include $(LIBOMP_WORK)src/rules.mk
+
+# Initiate rebuild if any of makefiles or build sript is changed.
+# When developing makefiles, it is useful to comment it, otherwise make will perform full rebuild
+# on every change of makefiles.
+.rebuild : $(MAKEFILE_LIST) $(tools_dir)build.pl $(tools_dir)lib/Build.pm
+
+ifeq "$(clean)" ""
+    # Do not include dependency files if "clean" goal is specified.
+    -include $(dep_files)
+endif
+
+# end of file #

diff --git a/final/runtime/src/ompt-event-specific.h b/final/runtime/src/ompt-event-specific.h
new file mode 100644
index 0000000..f05d809
--- /dev/null
+++ b/final/runtime/src/ompt-event-specific.h

@@ -0,0 +1,144 @@
+#ifndef  __OMPT_EVENT_SPECIFIC_H__
+#define  __OMPT_EVENT_SPECIFIC_H__
+
+/******************************************************************************
+ * File: ompt-event-specific.h
+ *
+ * Description:
+ *
+ *   specify which of the OMPT events are implemented by this runtime system
+ *   and the level of their implementation by a runtime system.
+ *****************************************************************************/
+
+#define _ompt_tokenpaste_helper(x,y)        x ## y
+#define _ompt_tokenpaste(x,y)               _ompt_tokenpaste_helper(x,y)
+#define ompt_event_implementation_status(e) _ompt_tokenpaste(e,_implemented)
+
+
+/*----------------------------------------------------------------------------
+ | Specify whether an event may occur or not, and whether event callbacks
+ | never, sometimes, or always occur.
+ |
+ | The values for these constants are defined in section 6.1.2 of
+ | the OMPT TR. They are exposed to tools through ompt_set_callback.
+ +--------------------------------------------------------------------------*/
+
+#define ompt_event_NEVER             ompt_set_result_event_never_occurs
+#define ompt_event_UNIMPLEMENTED     ompt_set_result_event_may_occur_no_callback
+#define ompt_event_MAY_CONVENIENT    ompt_set_result_event_may_occur_callback_some
+#define ompt_event_MAY_ALWAYS        ompt_set_result_event_may_occur_callback_always
+
+#if OMPT_TRACE
+#define ompt_event_MAY_ALWAYS_TRACE   ompt_event_MAY_ALWAYS
+#else
+#define ompt_event_MAY_ALWAYS_TRACE   ompt_event_UNIMPLEMENTED
+#endif
+
+#if OMPT_BLAME
+#define ompt_event_MAY_ALWAYS_BLAME   ompt_event_MAY_ALWAYS
+#else
+#define ompt_event_MAY_ALWAYS_BLAME   ompt_event_UNIMPLEMENTED
+#endif
+
+/*----------------------------------------------------------------------------
+ | Mandatory Events
+ +--------------------------------------------------------------------------*/
+
+#define ompt_event_parallel_begin_implemented           ompt_event_MAY_ALWAYS
+#define ompt_event_parallel_end_implemented             ompt_event_MAY_ALWAYS
+
+#define ompt_event_task_begin_implemented               ompt_event_MAY_ALWAYS
+#define ompt_event_task_end_implemented                 ompt_event_MAY_ALWAYS
+
+#define ompt_event_thread_begin_implemented             ompt_event_MAY_ALWAYS
+#define ompt_event_thread_end_implemented               ompt_event_MAY_ALWAYS
+
+#define ompt_event_control_implemented                  ompt_event_MAY_ALWAYS
+
+#define ompt_event_runtime_shutdown_implemented         ompt_event_MAY_ALWAYS
+
+
+/*----------------------------------------------------------------------------
+ | Optional Events (blame shifting)
+ +--------------------------------------------------------------------------*/
+
+#define ompt_event_idle_begin_implemented               ompt_event_MAY_ALWAYS_BLAME
+#define ompt_event_idle_end_implemented                 ompt_event_MAY_ALWAYS_BLAME
+
+#define ompt_event_wait_barrier_begin_implemented       ompt_event_MAY_ALWAYS_BLAME
+#define ompt_event_wait_barrier_end_implemented         ompt_event_MAY_ALWAYS_BLAME
+
+#define ompt_event_wait_taskwait_begin_implemented      ompt_event_UNIMPLEMENTED
+#define ompt_event_wait_taskwait_end_implemented        ompt_event_UNIMPLEMENTED
+
+#define ompt_event_wait_taskgroup_begin_implemented     ompt_event_UNIMPLEMENTED
+#define ompt_event_wait_taskgroup_end_implemented       ompt_event_UNIMPLEMENTED
+
+#define ompt_event_release_lock_implemented             ompt_event_MAY_ALWAYS_BLAME
+#define ompt_event_release_nest_lock_last_implemented   ompt_event_MAY_ALWAYS_BLAME
+#define ompt_event_release_critical_implemented         ompt_event_MAY_ALWAYS_BLAME
+#define ompt_event_release_atomic_implemented           ompt_event_MAY_ALWAYS_BLAME
+#define ompt_event_release_ordered_implemented          ompt_event_MAY_ALWAYS_BLAME
+
+
+/*----------------------------------------------------------------------------
+ | Optional Events (synchronous events)
+ +--------------------------------------------------------------------------*/
+
+#define ompt_event_implicit_task_begin_implemented      ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_implicit_task_end_implemented        ompt_event_MAY_ALWAYS_TRACE
+
+#define ompt_event_initial_task_begin_implemented       ompt_event_UNIMPLEMENTED
+#define ompt_event_initial_task_end_implemented         ompt_event_UNIMPLEMENTED
+
+#define ompt_event_task_switch_implemented              ompt_event_UNIMPLEMENTED
+
+#define ompt_event_loop_begin_implemented               ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_loop_end_implemented                 ompt_event_MAY_ALWAYS_TRACE
+
+#define ompt_event_sections_begin_implemented           ompt_event_UNIMPLEMENTED
+#define ompt_event_sections_end_implemented             ompt_event_UNIMPLEMENTED
+
+#define ompt_event_single_in_block_begin_implemented    ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_single_in_block_end_implemented      ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_single_others_begin_implemented      ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_single_others_end_implemented        ompt_event_MAY_ALWAYS_TRACE
+
+#define ompt_event_workshare_begin_implemented          ompt_event_UNIMPLEMENTED
+#define ompt_event_workshare_end_implemented            ompt_event_UNIMPLEMENTED
+
+#define ompt_event_master_begin_implemented             ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_master_end_implemented               ompt_event_MAY_ALWAYS_TRACE
+
+#define ompt_event_barrier_begin_implemented            ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_barrier_end_implemented              ompt_event_MAY_ALWAYS_TRACE
+
+#define ompt_event_taskwait_begin_implemented           ompt_event_UNIMPLEMENTED
+#define ompt_event_taskwait_end_implemented             ompt_event_UNIMPLEMENTED
+
+#define ompt_event_taskgroup_begin_implemented          ompt_event_UNIMPLEMENTED
+#define ompt_event_taskgroup_end_implemented            ompt_event_UNIMPLEMENTED
+
+#define ompt_event_release_nest_lock_prev_implemented   ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_wait_lock_implemented                ompt_event_UNIMPLEMENTED
+#define ompt_event_wait_nest_lock_implemented           ompt_event_UNIMPLEMENTED
+#define ompt_event_wait_critical_implemented            ompt_event_UNIMPLEMENTED
+#define ompt_event_wait_atomic_implemented              ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_wait_ordered_implemented             ompt_event_MAY_ALWAYS_TRACE
+
+#define ompt_event_acquired_lock_implemented            ompt_event_UNIMPLEMENTED
+#define ompt_event_acquired_nest_lock_first_implemented ompt_event_UNIMPLEMENTED
+#define ompt_event_acquired_nest_lock_next_implemented  ompt_event_UNIMPLEMENTED
+#define ompt_event_acquired_critical_implemented        ompt_event_UNIMPLEMENTED
+#define ompt_event_acquired_atomic_implemented          ompt_event_MAY_ALWAYS_TRACE
+#define ompt_event_acquired_ordered_implemented         ompt_event_MAY_ALWAYS_TRACE
+
+#define ompt_event_init_lock_implemented                ompt_event_UNIMPLEMENTED
+#define ompt_event_init_nest_lock_implemented           ompt_event_UNIMPLEMENTED
+
+#define ompt_event_destroy_lock_implemented             ompt_event_UNIMPLEMENTED
+#define ompt_event_destroy_nest_lock_implemented        ompt_event_UNIMPLEMENTED
+
+#define ompt_event_flush_implemented                    ompt_event_UNIMPLEMENTED
+
+#endif

diff --git a/final/runtime/src/ompt-general.c b/final/runtime/src/ompt-general.c
new file mode 100644
index 0000000..2e5d4ff
--- /dev/null
+++ b/final/runtime/src/ompt-general.c

@@ -0,0 +1,404 @@
+/*****************************************************************************
+ * system include files
+ ****************************************************************************/
+
+#include <assert.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+
+/*****************************************************************************
+ * ompt include files
+ ****************************************************************************/
+
+#include "ompt-internal.h"
+#include "ompt-specific.c"
+
+
+
+/*****************************************************************************
+ * macros
+ ****************************************************************************/
+
+#define ompt_get_callback_success 1
+#define ompt_get_callback_failure 0
+
+#define no_tool_present 0
+
+#define OMPT_API_ROUTINE static
+
+
+
+/*****************************************************************************
+ * types
+ ****************************************************************************/
+
+typedef struct {
+    const char *state_name;
+    ompt_state_t  state_id;
+} ompt_state_info_t;
+
+
+
+/*****************************************************************************
+ * global variables
+ ****************************************************************************/
+
+ompt_status_t ompt_status = ompt_status_ready;
+
+
+ompt_state_info_t ompt_state_info[] = {
+#define ompt_state_macro(state, code) { # state, state },
+    FOREACH_OMPT_STATE(ompt_state_macro)
+#undef ompt_state_macro
+};
+
+
+ompt_callbacks_t ompt_callbacks;
+
+
+
+/*****************************************************************************
+ * forward declarations
+ ****************************************************************************/
+
+static ompt_interface_fn_t ompt_fn_lookup(const char *s);
+
+
+/*****************************************************************************
+ * state
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_enumerate_state(int current_state, int *next_state,
+                                          const char **next_state_name)
+{
+    const static int len = sizeof(ompt_state_info) / sizeof(ompt_state_info_t);
+    int i = 0;
+
+    for (i = 0; i < len - 1; i++) {
+        if (ompt_state_info[i].state_id == current_state) {
+            *next_state = ompt_state_info[i+1].state_id;
+            *next_state_name = ompt_state_info[i+1].state_name;
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+
+
+/*****************************************************************************
+ * callbacks
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_set_callback(ompt_event_t evid, ompt_callback_t cb)
+{
+    switch (evid) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+    case event_name:                                                           \
+        if (ompt_event_implementation_status(event_name)) {                    \
+            ompt_callbacks.ompt_callback(event_name) = (callback_type) cb;     \
+        }                                                                      \
+        return ompt_event_implementation_status(event_name);
+
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+    default: return ompt_set_result_registration_error;
+    }
+}
+
+
+OMPT_API_ROUTINE int ompt_get_callback(ompt_event_t evid, ompt_callback_t *cb)
+{
+    switch (evid) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+    case event_name:                                                           \
+        if (ompt_event_implementation_status(event_name)) {                    \
+            ompt_callback_t mycb =                                             \
+                (ompt_callback_t) ompt_callbacks.ompt_callback(event_name);    \
+            if (mycb) {                                                        \
+                *cb = mycb;                                                    \
+                return ompt_get_callback_success;                              \
+            }                                                                  \
+        }                                                                      \
+        return ompt_get_callback_failure;
+
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+    default: return ompt_get_callback_failure;
+    }
+}
+
+
+
+/*****************************************************************************
+ * intialization/finalization
+ ****************************************************************************/
+
+_OMP_EXTERN __attribute__ (( weak ))
+int ompt_initialize(ompt_function_lookup_t ompt_fn_lookup, const char *version,
+                    unsigned int ompt_version)
+{
+    return no_tool_present;
+}
+
+enum tool_setting_e {
+    omp_tool_error,
+    omp_tool_unset,
+    omp_tool_disabled,
+    omp_tool_enabled
+};
+
+void ompt_init()
+{
+    static int ompt_initialized = 0;
+
+    if (ompt_initialized) return;
+
+    const char *ompt_env_var = getenv("OMP_TOOL");
+    tool_setting_e tool_setting = omp_tool_error;
+
+    if (!ompt_env_var  || !strcmp(ompt_env_var, ""))
+        tool_setting = omp_tool_unset;
+    else if (!strcmp(ompt_env_var, "disabled"))
+        tool_setting = omp_tool_disabled;
+    else if (!strcmp(ompt_env_var, "enabled"))
+        tool_setting = omp_tool_enabled;
+
+    switch(tool_setting) {
+    case omp_tool_disabled:
+        ompt_status = ompt_status_disabled;
+        break;
+
+    case omp_tool_unset:
+    case omp_tool_enabled:
+    {
+        const char *runtime_version = __ompt_get_runtime_version_internal();
+        int ompt_init_val =
+            ompt_initialize(ompt_fn_lookup, runtime_version, OMPT_VERSION);
+
+        if (ompt_init_val) {
+            ompt_status = ompt_status_track_callback;
+            __ompt_init_internal();
+        }
+        break;
+    }
+
+    case omp_tool_error:
+        fprintf(stderr,
+            "Warning: OMP_TOOL has invalid value \"%s\".\n"
+            "  legal values are (NULL,\"\",\"disabled\","
+            "\"enabled\").\n", ompt_env_var);
+        break;
+    }
+
+    ompt_initialized = 1;
+}
+
+
+void ompt_fini()
+{
+    if (ompt_status == ompt_status_track_callback) {
+        if (ompt_callbacks.ompt_callback(ompt_event_runtime_shutdown)) {
+            ompt_callbacks.ompt_callback(ompt_event_runtime_shutdown)();
+        }
+    }
+
+    ompt_status = ompt_status_disabled;
+}
+
+
+
+/*****************************************************************************
+ * parallel regions
+ ****************************************************************************/
+
+OMPT_API_ROUTINE ompt_parallel_id_t ompt_get_parallel_id(int ancestor_level)
+{
+    return __ompt_get_parallel_id_internal(ancestor_level);
+}
+
+
+OMPT_API_ROUTINE int ompt_get_parallel_team_size(int ancestor_level)
+{
+    return __ompt_get_parallel_team_size_internal(ancestor_level);
+}
+
+
+OMPT_API_ROUTINE void *ompt_get_parallel_function(int ancestor_level)
+{
+    return __ompt_get_parallel_function_internal(ancestor_level);
+}
+
+
+OMPT_API_ROUTINE ompt_state_t ompt_get_state(ompt_wait_id_t *ompt_wait_id)
+{
+    ompt_state_t thread_state = __ompt_get_state_internal(ompt_wait_id);
+
+    if (thread_state == ompt_state_undefined) {
+        thread_state = ompt_state_work_serial;
+    }
+
+    return thread_state;
+}
+
+
+
+/*****************************************************************************
+ * threads
+ ****************************************************************************/
+
+
+OMPT_API_ROUTINE void *ompt_get_idle_frame()
+{
+    return __ompt_get_idle_frame_internal();
+}
+
+
+
+/*****************************************************************************
+ * tasks
+ ****************************************************************************/
+
+
+OMPT_API_ROUTINE ompt_thread_id_t ompt_get_thread_id(void)
+{
+    return __ompt_get_thread_id_internal();
+}
+
+OMPT_API_ROUTINE ompt_task_id_t ompt_get_task_id(int depth)
+{
+    return __ompt_get_task_id_internal(depth);
+}
+
+
+OMPT_API_ROUTINE ompt_frame_t *ompt_get_task_frame(int depth)
+{
+    return __ompt_get_task_frame_internal(depth);
+}
+
+
+OMPT_API_ROUTINE void *ompt_get_task_function(int depth)
+{
+    return __ompt_get_task_function_internal(depth);
+}
+
+
+/*****************************************************************************
+ * placeholders
+ ****************************************************************************/
+
+// Don't define this as static. The loader may choose to eliminate the symbol
+// even though it is needed by tools.  
+#define OMPT_API_PLACEHOLDER 
+
+// Ensure that placeholders don't have mangled names in the symbol table.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+OMPT_API_PLACEHOLDER void ompt_idle(void)  
+{
+    // This function is a placeholder used to represent the calling context of
+    // idle OpenMP worker threads. It is not meant to be invoked.
+    assert(0);
+}
+
+
+OMPT_API_PLACEHOLDER void ompt_overhead(void)
+{
+    // This function is a placeholder used to represent the OpenMP context of
+    // threads working in the OpenMP runtime.  It is not meant to be invoked.
+    assert(0);
+}
+
+
+OMPT_API_PLACEHOLDER void ompt_barrier_wait(void)
+{
+    // This function is a placeholder used to represent the OpenMP context of
+    // threads waiting for a barrier in the OpenMP runtime. It is not meant
+    // to be invoked.
+    assert(0);
+}
+
+
+OMPT_API_PLACEHOLDER void ompt_task_wait(void)
+{
+    // This function is a placeholder used to represent the OpenMP context of
+    // threads waiting for a task in the OpenMP runtime. It is not meant
+    // to be invoked.
+    assert(0);
+}
+
+
+OMPT_API_PLACEHOLDER void ompt_mutex_wait(void)
+{
+    // This function is a placeholder used to represent the OpenMP context of
+    // threads waiting for a mutex in the OpenMP runtime. It is not meant
+    // to be invoked.
+    assert(0);
+}
+
+#ifdef __cplusplus
+};
+#endif
+
+
+/*****************************************************************************
+ * compatability
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_ompt_version()
+{
+    return OMPT_VERSION;
+}
+
+
+
+/*****************************************************************************
+ * application-facing API
+ ****************************************************************************/
+
+
+/*----------------------------------------------------------------------------
+ | control
+ ---------------------------------------------------------------------------*/
+
+_OMP_EXTERN void ompt_control(uint64_t command, uint64_t modifier)
+{
+    if (ompt_status == ompt_status_track_callback &&
+        ompt_callbacks.ompt_callback(ompt_event_control)) {
+        ompt_callbacks.ompt_callback(ompt_event_control)(command, modifier);
+    }
+}
+
+
+
+/*****************************************************************************
+ * API inquiry for tool
+ ****************************************************************************/
+
+static ompt_interface_fn_t ompt_fn_lookup(const char *s)
+{
+
+#define ompt_interface_fn(fn) \
+    if (strcmp(s, #fn) == 0) return (ompt_interface_fn_t) fn;
+
+    FOREACH_OMPT_INQUIRY_FN(ompt_interface_fn)
+
+    FOREACH_OMPT_PLACEHOLDER_FN(ompt_interface_fn)
+
+    return (ompt_interface_fn_t) 0;
+}

diff --git a/final/runtime/src/ompt-internal.h b/final/runtime/src/ompt-internal.h
new file mode 100644
index 0000000..eae0577
--- /dev/null
+++ b/final/runtime/src/ompt-internal.h

@@ -0,0 +1,85 @@
+#ifndef __OMPT_INTERNAL_H__
+#define __OMPT_INTERNAL_H__
+
+#include "ompt.h"
+#include "ompt-event-specific.h"
+
+#define OMPT_VERSION 1
+
+#define _OMP_EXTERN extern "C"
+
+
+
+#define ompt_callback(e) e ## _callback
+
+/* track and track_callback share a bit so that one can test whether either is
+ * set by anding a bit.
+ */
+typedef enum {
+    ompt_status_disabled       = 0x0,
+    ompt_status_ready          = 0x1,
+    ompt_status_track          = 0x2,
+    ompt_status_track_callback = 0x6,
+} ompt_status_t;
+
+
+typedef struct ompt_callbacks_s {
+#define ompt_event_macro(event, callback, eventid) callback ompt_callback(event);
+
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_callbacks_t;
+
+
+
+typedef struct {
+    ompt_frame_t        frame;
+    void*               function;
+    ompt_task_id_t      task_id;
+} ompt_task_info_t;
+
+
+typedef struct {
+    ompt_parallel_id_t  parallel_id;
+    void                *microtask;
+} ompt_team_info_t;
+
+
+typedef struct ompt_lw_taskteam_s {
+    ompt_team_info_t    ompt_team_info;
+    ompt_task_info_t    ompt_task_info;
+    struct ompt_lw_taskteam_s *parent;
+} ompt_lw_taskteam_t;
+
+
+typedef struct ompt_parallel_info_s {
+    ompt_task_id_t parent_task_id;    /* id of parent task            */
+    ompt_parallel_id_t parallel_id;   /* id of parallel region        */
+    ompt_frame_t *parent_task_frame;  /* frame data of parent task    */
+    void *parallel_function;          /* pointer to outlined function */
+} ompt_parallel_info_t;
+
+
+typedef struct {
+    ompt_state_t        state;
+    ompt_wait_id_t      wait_id;
+    void                *idle_frame;
+} ompt_thread_info_t;
+
+
+extern ompt_status_t ompt_status;
+extern ompt_callbacks_t ompt_callbacks;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ompt_init(void);
+void ompt_fini(void);
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif

diff --git a/final/runtime/src/ompt-specific.c b/final/runtime/src/ompt-specific.c
new file mode 100644
index 0000000..8339187
--- /dev/null
+++ b/final/runtime/src/ompt-specific.c

@@ -0,0 +1,374 @@
+//******************************************************************************
+// include files
+//******************************************************************************
+
+#include "kmp.h"
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define GTID_TO_OMPT_THREAD_ID(id) ((ompt_thread_id_t) (id >=0) ? id + 1: 0)
+
+#define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info;
+
+#define OMPT_THREAD_ID_BITS 16
+
+// 2013 08 24 - John Mellor-Crummey
+//   ideally, a thread should assign its own ids based on thread private data.
+//   however, the way the intel runtime reinitializes thread data structures
+//   when it creates teams makes it difficult to maintain persistent thread
+//   data. using a shared variable instead is simple. I leave it to intel to
+//   sort out how to implement a higher performance version in their runtime.
+
+// when using fetch_and_add to generate the IDs, there isn't any reason to waste
+// bits for thread id.
+#if 0
+#define NEXT_ID(id_ptr,tid) \
+  ((KMP_TEST_THEN_INC64(id_ptr) << OMPT_THREAD_ID_BITS) | (tid))
+#else
+#define NEXT_ID(id_ptr,tid) (KMP_TEST_THEN_INC64(id_ptr))
+#endif
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+//----------------------------------------------------------
+// traverse the team and task hierarchy
+// note: __ompt_get_teaminfo and __ompt_get_taskinfo
+//       traverse the hierarchy similarly and need to be
+//       kept consistent
+//----------------------------------------------------------
+
+ompt_team_info_t *
+__ompt_get_teaminfo(int depth, int *size)
+{
+    kmp_info_t *thr = ompt_get_thread();
+
+    if (thr) {
+        kmp_team *team = thr->th.th_team;
+        ompt_lw_taskteam_t *lwt = LWT_FROM_TEAM(team);
+
+        while(depth > 0) {
+            // next lightweight team (if any)
+            if (lwt) lwt = lwt->parent;
+
+            // next heavyweight team (if any) after
+            // lightweight teams are exhausted
+            if (!lwt && team) team=team->t.t_parent;
+
+            depth--;
+        }
+
+        if (lwt) {
+            // lightweight teams have one task
+            if (size) *size = 1;
+
+            // return team info for lightweight team
+            return &lwt->ompt_team_info;
+        } else if (team) {
+            // extract size from heavyweight team
+            if (size) *size = team->t.t_nproc;
+
+            // return team info for heavyweight team
+            return &team->t.ompt_team_info;
+        }
+    }
+
+    return NULL;
+}
+
+
+ompt_task_info_t *
+__ompt_get_taskinfo(int depth)
+{
+    ompt_task_info_t *info = NULL;
+    kmp_info_t *thr = ompt_get_thread();
+
+    if (thr) {
+        kmp_taskdata_t  *taskdata = thr->th.th_current_task;
+        ompt_lw_taskteam_t *lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+        while (depth > 0) {
+            // next lightweight team (if any)
+            if (lwt) lwt = lwt->parent;
+
+            // next heavyweight team (if any) after
+            // lightweight teams are exhausted
+            if (!lwt && taskdata) {
+                taskdata = taskdata->td_parent;
+                if (taskdata) {
+                    lwt = LWT_FROM_TEAM(taskdata->td_team);
+                }
+            }
+            depth--;
+        }
+
+        if (lwt) {
+            info = &lwt->ompt_task_info;
+        } else if (taskdata) {
+            info = &taskdata->ompt_task_info;
+        }
+    }
+
+    return info;
+}
+
+
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+//----------------------------------------------------------
+// initialization support
+//----------------------------------------------------------
+
+void
+__ompt_init_internal()
+{
+    if (ompt_status & ompt_status_track) {
+        // initialize initial thread for OMPT
+        kmp_info_t *root_thread = ompt_get_thread();
+        __kmp_task_init_ompt(
+            root_thread->th.th_team->t.t_implicit_task_taskdata, 0);
+        __kmp_task_init_ompt(
+            root_thread->th.th_serial_team->t.t_implicit_task_taskdata, 0);
+
+        // make mandatory callback for creation of initial thread
+        // this needs to occur here rather than in __kmp_register_root because
+        // __kmp_register_root is called before ompt_initialize
+        int gtid = __kmp_get_gtid();
+        if (KMP_UBER_GTID(gtid)) {
+            // initialize the initial thread's idle frame and state
+            root_thread->th.ompt_thread_info.idle_frame = 0;
+            root_thread->th.ompt_thread_info.state = ompt_state_overhead;
+            if ((ompt_status == ompt_status_track_callback) &&
+                ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
+                __ompt_thread_begin(ompt_thread_initial, gtid);
+            }
+            root_thread->th.ompt_thread_info.state = ompt_state_work_serial;
+        }
+    }
+}
+
+
+//----------------------------------------------------------
+// thread support
+//----------------------------------------------------------
+
+ompt_parallel_id_t
+__ompt_thread_id_new()
+{
+    static uint64_t ompt_thread_id = 1;
+    return NEXT_ID(&ompt_thread_id, 0);
+}
+
+void
+__ompt_thread_begin(ompt_thread_type_t thread_type, int gtid)
+{
+    ompt_callbacks.ompt_callback(ompt_event_thread_begin)(
+        thread_type, GTID_TO_OMPT_THREAD_ID(gtid));
+}
+
+
+void
+__ompt_thread_end(ompt_thread_type_t thread_type, int gtid)
+{
+    ompt_callbacks.ompt_callback(ompt_event_thread_end)(
+        thread_type, GTID_TO_OMPT_THREAD_ID(gtid));
+}
+
+
+ompt_thread_id_t
+__ompt_get_thread_id_internal()
+{
+    // FIXME
+    // until we have a better way of assigning ids, use __kmp_get_gtid
+    // since the return value might be negative, we need to test that before
+    // assigning it to an ompt_thread_id_t, which is unsigned.
+    int id = __kmp_get_gtid();
+    assert(id >= 0);
+
+    return GTID_TO_OMPT_THREAD_ID(id);
+}
+
+//----------------------------------------------------------
+// state support
+//----------------------------------------------------------
+
+void
+__ompt_thread_assign_wait_id(void *variable)
+{
+    int gtid = __kmp_gtid_get_specific();
+    kmp_info_t *ti = ompt_get_thread_gtid(gtid);
+
+    ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t) variable;
+}
+
+ompt_state_t
+__ompt_get_state_internal(ompt_wait_id_t *ompt_wait_id)
+{
+    kmp_info_t *ti = ompt_get_thread();
+
+    if (ti) {
+        if (ompt_wait_id)
+            *ompt_wait_id = ti->th.ompt_thread_info.wait_id;
+        return ti->th.ompt_thread_info.state;
+    }
+    return ompt_state_undefined;
+}
+
+//----------------------------------------------------------
+// idle frame support
+//----------------------------------------------------------
+
+void *
+__ompt_get_idle_frame_internal(void)
+{
+    kmp_info_t *ti = ompt_get_thread();
+    return ti ? ti->th.ompt_thread_info.idle_frame : NULL;
+}
+
+
+//----------------------------------------------------------
+// parallel region support
+//----------------------------------------------------------
+
+ompt_parallel_id_t
+__ompt_parallel_id_new(int gtid)
+{
+    static uint64_t ompt_parallel_id = 1;
+    return gtid >= 0 ? NEXT_ID(&ompt_parallel_id, gtid) : 0;
+}
+
+
+void *
+__ompt_get_parallel_function_internal(int depth)
+{
+    ompt_team_info_t *info = __ompt_get_teaminfo(depth, NULL);
+    void *function = info ? info->microtask : NULL;
+    return function;
+}
+
+
+ompt_parallel_id_t
+__ompt_get_parallel_id_internal(int depth)
+{
+    ompt_team_info_t *info = __ompt_get_teaminfo(depth, NULL);
+    ompt_parallel_id_t id = info ? info->parallel_id : 0;
+    return id;
+}
+
+
+int
+__ompt_get_parallel_team_size_internal(int depth)
+{
+    // initialize the return value with the error value.
+    // if there is a team at the specified depth, the default
+    // value will be overwritten the size of that team.
+    int size = -1;
+    (void) __ompt_get_teaminfo(depth, &size);
+    return size;
+}
+
+
+//----------------------------------------------------------
+// lightweight task team support
+//----------------------------------------------------------
+
+void
+__ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                        int gtid, void *microtask,
+                        ompt_parallel_id_t ompt_pid)
+{
+    lwt->ompt_team_info.parallel_id = ompt_pid;
+    lwt->ompt_team_info.microtask = microtask;
+    lwt->ompt_task_info.task_id = 0;
+    lwt->ompt_task_info.frame.reenter_runtime_frame = 0;
+    lwt->ompt_task_info.frame.exit_runtime_frame = 0;
+    lwt->ompt_task_info.function = NULL;
+    lwt->parent = 0;
+}
+
+
+void
+__ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt,  kmp_info_t *thr)
+{
+    ompt_lw_taskteam_t *my_parent = thr->th.th_team->t.ompt_serialized_team_info;
+    lwt->parent = my_parent;
+    thr->th.th_team->t.ompt_serialized_team_info = lwt;
+}
+
+
+ompt_lw_taskteam_t *
+__ompt_lw_taskteam_unlink(kmp_info_t *thr)
+{
+    ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
+    if (lwtask) thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;
+    return lwtask;
+}
+
+
+//----------------------------------------------------------
+// task support
+//----------------------------------------------------------
+
+ompt_task_id_t
+__ompt_task_id_new(int gtid)
+{
+    static uint64_t ompt_task_id = 1;
+    return NEXT_ID(&ompt_task_id, gtid);
+}
+
+
+ompt_task_id_t
+__ompt_get_task_id_internal(int depth)
+{
+    ompt_task_info_t *info = __ompt_get_taskinfo(depth);
+    ompt_task_id_t task_id = info ?  info->task_id : 0;
+    return task_id;
+}
+
+
+void *
+__ompt_get_task_function_internal(int depth)
+{
+    ompt_task_info_t *info = __ompt_get_taskinfo(depth);
+    void *function = info ? info->function : NULL;
+    return function;
+}
+
+
+ompt_frame_t *
+__ompt_get_task_frame_internal(int depth)
+{
+    ompt_task_info_t *info = __ompt_get_taskinfo(depth);
+    ompt_frame_t *frame = info ? frame = &info->frame : NULL;
+    return frame;
+}
+
+
+//----------------------------------------------------------
+// team support
+//----------------------------------------------------------
+
+void
+__ompt_team_assign_id(kmp_team_t *team, ompt_parallel_id_t ompt_pid)
+{
+    team->t.ompt_team_info.parallel_id = ompt_pid;
+}
+
+
+//----------------------------------------------------------
+// runtime version support
+//----------------------------------------------------------
+
+const char *
+__ompt_get_runtime_version_internal()
+{
+    return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
+}

diff --git a/final/runtime/src/ompt-specific.h b/final/runtime/src/ompt-specific.h
new file mode 100644
index 0000000..ed679da
--- /dev/null
+++ b/final/runtime/src/ompt-specific.h

@@ -0,0 +1,49 @@
+#ifndef OMPT_SPECIFIC_H
+#define OMPT_SPECIFIC_H
+
+#include "kmp.h"
+
+void __ompt_team_assign_id(kmp_team_t *team, ompt_parallel_id_t ompt_pid);
+void __ompt_thread_assign_wait_id(void *variable);
+
+void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                             int gtid, void *microtask,
+                             ompt_parallel_id_t ompt_pid);
+
+void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt,  kmp_info_t *thr);
+
+ompt_lw_taskteam_t * __ompt_lw_taskteam_unlink(kmp_info_t *thr);
+
+ompt_parallel_id_t __ompt_parallel_id_new(int gtid);
+ompt_task_id_t __ompt_task_id_new(int gtid);
+
+ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size);
+
+ompt_task_info_t *__ompt_get_taskinfo(int depth);
+
+inline kmp_info_t *
+ompt_get_thread_gtid(int gtid)
+{
+    return (gtid >= 0) ? __kmp_thread_from_gtid(gtid) : NULL;
+}
+
+inline kmp_info_t *
+ompt_get_thread()
+{
+    int gtid = __kmp_gtid_get_specific();
+    return ompt_get_thread_gtid(gtid);
+}
+
+
+void __ompt_thread_begin(ompt_thread_type_t thread_type, int gtid);
+
+void __ompt_thread_end(ompt_thread_type_t thread_type, int gtid);
+
+
+int __ompt_get_parallel_team_size_internal(int ancestor_level);
+
+ompt_task_id_t __ompt_get_task_id_internal(int depth);
+
+ompt_frame_t *__ompt_get_task_frame_internal(int depth);
+
+#endif

diff --git a/final/runtime/src/rules.mk b/final/runtime/src/rules.mk
new file mode 100644
index 0000000..b5b1b7c
--- /dev/null
+++ b/final/runtime/src/rules.mk

@@ -0,0 +1,100 @@
+# rules.mk #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# --- Copy files to out directories ---
+
+$(out_cmn_dir)include/% : % $(out_cmn_dir)include/.dir .rebuild
+	$(target)
+	$(cp) $< $@
+
+$(out_cmn_dir)include_compat/% : % $(out_cmn_dir)include_compat/.dir .rebuild
+	$(target)
+	$(cp) $< $@
+
+# Fat: touch .touch file on every update in $(out_lib_dir), so we will know should we update fat
+# goal or not.
+$(out_lib_dir)% : % $(out_lib_dir).dir .rebuild
+	$(target)
+	$(cp) $< $@
+        ifneq "$(out_lib_fat_dir)" ""
+	    $(touch) $(dir $@).touch
+        endif
+
+.PHONY: libomp_aliases
+libomp_aliases: $(out_lib_dir).dir .rebuild $(out_lib_dir)$(lib_file)
+	$(target)
+ifeq "$(os)" "win"
+	cd $(out_lib_dir) ; $(cp) $(lib_file) libiomp5md$(dll) ; $(cp) $(imp_file) libiomp5md$(lib)
+else
+	cd $(out_lib_dir) ; ln -sf $(lib_file) libiomp5$(dll)
+endif
+
+$(out_ptf_dir)include/% : % $(out_ptf_dir)include/.dir .rebuild
+	$(target)
+	$(cp) $< $@
+
+$(out_ptf_dir)include_compat/% : % $(out_ptf_dir)include_compat/.dir .rebuild
+	$(target)
+	$(cp) $< $@
+
+$(out_l10n_dir)%/$(cat_file) : l10n/%/$(cat_file) $(out_l10n_dir)%/.dir .rebuild
+	$(target)
+	$(cp) $< $@
+
+ifeq "$(os)" "mac"
+    $(out_l10n_fat_dir)%/$(cat_file) : l10n/%/$(cat_file) $(out_l10n_fat_dir)%/.dir .rebuild
+	    $(target)
+	    $(cp) $< $@
+endif
+
+# --- Include really common rules ---
+
+include $(LIBOMP_WORK)tools/src/common-rules.mk
+
+# --- Building helper tools from sources ---
+
+.PRECIOUS: %$(exe)                     # Do not delete automatically created files.
+
+%$(exe) : $(tools_dir)%.cpp .rebuild
+	$(target)
+	$(cxx) $(cxx-out)$@ $<
+
+# --- Fat libraries ---
+
+# Every time new file is copied to $(out_lib_dir) directory we update $(out_lib_dir).rebuild file,
+# so we know should we rebuild fat libraries or not.
+
+# Note: Original implementation built fat libraries in mac_32 directory, then copied all the
+# libraries from mac_32 to mac_32e directory. However, this may work wrong if exports/mac_*/lib/
+# contains other libraries. So now we build fat libraries twice: in both mac_32
+# and mac_32e directories.
+
+ifeq "$(platform)" "mac_32e"
+
+    .PHONY : fat
+    fat    : $(call _out_lib_fat_dir,mac_32).done $(call _out_lib_fat_dir,mac_32e).done
+
+    $(call _out_lib_fat_dir,mac_32).done \
+    $(call _out_lib_fat_dir,mac_32e).done : \
+        $(call _out_lib_dir,mac_32).touch \
+        $(call _out_lib_dir,mac_32e).touch \
+        $(tools_dir)make-fat-binaries.pl \
+        $(call _out_lib_fat_dir,mac_32).dir $(call _out_lib_fat_dir,mac_32e).dir .rebuild
+	    $(target)
+	    $(perl) $(tools_dir)make-fat-binaries.pl \
+	        --output=$(dir $@) $(call _out_lib_dir,mac_32) $(call _out_lib_dir,mac_32e)
+	    $(touch) $@
+
+endif
+
+# end of file #

diff --git a/final/runtime/src/test-touch.c b/final/runtime/src/test-touch.c
new file mode 100644
index 0000000..6ce529a
--- /dev/null
+++ b/final/runtime/src/test-touch.c

@@ -0,0 +1,31 @@
+// test-touch.c //
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern double omp_get_wtime();
+extern int    omp_get_num_threads();
+extern int    omp_get_max_threads();
+#ifdef __cplusplus
+}
+#endif
+
+int main() {
+    omp_get_wtime();
+    omp_get_num_threads();
+    omp_get_max_threads();
+    return 0;
+}
+
+// end of file //

diff --git a/final/runtime/src/thirdparty/ittnotify/disable_warnings.h b/final/runtime/src/thirdparty/ittnotify/disable_warnings.h
new file mode 100644
index 0000000..4b242fd
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/disable_warnings.h

@@ -0,0 +1,29 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+
+#pragma warning (disable: 593)   /* parameter "XXXX" was set but never used                 */
+#pragma warning (disable: 344)   /* typedef name has already been declared (with same type) */
+#pragma warning (disable: 174)   /* expression has no effect                                */
+#pragma warning (disable: 4127)  /* conditional expression is constant                      */
+#pragma warning (disable: 4306)  /* conversion from '?' to '?' of greater size              */
+
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if defined __INTEL_COMPILER
+
+#pragma warning (disable: 869)  /* parameter "XXXXX" was never referenced                  */
+#pragma warning (disable: 1418) /* external function definition with no prior declaration  */
+#pragma warning (disable: 1419) /* external declaration in primary source file             */
+
+#endif /* __INTEL_COMPILER */

diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify.h b/final/runtime/src/thirdparty/ittnotify/ittnotify.h
new file mode 100644
index 0000000..d05d8b7
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify.h

@@ -0,0 +1,3804 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ITTNOTIFY_H_
+#define _ITTNOTIFY_H_
+
+/**
+@file
+@brief Public User API functions and types
+@mainpage
+
+The ITT API is used to annotate a user's program with additional information
+that can be used by correctness and performance tools. The user inserts
+calls in their program. Those calls generate information that is collected
+at runtime, and used by Intel(R) Threading Tools.
+
+@section API Concepts
+The following general concepts are used throughout the API.
+
+@subsection Unicode Support
+Many API functions take character string arguments. On Windows, there
+are two versions of each such function. The function name is suffixed
+by W if Unicode support is enabled, and by A otherwise. Any API function
+that takes a character string argument adheres to this convention.
+
+@subsection Conditional Compilation
+Many users prefer having an option to modify ITT API code when linking it
+inside their runtimes. ITT API header file provides a mechanism to replace
+ITT API function names inside your code with empty strings. To do this,
+define the macros INTEL_NO_ITTNOTIFY_API during compilation and remove the
+static library from the linker script.
+
+@subsection Domains
+[see domains]
+Domains provide a way to separate notification for different modules or
+libraries in a program. Domains are specified by dotted character strings,
+e.g. TBB.Internal.Control.
+
+A mechanism (to be specified) is provided to enable and disable
+domains. By default, all domains are enabled.
+@subsection Named Entities and Instances
+Named entities (frames, regions, tasks, and markers) communicate
+information about the program to the analysis tools. A named entity often
+refers to a section of program code, or to some set of logical concepts
+that the programmer wants to group together.
+
+Named entities relate to the programmer's static view of the program. When
+the program actually executes, many instances of a given named entity
+may be created.
+
+The API annotations denote instances of named entities. The actual
+named entities are displayed using the analysis tools. In other words,
+the named entities come into existence when instances are created.
+
+Instances of named entities may have instance identifiers (IDs). Some
+API calls use instance identifiers to create relationships between
+different instances of named entities. Other API calls associate data
+with instances of named entities.
+
+Some named entities must always have instance IDs. In particular, regions
+and frames always have IDs. Task and markers need IDs only if the ID is
+needed in another API call (such as adding a relation or metadata).
+
+The lifetime of instance IDs is distinct from the lifetime of
+instances. This allows various relationships to be specified separate
+from the actual execution of instances. This flexibility comes at the
+expense of extra API calls.
+
+The same ID may not be reused for different instances, unless a previous
+[ref] __itt_id_destroy call for that ID has been issued.
+*/
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__ 
+#      define CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall)) 
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    CDECL
+#define LIBITTAPI CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    CDECL
+#define LIBITTAPI_CALL CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#endif /* __STRICT_ANSI__ */
+#define ITT_INLINE_ATTRIBUTE __attribute__ ((always_inline, unused))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#  include "legacy/ittnotify.h"
+#endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup public Public API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup control Collection Control
+ * @ingroup public
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,  (void))
+ITT_STUBV(ITTAPI, void, resume, (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} control group */
+/** @endcond */
+
+/**
+ * @defgroup threads Threads
+ * @ingroup public
+ * Give names to threads
+ * @{
+ */
+/**
+ * @brief Sets thread name of calling thread
+ * @param[in] name - name of thread
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_thread_set_nameA(const char    *name);
+void ITTAPI __itt_thread_set_nameW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thread_set_name     __itt_thread_set_nameW
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_thread_set_name     __itt_thread_set_nameA
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_thread_set_name(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name))
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA     ITTNOTIFY_VOID(thread_set_nameA)
+#define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA)
+#define __itt_thread_set_nameW     ITTNOTIFY_VOID(thread_set_nameW)
+#define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name     ITTNOTIFY_VOID(thread_set_name)
+#define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA(name)
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW(name)
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name(name)
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void ITTAPI __itt_thread_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, thread_ignore, (void))
+#define __itt_thread_ignore     ITTNOTIFY_VOID(thread_ignore)
+#define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thread_ignore()
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} threads group */
+
+/**
+ * @defgroup suppress Error suppression
+ * @ingroup public
+ * General behavior: application continues to run, but errors are suppressed
+ *
+ * @{
+ */
+
+/*****************************************************************//**
+ * @name group of functions used for error suppression in correctness tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @hideinitializer 
+ * @brief possible value for suppression mask
+ */
+#define __itt_suppress_all_errors 0x7fffffff
+
+/**
+ * @hideinitializer 
+ * @brief possible value for suppression mask (suppresses errors from threading analysis)
+ */
+#define __itt_suppress_threading_errors 0x000000ff
+
+/**
+ * @hideinitializer 
+ * @brief possible value for suppression mask (suppresses errors from memory analysis)
+ */
+#define __itt_suppress_memory_errors 0x0000ff00
+
+/**
+ * @brief Start suppressing errors identified in mask on this thread
+ */
+void ITTAPI __itt_suppress_push(unsigned int mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
+#define __itt_suppress_push     ITTNOTIFY_VOID(suppress_push)
+#define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_push(mask)
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effects of the matching call to __itt_suppress_push  
+ */
+void ITTAPI __itt_suppress_pop(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_pop, (void))
+#define __itt_suppress_pop     ITTNOTIFY_VOID(suppress_pop)
+#define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_pop()
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum __itt_suppress_mode {
+    __itt_unsuppress_range,
+    __itt_suppress_range
+} __itt_suppress_mode_t;
+
+/**
+ * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
+ */
+void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_mark_range     ITTNOTIFY_VOID(suppress_mark_range)
+#define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_mark_range(mask)
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If not matching
+ *        call is found, nothing is changed.
+ */
+void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_clear_range     ITTNOTIFY_VOID(suppress_clear_range)
+#define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_clear_range(mask)
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+/** @} suppress group */
+
+/**
+ * @defgroup sync Synchronization
+ * @ingroup public
+ * Indicate user-written synchronization code
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+@brief Name a synchronization object
+@param[in] addr       Handle for the synchronization object. You should
+use a real address to uniquely identify the synchronization object.
+@param[in] objtype    null-terminated object type string. If NULL is
+passed, the name will be "User Synchronization".
+@param[in] objname    null-terminated object name string. If NULL,
+no name will be assigned to the object.
+@param[in] attribute  one of [#__itt_attr_barrier, #__itt_attr_mutex]
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_createA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_create     __itt_sync_createW
+#  define __itt_sync_create_ptr __itt_sync_createW_ptr
+#else /* UNICODE */
+#  define __itt_sync_create     __itt_sync_createA
+#  define __itt_sync_create_ptr __itt_sync_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const char*    objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA     ITTNOTIFY_VOID(sync_createA)
+#define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA)
+#define __itt_sync_createW     ITTNOTIFY_VOID(sync_createW)
+#define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create     ITTNOTIFY_VOID(sync_create)
+#define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA(addr, objtype, objname, attribute)
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW(addr, objtype, objname, attribute)
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create(addr, objtype, objname, attribute)
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+@brief Rename a synchronization object
+
+You can use the rename call to assign or reassign a name to a given
+synchronization object.
+@param[in] addr  handle for the synchronization object.
+@param[in] name  null-terminated object name string.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_renameA(void *addr, const char    *name);
+void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_rename     __itt_sync_renameW
+#  define __itt_sync_rename_ptr __itt_sync_renameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_rename     __itt_sync_renameA
+#  define __itt_sync_rename_ptr __itt_sync_renameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_rename(void *addr, const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name))
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA     ITTNOTIFY_VOID(sync_renameA)
+#define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA)
+#define __itt_sync_renameW     ITTNOTIFY_VOID(sync_renameW)
+#define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename     ITTNOTIFY_VOID(sync_rename)
+#define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA(addr, name)
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW(addr, name)
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename(addr, name)
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ @brief Destroy a synchronization object.
+ @param addr Handle for the synchronization object.
+ */
+void ITTAPI __itt_sync_destroy(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr))
+#define __itt_sync_destroy     ITTNOTIFY_VOID(sync_destroy)
+#define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_destroy(addr)
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/*****************************************************************//**
+ * @name group of functions is used for performance measurement tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @brief Enter spin loop on user-defined sync object
+ */
+void ITTAPI __itt_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr))
+#define __itt_sync_prepare     ITTNOTIFY_VOID(sync_prepare)
+#define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_prepare(addr)
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Quit spin loop without acquiring spin object
+ */
+void ITTAPI __itt_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr))
+#define __itt_sync_cancel     ITTNOTIFY_VOID(sync_cancel)
+#define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_cancel(addr)
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void ITTAPI __itt_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr))
+#define __itt_sync_acquired     ITTNOTIFY_VOID(sync_acquired)
+#define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_acquired(addr)
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void ITTAPI __itt_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
+#define __itt_sync_releasing     ITTNOTIFY_VOID(sync_releasing)
+#define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_releasing(addr)
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/** @} sync group */
+
+/**************************************************************//**
+ * @name group of functions is used for correctness checking tools
+ ******************************************************************/
+/** @{ */
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_prepare(void* addr);
+ */
+void ITTAPI __itt_fsync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
+#define __itt_fsync_prepare     ITTNOTIFY_VOID(fsync_prepare)
+#define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_prepare(addr)
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_cancel(void *addr);
+ */
+void ITTAPI __itt_fsync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
+#define __itt_fsync_cancel     ITTNOTIFY_VOID(fsync_cancel)
+#define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_cancel(addr)
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_acquired(void *addr);
+ */
+void ITTAPI __itt_fsync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
+#define __itt_fsync_acquired     ITTNOTIFY_VOID(fsync_acquired)
+#define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_acquired(addr)
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_releasing(void* addr);
+ */
+void ITTAPI __itt_fsync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
+#define __itt_fsync_releasing     ITTNOTIFY_VOID(fsync_releasing)
+#define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_releasing(addr)
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/**
+ * @defgroup model Modeling by Intel(R) Parallel Advisor
+ * @ingroup public
+ * This is the subset of itt used for modeling by Intel(R) Parallel Advisor.
+ * This API is called ONLY using annotate.h, by "Annotation" macros
+ * the user places in their sources during the parallelism modeling steps.
+ *
+ * site_begin/end and task_begin/end take the address of handle variables,
+ * which are writeable by the API.  Handles must be 0 initialized prior
+ * to the first call to begin, or may cause a run-time failure.
+ * The handles are initialized in a multi-thread safe way by the API if
+ * the handle is 0.  The commonly expected idiom is one static handle to
+ * identify a site or task.  If a site or task of the same name has already
+ * been started during this collection, the same handle MAY be returned,
+ * but is not required to be - it is unspecified if data merging is done
+ * based on name.  These routines also take an instance variable.  Like
+ * the lexical instance, these must be 0 initialized.  Unlike the lexical
+ * instance, this is used to track a single dynamic instance.
+ *
+ * API used by the Intel(R) Parallel Advisor to describe potential concurrency
+ * and related activities. User-added source annotations expand to calls
+ * to these procedures to enable modeling of a hypothetical concurrent
+ * execution serially.
+ * @{
+ */
+#if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL)
+
+typedef void* __itt_model_site;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_site_instance;    /*!< @brief handle for dynamic instance */
+typedef void* __itt_model_task;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_task_instance;    /*!< @brief handle for dynamic instance */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum {
+    __itt_model_disable_observation,
+    __itt_model_disable_collection
+} __itt_model_disable;
+
+#endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */
+
+/**
+ * @brief ANNOTATE_SITE_BEGIN/ANNOTATE_SITE_END support.
+ *
+ * site_begin/end model a potential concurrency site.
+ * site instances may be recursively nested with themselves.
+ * site_end exits the most recently started but unended site for the current
+ * thread.  The handle passed to end may be used to validate structure.
+ * Instances of a site encountered on different threads concurrently
+ * are considered completely distinct. If the site name for two different
+ * lexical sites match, it is unspecified whether they are treated as the
+ * same or different for data presentation.
+ */
+void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_site_beginW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_site_beginA(const char *name);
+void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen);
+void ITTAPI __itt_model_site_end  (__itt_model_site *site, __itt_model_site_instance *instance);
+void ITTAPI __itt_model_site_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_site_begin,  (__itt_model_site *site, __itt_model_site_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,  (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_site_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_site_beginAL,  (const char *name, size_t siteNameLen))
+ITT_STUBV(ITTAPI, void, model_site_end,    (__itt_model_site *site, __itt_model_site_instance *instance))
+ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
+#define __itt_model_site_begin      ITTNOTIFY_VOID(model_site_begin)
+#define __itt_model_site_begin_ptr  ITTNOTIFY_NAME(model_site_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW      ITTNOTIFY_VOID(model_site_beginW)
+#define __itt_model_site_beginW_ptr  ITTNOTIFY_NAME(model_site_beginW)
+#endif
+#define __itt_model_site_beginA      ITTNOTIFY_VOID(model_site_beginA)
+#define __itt_model_site_beginA_ptr  ITTNOTIFY_NAME(model_site_beginA)
+#define __itt_model_site_beginAL      ITTNOTIFY_VOID(model_site_beginAL)
+#define __itt_model_site_beginAL_ptr  ITTNOTIFY_NAME(model_site_beginAL)
+#define __itt_model_site_end        ITTNOTIFY_VOID(model_site_end)
+#define __itt_model_site_end_ptr    ITTNOTIFY_NAME(model_site_end)
+#define __itt_model_site_end_2        ITTNOTIFY_VOID(model_site_end_2)
+#define __itt_model_site_end_2_ptr    ITTNOTIFY_NAME(model_site_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_site_begin(site, instance, name)
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW(name)
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA(name)
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL(name, siteNameLen)
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end(site, instance)
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2()
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_TASK_BEGIN/ANNOTATE_TASK_END support
+ *
+ * task_begin/end model a potential task, which is contained within the most
+ * closely enclosing dynamic site.  task_end exits the most recently started
+ * but unended task.  The handle passed to end may be used to validate
+ * structure.  It is unspecified if bad dynamic nesting is detected.  If it
+ * is, it should be encoded in the resulting data collection.  The collector
+ * should not fail due to construct nesting issues, nor attempt to directly
+ * indicate the problem.
+ */
+void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_task_beginW(const wchar_t *name);
+void ITTAPI __itt_model_iteration_taskW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_task_beginA(const char *name);
+void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_iteration_taskA(const char *name);
+void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_task_end  (__itt_model_task *task, __itt_model_task_instance *instance);
+void ITTAPI __itt_model_task_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_task_begin,  (__itt_model_task *task, __itt_model_task_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_task_beginW,  (const wchar_t *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_task_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_task_beginAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_task_end,    (__itt_model_task *task, __itt_model_task_instance *instance))
+ITT_STUBV(ITTAPI, void, model_task_end_2,  (void))
+#define __itt_model_task_begin      ITTNOTIFY_VOID(model_task_begin)
+#define __itt_model_task_begin_ptr  ITTNOTIFY_NAME(model_task_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW     ITTNOTIFY_VOID(model_task_beginW)
+#define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW)
+#define __itt_model_iteration_taskW     ITTNOTIFY_VOID(model_iteration_taskW)
+#define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW)
+#endif
+#define __itt_model_task_beginA    ITTNOTIFY_VOID(model_task_beginA)
+#define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA)
+#define __itt_model_task_beginAL    ITTNOTIFY_VOID(model_task_beginAL)
+#define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL)
+#define __itt_model_iteration_taskA    ITTNOTIFY_VOID(model_iteration_taskA)
+#define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA)
+#define __itt_model_iteration_taskAL    ITTNOTIFY_VOID(model_iteration_taskAL)
+#define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL)
+#define __itt_model_task_end        ITTNOTIFY_VOID(model_task_end)
+#define __itt_model_task_end_ptr    ITTNOTIFY_NAME(model_task_end)
+#define __itt_model_task_end_2        ITTNOTIFY_VOID(model_task_end_2)
+#define __itt_model_task_end_2_ptr    ITTNOTIFY_NAME(model_task_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_task_begin(task, instance, name)
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW(name)
+#define __itt_model_task_beginW_ptr  0
+#endif
+#define __itt_model_task_beginA(name)
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL(name, siteNameLen)
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA(name)
+#define __itt_model_iteration_taskA_ptr  0
+#define __itt_model_iteration_taskAL(name, siteNameLen)
+#define __itt_model_iteration_taskAL_ptr  0
+#define __itt_model_task_end(task, instance)
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2()
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW_ptr 0
+#endif
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA_ptr    0
+#define __itt_model_iteration_taskAL_ptr    0
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_LOCK_ACQUIRE/ANNOTATE_LOCK_RELEASE support
+ *
+ * lock_acquire/release model a potential lock for both lockset and
+ * performance modeling.  Each unique address is modeled as a separate
+ * lock, with invalid addresses being valid lock IDs.  Specifically:
+ * no storage is accessed by the API at the specified address - it is only
+ * used for lock identification.  Lock acquires may be self-nested and are
+ * unlocked by a corresponding number of releases.
+ * (These closely correspond to __itt_sync_acquired/__itt_sync_releasing,
+ * but may not have identical semantics.)
+ */
+void ITTAPI __itt_model_lock_acquire(void *lock);
+void ITTAPI __itt_model_lock_acquire_2(void *lock);
+void ITTAPI __itt_model_lock_release(void *lock);
+void ITTAPI __itt_model_lock_release_2(void *lock);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
+#define __itt_model_lock_acquire     ITTNOTIFY_VOID(model_lock_acquire)
+#define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire)
+#define __itt_model_lock_acquire_2     ITTNOTIFY_VOID(model_lock_acquire_2)
+#define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2)
+#define __itt_model_lock_release     ITTNOTIFY_VOID(model_lock_release)
+#define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release)
+#define __itt_model_lock_release_2     ITTNOTIFY_VOID(model_lock_release_2)
+#define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_lock_acquire(lock)
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2(lock)
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release(lock)
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2(lock)
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_RECORD_ALLOCATION/ANNOTATE_RECORD_DEALLOCATION support
+ *
+ * record_allocation/deallocation describe user-defined memory allocator
+ * behavior, which may be required for correctness modeling to understand
+ * when storage is not expected to be actually reused across threads.
+ */
+void ITTAPI __itt_model_record_allocation  (void *addr, size_t size);
+void ITTAPI __itt_model_record_deallocation(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size))
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
+#define __itt_model_record_allocation       ITTNOTIFY_VOID(model_record_allocation)
+#define __itt_model_record_allocation_ptr   ITTNOTIFY_NAME(model_record_allocation)
+#define __itt_model_record_deallocation     ITTNOTIFY_VOID(model_record_deallocation)
+#define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_record_allocation(addr, size)
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation(addr)
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_INDUCTION_USES support
+ *
+ * Note particular storage is inductive through the end of the current site
+ */
+void ITTAPI __itt_model_induction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
+#define __itt_model_induction_uses     ITTNOTIFY_VOID(model_induction_uses)
+#define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_induction_uses(addr, size)
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_REDUCTION_USES support
+ *
+ * Note particular storage is used for reduction through the end
+ * of the current site
+ */
+void ITTAPI __itt_model_reduction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
+#define __itt_model_reduction_uses     ITTNOTIFY_VOID(model_reduction_uses)
+#define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_reduction_uses(addr, size)
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_OBSERVE_USES support
+ *
+ * Have correctness modeling record observations about uses of storage
+ * through the end of the current site
+ */
+void ITTAPI __itt_model_observe_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
+#define __itt_model_observe_uses     ITTNOTIFY_VOID(model_observe_uses)
+#define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_observe_uses(addr, size)
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_CLEAR_USES support
+ *
+ * Clear the special handling of a piece of storage related to induction,
+ * reduction or observe_uses
+ */
+void ITTAPI __itt_model_clear_uses(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr))
+#define __itt_model_clear_uses     ITTNOTIFY_VOID(model_clear_uses)
+#define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_clear_uses(addr)
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_DISABLE_*_PUSH/ANNOTATE_DISABLE_*_POP support
+ *
+ * disable_push/disable_pop push and pop disabling based on a parameter.
+ * Disabling observations stops processing of memory references during
+ * correctness modeling, and all annotations that occur in the disabled
+ * region.  This allows description of code that is expected to be handled
+ * specially during conversion to parallelism or that is not recognized
+ * by tools (e.g. some kinds of synchronization operations.)
+ * This mechanism causes all annotations in the disabled region, other
+ * than disable_push and disable_pop, to be ignored.  (For example, this
+ * might validly be used to disable an entire parallel site and the contained
+ * tasks and locking in it for data collection purposes.)
+ * The disable for collection is a more expensive operation, but reduces
+ * collector overhead significantly.  This applies to BOTH correctness data
+ * collection and performance data collection.  For example, a site
+ * containing a task might only enable data collection for the first 10
+ * iterations.  Both performance and correctness data should reflect this,
+ * and the program should run as close to full speed as possible when
+ * collection is disabled.
+ */
+void ITTAPI __itt_model_disable_push(__itt_model_disable x);
+void ITTAPI __itt_model_disable_pop(void);
+void ITTAPI __itt_model_aggregate_task(size_t x);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x))
+ITT_STUBV(ITTAPI, void, model_disable_pop,  (void))
+ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
+#define __itt_model_disable_push     ITTNOTIFY_VOID(model_disable_push)
+#define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push)
+#define __itt_model_disable_pop      ITTNOTIFY_VOID(model_disable_pop)
+#define __itt_model_disable_pop_ptr  ITTNOTIFY_NAME(model_disable_pop)
+#define __itt_model_aggregate_task      ITTNOTIFY_VOID(model_aggregate_task)
+#define __itt_model_aggregate_task_ptr  ITTNOTIFY_NAME(model_aggregate_task)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_disable_push(x)
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop()
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task(x)
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} model group */
+
+/**
+ * @defgroup heap Heap
+ * @ingroup public
+ * Heap group
+ * @{
+ */
+
+typedef void* __itt_heap_function;
+
+/**
+ * @brief Create an identification for heap function
+ * @return non-zero identifier or NULL
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_heap_function ITTAPI __itt_heap_function_createA(const char*    name, const char*    domain);
+__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_heap_function_create     __itt_heap_function_createW
+#  define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
+#else
+#  define __itt_heap_function_create     __itt_heap_function_createA
+#  define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char*    name, const char*    domain))
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    name, const char*    domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA     ITTNOTIFY_DATA(heap_function_createA)
+#define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA)
+#define __itt_heap_function_createW     ITTNOTIFY_DATA(heap_function_createW)
+#define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create      ITTNOTIFY_DATA(heap_function_create)
+#define __itt_heap_function_create_ptr  ITTNOTIFY_NAME(heap_function_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create(name, domain)  (__itt_heap_function)0
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation begin occurrence.
+ */
+void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized))
+#define __itt_heap_allocate_begin     ITTNOTIFY_VOID(heap_allocate_begin)
+#define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_begin(h, size, initialized)
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation end occurrence.
+ */
+void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized))
+#define __itt_heap_allocate_end     ITTNOTIFY_VOID(heap_allocate_end)
+#define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_end(h, addr, size, initialized)
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free begin occurrence.
+ */
+void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_begin     ITTNOTIFY_VOID(heap_free_begin)
+#define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_begin(h, addr)
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free end occurrence.
+ */
+void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_end     ITTNOTIFY_VOID(heap_free_end)
+#define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_end(h, addr)
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation begin occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_begin     ITTNOTIFY_VOID(heap_reallocate_begin)
+#define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_begin(h, addr, new_size, initialized)
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation end occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_end     ITTNOTIFY_VOID(heap_reallocate_end)
+#define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized)
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access begin */
+void ITTAPI __itt_heap_internal_access_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin,  (void))
+#define __itt_heap_internal_access_begin      ITTNOTIFY_VOID(heap_internal_access_begin)
+#define __itt_heap_internal_access_begin_ptr  ITTNOTIFY_NAME(heap_internal_access_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_begin()
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access end */
+void ITTAPI __itt_heap_internal_access_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void))
+#define __itt_heap_internal_access_end     ITTNOTIFY_VOID(heap_internal_access_end)
+#define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_end()
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth begin */
+void ITTAPI __itt_heap_record_memory_growth_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin,  (void))
+#define __itt_heap_record_memory_growth_begin      ITTNOTIFY_VOID(heap_record_memory_growth_begin)
+#define __itt_heap_record_memory_growth_begin_ptr  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_begin()
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth end */
+void ITTAPI __itt_heap_record_memory_growth_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
+#define __itt_heap_record_memory_growth_end     ITTNOTIFY_VOID(heap_record_memory_growth_end)
+#define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_end()
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Specify the type of heap detection/reporting to modify.
+ */
+/**
+ * @hideinitializer 
+ * @brief Report on memory leaks.
+ */
+#define __itt_heap_leaks 0x00000001
+
+/**
+ * @hideinitializer 
+ * @brief Report on memory growth.
+ */
+#define __itt_heap_growth 0x00000002
+
+
+/** @brief heap reset detection */
+void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reset_detection,  (unsigned int reset_mask))
+#define __itt_heap_reset_detection      ITTNOTIFY_VOID(heap_reset_detection)
+#define __itt_heap_reset_detection_ptr  ITTNOTIFY_NAME(heap_reset_detection)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reset_detection()
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief report */
+void ITTAPI __itt_heap_record(unsigned int record_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
+#define __itt_heap_record     ITTNOTIFY_VOID(heap_record)
+#define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record()
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} heap group */
+/** @endcond */
+/* ========================================================================== */
+
+/**
+ * @defgroup domains Domains
+ * @ingroup public
+ * Domains group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_domain
+{
+    volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */
+    const char* nameA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved to the runtime */
+    void* extra2; /*!< Reserved to the runtime */
+    struct ___itt_domain* next;
+} __itt_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup domains
+ * @brief Create a domain.
+ * Create domain using some domain name: the URI naming style is recommended.
+ * Because the set of domains is expected to be static over the application's 
+ * execution time, there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of
+ * which thread created the domain. This call is thread-safe.
+ * @param[in] name name of domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_domain* ITTAPI __itt_domain_createA(const char    *name);
+__itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_domain_create     __itt_domain_createW
+#  define __itt_domain_create_ptr __itt_domain_createW_ptr
+#else /* UNICODE */
+#  define __itt_domain_create     __itt_domain_createA
+#  define __itt_domain_create_ptr __itt_domain_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_domain* ITTAPI __itt_domain_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA     ITTNOTIFY_DATA(domain_createA)
+#define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA)
+#define __itt_domain_createW     ITTNOTIFY_DATA(domain_createW)
+#define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create     ITTNOTIFY_DATA(domain_create)
+#define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA(name) (__itt_domain*)0
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW(name) (__itt_domain*)0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create(name)  (__itt_domain*)0
+#define __itt_domain_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} domains group */
+
+/**
+ * @defgroup ids IDs
+ * @ingroup public
+ * IDs group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_id
+{
+    unsigned long long d1, d2, d3;
+} __itt_id;
+
+#pragma pack(pop)
+/** @endcond */
+
+static const __itt_id __itt_null = { 0, 0, 0 };
+
+/**
+ * @ingroup ids
+ * @brief A convenience function is provided to create an ID without domain control.
+ * @brief This is a convenience function to initialize an __itt_id structure. This function
+ * does not affect the trace collector runtime in any way. After you make the ID with this
+ * function, you still must create it with the __itt_id_create function before using the ID
+ * to identify a named entity.
+ * @param[in] addr The address of object; high QWORD of the ID value.
+ * @param[in] extra The extra data to unique identify object; low QWORD of the ID value.
+ */
+
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra)
+{
+    __itt_id id = __itt_null;
+    id.d1 = (unsigned long long)((uintptr_t)addr);
+    id.d2 = (unsigned long long)extra;
+    id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
+    return id;
+}
+
+/**
+ * @ingroup ids
+ * @brief Create an instance of identifier.
+ * This establishes the beginning of the lifetime of an instance of
+ * the given ID in the trace. Once this lifetime starts, the ID
+ * can be used to tag named entity instances in calls such as
+ * __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * Instance IDs are not domain specific!
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x)
+#define __itt_id_create_ptr  ITTNOTIFY_NAME(id_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create(domain,id)
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup ids
+ * @brief Destroy an instance of identifier.
+ * This ends the lifetime of the current instance of the given ID value in the trace.
+ * Any relationships that are established after this lifetime ends are invalid.
+ * This call must be performed before the given ID value can be reused for a different 
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x)
+#define __itt_id_destroy_ptr  ITTNOTIFY_NAME(id_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_destroy(domain,id)
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} ids group */
+
+/**
+ * @defgroup handless String Handles
+ * @ingroup public
+ * String Handles group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_string_handle
+{
+    const char* strA; /*!< Copy of original string in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* strW; /*!< Copy of original string in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* strW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved. Must be zero   */
+    void* extra2; /*!< Reserved. Must be zero   */
+    struct ___itt_string_handle* next;
+} __itt_string_handle;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup handles
+ * @brief Create a string handle.
+ * Create and return handle value that can be associated with a string.
+ * Consecutive calls to __itt_string_handle_create with the same name
+ * return the same value. Because the set of string handles is expected to remain
+ * static during the application's execution time, there is no mechanism to destroy a string handle.
+ * Any string handle can be accessed by any thread in the process, regardless of which thread created
+ * the string handle. This call is thread-safe.
+ * @param[in] name The input string
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_string_handle* ITTAPI __itt_string_handle_createA(const char    *name);
+__itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_string_handle_create     __itt_string_handle_createW
+#  define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
+#else /* UNICODE */
+#  define __itt_string_handle_create     __itt_string_handle_createA
+#  define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_string_handle* ITTAPI __itt_string_handle_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA     ITTNOTIFY_DATA(string_handle_createA)
+#define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA)
+#define __itt_string_handle_createW     ITTNOTIFY_DATA(string_handle_createW)
+#define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create     ITTNOTIFY_DATA(string_handle_create)
+#define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA(name) (__itt_string_handle*)0
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW(name) (__itt_string_handle*)0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create(name)  (__itt_string_handle*)0
+#define __itt_string_handle_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} handles group */
+
+/** @cond exclude_from_documentation */
+typedef unsigned long long __itt_timestamp;
+/** @endcond */
+
+static const __itt_timestamp __itt_timestamp_none = (__itt_timestamp)-1LL;
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @ingroup timestamps
+ * @brief Return timestamp corresponding to the current moment.
+ * This returns the timestamp in the format that is the most relevant for the current
+ * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
+ * compare __itt_timestamp values.
+ */
+__itt_timestamp ITTAPI __itt_get_timestamp(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
+#define __itt_get_timestamp      ITTNOTIFY_DATA(get_timestamp)
+#define __itt_get_timestamp_ptr  ITTNOTIFY_NAME(get_timestamp)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_get_timestamp()
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} timestamps */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @defgroup regions Regions
+ * @ingroup public
+ * Regions group
+ * @{
+ */
+/**
+ * @ingroup regions
+ * @brief Begin of region instance.
+ * Successive calls to __itt_region_begin with the same ID are ignored
+ * until a call to __itt_region_end with the same ID
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance. Must not be __itt_null
+ * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null
+ * @param[in] name The name of this region
+ */
+void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup regions
+ * @brief End of region instance.
+ * The first call to __itt_region_end with a given ID ends the
+ * region. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_region_begin call.
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance
+ */
+void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
+#define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z)
+#define __itt_region_begin_ptr      ITTNOTIFY_NAME(region_begin)
+#define __itt_region_end(d,x)       ITTNOTIFY_VOID_D1(region_end,d,x)
+#define __itt_region_end_ptr        ITTNOTIFY_NAME(region_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_region_begin(d,x,y,z)
+#define __itt_region_begin_ptr 0
+#define __itt_region_end(d,x)
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_region_begin_ptr 0
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} regions group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup public
+ * Frames are similar to regions, but are intended to be easier to use and to implement.
+ * In particular:
+ * - Frames always represent periods of elapsed time
+ * - By default, frames have no nesting relationships
+ * @{
+ */
+
+/**
+ * @ingroup frames
+ * @brief Begin a frame instance.
+ * Successive calls to __itt_frame_begin with the
+ * same ID are ignored until a call to __itt_frame_end with the same ID.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ */
+void ITTAPI __itt_frame_begin_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief End a frame instance.
+ * The first call to __itt_frame_end with a given ID
+ * ends the frame. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_frame_begin call.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL for current
+ */
+void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief Submits a frame instance.
+ * Successive calls to __itt_frame_begin or __itt_frame_submit with the
+ * same ID are ignored until a call to __itt_frame_end or __itt_frame_submit
+ * with the same ID.
+ * Passing special __itt_timestamp_none value as "end" argument means
+ * take the current timestamp as the end timestamp.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ * @param[in] begin Timestamp of the beginning of the frame
+ * @param[in] end Timestamp of the end of the frame
+ */
+void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
+    __itt_timestamp begin, __itt_timestamp end);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
+#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
+#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
+#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
+#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
+#define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
+#define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin_v3(domain,id)
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3(domain,id)
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3(domain,id,begin,end)
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+/** @endcond */
+
+/**
+ * @defgroup taskgroup Task Group
+ * @ingroup public
+ * Task Group
+ * @{
+ */
+/**
+ * @ingroup task_groups
+ * @brief Denotes a task_group instance.
+ * Successive calls to __itt_task_group with the same ID are ignored.
+ * @param[in] domain The domain for this task_group instance
+ * @param[in] id The instance ID for this task_group instance. Must not be __itt_null.
+ * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null.
+ * @param[in] name The name of this task_group
+ */
+void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+#define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z)
+#define __itt_task_group_ptr      ITTNOTIFY_NAME(task_group)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_group(d,x,y,z)
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} taskgroup group */
+
+/**
+ * @defgroup tasks Tasks
+ * @ingroup public
+ * A task instance represents a piece of work performed by a particular
+ * thread for a period of time. A call to __itt_task_begin creates a
+ * task instance. This becomes the current instance for that task on that
+ * thread. A following call to __itt_task_end on the same thread ends the
+ * instance. There may be multiple simultaneous instances of tasks with the
+ * same name on different threads. If an ID is specified, the task instance
+ * receives that ID. Nested tasks are allowed.
+ *
+ * Note: The task is defined by the bracketing of __itt_task_begin and
+ * __itt_task_end on the same thread. If some scheduling mechanism causes
+ * task switching (the thread executes a different user task) or task
+ * switching (the user task switches to a different thread) then this breaks
+ * the notion of  current instance. Additional API calls are required to
+ * deal with that possibility.
+ * @{
+ */
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The identifier for this task instance (may be 0)
+ * @param[in] parentid The parent of this task (may be 0)
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup tasks
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ */
+void ITTAPI __itt_task_end(const __itt_domain *domain);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain))
+#define __itt_task_begin(d,x,y,z)    ITTNOTIFY_VOID_D3(task_begin,d,x,y,z)
+#define __itt_task_begin_ptr         ITTNOTIFY_NAME(task_begin)
+#define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z)
+#define __itt_task_begin_fn_ptr      ITTNOTIFY_NAME(task_begin_fn)
+#define __itt_task_end(d)            ITTNOTIFY_VOID_D0(task_end,d)
+#define __itt_task_end_ptr           ITTNOTIFY_NAME(task_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin(domain,id,parentid,name)
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn(domain,id,parentid,fn)
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end(domain)
+#define __itt_task_end_ptr      0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end_ptr      0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} tasks group */
+
+/**
+ * @defgroup counters Counters
+ * @ingroup public
+ * Counters are user-defined objects with a monotonically increasing
+ * value. Counter values are 64-bit unsigned integers. Counter values
+ * are tracked per-thread. Counters have names that can be displayed in
+ * the tools.
+ * @{
+ */
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero on every thread. Successive calls increment the counter value
+ * on the thread on which the call is issued.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to increment the counter
+ */
+void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_inc_v3(d,x)         ITTNOTIFY_VOID_D1(counter_inc_v3,d,x)
+#define __itt_counter_inc_v3_ptr          ITTNOTIFY_NAME(counter_inc_v3)
+#define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y)
+#define __itt_counter_inc_delta_v3_ptr    ITTNOTIFY_NAME(counter_inc_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_v3(domain,name)
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3(domain,name,delta)
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} counters group */
+
+/**
+ * @defgroup markers Markers
+ * Markers represent a single discreet event in time. Markers have a scope,
+ * described by an enumerated type __itt_scope. Markers are created by
+ * the API call __itt_marker. A marker instance can be given an ID for use in
+ * adding metadata.
+ * @{
+ */
+
+/**
+ * @brief Describes the scope of an event object in the trace.
+ */
+typedef enum
+{
+    __itt_scope_unknown = 0,
+    __itt_scope_global,
+    __itt_scope_track_group,
+    __itt_scope_track,
+    __itt_scope_task,
+    __itt_scope_marker
+} __itt_scope;
+
+/** @cond exclude_from_documentation */
+#define __itt_marker_scope_unknown  __itt_scope_unknown
+#define __itt_marker_scope_global   __itt_scope_global
+#define __itt_marker_scope_process  __itt_scope_track_group
+#define __itt_marker_scope_thread   __itt_scope_track
+#define __itt_marker_scope_task     __itt_scope_task
+/** @endcond */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance
+ * @param[in] domain The domain for this marker
+ * @param[in] id The instance ID for this marker or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z)
+#define __itt_marker_ptr      ITTNOTIFY_NAME(marker)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker(domain,id,name,scope)
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} markers group */
+
+/**
+ * @defgroup metadata Metadata
+ * The metadata API is used to attach extra information to named
+ * entities. Metadata can be attached to an identified named entity by ID,
+ * or to the current entity (which is always a task).
+ *
+ * Conceptually metadata has a type (what kind of metadata), a key (the
+ * name of the metadata), and a value (the actual data). The encoding of
+ * the value depends on the type of the metadata.
+ *
+ * The type of metadata is specified by an enumerated type __itt_metdata_type.
+ * @{
+ */
+
+/**
+ * @ingroup parameters
+ * @brief describes the type of metadata
+ */
+typedef enum {
+    __itt_metadata_unknown = 0,
+    __itt_metadata_u64,     /**< Unsigned 64-bit integer */
+    __itt_metadata_s64,     /**< Signed 64-bit integer */
+    __itt_metadata_u32,     /**< Unsigned 32-bit integer */
+    __itt_metadata_s32,     /**< Signed 32-bit integer */
+    __itt_metadata_u16,     /**< Unsigned 16-bit integer */
+    __itt_metadata_s16,     /**< Signed 16-bit integer */
+    __itt_metadata_float,   /**< Signed 32-bit floating-point */
+    __itt_metadata_double   /**< SIgned 64-bit floating-point */
+} __itt_metadata_type;
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b)
+#define __itt_metadata_add_ptr          ITTNOTIFY_NAME(metadata_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add(d,x,y,z,a,b)
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated 
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add     __itt_metadata_str_addW
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add     __itt_metadata_str_addA
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr        ITTNOTIFY_NAME(metadata_str_addA)
+#define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr        ITTNOTIFY_NAME(metadata_str_addW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a)
+#define __itt_metadata_str_add_ptr         ITTNOTIFY_NAME(metadata_str_add)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a) 
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW(d,x,y,z,a) 
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)
+#define __itt_metadata_str_add_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ 
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr          ITTNOTIFY_NAME(metadata_add_with_scope)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated 
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeW
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeA
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr         ITTNOTIFY_NAME(metadata_str_add_with_scope)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) 
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) 
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} metadata group */
+
+/**
+ * @defgroup relations Relations
+ * Instances of named entities can be explicitly associated with other
+ * instances using instance IDs and the relationship API calls.
+ *
+ * @{
+ */
+
+/**
+ * @ingroup relations
+ * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation.
+ * Relations between instances can be added with an API call. The relation
+ * API uses instance IDs. Relations can be added before or after the actual
+ * instances are created and persist independently of the instances. This
+ * is the motivation for having different lifetimes for instance IDs and
+ * the actual instances.
+ */
+typedef enum
+{
+    __itt_relation_is_unknown = 0,
+    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
+    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
+    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
+    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
+    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
+    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
+    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+} __itt_relation;
+
+/**
+ * @ingroup relations
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup relations
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y)
+#define __itt_relation_add_to_current_ptr    ITTNOTIFY_NAME(relation_add_to_current)
+#define __itt_relation_add(d,x,y,z)          ITTNOTIFY_VOID_D3(relation_add,d,x,y,z)
+#define __itt_relation_add_ptr               ITTNOTIFY_NAME(relation_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current(d,x,y)
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add(d,x,y,z)
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} relations group */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_info
+{
+    unsigned long long clock_freq; /*!< Clock domain frequency */
+    unsigned long long clock_base; /*!< Clock domain base timestamp */
+} __itt_clock_info;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_domain
+{
+    __itt_clock_info info;      /*!< Most recent clock domain info */
+    __itt_get_clock_info_fn fn; /*!< Callback function pointer */
+    void* fn_data;              /*!< Input argument for the callback function */
+    int   extra1;               /*!< Reserved. Must be zero */
+    void* extra2;               /*!< Reserved. Must be zero */
+    struct ___itt_clock_domain* next;
+} __itt_clock_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Create a clock domain.
+ * Certain applications require the capability to trace their application using
+ * a clock domain different than the CPU, for instance the instrumentation of events
+ * that occur on a GPU.
+ * Because the set of domains is expected to be static over the application's execution time,
+ * there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of which thread created
+ * the domain. This call is thread-safe.
+ * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps
+ * @param[in] fn_data Argument for a callback function; may be NULL
+ */
+__itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data))
+#define __itt_clock_domain_create     ITTNOTIFY_DATA(clock_domain_create)
+#define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Recalculate clock domains frequences and clock base timestamps.
+ */
+void ITTAPI __itt_clock_domain_reset(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, clock_domain_reset, (void))
+#define __itt_clock_domain_reset     ITTNOTIFY_VOID(clock_domain_reset)
+#define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_reset()
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Create an instance of identifier. This establishes the beginning of the lifetime of
+ * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to
+ * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/**
+ * @ingroup clockdomain
+ * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the
+ * given ID value in the trace. Any relationships that are established after this lifetime ends are
+ * invalid. This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+#define __itt_id_create_ex(d,x,y,z)  ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z)
+#define __itt_id_create_ex_ptr       ITTNOTIFY_NAME(id_create_ex)
+#define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z)
+#define __itt_id_destroy_ex_ptr      ITTNOTIFY_NAME(id_destroy_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, or __itt_null
+ * @param[in] parentid The parent of this task, or __itt_null
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup clockdomain
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ */
+void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_ex,        (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex,     (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp))
+#define __itt_task_begin_ex(d,x,y,z,a,b)      ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b)
+#define __itt_task_begin_ex_ptr               ITTNOTIFY_NAME(task_begin_ex)
+#define __itt_task_begin_fn_ex(d,x,y,z,a,b)   ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b)
+#define __itt_task_begin_fn_ex_ptr            ITTNOTIFY_NAME(task_begin_fn_ex)
+#define __itt_task_end_ex(d,x,y)              ITTNOTIFY_VOID_D2(task_end_ex,d,x,y)
+#define __itt_task_end_ex_ptr                 ITTNOTIFY_NAME(task_end_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name)
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn)
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex(domain,clock_domain,timestamp)
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance.
+ * @param[in] domain The domain for this marker
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The instance ID for this marker, or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker_ex(d,x,y,z,a,b)    ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b)
+#define __itt_marker_ex_ptr             ITTNOTIFY_NAME(marker_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope)
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a)
+#define __itt_relation_add_to_current_ex_ptr        ITTNOTIFY_NAME(relation_add_to_current_ex)
+#define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
+#define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum ___itt_track_group_type
+{
+    __itt_track_group_type_normal = 0
+} __itt_track_group_type;
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track_group
+{
+    __itt_string_handle* name;     /*!< Name of the track group */
+    struct ___itt_track* track;    /*!< List of child tracks    */
+    __itt_track_group_type tgtype; /*!< Type of the track group */
+    int   extra1;                  /*!< Reserved. Must be zero  */
+    void* extra2;                  /*!< Reserved. Must be zero  */
+    struct ___itt_track_group* next;
+} __itt_track_group;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Placeholder for custom track types. Currently, "normal" custom track
+ * is the only available track type.
+ */
+typedef enum ___itt_track_type
+{
+    __itt_track_type_normal = 0
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+    , __itt_track_type_queue
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
+} __itt_track_type;
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track
+{
+    __itt_string_handle* name; /*!< Name of the track group */
+    __itt_track_group* group;  /*!< Parent group to a track */
+    __itt_track_type ttype;    /*!< Type of the track       */
+    int   extra1;              /*!< Reserved. Must be zero  */
+    void* extra2;              /*!< Reserved. Must be zero  */
+    struct ___itt_track* next;
+} __itt_track;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Create logical track group.
+ */
+__itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type))
+#define __itt_track_group_create     ITTNOTIFY_DATA(track_group_create)
+#define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_group_create(name)  (__itt_track_group*)0
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create logical track.
+ */
+__itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type))
+#define __itt_track_create     ITTNOTIFY_DATA(track_create)
+#define __itt_track_create_ptr ITTNOTIFY_NAME(track_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_create(track_group,name,track_type)  (__itt_track*)0
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the logical track.
+ */
+void ITTAPI __itt_set_track(__itt_track* track);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track))
+#define __itt_set_track     ITTNOTIFY_VOID(set_track)
+#define __itt_set_track_ptr ITTNOTIFY_NAME(set_track)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_set_track(track)
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/* ========================================================================== */
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup events Events
+ * @ingroup public
+ * Events group
+ * @{
+ */
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} events group */
+
+
+/**
+ * @defgroup arrays Arrays Visualizer
+ * @ingroup public
+ * Visualize arrays
+ * @{
+ */
+
+/**
+ * @enum __itt_av_data_type
+ * @brief Defines types of arrays data (for C/C++ intrinsic types) 
+ */
+typedef enum 
+{
+    __itt_e_first = 0,
+    __itt_e_char = 0,  /* 1-byte integer */
+    __itt_e_uchar,     /* 1-byte unsigned integer */
+    __itt_e_int16,     /* 2-byte integer */
+    __itt_e_uint16,    /* 2-byte unsigned integer  */
+    __itt_e_int32,     /* 4-byte integer */
+    __itt_e_uint32,    /* 4-byte unsigned integer */
+    __itt_e_int64,     /* 8-byte integer */
+    __itt_e_uint64,    /* 8-byte unsigned integer */
+    __itt_e_float,     /* 4-byte floating */
+    __itt_e_double,    /* 8-byte floating */
+    __itt_e_last = __itt_e_double
+} __itt_av_data_type;
+
+/**
+ * @brief Save an array data to a file.
+ * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
+ * @param[in] data - pointer to the array data
+ * @param[in] rank - the rank of the array 
+ * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions. 
+ * The size of dimensions must be equal to the rank
+ * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
+ * @param[in] filePath - the file path; the output format is defined by the file extension
+ * @param[in] columnOrder - defines how the array is stored in the linear memory.
+ * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C).
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_av_save     __itt_av_saveW
+#  define __itt_av_save_ptr __itt_av_saveW_ptr
+#else /* UNICODE */
+#  define __itt_av_save     __itt_av_saveA
+#  define __itt_av_save_ptr __itt_av_saveA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA     ITTNOTIFY_DATA(av_saveA)
+#define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA)
+#define __itt_av_saveW     ITTNOTIFY_DATA(av_saveW)
+#define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save     ITTNOTIFY_DATA(av_save)
+#define __itt_av_save_ptr ITTNOTIFY_NAME(av_save)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA(name)
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW(name)
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save(name)
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+void ITTAPI __itt_enable_attach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, enable_attach, (void))
+#define __itt_enable_attach     ITTNOTIFY_VOID(enable_attach)
+#define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_enable_attach()
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/** @} arrays group */
+
+/** @endcond */
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_H_ */
+
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+
+#ifndef _ITTNOTIFY_PRIVATE_
+#define _ITTNOTIFY_PRIVATE_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @ingroup tasks
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup tasks
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid);
+
+/**
+ * @ingroup clockdomain
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_overlapped,          (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,       (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped,            (const __itt_domain *domain, __itt_id taskid))
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid))
+#define __itt_task_begin_overlapped(d,x,y,z)            ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z)
+#define __itt_task_begin_overlapped_ptr                 ITTNOTIFY_NAME(task_begin_overlapped)
+#define __itt_task_begin_overlapped_ex(d,x,y,z,a,b)     ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b)
+#define __itt_task_begin_overlapped_ex_ptr              ITTNOTIFY_NAME(task_begin_overlapped_ex)
+#define __itt_task_end_overlapped(d,x)                  ITTNOTIFY_VOID_D1(task_end_overlapped,d,x)
+#define __itt_task_end_overlapped_ptr                   ITTNOTIFY_NAME(task_end_overlapped)
+#define __itt_task_end_overlapped_ex(d,x,y,z)           ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z)
+#define __itt_task_end_overlapped_ex_ptr                ITTNOTIFY_NAME(task_end_overlapped_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_overlapped(domain,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ptr         0
+#define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped(domain,taskid)
+#define __itt_task_end_overlapped_ptr           0
+#define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid)
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_overlapped_ptr         0
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ptr           0
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup makrs_internal Marks
+ * @ingroup internal
+ * Marks group
+ * @warning Internal API:
+ *   - It is not shipped to outside of Intel
+ *   - It is delivered to internal Intel teams using e-mail or SVN access only
+ * @{
+ */
+/** @brief user mark type */
+typedef int __itt_mark_type;
+
+/**
+ * @brief Creates a user mark type with the specified name using char or Unicode string.
+ * @param[in] name - name of mark to create
+ * @return Returns a handle to the mark type
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_mark_type ITTAPI __itt_mark_createA(const char    *name);
+__itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_create     __itt_mark_createW
+#  define __itt_mark_create_ptr __itt_mark_createW_ptr
+#else /* UNICODE */
+#  define __itt_mark_create     __itt_mark_createA
+#  define __itt_mark_create_ptr __itt_mark_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_mark_type ITTAPI __itt_mark_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA     ITTNOTIFY_DATA(mark_createA)
+#define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA)
+#define __itt_mark_createW     ITTNOTIFY_DATA(mark_createW)
+#define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create      ITTNOTIFY_DATA(mark_create)
+#define __itt_mark_create_ptr  ITTNOTIFY_NAME(mark_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA(name) (__itt_mark_type)0
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW(name) (__itt_mark_type)0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create(name)  (__itt_mark_type)0
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string.
+ *
+ * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign.
+ * - The call is "synchronous" - function returns after mark is actually added to results.
+ * - This function is useful, for example, to mark different phases of application
+ *   (beginning of the next mark automatically meand end of current region).
+ * - Can be used together with "continuous" marks (see below) at the same collection session
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @param[in] parameter - string parameter of mark
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_markA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark     __itt_markW
+#  define __itt_mark_ptr __itt_markW_ptr
+#else /* UNICODE  */
+#  define __itt_mark     __itt_markA
+#  define __itt_mark_ptr __itt_markA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA     ITTNOTIFY_DATA(markA)
+#define __itt_markA_ptr ITTNOTIFY_NAME(markA)
+#define __itt_markW     ITTNOTIFY_DATA(markW)
+#define __itt_markW_ptr ITTNOTIFY_NAME(markW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark      ITTNOTIFY_DATA(mark)
+#define __itt_mark_ptr  ITTNOTIFY_NAME(mark)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA(mt, parameter) (int)0
+#define __itt_markA_ptr 0
+#define __itt_markW(mt, parameter) (int)0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark(mt, parameter)  (int)0
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA_ptr 0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create a "discrete" user event type (mark) for process
+ * rather then for one thread
+ * @see int __itt_mark(__itt_mark_type mt, const char* parameter);
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_global     __itt_mark_globalW
+#  define __itt_mark_global_ptr __itt_mark_globalW_ptr
+#else /* UNICODE  */
+#  define __itt_mark_global     __itt_mark_globalA
+#  define __itt_mark_global_ptr __itt_mark_globalA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark_global,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA     ITTNOTIFY_DATA(mark_globalA)
+#define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA)
+#define __itt_mark_globalW     ITTNOTIFY_DATA(mark_globalW)
+#define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global      ITTNOTIFY_DATA(mark_global)
+#define __itt_mark_global_ptr  ITTNOTIFY_NAME(mark_global)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA(mt, parameter) (int)0
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW(mt, parameter) (int)0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global(mt, parameter)  (int)0
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates an "end" point for "continuous" mark with specified name.
+ *
+ * - Returns zero value in case of success, non-zero value otherwise.
+ *   Also returns non-zero value when preceding "begin" point for the
+ *   mark with the same name failed to be created or not created.
+ * - The mark of "continuous" type is placed to collection results in
+ *   case of success. It appears in overtime view(s) as a special tick
+ *   sign (different from "discrete" mark) together with line from
+ *   corresponding "begin" mark to "end" mark.
+ * @note Continuous marks can overlap and be nested inside each other.
+ * Discrete mark can be nested inside marked region
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+int ITTAPI __itt_mark_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt))
+#define __itt_mark_off     ITTNOTIFY_DATA(mark_off)
+#define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_off(mt) (int)0
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create an "end" point for mark of process
+ * @see int __itt_mark_off(__itt_mark_type mt);
+ */
+int ITTAPI __itt_mark_global_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
+#define __itt_mark_global_off     ITTNOTIFY_DATA(mark_global_off)
+#define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_global_off(mt) (int)0
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} marks group */
+
+/**
+ * @defgroup counters_internal Counters
+ * @ingroup internal
+ * Counters group
+ * @{
+ */
+/**
+ * @brief opaque structure for counter identification
+ */
+typedef struct ___itt_counter *__itt_counter;
+
+/**
+ * @brief Create a counter with given name/domain for the calling thread
+ *
+ * After __itt_counter_create() is called, __itt_counter_inc() / __itt_counter_inc_delta() can be used
+ * to increment the counter on any thread
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA(const char    *name, const char    *domain);
+__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create     __itt_counter_createW
+#  define __itt_counter_create_ptr __itt_counter_createW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create     __itt_counter_createA
+#  define __itt_counter_create_ptr __itt_counter_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char *name, const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA     ITTNOTIFY_DATA(counter_createA)
+#define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA)
+#define __itt_counter_createW     ITTNOTIFY_DATA(counter_createW)
+#define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create     ITTNOTIFY_DATA(counter_create)
+#define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA(name, domain)
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW(name, domain)
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create(name, domain)
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create()
+ */
+void ITTAPI __itt_counter_destroy(__itt_counter id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
+#define __itt_counter_destroy     ITTNOTIFY_VOID(counter_destroy)
+#define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_destroy(id)
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Increment the counter value
+ */
+void ITTAPI __itt_counter_inc(__itt_counter id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id))
+#define __itt_counter_inc     ITTNOTIFY_VOID(counter_inc)
+#define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc(id)
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Increment the counter value with x
+ */
+void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_inc_delta     ITTNOTIFY_VOID(counter_inc_delta)
+#define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_delta(id, value)
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} counters group */
+
+/**
+ * @defgroup stitch Stack Stitching
+ * @ingroup internal
+ * Stack Stitching group
+ * @{
+ */
+/**
+ * @brief opaque structure for counter identification
+ */
+typedef struct ___itt_caller *__itt_caller;
+
+/**
+ * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to.
+ * The function returns a unique identifier which is used to match the cut points with corresponding stitch points.
+ */
+__itt_caller ITTAPI __itt_stack_caller_create(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
+#define __itt_stack_caller_create     ITTNOTIFY_DATA(stack_caller_create)
+#define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_create() (__itt_caller)0
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the inforamtion about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ */
+void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id))
+#define __itt_stack_caller_destroy     ITTNOTIFY_VOID(stack_caller_destroy)
+#define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_destroy(id)
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Sets the cut point. Stack from each event which occurs after this call will be cut
+ * at the same stack level the function was called and stitched to the corresponding stitch point.
+ */
+void ITTAPI __itt_stack_callee_enter(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id))
+#define __itt_stack_callee_enter     ITTNOTIFY_VOID(stack_callee_enter)
+#define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_enter(id)
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter().
+ */
+void ITTAPI __itt_stack_callee_leave(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id))
+#define __itt_stack_callee_leave     ITTNOTIFY_VOID(stack_callee_leave)
+#define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_leave(id)
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} stitch group */
+
+/* ***************************************************************************************************************************** */
+
+#include <stdarg.h>
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_error_code
+{
+    __itt_error_success       = 0, /*!< no error */
+    __itt_error_no_module     = 1, /*!< module can't be loaded */
+    /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */
+    __itt_error_no_symbol     = 2, /*!< symbol not found */
+    /* %1$s -- library name, %2$s -- symbol name. */
+    __itt_error_unknown_group = 3, /*!< unknown group specified */
+    /* %1$s -- env var name, %2$s -- group name. */
+    __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
+    /* %1$s -- env var name, %2$d -- system error. */
+    __itt_error_env_too_long  = 5, /*!< variable value too long */
+    /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */
+    __itt_error_system        = 6  /*!< pthread_mutexattr_init or pthread_mutex_init failed */
+    /* %1$s -- function name, %2$d -- errno. */
+} __itt_error_code;
+
+typedef void (__itt_error_handler_t)(__itt_error_code code, va_list);
+__itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*);
+
+const char* ITTAPI __itt_api_version(void);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#define __itt_error_handler ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, error_handler)
+void __itt_error_handler(__itt_error_code code, va_list args);
+extern const int ITTNOTIFY_NAME(err);
+#define __itt_err ITTNOTIFY_NAME(err)
+ITT_STUB(ITTAPI, const char*, api_version, (void))
+#define __itt_api_version     ITTNOTIFY_DATA(api_version)
+#define __itt_api_version_ptr ITTNOTIFY_NAME(api_version)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_api_version()   (const char*)0
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_PRIVATE_ */
+
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */

diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/final/runtime/src/thirdparty/ittnotify/ittnotify_config.h
new file mode 100644
index 0000000..c1bfbcc
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_config.h

@@ -0,0 +1,477 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ITTNOTIFY_CONFIG_H_
+#define _ITTNOTIFY_CONFIG_H_
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__ 
+#      define CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall)) 
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    CDECL
+#define LIBITTAPI CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    CDECL
+#define LIBITTAPI_CALL CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#endif /* __STRICT_ANSI__ */
+#define ITT_INLINE_ATTRIBUTE __attribute__ ((always_inline, unused))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifndef ITT_ARCH_IA32
+#  define ITT_ARCH_IA32  1
+#endif /* ITT_ARCH_IA32 */
+
+#ifndef ITT_ARCH_IA32E
+#  define ITT_ARCH_IA32E 2
+#endif /* ITT_ARCH_IA32E */
+
+/* Was there a magical reason we didn't have 3 here before? */
+#ifndef ITT_ARCH_AARCH64
+#  define ITT_ARCH_AARCH64  3
+#endif /* ITT_ARCH_AARCH64 */
+
+#ifndef ITT_ARCH_ARM
+#  define ITT_ARCH_ARM  4
+#endif /* ITT_ARCH_ARM */
+
+#ifndef ITT_ARCH_PPC64
+#  define ITT_ARCH_PPC64  5
+#endif /* ITT_ARCH_PPC64 */
+
+
+#ifndef ITT_ARCH
+#  if defined _M_IX86 || defined __i386__
+#    define ITT_ARCH ITT_ARCH_IA32
+#  elif defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#    define ITT_ARCH ITT_ARCH_IA32E
+#  elif defined _M_IA64 || defined __ia64__
+#    define ITT_ARCH ITT_ARCH_IA64
+#  elif defined _M_ARM || __arm__
+#    define ITT_ARCH ITT_ARCH_ARM
+#  elif defined __powerpc64__
+#    define ITT_ARCH ITT_ARCH_PPC64
+#  elif defined __aarch64__
+#    define ITT_ARCH ITT_ARCH_AARCH64
+#  endif
+#endif
+
+#ifdef __cplusplus
+#  define ITT_EXTERN_C extern "C"
+#  define ITT_EXTERN_C_BEGIN extern "C" {
+#  define ITT_EXTERN_C_END }
+#else
+#  define ITT_EXTERN_C /* nothing */
+#  define ITT_EXTERN_C_BEGIN /* nothing */
+#  define ITT_EXTERN_C_END /* nothing */
+#endif /* __cplusplus */
+
+#define ITT_TO_STR_AUX(x) #x
+#define ITT_TO_STR(x)     ITT_TO_STR_AUX(x)
+
+#define __ITT_BUILD_ASSERT(expr, suffix) do { \
+    static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \
+    __itt_build_check_##suffix[0] = 0; \
+} while(0)
+#define _ITT_BUILD_ASSERT(expr, suffix)  __ITT_BUILD_ASSERT((expr), suffix)
+#define ITT_BUILD_ASSERT(expr)           _ITT_BUILD_ASSERT((expr), __LINE__)
+
+#define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
+
+/* Replace with snapshot date YYYYMMDD for promotion build. */
+#define API_VERSION_BUILD    20111111
+
+#ifndef API_VERSION_NUM
+#define API_VERSION_NUM 0.0.0
+#endif /* API_VERSION_NUM */
+
+#define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
+                                " (" ITT_TO_STR(API_VERSION_BUILD) ")"
+
+/* OS communication functions */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <windows.h>
+typedef HMODULE           lib_t;
+typedef DWORD             TIDT;
+typedef CRITICAL_SECTION  mutex_t;
+#define MUTEX_INITIALIZER { 0 }
+#define strong_alias(name, aliasname) /* empty for Windows */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <dlfcn.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */
+#endif /* _GNU_SOURCE */
+#ifndef __USE_UNIX98
+#define __USE_UNIX98 1 /* need for PTHREAD_MUTEX_RECURSIVE, on SLES11.1 with gcc 4.3.4 wherein pthread.h missing dependency on __USE_XOPEN2K8 */
+#endif /*__USE_UNIX98*/
+#include <pthread.h>
+typedef void*             lib_t;
+typedef pthread_t         TIDT;
+typedef pthread_mutex_t   mutex_t;
+#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#define _strong_alias(name, aliasname) \
+            extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+#define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_get_proc(lib, name) GetProcAddress(lib, name)
+#define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
+#define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
+#define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
+#define __itt_load_lib(name)      LoadLibraryA(name)
+#define __itt_unload_lib(handle)  FreeLibrary(handle)
+#define __itt_system_error()      (int)GetLastError()
+#define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
+#define __itt_fstrlen(s)          lstrlenA(s)
+#define __itt_fstrcpyn(s1, s2, l) lstrcpynA(s1, s2, l)
+#define __itt_fstrdup(s)          _strdup(s)
+#define __itt_thread_id()         GetCurrentThreadId()
+#define __itt_thread_yield()      SwitchToThread()
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return InterlockedIncrement(ptr);
+}
+#endif /* ITT_SIMPLE_INIT */
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#define __itt_get_proc(lib, name) dlsym(lib, name)
+#define __itt_mutex_init(mutex)   {\
+    pthread_mutexattr_t mutex_attr;                                         \
+    int error_code = pthread_mutexattr_init(&mutex_attr);                   \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_init",    \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_settype(&mutex_attr,                     \
+                                           PTHREAD_MUTEX_RECURSIVE);        \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \
+                           error_code);                                     \
+    error_code = pthread_mutex_init(mutex, &mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutex_init",        \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_destroy(&mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \
+                           error_code);                                     \
+}
+#define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
+#define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
+#define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
+#define __itt_unload_lib(handle)  dlclose(handle)
+#define __itt_system_error()      errno
+#define __itt_fstrcmp(s1, s2)     strcmp(s1, s2)
+#define __itt_fstrlen(s)          strlen(s)
+#define __itt_fstrcpyn(s1, s2, l) strncpy(s1, s2, l)
+#define __itt_fstrdup(s)          strdup(s)
+#define __itt_thread_id()         pthread_self()
+#define __itt_thread_yield()      sched_yield()
+#if ITT_ARCH==ITT_ARCH_IA64
+#ifdef __INTEL_COMPILER
+#define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
+#else  /* __INTEL_COMPILER */
+/* TODO: Add Support for not Intel compilers for IA-64 */
+#endif /* __INTEL_COMPILER */
+#elif ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */
+ITT_INLINE long
+__TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
+{
+    long result;
+    __asm__ __volatile__("lock\nxadd %0,%1"
+                          : "=r"(result),"=m"(*(int*)ptr)
+                          : "0"(addend), "m"(*(int*)ptr)
+                          : "memory");
+    return result;
+}
+#elif ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_PPC64 || ITT_ARCH==ITT_ARCH_AARCH64
+#define __TBB_machine_fetchadd4(addr, val) __sync_fetch_and_add(addr, val)
+#endif /* ITT_ARCH==ITT_ARCH_IA64 */
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return __TBB_machine_fetchadd4(ptr, 1) + 1L;
+}
+#endif /* ITT_SIMPLE_INIT */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+typedef enum {
+    __itt_collection_normal = 0,
+    __itt_collection_paused = 1
+} __itt_collection_state;
+
+typedef enum {
+    __itt_thread_normal  = 0,
+    __itt_thread_ignored = 1
+} __itt_thread_state;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_thread_info
+{
+    const char* nameA; /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    TIDT               tid;
+    __itt_thread_state state;   /*!< Thread state (paused or normal) */
+    int                extra1;  /*!< Reserved to the runtime */
+    void*              extra2;  /*!< Reserved to the runtime */
+    struct ___itt_thread_info* next;
+} __itt_thread_info;
+
+#include "ittnotify_types.h" /* For __itt_group_id definition */
+
+typedef struct ___itt_api_info_20101001
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    __itt_group_id group;
+}  __itt_api_info_20101001;
+
+typedef struct ___itt_api_info
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    void*          null_func;
+    __itt_group_id group;
+}  __itt_api_info;
+
+struct ___itt_domain;
+struct ___itt_string_handle;
+
+typedef struct ___itt_global
+{
+    unsigned char          magic[8];
+    unsigned long          version_major;
+    unsigned long          version_minor;
+    unsigned long          version_build;
+    volatile long          api_initialized;
+    volatile long          mutex_initialized;
+    volatile long          atomic_counter;
+    mutex_t                mutex;
+    lib_t                  lib;
+    void*                  error_handler;
+    const char**           dll_path_ptr;
+    __itt_api_info*        api_list_ptr;
+    struct ___itt_global*  next;
+    /* Joinable structures below */
+    __itt_thread_info*     thread_list;
+    struct ___itt_domain*  domain_list;
+    struct ___itt_string_handle* string_list;
+    __itt_collection_state state;
+} __itt_global;
+
+#pragma pack(pop)
+
+#define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = NULL; \
+        h->nameW  = n ? _wcsdup(n) : NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = n ? __itt_fstrdup(n) : NULL; \
+        h->nameW  = NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_W(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 0;    /* domain is disabled by default */ \
+        h->nameA  = NULL; \
+        h->nameW  = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_A(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 0;    /* domain is disabled by default */ \
+        h->nameA  = name ? __itt_fstrdup(name) : NULL; \
+        h->nameW  = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = NULL; \
+        h->strW   = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = name ? __itt_fstrdup(name) : NULL; \
+        h->strW   = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#endif /* _ITTNOTIFY_CONFIG_H_ */

diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_static.c b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.c
new file mode 100644
index 0000000..0044e75
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.c

@@ -0,0 +1,1050 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define PATH_MAX 512
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#include <limits.h>
+#include <dlfcn.h>
+#include <errno.h>
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define INTEL_NO_MACRO_BODY 
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#include "ittnotify.h"
+#include "legacy/ittnotify.h"
+
+#include "disable_warnings.h"
+
+static const char api_version[] = API_VERSION "\0\n@(#) $Revision: 43375 $\n";
+
+#define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+
+#if ITT_OS==ITT_OS_WIN
+static const char* ittnotify_lib_name = "libittnotify.dll";
+#elif ITT_OS==ITT_OS_LINUX
+static const char* ittnotify_lib_name = "libittnotify.so";
+#elif ITT_OS==ITT_OS_MAC
+static const char* ittnotify_lib_name = "libittnotify.dylib";
+#else
+#error Unsupported or unknown OS.
+#endif
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+
+#ifdef ITT_ANDROID_LOG
+    #define ITT_ANDROID_LOG_TAG   "INTEL_VTUNE_USERAPI"
+    #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+#else
+    #define ITT_ANDROID_LOGI(...)
+    #define ITT_ANDROID_LOGW(...)
+    #define ITT_ANDROID_LOGE(...)
+    #define ITT_ANDROID_LOGD(...)
+#endif
+
+/* default location of userapi collector on Android */
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH  "/data/data/com.intel.vtune/intel/libittnotify.so"
+#endif
+
+
+#ifndef LIB_VAR_NAME
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY32
+#else
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY64
+#endif
+#endif /* LIB_VAR_NAME */
+
+#define ITT_MUTEX_INIT_AND_LOCK(p) {                                 \
+        if (!p.mutex_initialized)                                    \
+        {                                                            \
+            if (__itt_interlocked_increment(&p.atomic_counter) == 1) \
+            {                                                        \
+                __itt_mutex_init(&p.mutex);                          \
+                p.mutex_initialized = 1;                             \
+            }                                                        \
+            else                                                     \
+                while (!p.mutex_initialized)                         \
+                    __itt_thread_yield();                            \
+        }                                                            \
+        __itt_mutex_lock(&p.mutex);                                  \
+}
+
+const int _N_(err) = 0;
+
+typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id);
+
+/* this define used to control initialization function name. */
+#ifndef __itt_init_ittlib_name
+ITT_EXTERN_C int _N_(init_ittlib)(const char*, __itt_group_id);
+static __itt_init_ittlib_t* __itt_init_ittlib_ptr = _N_(init_ittlib);
+#define __itt_init_ittlib_name __itt_init_ittlib_ptr
+#endif /* __itt_init_ittlib_name */
+
+typedef void (__itt_fini_ittlib_t)(void);
+
+/* this define used to control finalization function name. */
+#ifndef __itt_fini_ittlib_name
+ITT_EXTERN_C void _N_(fini_ittlib)(void);
+static __itt_fini_ittlib_t* __itt_fini_ittlib_ptr = _N_(fini_ittlib);
+#define __itt_fini_ittlib_name __itt_fini_ittlib_ptr
+#endif /* __itt_fini_ittlib_name */
+
+/* building pointers to imported funcs */
+#undef ITT_STUBV
+#undef ITT_STUB
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    __itt_init_ittlib_name(NULL, __itt_group_all);             \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        return ITTNOTIFY_NAME(name) params;                    \
+    else                                                       \
+        return (type)0;                                        \
+}
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    __itt_init_ittlib_name(NULL, __itt_group_all);             \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        ITTNOTIFY_NAME(name) params;                           \
+    else                                                       \
+        return;                                                \
+}
+
+#undef __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+
+ITT_GROUP_LIST(group_list);
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_alias
+{
+    const char*    env_var;
+    __itt_group_id groups;
+} __itt_group_alias;
+
+static __itt_group_alias group_alias[] = {
+    { "KMP_FOR_TPROFILE", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_mark) },
+    { "KMP_FOR_TCHECK",   (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_fsync | __itt_group_mark | __itt_group_suppress) },
+    { NULL,               (__itt_group_none) },
+    { api_version,        (__itt_group_none) } /* !!! Just to avoid unused code elimination !!! */
+};
+
+#pragma pack(pop)
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static __itt_api_info api_list[] = {
+/* Define functions with static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) { ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+/* Define functions without static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) {ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), NULL, (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#include "ittnotify_static.h"
+    {NULL, NULL, NULL, NULL, __itt_group_none}
+};
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/* private, init thread info item. used for internal purposes */
+static __itt_thread_info init_thread_info = {
+    (const char*)NULL,                        /* nameA */
+#if defined(UNICODE) || defined(_UNICODE)
+    (const wchar_t*)NULL,                     /* nameW */
+#else
+    (void*)NULL,                              /* nameW */
+#endif
+    0,                                        /* tid */
+    __itt_thread_normal,                      /* state */
+    0,                                        /* extra1 */
+    (void*)NULL,                              /* extra2 */
+    (__itt_thread_info*)NULL                  /* next */
+};
+
+/* private, NULL domain item. used for internal purposes */
+static __itt_domain null_domain = {
+    0,                                        /* flags:  disabled by default */
+    (const char*)NULL,                        /* nameA */
+#if defined(UNICODE) || defined(_UNICODE)
+    (const wchar_t*)NULL,                     /* nameW */
+#else
+    (void*)NULL,                              /* nameW */
+#endif
+    0,                                        /* extra1 */
+    (void*)NULL,                              /* extra2 */
+    (__itt_domain*)NULL                       /* next */
+};
+
+/* private, NULL string handle item. used for internal purposes */
+static __itt_string_handle null_string_handle = {
+    (const char*)NULL,                        /* strA */
+#if defined(UNICODE) || defined(_UNICODE)
+    (const wchar_t*)NULL,                     /* strW */
+#else
+    (void*)NULL,                              /* strW */
+#endif
+    0,                                        /* extra1 */
+    (void*)NULL,                              /* extra2 */
+    (__itt_string_handle*)NULL                /* next */
+};
+
+static const char dll_path[PATH_MAX] = { 0 };
+
+/* static part descriptor which handles. all notification api attributes. */
+__itt_global _N_(_ittapi_global) = {
+    ITT_MAGIC,                                     /* identification info */
+    ITT_MAJOR, ITT_MINOR, API_VERSION_BUILD,       /* version info */
+    0,                                             /* api_initialized */
+    0,                                             /* mutex_initialized */
+    0,                                             /* atomic_counter */
+    MUTEX_INITIALIZER,                             /* mutex */
+    NULL,                                          /* dynamic library handle */
+    NULL,                                          /* error_handler */
+    (const char**)&dll_path,                       /* dll_path_ptr */
+    (__itt_api_info*)&api_list,                    /* api_list_ptr */
+    NULL,                                          /* next __itt_global */
+    (__itt_thread_info*)&init_thread_info,         /* thread_list */
+    (__itt_domain*)&null_domain,                   /* domain_list */
+    (__itt_string_handle*)&null_string_handle,     /* string_list */
+    __itt_collection_normal                        /* collection state */
+};
+
+typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id);
+typedef void (__itt_api_fini_t)(__itt_global*);
+
+/* ========================================================================= */
+
+#ifdef ITT_NOTIFY_EXT_REPORT
+ITT_EXTERN_C void _N_(error_handler)(__itt_error_code, va_list args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(push)
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void __itt_report_error(__itt_error_code code, ...)
+{
+    va_list args;
+    va_start(args, code);
+    if (_N_(_ittapi_global).error_handler != NULL)
+    {
+        __itt_error_handler_t* handler = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+        handler(code, args);
+    }
+#ifdef ITT_NOTIFY_EXT_REPORT
+    _N_(error_handler)(code, args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+    va_end(args);
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))(const wchar_t* name)
+{
+    __itt_domain *h_tail, *h;
+
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+        if (ITTNOTIFY_NAME(domain_createW) && ITTNOTIFY_NAME(domain_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init)))
+            return ITTNOTIFY_NAME(domain_createW)(name);
+    }
+
+    if (name == NULL)
+        return _N_(_ittapi_global).domain_list;
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+        if (h->nameW != NULL && !wcscmp(h->nameW, name))
+            break;
+    if (h == NULL) {
+        NEW_DOMAIN_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_domain *h_tail, *h;
+
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(domain_createA) && ITTNOTIFY_NAME(domain_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init)))
+            return ITTNOTIFY_NAME(domain_createA)(name);
+#else
+        if (ITTNOTIFY_NAME(domain_create) && ITTNOTIFY_NAME(domain_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init)))
+            return ITTNOTIFY_NAME(domain_create)(name);
+#endif
+    }
+
+    if (name == NULL)
+        return _N_(_ittapi_global).domain_list;
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+        if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name))
+            break;
+    if (h == NULL) {
+        NEW_DOMAIN_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))(const wchar_t* name)
+{
+    __itt_string_handle *h_tail, *h;
+
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+        if (ITTNOTIFY_NAME(string_handle_createW) && ITTNOTIFY_NAME(string_handle_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init)))
+            return ITTNOTIFY_NAME(string_handle_createW)(name);
+    }
+
+    if (name == NULL)
+        return _N_(_ittapi_global).string_list;
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+        if (h->strW != NULL && !wcscmp(h->strW, name))
+            break;
+    if (h == NULL) {
+        NEW_STRING_HANDLE_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_string_handle *h_tail, *h;
+
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(string_handle_createA) && ITTNOTIFY_NAME(string_handle_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init)))
+            return ITTNOTIFY_NAME(string_handle_createA)(name);
+#else
+        if (ITTNOTIFY_NAME(string_handle_create) && ITTNOTIFY_NAME(string_handle_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init)))
+            return ITTNOTIFY_NAME(string_handle_create)(name);
+#endif
+    }
+
+    if (name == NULL)
+        return _N_(_ittapi_global).string_list;
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+        if (h->strA != NULL && !__itt_fstrcmp(h->strA, name))
+            break;
+    if (h == NULL) {
+        NEW_STRING_HANDLE_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+        if (ITTNOTIFY_NAME(pause) && ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init)))
+        {
+            ITTNOTIFY_NAME(pause)();
+            return;
+        }
+    }
+    _N_(_ittapi_global).state = __itt_collection_paused;
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+        if (ITTNOTIFY_NAME(resume) && ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init)))
+        {
+            ITTNOTIFY_NAME(resume)();
+            return;
+        }
+    }
+    _N_(_ittapi_global).state = __itt_collection_normal;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name)
+{
+    TIDT tid = __itt_thread_id();
+    __itt_thread_info *h_tail, *h;
+
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+        if (ITTNOTIFY_NAME(thread_set_nameW) && ITTNOTIFY_NAME(thread_set_nameW) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init)))
+        {
+            ITTNOTIFY_NAME(thread_set_nameW)(name);
+            return;
+        }
+    }
+
+    __itt_mutex_lock(&_N_(_ittapi_global).mutex);
+    for (h_tail = NULL, h = _N_(_ittapi_global).thread_list; h != NULL; h_tail = h, h = h->next)
+        if (h->tid == tid)
+            break;
+    if (h == NULL) {
+        NEW_THREAD_INFO_W(&_N_(_ittapi_global), h, h_tail, tid, __itt_thread_normal, name);
+    }
+    else
+    {
+        h->nameW = name ? _wcsdup(name) : NULL;
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+}
+
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen)
+{
+    namelen = namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name);
+    return 0;
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(const char* name)
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    TIDT tid = __itt_thread_id();
+    __itt_thread_info *h_tail, *h;
+
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(thread_set_nameA) && ITTNOTIFY_NAME(thread_set_nameA) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init)))
+        {
+            ITTNOTIFY_NAME(thread_set_nameA)(name);
+            return;
+        }
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        if (ITTNOTIFY_NAME(thread_set_name) && ITTNOTIFY_NAME(thread_set_name) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init)))
+        {
+            ITTNOTIFY_NAME(thread_set_name)(name);
+            return;
+        }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+
+    __itt_mutex_lock(&_N_(_ittapi_global).mutex);
+    for (h_tail = NULL, h = _N_(_ittapi_global).thread_list; h != NULL; h_tail = h, h = h->next)
+        if (h->tid == tid)
+            break;
+    if (h == NULL) {
+        NEW_THREAD_INFO_A(&_N_(_ittapi_global), h, h_tail, tid, __itt_thread_normal, name);
+    }
+    else
+    {
+        h->nameA = name ? __itt_fstrdup(name) : NULL;
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen)
+{
+    namelen = namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name);
+    return 0;
+}
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen)
+{
+    namelen = namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name);
+    return 0;
+}
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(void)
+{
+    TIDT tid = __itt_thread_id();
+    __itt_thread_info *h_tail, *h;
+
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list->tid == 0)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+        if (ITTNOTIFY_NAME(thread_ignore) && ITTNOTIFY_NAME(thread_ignore) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init)))
+        {
+            ITTNOTIFY_NAME(thread_ignore)();
+            return;
+        }
+    }
+
+    __itt_mutex_lock(&_N_(_ittapi_global).mutex);
+    for (h_tail = NULL, h = _N_(_ittapi_global).thread_list; h != NULL; h_tail = h, h = h->next)
+        if (h->tid == tid)
+            break;
+    if (h == NULL) {
+        static const char* name = "unknown";
+        NEW_THREAD_INFO_A(&_N_(_ittapi_global), h, h_tail, tid, __itt_thread_ignored, name);
+    }
+    else
+    {
+        h->state = __itt_thread_ignored;
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore),_init))(void)
+{
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))();
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach),_init))(void)
+{
+#ifdef __ANDROID__
+    /*
+     * if LIB_VAR_NAME env variable were set before then stay previous value
+     * else set default path
+    */
+    setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0);
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+
+static const char* __itt_fsplit(const char* s, const char* sep, const char** out, int* len)
+{
+    int i;
+    int j;
+
+    if (!s || !sep || !out || !len)
+        return NULL;
+
+    for (i = 0; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    if (!s[i])
+        return NULL;
+
+    *len = 0;
+    *out = &s[i];
+
+    for (; s[i]; i++, (*len)++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (b)
+            break;
+    }
+
+    for (; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    return &s[i];
+}
+
+/* This function return value of env variable that placed into static buffer.
+ * !!! The same static buffer is used for subsequent calls. !!!
+ * This was done to aviod dynamic allocation for few calls.
+ * Actually we need this function only four times.
+ */
+static const char* __itt_get_env_var(const char* name)
+{
+#define MAX_ENV_VALUE_SIZE 4086
+    static char  env_buff[MAX_ENV_VALUE_SIZE];
+    static char* env_value = (char*)env_buff;
+
+    if (name != NULL)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+        DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len);
+        if (rc >= max_len)
+            __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1, (size_t)(max_len - 1));
+        else if (rc > 0)
+        {
+            const char* ret = (const char*)env_value;
+            env_value += rc + 1;
+            return ret;
+        }
+        else
+        {
+            /* If environment variable is empty, GetEnvirornmentVariables()
+             * returns zero (number of characters (not including terminating null),
+             * and GetLastError() returns ERROR_SUCCESS. */
+            DWORD err = GetLastError();
+            if (err == ERROR_SUCCESS)
+                return env_value;
+
+            if (err != ERROR_ENVVAR_NOT_FOUND)
+                __itt_report_error(__itt_error_cant_read_env, name, (int)err);
+        }
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+        char* env = getenv(name);
+        if (env != NULL)
+        {
+            size_t len = strlen(env);
+            size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+            if (len < max_len)
+            {
+                const char* ret = (const char*)env_value;
+                strncpy(env_value, env, len + 1);
+                env_value += len + 1;
+                return ret;
+            } else
+                __itt_report_error(__itt_error_env_too_long, name, (size_t)len, (size_t)(max_len - 1));
+        }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+    return NULL;
+}
+
+static const char* __itt_get_lib_name(void)
+{
+    const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+
+#ifdef __ANDROID__
+    if (lib_name == NULL)
+    {
+        const char* const system_wide_marker_filename = "/data/local/tmp/com.intel.itt.collector_lib";
+        int itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
+        ssize_t res = 0;
+
+        if (itt_marker_file_fd == -1)
+        {
+            const pid_t my_pid = getpid();
+            char cmdline_path[PATH_MAX] = {0};
+            char package_name[PATH_MAX] = {0};
+            char app_sandbox_file[PATH_MAX] = {0};
+            int cmdline_fd = 0;
+
+            ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
+            snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
+            ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
+            cmdline_fd = open(cmdline_path, O_RDONLY);
+            if (cmdline_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
+                return lib_name;
+            }
+            res = read(cmdline_fd, package_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
+                res = close(cmdline_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                }
+                return lib_name;
+            }
+            res = close(cmdline_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Package name: %s\n", package_name);
+            snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/com.intel.itt.collector_lib", package_name);
+            ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
+            itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
+            if (itt_marker_file_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open app marker file!");
+                return lib_name;
+            }
+        }
+
+        {
+            char itt_lib_name[PATH_MAX] = {0};
+
+            res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
+                res = close(itt_marker_file_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                }
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
+            res = close(itt_marker_file_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Set env");
+            res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to set env var!");
+                return lib_name;
+            }
+            lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+            ITT_ANDROID_LOGI("ITT Lib path from env: %s", itt_lib_name);
+        }
+    }
+#endif
+
+    return lib_name;
+}
+
+#ifndef min
+#define min(a,b) (a) < (b) ? (a) : (b)
+#endif /* min */
+
+static __itt_group_id __itt_get_groups(void)
+{
+    register int i;
+    __itt_group_id res = __itt_group_none;
+    const char* var_name  = "INTEL_ITTNOTIFY_GROUPS";
+    const char* group_str = __itt_get_env_var(var_name);
+
+    if (group_str != NULL)
+    {
+        int len;
+        char gr[255];
+        const char* chunk;
+        while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL)
+        {
+            __itt_fstrcpyn(gr, chunk, sizeof(gr) - 1);
+            gr[min(len, (int)(sizeof(gr) - 1))] = 0;
+
+            for (i = 0; group_list[i].name != NULL; i++)
+            {
+                if (!__itt_fstrcmp(gr, group_list[i].name))
+                {
+                    res = (__itt_group_id)(res | group_list[i].id);
+                    break;
+                }
+            }
+        }
+        /* TODO: !!! Workaround for bug with warning for unknown group !!!
+         * Should be fixed in new initialization scheme.
+         * Now the following groups should be set always. */
+        for (i = 0; group_list[i].id != __itt_group_none; i++)
+            if (group_list[i].id != __itt_group_all &&
+                group_list[i].id > __itt_group_splitter_min &&
+                group_list[i].id < __itt_group_splitter_max)
+                res = (__itt_group_id)(res | group_list[i].id);
+        return res;
+    }
+    else
+    {
+        for (i = 0; group_alias[i].env_var != NULL; i++)
+            if (__itt_get_env_var(group_alias[i].env_var) != NULL)
+                return group_alias[i].groups;
+    }
+
+    return res;
+}
+
+static int __itt_lib_version(lib_t lib)
+{
+    if (lib == NULL)
+        return 0;
+    if (__itt_get_proc(lib, "__itt_api_init"))
+        return 2;
+    if (__itt_get_proc(lib, "__itt_api_version"))
+        return 1;
+    return 0;
+}
+
+/* It's not used right now! Comment it out to avoid warnings.
+static void __itt_reinit_all_pointers(void)
+{
+    register int i;
+    // Fill all pointers with initial stubs
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].init_func;
+}
+*/
+
+static void __itt_nullify_all_pointers(void)
+{
+    register int i;
+    /* Nulify all pointers except domain_create and string_handle_create */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_EXTERN_C void _N_(fini_ittlib)(void)
+{
+    __itt_api_fini_t* __itt_api_fini_ptr;
+    static volatile TIDT current_thread = 0;
+
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        __itt_mutex_lock(&_N_(_ittapi_global).mutex);
+        if (_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                current_thread = __itt_thread_id();
+                __itt_api_fini_ptr = (__itt_api_fini_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_fini");
+                if (__itt_api_fini_ptr)
+                    __itt_api_fini_ptr(&_N_(_ittapi_global));
+
+                __itt_nullify_all_pointers();
+
+ /* TODO: !!! not safe !!! don't support unload so far.
+  *             if (_N_(_ittapi_global).lib != NULL)
+  *                 __itt_unload_lib(_N_(_ittapi_global).lib);
+  *             _N_(_ittapi_global).lib = NULL;
+  */
+                _N_(_ittapi_global).api_initialized = 0;
+                current_thread = 0;
+            }
+        }
+        __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    }
+}
+
+ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups)
+{
+    register int i;
+    __itt_group_id groups;
+#ifdef ITT_COMPLETE_GROUP
+    __itt_group_id zero_group = __itt_group_none;
+#endif /* ITT_COMPLETE_GROUP */
+    static volatile TIDT current_thread = 0;
+
+    if (!_N_(_ittapi_global).api_initialized)
+    {
+#ifndef ITT_SIMPLE_INIT
+        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+#endif /* ITT_SIMPLE_INIT */
+
+        if (!_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                current_thread = __itt_thread_id();
+                _N_(_ittapi_global).thread_list->tid = current_thread;
+                if (lib_name == NULL)
+                    lib_name = __itt_get_lib_name();
+                groups = __itt_get_groups();
+                if (groups != __itt_group_none || lib_name != NULL)
+                {
+                    _N_(_ittapi_global).lib = __itt_load_lib((lib_name == NULL) ? ittnotify_lib_name : lib_name);
+
+                    if (_N_(_ittapi_global).lib != NULL)
+                    {
+                        __itt_api_init_t* __itt_api_init_ptr;
+                        int lib_version = __itt_lib_version(_N_(_ittapi_global).lib);
+
+                        switch (lib_version) {
+                        case 0:
+                            groups = __itt_group_legacy;
+                        case 1:
+                            /* Fill all pointers from dynamic library */
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                            {
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & groups & init_groups)
+                                {
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = (void*)__itt_get_proc(_N_(_ittapi_global).lib, _N_(_ittapi_global).api_list_ptr[i].name);
+                                    if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL)
+                                    {
+                                        /* Restore pointers for function with static implementation */
+                                        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                                        __itt_report_error(__itt_error_no_symbol, lib_name, _N_(_ittapi_global).api_list_ptr[i].name);
+#ifdef ITT_COMPLETE_GROUP
+                                        zero_group = (__itt_group_id)(zero_group | _N_(_ittapi_global).api_list_ptr[i].group);
+#endif /* ITT_COMPLETE_GROUP */
+                                    }
+                                }
+                                else
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                            }
+
+                            if (groups == __itt_group_legacy)
+                            {
+                                /* Compatibility with legacy tools */
+                                ITTNOTIFY_NAME(thread_ignore)  = ITTNOTIFY_NAME(thr_ignore);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                                ITTNOTIFY_NAME(sync_createA)   = ITTNOTIFY_NAME(sync_set_nameA);
+                                ITTNOTIFY_NAME(sync_createW)   = ITTNOTIFY_NAME(sync_set_nameW);
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_create)    = ITTNOTIFY_NAME(sync_set_name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_prepare)   = ITTNOTIFY_NAME(notify_sync_prepare);
+                                ITTNOTIFY_NAME(sync_cancel)    = ITTNOTIFY_NAME(notify_sync_cancel);
+                                ITTNOTIFY_NAME(sync_acquired)  = ITTNOTIFY_NAME(notify_sync_acquired);
+                                ITTNOTIFY_NAME(sync_releasing) = ITTNOTIFY_NAME(notify_sync_releasing);
+                            }
+
+#ifdef ITT_COMPLETE_GROUP
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group)
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+#endif /* ITT_COMPLETE_GROUP */
+                            break;
+                        case 2:
+                            __itt_api_init_ptr = (__itt_api_init_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_init");
+                            if (__itt_api_init_ptr)
+                                __itt_api_init_ptr(&_N_(_ittapi_global), init_groups);
+                            break;
+                        }
+                    }
+                    else
+                    {
+                        __itt_nullify_all_pointers();
+
+                        __itt_report_error(__itt_error_no_module, lib_name,
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                            __itt_system_error()
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                            dlerror()
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                        );
+                    }
+                }
+                else
+                {
+                    __itt_nullify_all_pointers();
+                }
+                _N_(_ittapi_global).api_initialized = 1;
+                current_thread = 0;
+                /* !!! Just to avoid unused code elimination !!! */
+                if (__itt_fini_ittlib_ptr == _N_(fini_ittlib)) current_thread = 0;
+            }
+        }
+
+#ifndef ITT_SIMPLE_INIT
+        __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#endif /* ITT_SIMPLE_INIT */
+    }
+
+    /* Evaluating if any function ptr is non empty and it's in init_groups */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr != _N_(_ittapi_global).api_list_ptr[i].null_func &&
+            _N_(_ittapi_global).api_list_ptr[i].group & init_groups)
+            return 1;
+    return 0;
+}
+
+ITT_EXTERN_C __itt_error_handler_t* _N_(set_error_handler)(__itt_error_handler_t* handler)
+{
+    __itt_error_handler_t* prev = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+    _N_(_ittapi_global).error_handler = (void*)(size_t)handler;
+    return prev;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+

diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_static.h b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.h
new file mode 100644
index 0000000..a218cc8
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.h

@@ -0,0 +1,316 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ittnotify_config.h"
+
+#ifndef ITT_FORMAT_DEFINED
+#  ifndef ITT_FORMAT
+#    define ITT_FORMAT
+#  endif /* ITT_FORMAT */
+#  ifndef ITT_NO_PARAMS
+#    define ITT_NO_PARAMS
+#  endif /* ITT_NO_PARAMS */
+#endif /* ITT_FORMAT_DEFINED */
+
+/*
+ * parameters for macro expected:
+ * ITT_STUB(api, type, func_name, arguments, params, func_name_in_dll, group, printf_fmt)
+ */
+#ifdef __ITT_INTERNAL_INIT
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name), (ITT_FORMAT name), domain_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name), (ITT_FORMAT name), domain_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name), (ITT_FORMAT name), domain_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name), (ITT_FORMAT name), string_handle_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name), (ITT_FORMAT name), string_handle_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name), (ITT_FORMAT name), string_handle_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(ITTAPI, void, pause,  (void), (ITT_NO_PARAMS), pause,  __itt_group_control | __itt_group_legacy, "no args")
+ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"")
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name), (ITT_FORMAT name), thread_set_nameW, __itt_group_thread, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name), (ITT_FORMAT name), thread_set_name,  __itt_group_thread, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_ignore, (void), (ITT_NO_PARAMS), thread_ignore, __itt_group_thread, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int,  thr_name_setA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setA, __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, int,  thr_name_setW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setW, __itt_group_thread | __itt_group_legacy, "\"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int,  thr_name_set,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_set,  __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(LIBITTAPI, void, thr_ignore,   (void),                             (ITT_NO_PARAMS),            thr_ignore,    __itt_group_thread | __itt_group_legacy, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args")
+
+#else  /* __ITT_INTERNAL_INIT */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\", \"%S\", %x")
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_renameA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name), (ITT_FORMAT addr, name), sync_renameW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_create,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_rename,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_destroy,    (void *addr), (ITT_FORMAT addr), sync_destroy,   __itt_group_sync | __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, sync_prepare,    (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_cancel,     (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_acquired,   (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_releasing,  (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_sync,  "%p")
+
+ITT_STUBV(ITTAPI, void, suppress_push,       (unsigned int mask),                             (ITT_FORMAT mask), suppress_push,  __itt_group_suppress,  "%p")
+ITT_STUBV(ITTAPI, void, suppress_pop,        (void),                                          (ITT_NO_PARAMS),   suppress_pop,   __itt_group_suppress,  "no args")
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_mark_range, __itt_group_suppress, "%d, %p, %p, %d")
+ITT_STUBV(ITTAPI, void, suppress_clear_range,(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_clear_range,__itt_group_suppress, "%d, %p, %p, %d")
+
+ITT_STUBV(ITTAPI, void, fsync_prepare,   (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_cancel,    (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_acquired,  (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, model_site_begin,          (__itt_model_site *site, __itt_model_site_instance *instance, const char *name), (ITT_FORMAT site, instance, name), model_site_begin, __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_end,            (__itt_model_site *site, __itt_model_site_instance *instance),                   (ITT_FORMAT site, instance),       model_site_end,   __itt_group_model, "%p, %p")
+ITT_STUBV(ITTAPI, void, model_task_begin,          (__itt_model_task *task, __itt_model_task_instance *instance, const char *name), (ITT_FORMAT task, instance, name), model_task_begin, __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_end,            (__itt_model_task *task, __itt_model_task_instance *instance),                   (ITT_FORMAT task, instance),       model_task_end,   __itt_group_model, "%p, %p")
+ITT_STUBV(ITTAPI, void, model_lock_acquire,        (void *lock), (ITT_FORMAT lock), model_lock_acquire, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release,        (void *lock), (ITT_FORMAT lock), model_lock_release, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size), (ITT_FORMAT addr, size), model_record_allocation,   __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr),              (ITT_FORMAT addr),       model_record_deallocation, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_induction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_induction_uses,      __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_reduction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_reduction_uses,      __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_observe_uses,        (void* addr, size_t size), (ITT_FORMAT addr, size), model_observe_uses,        __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_clear_uses,          (void* addr),              (ITT_FORMAT addr),       model_clear_uses,          __itt_group_model, "%p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_site_beginW,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_task_beginW,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskW,     (const wchar_t *name),     (ITT_FORMAT name),       model_iteration_taskW,     __itt_group_model, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, model_site_beginA,         (const char *name),        (ITT_FORMAT name),       model_site_beginA,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_site_beginAL,    __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_task_beginA,         (const char *name),        (ITT_FORMAT name),       model_task_beginA,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_task_beginAL,    __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,     (const char *name),        (ITT_FORMAT name),       model_iteration_taskA,     __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,    (const char *name, size_t len), (ITT_FORMAT name, len), model_iteration_taskAL, __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_site_end_2,          (void),                    (ITT_NO_PARAMS),         model_site_end_2,          __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_task_end_2,          (void),                    (ITT_NO_PARAMS),         model_task_end_2,          __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_acquire_2,      __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_release_2,      __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_aggregate_task,      (size_t count),            (ITT_FORMAT count),      model_aggregate_task,      __itt_group_model, "%d")
+ITT_STUBV(ITTAPI, void, model_disable_push,        (__itt_model_disable x),   (ITT_FORMAT x),          model_disable_push,        __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_disable_pop,         (void),                    (ITT_NO_PARAMS),         model_disable_pop,         __itt_group_model, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_createA, __itt_group_heap, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), heap_function_createW, __itt_group_heap, "\"%s\", \"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_create,  __itt_group_heap, "\"%s\", \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, heap_allocate_begin,   (__itt_heap_function h, size_t size, int initialized),             (ITT_FORMAT h, size, initialized),       heap_allocate_begin, __itt_group_heap, "%p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_allocate_end,     (__itt_heap_function h, void** addr, size_t size, int initialized), (ITT_FORMAT h, addr, size, initialized), heap_allocate_end,   __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_free_begin,       (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_begin, __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_free_end,         (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_end,   __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void*  addr, size_t new_size, int initialized),                  (ITT_FORMAT h, addr, new_size, initialized),           heap_reallocate_begin, __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_reallocate_end,   (__itt_heap_function h, void*  addr, void** new_addr, size_t new_size, int initialized), (ITT_FORMAT h, addr, new_addr, new_size, initialized), heap_reallocate_end,   __itt_group_heap, "%p, %p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void), (ITT_NO_PARAMS), heap_internal_access_begin, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_internal_access_end,   (void), (ITT_NO_PARAMS), heap_internal_access_end,   __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void), (ITT_NO_PARAMS), heap_record_memory_growth_begin, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end,   (void), (ITT_NO_PARAMS), heap_record_memory_growth_end,   __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask),  (ITT_FORMAT reset_mask), heap_reset_detection, __itt_group_heap, "%u")
+ITT_STUBV(ITTAPI, void, heap_record,          (unsigned int record_mask), (ITT_FORMAT record_mask),  heap_record,        __itt_group_heap, "%u")
+
+ITT_STUBV(ITTAPI, void, id_create,  (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_create,  __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_destroy, __itt_group_structure, "%p, %lu")
+
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void), (ITT_NO_PARAMS), get_timestamp,  __itt_group_structure, "no args")
+
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), region_begin, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id),                                             (ITT_FORMAT domain, id),               region_end,   __itt_group_structure, "%p, %lu")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_begin_v3,  __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_end_v3,    __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end), (ITT_FORMAT domain, id, begin, end), frame_submit_v3, __itt_group_structure, "%p, %p, %lu, %lu")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(ITTAPI, void, task_group,   (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_group,  __itt_group_structure, "%p, %lu, %lu, %p")
+
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_begin,    __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parent, void* fn),                  (ITT_FORMAT domain, id, parent, fn),   task_begin_fn, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain),                                                          (ITT_FORMAT domain),                   task_end,      __itt_group_structure, "%p")
+
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name),                           (ITT_FORMAT domain, name),        counter_inc_v3,       __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_inc_delta_v3, __itt_group_structure, "%p, %p, %lu")
+
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, id, name, scope), marker, __itt_group_structure, "%p, %lu, %p, %d")
+
+ITT_STUBV(ITTAPI, void, metadata_add,      (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, id, key, type, count, data), metadata_add, __itt_group_structure, "%p, %lu, %p, %d, %lu, %p")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_addA, __itt_group_structure, "%p, %lu, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t* data, size_t length), (ITT_FORMAT domain, id, key, data, length), metadata_str_addW, __itt_group_structure, "%p, %lu, %p, %p, %lu")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add,  (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_add,  __itt_group_structure, "%p, %lu, %p, %p, %lu")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, relation, tail),       relation_add_to_current, __itt_group_structure, "%p, %lu, %p")
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, head, relation, tail), relation_add,            __itt_group_structure, "%p, %p, %lu, %p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_createA, __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), event_createW, __itt_group_mark | __itt_group_legacy, "\"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_create,  __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int,  event_start,          (__itt_event event),                (ITT_FORMAT event),         event_start,   __itt_group_mark | __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, int,  event_end,            (__itt_event event),                (ITT_FORMAT event),         event_end,     __itt_group_mark | __itt_group_legacy, "%d")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", \"%S\", %x")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "p, \"%s\", \"%s\", %x")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *p, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", %d, \"%S\", %d, %x")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare,   (void *p), (ITT_FORMAT p), notify_sync_prepare,   __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel,    (void *p), (ITT_FORMAT p), notify_sync_cancel,    __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired,  (void *p), (ITT_FORMAT p), notify_sync_acquired,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *p), (ITT_FORMAT p), notify_sync_releasing, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(LIBITTAPI, void, memory_read,   (void *addr, size_t size), (ITT_FORMAT addr, size), memory_read,   __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_write,  (void *addr, size_t size), (ITT_FORMAT addr, size), memory_write,  __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size), (ITT_FORMAT addr, size), memory_update, __itt_group_legacy, "%p, %lu")
+
+ITT_STUB(LIBITTAPI, __itt_state_t,     state_get,    (void),                                    (ITT_NO_PARAMS),   state_get,    __itt_group_legacy, "no args")
+ITT_STUB(LIBITTAPI, __itt_state_t,     state_set,    (__itt_state_t s),                         (ITT_FORMAT s),    state_set,    __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s), (ITT_FORMAT p, s), obj_mode_set, __itt_group_legacy, "%d, %d")
+ITT_STUB(LIBITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s), (ITT_FORMAT p, s), thr_mode_set, __itt_group_legacy, "%d, %d")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain), (ITT_FORMAT domain), frame_createA, __itt_group_frame, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain), (ITT_FORMAT domain), frame_createW, __itt_group_frame, "\"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char    *domain), (ITT_FORMAT domain), frame_create,  __itt_group_frame, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, frame_begin,         (__itt_frame frame),     (ITT_FORMAT frame),  frame_begin,   __itt_group_frame, "%p")
+ITT_STUBV(ITTAPI, void, frame_end,           (__itt_frame frame),     (ITT_FORMAT frame),  frame_end,     __itt_group_frame, "%p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_createA, __itt_group_counter, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), counter_createW, __itt_group_counter, "\"%s\", \"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_create,  __itt_group_counter, "\"%s\", \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, counter_destroy,   (__itt_counter id),                           (ITT_FORMAT id),        counter_destroy,   __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc,       (__itt_counter id),                           (ITT_FORMAT id),        counter_inc,       __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value), (ITT_FORMAT id, value), counter_inc_delta, __itt_group_counter, "%p, %lu")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name), (ITT_FORMAT name), mark_createA, __itt_group_mark, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name), (ITT_FORMAT name), mark_createW, __itt_group_mark, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char    *name), (ITT_FORMAT name), mark_create,  __itt_group_mark, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int,  markA,        (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), markA, __itt_group_mark, "%d, \"%s\"")
+ITT_STUB(ITTAPI, int,  markW,        (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), markW, __itt_group_mark, "%d, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark,         (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark,  __itt_group_mark, "%d, \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_off, (__itt_mark_type mt), (ITT_FORMAT mt), mark_off, __itt_group_mark, "%d")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int,  mark_globalA, (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_globalA, __itt_group_mark, "%d, \"%s\"")
+ITT_STUB(ITTAPI, int,  mark_globalW, (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), mark_globalW, __itt_group_mark, "%d, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_global,  (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_global,  __itt_group_mark, "%d, \"%S\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_global_off, (__itt_mark_type mt),                        (ITT_FORMAT mt),            mark_global_off, __itt_group_mark, "%d")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void), (ITT_NO_PARAMS), stack_caller_create,  __itt_group_stitch, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id), (ITT_FORMAT id), stack_caller_destroy, __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_enter,   (__itt_caller id), (ITT_FORMAT id), stack_callee_enter,   __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_leave,   (__itt_caller id), (ITT_FORMAT id), stack_callee_leave,   __itt_group_stitch, "%p")
+
+ITT_STUB(ITTAPI,  __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data), (ITT_FORMAT fn, fn_data), clock_domain_create, __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void,                clock_domain_reset,  (void),                                      (ITT_NO_PARAMS),          clock_domain_reset,  __itt_group_structure, "no args")
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_create_ex,  __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_destroy_ex, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn),                  (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, fn), task_begin_fn_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_ex,      (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp),                                                            (ITT_FORMAT domain, clock_domain, timestamp), task_end_ex, __itt_group_structure, "%p, %p, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped,       (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name),                                                                   (ITT_FORMAT domain, id, parent, name), task_begin_overlapped, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_overlapped, (const __itt_domain *domain, __itt_id id),                                                                                                                       (ITT_FORMAT domain, id), task_end_overlapped, __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id),                                                    (ITT_FORMAT domain, clock_domain, timestamp, id), task_end_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, marker_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, clock_domain, timestamp, id, name, scope), marker_ex, __itt_group_structure, "%p, %p, %lu, %lu, %p, %d")
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, scope, key, type, count, data), metadata_add_with_scope, __itt_group_structure, "%p, %d, %p, %d, %lu, %p")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeA, __itt_group_structure, "%p, %d, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length), (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeW, __itt_group_structure, "%p, %d, %p, %p, %lu")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope,  (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scope,  __itt_group_structure, "%p, %d, %p, %p, %lu")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, clock_domain, timestamp, relation, tail),       relation_add_to_current_ex, __itt_group_structure, "%p, %p, %lu, %d, %lu")
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, clock_domain, timestamp, head, relation, tail), relation_add_ex,            __itt_group_structure, "%p, %p, %lu, %lu, %d, %lu")
+ITT_STUB(ITTAPI,  __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type),                    (ITT_FORMAT name, track_group_type),        track_group_create, __itt_group_structure, "%p, %d")
+ITT_STUB(ITTAPI,  __itt_track*,       track_create,       (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type), (ITT_FORMAT track_group, name, track_type), track_create,       __itt_group_structure, "%p, %p, %d")
+ITT_STUBV(ITTAPI, void,               set_track,          (__itt_track *track),                                                                    (ITT_FORMAT track),                         set_track,          __itt_group_structure, "%p")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUB(ITTAPI, const char*, api_version, (void), (ITT_NO_PARAMS), api_version, __itt_group_all & ~__itt_group_legacy, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveA, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveW, __itt_group_arrays, "%p, %d, %p, %d, \"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_save,  __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+
+#endif /* __ITT_INTERNAL_INIT */

diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_types.h b/final/runtime/src/thirdparty/ittnotify/ittnotify_types.h
new file mode 100644
index 0000000..3695a67
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_types.h

@@ -0,0 +1,67 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ITTNOTIFY_TYPES_H_
+#define _ITTNOTIFY_TYPES_H_
+
+typedef enum ___itt_group_id
+{
+    __itt_group_none      = 0,
+    __itt_group_legacy    = 1<<0,
+    __itt_group_control   = 1<<1,
+    __itt_group_thread    = 1<<2,
+    __itt_group_mark      = 1<<3,
+    __itt_group_sync      = 1<<4,
+    __itt_group_fsync     = 1<<5,
+    __itt_group_jit       = 1<<6,
+    __itt_group_model     = 1<<7,
+    __itt_group_splitter_min = 1<<7,
+    __itt_group_counter   = 1<<8,
+    __itt_group_frame     = 1<<9,
+    __itt_group_stitch    = 1<<10,
+    __itt_group_heap      = 1<<11,
+    __itt_group_splitter_max = 1<<12,
+    __itt_group_structure = 1<<12,
+    __itt_group_suppress = 1<<13,
+    __itt_group_arrays    = 1<<14,
+    __itt_group_all       = -1
+} __itt_group_id;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_list
+{
+    __itt_group_id id;
+    const char*    name;
+} __itt_group_list;
+
+#pragma pack(pop)
+
+#define ITT_GROUP_LIST(varname) \
+    static __itt_group_list varname[] = {       \
+        { __itt_group_all,       "all"       }, \
+        { __itt_group_control,   "control"   }, \
+        { __itt_group_thread,    "thread"    }, \
+        { __itt_group_mark,      "mark"      }, \
+        { __itt_group_sync,      "sync"      }, \
+        { __itt_group_fsync,     "fsync"     }, \
+        { __itt_group_jit,       "jit"       }, \
+        { __itt_group_model,     "model"     }, \
+        { __itt_group_counter,   "counter"   }, \
+        { __itt_group_frame,     "frame"     }, \
+        { __itt_group_stitch,    "stitch"    }, \
+        { __itt_group_heap,      "heap"      }, \
+        { __itt_group_structure, "structure" }, \
+        { __itt_group_suppress,  "suppress"  }, \
+        { __itt_group_arrays,    "arrays"    }, \
+        { __itt_group_none,      NULL        }  \
+    }
+
+#endif /* _ITTNOTIFY_TYPES_H_ */

diff --git a/final/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h b/final/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
new file mode 100644
index 0000000..4d87bd3
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h

@@ -0,0 +1,971 @@
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LEGACY_ITTNOTIFY_H_
+#define _LEGACY_ITTNOTIFY_H_
+
+/**
+ * @file
+ * @brief Legacy User API functions and types
+ */
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__ 
+#      define CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall)) 
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    CDECL
+#define LIBITTAPI CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    CDECL
+#define LIBITTAPI_CALL CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#endif /* __STRICT_ANSI__ */
+#define ITT_INLINE_ATTRIBUTE __attribute__ ((always_inline, unused))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @defgroup legacy Legacy API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup legacy_control Collection Control
+ * @ingroup legacy
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+#ifndef _ITTNOTIFY_H_
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,   (void))
+ITT_STUBV(ITTAPI, void, resume,  (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+#endif /* _ITTNOTIFY_H_ */
+/** @} legacy_control group */
+
+/**
+ * @defgroup legacy_threads Threads
+ * @ingroup legacy
+ * Threads group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @deprecated Legacy API
+ * @brief Set name to be associated with thread in analysis GUI.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_thr_name_setA(const char    *name, int namelen);
+int LIBITTAPI __itt_thr_name_setW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thr_name_set     __itt_thr_name_setW
+#  define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr
+#else
+#  define __itt_thr_name_set     __itt_thr_name_setA
+#  define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_thr_name_set(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA     ITTNOTIFY_DATA(thr_name_setA)
+#define __itt_thr_name_setA_ptr ITTNOTIFY_NAME(thr_name_setA)
+#define __itt_thr_name_setW     ITTNOTIFY_DATA(thr_name_setW)
+#define __itt_thr_name_setW_ptr ITTNOTIFY_NAME(thr_name_setW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set     ITTNOTIFY_DATA(thr_name_set)
+#define __itt_thr_name_set_ptr ITTNOTIFY_NAME(thr_name_set)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA(name, namelen)
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW(name, namelen)
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set(name, namelen)
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void LIBITTAPI __itt_thr_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, thr_ignore, (void))
+#define __itt_thr_ignore     ITTNOTIFY_VOID(thr_ignore)
+#define __itt_thr_ignore_ptr ITTNOTIFY_NAME(thr_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_ignore()
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_threads group */
+
+/**
+ * @defgroup legacy_sync Synchronization
+ * @ingroup legacy
+ * Synchronization group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name to a sync object using char or Unicode string
+ * @param[in] addr    - pointer to the sync object. You should use a real pointer to your object
+ *                      to make sure that the values don't clash with other object addresses
+ * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will
+ *                      be assumed to be of generic "User Synchronization" type
+ * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned
+ *                      to the object -- you can use the __itt_sync_rename call later to assign
+ *                      the name
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                      exact semantics of how prepare/acquired/releasing calls work.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_set_nameA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_set_name     __itt_sync_set_nameW
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_set_name     __itt_sync_set_nameA
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_set_name(void *addr, const char* objtype, const char* objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA     ITTNOTIFY_VOID(sync_set_nameA)
+#define __itt_sync_set_nameA_ptr ITTNOTIFY_NAME(sync_set_nameA)
+#define __itt_sync_set_nameW     ITTNOTIFY_VOID(sync_set_nameW)
+#define __itt_sync_set_nameW_ptr ITTNOTIFY_NAME(sync_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name     ITTNOTIFY_VOID(sync_set_name)
+#define __itt_sync_set_name_ptr ITTNOTIFY_NAME(sync_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name(addr, objtype, objname, attribute)
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name and type to a sync object using char or Unicode string
+ * @param[in] addr -      pointer to the sync object. You should use a real pointer to your object
+ *                        to make sure that the values don't clash with other object addresses
+ * @param[in] objtype -   null-terminated object type string. If NULL is passed, the object will
+ *                        be assumed to be of generic "User Synchronization" type
+ * @param[in] objname -   null-terminated object name string. If NULL, no name will be assigned
+ *                        to the object -- you can use the __itt_sync_rename call later to assign
+ *                        the name
+ * @param[in] typelen, namelen -   a length of string for appropriate objtype and objname parameter
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                        exact semantics of how prepare/acquired/releasing calls work.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute);
+int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_notify_sync_name __itt_notify_sync_nameW
+#else
+#  define __itt_notify_sync_name __itt_notify_sync_nameA
+#endif
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA     ITTNOTIFY_DATA(notify_sync_nameA)
+#define __itt_notify_sync_nameA_ptr ITTNOTIFY_NAME(notify_sync_nameA)
+#define __itt_notify_sync_nameW     ITTNOTIFY_DATA(notify_sync_nameW)
+#define __itt_notify_sync_nameW_ptr ITTNOTIFY_NAME(notify_sync_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name     ITTNOTIFY_DATA(notify_sync_name)
+#define __itt_notify_sync_name_ptr ITTNOTIFY_NAME(notify_sync_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Enter spin loop on user-defined sync object
+ */
+void LIBITTAPI __itt_notify_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *addr))
+#define __itt_notify_sync_prepare     ITTNOTIFY_VOID(notify_sync_prepare)
+#define __itt_notify_sync_prepare_ptr ITTNOTIFY_NAME(notify_sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_prepare(addr)
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Quit spin loop without acquiring spin object
+ */
+void LIBITTAPI __itt_notify_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *addr))
+#define __itt_notify_sync_cancel     ITTNOTIFY_VOID(notify_sync_cancel)
+#define __itt_notify_sync_cancel_ptr ITTNOTIFY_NAME(notify_sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_cancel(addr)
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void LIBITTAPI __itt_notify_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *addr))
+#define __itt_notify_sync_acquired     ITTNOTIFY_VOID(notify_sync_acquired)
+#define __itt_notify_sync_acquired_ptr ITTNOTIFY_NAME(notify_sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_acquired(addr)
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void LIBITTAPI __itt_notify_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *addr))
+#define __itt_notify_sync_releasing     ITTNOTIFY_VOID(notify_sync_releasing)
+#define __itt_notify_sync_releasing_ptr ITTNOTIFY_NAME(notify_sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_releasing(addr)
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_sync group */
+
+#ifndef _ITTNOTIFY_H_
+/**
+ * @defgroup legacy_events Events
+ * @ingroup legacy
+ * Events group
+ * @{
+ */
+
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_events group */
+#endif /* _ITTNOTIFY_H_ */
+
+/**
+ * @defgroup legacy_memory Memory Accesses
+ * @ingroup legacy
+ */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on reading
+ */
+void LIBITTAPI __itt_memory_read(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size))
+#define __itt_memory_read     ITTNOTIFY_VOID(memory_read)
+#define __itt_memory_read_ptr ITTNOTIFY_NAME(memory_read)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_read(addr, size)
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on writing
+ */
+void LIBITTAPI __itt_memory_write(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size))
+#define __itt_memory_write     ITTNOTIFY_VOID(memory_write)
+#define __itt_memory_write_ptr ITTNOTIFY_NAME(memory_write)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_write(addr, size)
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on updating
+ */
+void LIBITTAPI __itt_memory_update(void *address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size))
+#define __itt_memory_update     ITTNOTIFY_VOID(memory_update)
+#define __itt_memory_update_ptr ITTNOTIFY_NAME(memory_update)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_update(addr, size)
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_memory group */
+
+/**
+ * @defgroup legacy_state Thread and Object States
+ * @ingroup legacy
+ */
+
+/** @brief state type */
+typedef int __itt_state_t;
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_obj_state {
+    __itt_obj_state_err = 0,
+    __itt_obj_state_clr = 1,
+    __itt_obj_state_set = 2,
+    __itt_obj_state_use = 3
+} __itt_obj_state_t;
+
+typedef enum __itt_thr_state {
+    __itt_thr_state_err = 0,
+    __itt_thr_state_clr = 1,
+    __itt_thr_state_set = 2
+} __itt_thr_state_t;
+
+typedef enum __itt_obj_prop {
+    __itt_obj_prop_watch    = 1,
+    __itt_obj_prop_ignore   = 2,
+    __itt_obj_prop_sharable = 3
+} __itt_obj_prop_t;
+
+typedef enum __itt_thr_prop {
+    __itt_thr_prop_quiet = 1
+} __itt_thr_prop_t;
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_get(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_get, (void))
+#define __itt_state_get     ITTNOTIFY_DATA(state_get)
+#define __itt_state_get_ptr ITTNOTIFY_NAME(state_get)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_get(void) (__itt_state_t)0
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_set(__itt_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s))
+#define __itt_state_set     ITTNOTIFY_DATA(state_set)
+#define __itt_state_set_ptr ITTNOTIFY_NAME(state_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_set(s) (__itt_state_t)0
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p, __itt_thr_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s))
+#define __itt_thr_mode_set     ITTNOTIFY_DATA(thr_mode_set)
+#define __itt_thr_mode_set_ptr ITTNOTIFY_NAME(thr_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_mode_set(p, s) (__itt_thr_state_t)0
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p, __itt_obj_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s))
+#define __itt_obj_mode_set     ITTNOTIFY_DATA(obj_mode_set)
+#define __itt_obj_mode_set_ptr ITTNOTIFY_NAME(obj_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_obj_mode_set(p, s) (__itt_obj_state_t)0
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_state group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup legacy
+ * Frames group
+ * @{
+ */
+/**
+ * @brief opaque structure for frame identification
+ */
+typedef struct __itt_frame_t *__itt_frame;
+
+/**
+ * @brief Create a global frame with given domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_frame ITTAPI __itt_frame_createA(const char    *domain);
+__itt_frame ITTAPI __itt_frame_createW(const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_frame_create     __itt_frame_createW
+#  define __itt_frame_create_ptr __itt_frame_createW_ptr
+#else /* UNICODE */
+#  define __itt_frame_create     __itt_frame_createA
+#  define __itt_frame_create_ptr __itt_frame_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_frame ITTAPI __itt_frame_create(const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain))
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA     ITTNOTIFY_DATA(frame_createA)
+#define __itt_frame_createA_ptr ITTNOTIFY_NAME(frame_createA)
+#define __itt_frame_createW     ITTNOTIFY_DATA(frame_createW)
+#define __itt_frame_createW_ptr ITTNOTIFY_NAME(frame_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create     ITTNOTIFY_DATA(frame_create)
+#define __itt_frame_create_ptr ITTNOTIFY_NAME(frame_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA(domain)
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW(domain)
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create(domain)
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief Record an frame begin occurrence. */
+void ITTAPI __itt_frame_begin(__itt_frame frame);
+/** @brief Record an frame end occurrence. */
+void ITTAPI __itt_frame_end  (__itt_frame frame);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame))
+ITT_STUBV(ITTAPI, void, frame_end,   (__itt_frame frame))
+#define __itt_frame_begin     ITTNOTIFY_VOID(frame_begin)
+#define __itt_frame_begin_ptr ITTNOTIFY_NAME(frame_begin)
+#define __itt_frame_end       ITTNOTIFY_VOID(frame_end)
+#define __itt_frame_end_ptr   ITTNOTIFY_NAME(frame_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin(frame)
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end(frame)
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LEGACY_ITTNOTIFY_H_ */

diff --git a/final/runtime/src/z_Linux_asm.s b/final/runtime/src/z_Linux_asm.s
new file mode 100644
index 0000000..840065e
--- /dev/null
+++ b/final/runtime/src/z_Linux_asm.s

@@ -0,0 +1,1445 @@
+//  z_Linux_asm.s:  - microtasking routines specifically
+//                    written for Intel platforms running Linux* OS
+
+//
+////===----------------------------------------------------------------------===//
+////
+////                     The LLVM Compiler Infrastructure
+////
+//// This file is dual licensed under the MIT and the University of Illinois Open
+//// Source Licenses. See LICENSE.txt for details.
+////
+////===----------------------------------------------------------------------===//
+//
+
+// -----------------------------------------------------------------------
+// macros
+// -----------------------------------------------------------------------
+
+#include "kmp_platform.h"
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+# if __MIC__ || __MIC2__
+//
+// the 'delay r16/r32/r64' should be used instead of the 'pause'.
+// The delay operation has the effect of removing the current thread from
+// the round-robin HT mechanism, and therefore speeds up the issue rate of
+// the other threads on the same core.
+//
+// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
+// barrier time to increase greatly for 3 or more threads per core.
+//
+// A value of 100 works pretty well for up to 4 threads per core, but isn't
+// quite as fast as 0 for 2 threads per core.
+//
+// We need to check what happens for oversubscription / > 4 threads per core.
+// It is possible that we need to pass the delay value in as a parameter
+// that the caller determines based on the total # threads / # cores.
+//
+//.macro pause_op
+//	mov    $100, %rax
+//	delay  %rax
+//.endm
+# else
+#  define pause_op   .byte 0xf3,0x90
+# endif // __MIC__ || __MIC2__
+
+# if defined __APPLE__ && defined __MACH__
+#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
+#  define KMP_LABEL(x) L_##x             // form the name of label
+.macro KMP_CFI_DEF_OFFSET
+.endmacro
+.macro KMP_CFI_OFFSET
+.endmacro
+.macro KMP_CFI_REGISTER
+.endmacro
+.macro KMP_CFI_DEF
+.endmacro
+.macro ALIGN
+	.align $0
+.endmacro
+.macro DEBUG_INFO
+/* Not sure what .size does in icc, not sure if we need to do something
+   similar for OS X*.
+*/
+.endmacro
+.macro PROC
+	ALIGN  4
+	.globl KMP_PREFIX_UNDERSCORE($0)
+KMP_PREFIX_UNDERSCORE($0):
+.endmacro
+# else // defined __APPLE__ && defined __MACH__
+#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
+// Format labels so that they don't override function names in gdb's backtraces
+// MIC assembler doesn't accept .L syntax, the L works fine there (as well as on OS X*)
+# if __MIC__ || __MIC2__
+#  define KMP_LABEL(x) L_##x          // local label
+# else
+#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
+# endif // __MIC__ || __MIC2__
+.macro ALIGN size
+	.align 1<<(\size)
+.endm
+.macro DEBUG_INFO proc
+	.cfi_endproc
+// Not sure why we need .type and .size for the functions
+	.align 16
+	.type  \proc,@function
+        .size  \proc,.-\proc
+.endm
+.macro PROC proc
+	ALIGN  4
+        .globl KMP_PREFIX_UNDERSCORE(\proc)
+KMP_PREFIX_UNDERSCORE(\proc):
+	.cfi_startproc
+.endm
+.macro KMP_CFI_DEF_OFFSET sz
+	.cfi_def_cfa_offset	\sz
+.endm
+.macro KMP_CFI_OFFSET reg, sz
+	.cfi_offset	\reg,\sz
+.endm
+.macro KMP_CFI_REGISTER reg
+	.cfi_def_cfa_register	\reg
+.endm
+.macro KMP_CFI_DEF reg, sz
+	.cfi_def_cfa	\reg,\sz
+.endm
+# endif // defined __APPLE__ && defined __MACH__
+#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
+
+
+// -----------------------------------------------------------------------
+// data
+// -----------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+
+//
+// Support for unnamed common blocks.
+//
+// Because the symbol ".gomp_critical_user_" contains a ".", we have to
+// put this stuff in assembly.
+//
+
+# if KMP_ARCH_X86
+#  if defined __APPLE__ && defined __MACH__
+        .data
+        .comm .gomp_critical_user_,32
+        .data
+        .globl ___kmp_unnamed_critical_addr
+___kmp_unnamed_critical_addr:
+        .long .gomp_critical_user_
+#  else /* Linux* OS */
+        .data
+        .comm .gomp_critical_user_,32,8
+        .data
+	ALIGN 4
+        .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+        .4byte .gomp_critical_user_
+        .type __kmp_unnamed_critical_addr,@object
+        .size __kmp_unnamed_critical_addr,4
+#  endif /* defined __APPLE__ && defined __MACH__ */
+# endif /* KMP_ARCH_X86 */
+
+# if KMP_ARCH_X86_64
+#  if defined __APPLE__ && defined __MACH__
+        .data
+        .comm .gomp_critical_user_,32
+        .data
+        .globl ___kmp_unnamed_critical_addr
+___kmp_unnamed_critical_addr:
+        .quad .gomp_critical_user_
+#  else /* Linux* OS */
+        .data
+        .comm .gomp_critical_user_,32,8
+        .data
+	ALIGN 8
+        .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+        .8byte .gomp_critical_user_
+        .type __kmp_unnamed_critical_addr,@object
+        .size __kmp_unnamed_critical_addr,8
+#  endif /* defined __APPLE__ && defined __MACH__ */
+# endif /* KMP_ARCH_X86_64 */
+
+#endif /* KMP_GOMP_COMPAT */
+
+
+#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
+
+// -----------------------------------------------------------------------
+// microtasking routines specifically written for IA-32 architecture
+// running Linux* OS
+// -----------------------------------------------------------------------
+//
+
+	.ident "Intel Corporation"
+	.data
+	ALIGN 4
+// void
+// __kmp_x86_pause( void );
+//
+
+        .text
+	PROC  __kmp_x86_pause
+
+        pause_op
+        ret
+
+	DEBUG_INFO __kmp_x86_pause
+
+//
+// void
+// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
+//
+	PROC  __kmp_x86_cpuid
+
+	pushl %ebp
+	movl  %esp,%ebp
+        pushl %edi
+        pushl %ebx
+        pushl %ecx
+        pushl %edx
+
+	movl  8(%ebp), %eax
+	movl  12(%ebp), %ecx
+	cpuid				// Query the CPUID for the current processor
+
+	movl  16(%ebp), %edi
+	movl  %eax, 0(%edi)
+	movl  %ebx, 4(%edi)
+	movl  %ecx, 8(%edi)
+	movl  %edx, 12(%edi)
+
+        popl  %edx
+        popl  %ecx
+        popl  %ebx
+        popl  %edi
+        movl  %ebp, %esp
+        popl  %ebp
+	ret
+
+	DEBUG_INFO __kmp_x86_cpuid
+
+
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+//
+// kmp_int32
+// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+//
+
+        PROC      __kmp_test_then_add32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        lock
+        xaddl     %eax,(%ecx)
+        ret
+
+	DEBUG_INFO __kmp_test_then_add32
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_fixed8
+//
+// kmp_int32
+// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+//
+// return:	%al
+
+        PROC  __kmp_xchg_fixed8
+
+        movl      4(%esp), %ecx    // "p"
+        movb      8(%esp), %al	// "d"
+
+        lock
+        xchgb     %al,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed8
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_fixed16
+//
+// kmp_int16
+// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+// return:     %ax
+
+        PROC  __kmp_xchg_fixed16
+
+        movl      4(%esp), %ecx    // "p"
+        movw      8(%esp), %ax	// "d"
+
+        lock
+        xchgw     %ax,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed16
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_fixed32
+//
+// kmp_int32
+// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+//
+// return:	%eax
+
+        PROC  __kmp_xchg_fixed32
+
+        movl      4(%esp), %ecx    // "p"
+        movl      8(%esp), %eax	// "d"
+
+        lock
+        xchgl     %eax,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed32
+
+
+//
+// kmp_int8
+// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+
+        PROC  __kmp_compare_and_store8
+
+        movl      4(%esp), %ecx
+        movb      8(%esp), %al
+        movb      12(%esp), %dl
+        lock
+        cmpxchgb  %dl,(%ecx)
+        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store8
+
+//
+// kmp_int16
+// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+
+        PROC  __kmp_compare_and_store16
+
+        movl      4(%esp), %ecx
+        movw      8(%esp), %ax
+        movw      12(%esp), %dx
+        lock
+        cmpxchgw  %dx,(%ecx)
+        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store16
+
+//
+// kmp_int32
+// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+
+        PROC  __kmp_compare_and_store32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        movl      12(%esp), %edx
+        lock
+        cmpxchgl  %edx,(%ecx)
+        sete      %al           // if %eax == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store32
+
+//
+// kmp_int32
+// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+        PROC  __kmp_compare_and_store64
+
+        pushl     %ebp
+        movl      %esp, %ebp
+        pushl     %ebx
+        pushl     %edi
+        movl      8(%ebp), %edi
+        movl      12(%ebp), %eax        // "cv" low order word
+        movl      16(%ebp), %edx        // "cv" high order word
+        movl      20(%ebp), %ebx        // "sv" low order word
+        movl      24(%ebp), %ecx        // "sv" high order word
+        lock
+        cmpxchg8b (%edi)
+        sete      %al           // if %edx:eax == (%edi) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        popl      %edi
+        popl      %ebx
+        movl      %ebp, %esp
+        popl      %ebp
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store64
+
+//
+// kmp_int8
+// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+
+        PROC  __kmp_compare_and_store_ret8
+
+        movl      4(%esp), %ecx
+        movb      8(%esp), %al
+        movb      12(%esp), %dl
+        lock
+        cmpxchgb  %dl,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret8
+
+//
+// kmp_int16
+// __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+
+        PROC  __kmp_compare_and_store_ret16
+
+        movl      4(%esp), %ecx
+        movw      8(%esp), %ax
+        movw      12(%esp), %dx
+        lock
+        cmpxchgw  %dx,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret16
+
+//
+// kmp_int32
+// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+
+        PROC  __kmp_compare_and_store_ret32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        movl      12(%esp), %edx
+        lock
+        cmpxchgl  %edx,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret32
+
+//
+// kmp_int64
+// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+        PROC  __kmp_compare_and_store_ret64
+
+        pushl     %ebp
+        movl      %esp, %ebp
+        pushl     %ebx
+        pushl     %edi
+        movl      8(%ebp), %edi
+        movl      12(%ebp), %eax        // "cv" low order word
+        movl      16(%ebp), %edx        // "cv" high order word
+        movl      20(%ebp), %ebx        // "sv" low order word
+        movl      24(%ebp), %ecx        // "sv" high order word
+        lock
+        cmpxchg8b (%edi)
+        popl      %edi
+        popl      %ebx
+        movl      %ebp, %esp
+        popl      %ebp
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret64
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_real32
+//
+// kmp_real32
+// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
+//
+// parameters:
+// 	addr:	4(%esp)
+// 	data:	8(%esp)
+//
+// return:	%eax
+
+
+        PROC  __kmp_xchg_real32
+
+        pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        pushl   %esi
+
+        movl    4(%ebp), %esi
+        flds    (%esi)
+                        // load <addr>
+        fsts    -4(%ebp)
+                        // store old value
+
+        movl    8(%ebp), %eax
+
+        lock
+        xchgl   %eax, (%esi)
+
+        flds    -4(%ebp)
+                        // return old value
+
+        popl    %esi
+        movl    %ebp, %esp
+        popl    %ebp
+        ret
+
+        DEBUG_INFO __kmp_xchg_real32
+
+# endif /* !KMP_ASM_INTRINS */
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_load_x87_fpu_control_word
+//
+// void
+// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
+//
+// parameters:
+// 	p:	4(%esp)
+//
+
+        PROC  __kmp_load_x87_fpu_control_word
+
+        movl  4(%esp), %eax
+        fldcw (%eax)
+        ret
+
+        DEBUG_INFO __kmp_load_x87_fpu_control_word
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_store_x87_fpu_control_word
+//
+// void
+// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+//
+// parameters:
+// 	p:	4(%esp)
+//
+
+        PROC  __kmp_store_x87_fpu_control_word
+
+        movl  4(%esp), %eax
+        fstcw (%eax)
+        ret
+
+        DEBUG_INFO __kmp_store_x87_fpu_control_word
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_clear_x87_fpu_status_word
+//
+// void
+// __kmp_clear_x87_fpu_status_word();
+//
+//
+
+        PROC  __kmp_clear_x87_fpu_status_word
+
+        fnclex
+        ret
+
+        DEBUG_INFO __kmp_clear_x87_fpu_status_word
+
+
+//------------------------------------------------------------------------
+//
+// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
+//
+// int
+// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
+//                         int argc, void *p_argv[] ) {
+//    (*pkfn)( & gtid, & gtid, argv[0], ... );
+//    return 1;
+// }
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	PROC  __kmp_invoke_microtask
+
+	pushl %ebp
+	KMP_CFI_DEF_OFFSET 8
+	KMP_CFI_OFFSET ebp,-8
+	movl %esp,%ebp		// establish the base pointer for this routine.
+	KMP_CFI_REGISTER ebp
+	subl $8,%esp		// allocate space for two local variables.
+				// These varibales are:
+				//	argv: -4(%ebp)
+				//	temp: -8(%ebp)
+				//
+	pushl %ebx		// save %ebx to use during this routine
+				//
+#if OMPT_SUPPORT
+	movl 28(%ebp),%ebx	// get exit_frame address
+	movl %ebp,(%ebx)	// save exit_frame
+#endif
+
+	movl 20(%ebp),%ebx	// Stack alignment - # args
+	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
+	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
+	movl %esp,%eax		//
+	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
+	movl %eax,%ebx		// Save to %ebx
+	andl $0xFFFFFF80,%eax	// mask off 7 bits
+	subl %eax,%ebx		// Amount to subtract from %esp
+	subl %ebx,%esp		// Prepare the stack ptr --
+				//   now it will be aligned on 128-byte boundary at the call
+
+	movl 24(%ebp),%eax	// copy from p_argv[]
+	movl %eax,-4(%ebp)	// into the local variable *argv.
+
+	movl 20(%ebp),%ebx	// argc is 20(%ebp)
+	shll $2,%ebx
+
+KMP_LABEL(invoke_2):
+	cmpl $0,%ebx
+	jg  KMP_LABEL(invoke_4)
+	jmp KMP_LABEL(invoke_3)
+	ALIGN 2
+KMP_LABEL(invoke_4):
+	movl -4(%ebp),%eax
+	subl $4,%ebx			// decrement argc.
+	addl %ebx,%eax			// index into argv.
+	movl (%eax),%edx
+	pushl %edx
+
+	jmp KMP_LABEL(invoke_2)
+	ALIGN 2
+KMP_LABEL(invoke_3):
+	leal 16(%ebp),%eax		// push & tid
+	pushl %eax
+
+	leal 12(%ebp),%eax		// push & gtid
+	pushl %eax
+
+	movl 8(%ebp),%ebx
+	call *%ebx			// call (*pkfn)();
+
+	movl $1,%eax			// return 1;
+
+	movl -12(%ebp),%ebx		// restore %ebx
+	leave
+	KMP_CFI_DEF esp,4
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+
+// kmp_uint64
+// __kmp_hardware_timestamp(void)
+	PROC  __kmp_hardware_timestamp
+	rdtsc
+	ret
+
+	DEBUG_INFO __kmp_hardware_timestamp
+// -- End  __kmp_hardware_timestamp
+
+// -----------------------------------------------------------------------
+#endif /* KMP_ARCH_X86 */
+
+
+#if KMP_ARCH_X86_64
+
+// -----------------------------------------------------------------------
+// microtasking routines specifically written for IA-32 architecture and
+// Intel(R) 64 running Linux* OS
+// -----------------------------------------------------------------------
+
+// -- Machine type P
+// mark_description "Intel Corporation";
+	.ident "Intel Corporation"
+// --	.file "z_Linux_asm.s"
+	.data
+	ALIGN 4
+
+// To prevent getting our code into .data section .text added to every routine definition for x86_64.
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_x86_cpuid
+//
+// void
+// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
+//
+// parameters:
+// 	mode:		%edi
+// 	mode2:		%esi
+// 	cpuid_buffer:	%rdx
+
+        .text
+	PROC  __kmp_x86_cpuid
+
+	pushq  %rbp
+	movq   %rsp,%rbp
+        pushq  %rbx			// callee-save register
+
+	movl   %esi, %ecx		// "mode2"
+	movl   %edi, %eax		// "mode"
+        movq   %rdx, %rsi               // cpuid_buffer
+	cpuid				// Query the CPUID for the current processor
+
+	movl   %eax, 0(%rsi)		// store results into buffer
+	movl   %ebx, 4(%rsi)
+	movl   %ecx, 8(%rsi)
+	movl   %edx, 12(%rsi)
+
+        popq   %rbx			// callee-save register
+        movq   %rbp, %rsp
+        popq   %rbp
+	ret
+
+        DEBUG_INFO __kmp_x86_cpuid
+
+
+
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_test_then_add32
+//
+// kmp_int32
+// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%esi
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_test_then_add32
+
+        movl      %esi, %eax	// "d"
+        lock
+        xaddl     %eax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_test_then_add32
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_test_then_add64
+//
+// kmp_int64
+// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%rsi
+//	return:	%rax
+
+        .text
+        PROC  __kmp_test_then_add64
+
+        movq      %rsi, %rax	// "d"
+        lock
+        xaddq     %rax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_test_then_add64
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_fixed8
+//
+// kmp_int32
+// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%sil
+//
+// return:	%al
+
+        .text
+        PROC  __kmp_xchg_fixed8
+
+        movb      %sil, %al	// "d"
+
+        lock
+        xchgb     %al,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed8
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_fixed16
+//
+// kmp_int16
+// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%si
+// return:     %ax
+
+        .text
+        PROC  __kmp_xchg_fixed16
+
+        movw      %si, %ax	// "d"
+
+        lock
+        xchgw     %ax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed16
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_fixed32
+//
+// kmp_int32
+// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%esi
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_xchg_fixed32
+
+        movl      %esi, %eax	// "d"
+
+        lock
+        xchgl     %eax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed32
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_fixed64
+//
+// kmp_int64
+// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%rsi
+// return:	%rax
+
+        .text
+        PROC  __kmp_xchg_fixed64
+
+        movq      %rsi, %rax	// "d"
+
+        lock
+        xchgq     %rax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed64
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store8
+//
+// kmp_int8
+// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store8
+
+        movb      %sil, %al	// "cv"
+        lock
+        cmpxchgb  %dl,(%rdi)
+        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store8
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store16
+//
+// kmp_int16
+// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%si
+//	sv:	%dx
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store16
+
+        movw      %si, %ax	// "cv"
+        lock
+        cmpxchgw  %dx,(%rdi)
+        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store16
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store32
+//
+// kmp_int32
+// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store32
+
+        movl      %esi, %eax	// "cv"
+        lock
+        cmpxchgl  %edx,(%rdi)
+        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store32
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store64
+//
+// kmp_int32
+// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%rsi
+//	sv:	%rdx
+//	return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store64
+
+        movq      %rsi, %rax    // "cv"
+        lock
+        cmpxchgq  %rdx,(%rdi)
+        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store64
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store_ret8
+//
+// kmp_int8
+// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store_ret8
+
+        movb      %sil, %al	// "cv"
+        lock
+        cmpxchgb  %dl,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret8
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store_ret16
+//
+// kmp_int16
+// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%si
+//	sv:	%dx
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store_ret16
+
+        movw      %si, %ax	// "cv"
+        lock
+        cmpxchgw  %dx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret16
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store_ret32
+//
+// kmp_int32
+// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store_ret32
+
+        movl      %esi, %eax	// "cv"
+        lock
+        cmpxchgl  %edx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret32
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_compare_and_store_ret64
+//
+// kmp_int64
+// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%rsi
+//	sv:	%rdx
+//	return:	%eax
+
+        .text
+        PROC  __kmp_compare_and_store_ret64
+
+        movq      %rsi, %rax    // "cv"
+        lock
+        cmpxchgq  %rdx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret64
+
+# endif /* !KMP_ASM_INTRINS */
+
+
+# if ! (__MIC__ || __MIC2__)
+
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_real32
+//
+// kmp_real32
+// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
+//
+// parameters:
+// 	addr:	%rdi
+// 	data:	%xmm0 (lower 4 bytes)
+//
+// return:	%xmm0 (lower 4 bytes)
+
+        .text
+        PROC  __kmp_xchg_real32
+
+	movd	%xmm0, %eax	// load "data" to eax
+
+         lock
+         xchgl %eax, (%rdi)
+
+	movd	%eax, %xmm0	// load old value into return register
+
+        ret
+
+        DEBUG_INFO __kmp_xchg_real32
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_xchg_real64
+//
+// kmp_real64
+// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
+//
+// parameters:
+//      addr:   %rdi
+//      data:   %xmm0 (lower 8 bytes)
+//      return: %xmm0 (lower 8 bytes)
+//
+
+        .text
+        PROC  __kmp_xchg_real64
+
+	movd	%xmm0, %rax	// load "data" to rax
+
+         lock
+	xchgq  %rax, (%rdi)
+
+	movd	%rax, %xmm0	// load old value into return register
+        ret
+
+        DEBUG_INFO __kmp_xchg_real64
+
+
+# endif /* !(__MIC__ || __MIC2__) */
+
+# endif /* !KMP_ASM_INTRINS */
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_load_x87_fpu_control_word
+//
+// void
+// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
+//
+// parameters:
+// 	p:	%rdi
+//
+
+        .text
+        PROC  __kmp_load_x87_fpu_control_word
+
+        fldcw (%rdi)
+        ret
+
+        DEBUG_INFO __kmp_load_x87_fpu_control_word
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_store_x87_fpu_control_word
+//
+// void
+// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+//
+// parameters:
+// 	p:	%rdi
+//
+
+        .text
+        PROC  __kmp_store_x87_fpu_control_word
+
+        fstcw (%rdi)
+        ret
+
+        DEBUG_INFO __kmp_store_x87_fpu_control_word
+
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_clear_x87_fpu_status_word
+//
+// void
+// __kmp_clear_x87_fpu_status_word();
+//
+//
+
+        .text
+        PROC  __kmp_clear_x87_fpu_status_word
+
+#if __MIC__ || __MIC2__
+// TODO: remove the workaround for problem with fnclex instruction (no CQ known)
+        fstenv  -32(%rsp)              // store FP env
+        andw    $~0x80ff, 4-32(%rsp)   // clear 0-7,15 bits of FP SW
+        fldenv  -32(%rsp)              // load FP env back
+        ret
+#else
+        fnclex
+        ret
+#endif
+
+        DEBUG_INFO __kmp_clear_x87_fpu_status_word
+
+
+//------------------------------------------------------------------------
+//
+// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
+//
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//		           int gtid, int tid,
+//                         int argc, void *p_argv[] ) {
+//    (*pkfn)( & gtid, & tid, argv[0], ... );
+//    return 1;
+// }
+//
+// note:
+//	at call to pkfn must have %rsp 128-byte aligned for compiler
+//
+// parameters:
+//      %rdi:  	pkfn
+//	%esi:	gtid
+//	%edx:	tid
+//	%ecx:	argc
+//	%r8:	p_argv
+//	%r9:	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	%rax:	used all over the place
+//	%rdx:	used in stack pointer alignment calculation
+//	%r11:	used to traverse p_argv array
+//	%rsi:	used as temporary for stack parameters
+//		used as temporary for number of pkfn parms to push
+//	%rbx:	used to hold pkfn address, and zero constant, callee-save
+//
+// return:	%eax 	(always 1/TRUE)
+//
+
+__gtid = -16
+__tid = -24
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+        .text
+	PROC  __kmp_invoke_microtask
+
+	pushq 	%rbp		// save base pointer
+	KMP_CFI_DEF_OFFSET 16
+	KMP_CFI_OFFSET rbp,-16
+	movq 	%rsp,%rbp	// establish the base pointer for this routine.
+	KMP_CFI_REGISTER rbp
+
+#if OMPT_SUPPORT
+	movq	%rbp, (%r9)	// save exit_frame
+#endif
+
+	pushq 	%rbx		// %rbx is callee-saved register
+	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
+	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
+
+	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
+	movq	$0, %rbx	// constant for cmovs later
+	subq	$4, %rax	// subtract four args passed in registers to pkfn
+#if __MIC__ || __MIC2__
+	js	KMP_LABEL(kmp_0)	// jump to movq
+	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
+KMP_LABEL(kmp_0):
+	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+KMP_LABEL(kmp_0_exit):
+#else
+	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+#endif // __MIC__ || __MIC2__
+
+	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
+	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
+
+	movq 	%rsp, %rdx	//
+	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
+				// without align, stack ptr would be this
+	movq 	%rdx, %rax	// Save to %rax
+
+	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
+	subq 	%rax, %rdx	// Amount to subtract from %rsp
+	subq 	%rdx, %rsp	// Prepare the stack ptr --
+				// now %rsp will align to 128-byte boundary at call site
+
+				// setup pkfn parameter reg and stack
+	movq	%rcx, %rax	// argc -> %rax
+	cmpq	$0, %rsi
+	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
+	shlq	$3, %rcx	// argc*8 -> %rcx
+	movq 	%r8, %rdx	// p_argv -> %rdx
+	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
+
+	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
+
+KMP_LABEL(kmp_invoke_push_parms):
+	// push nth - 7th parms to pkfn on stack
+	subq	$8, %rdx	// decrement p_argv pointer to previous parm
+	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
+	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
+	subl	$1, %ecx
+
+// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
+//		if the name of the label that is an operand of this jecxz starts with a dot (".");
+//	   Apple's linker does not support 1-byte length relocation;
+//         Resolution: replace all .labelX entries with L_labelX.
+
+	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
+	jmp	KMP_LABEL(kmp_invoke_push_parms)
+	ALIGN 3
+KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
+				// order here is important to avoid trashing
+				// registers used for both input and output parms!
+	movq	%rdi, %rbx	// pkfn -> %rbx
+	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
+	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
+
+	movq	%r8, %r11	// p_argv -> %r11
+
+#if __MIC__ || __MIC2__
+	cmpq	$4, %rax	// argc >= 4?
+	jns	KMP_LABEL(kmp_4)	// jump to movq
+	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
+KMP_LABEL(kmp_4):
+	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+KMP_LABEL(kmp_4_exit):
+
+	cmpq	$3, %rax	// argc >= 3?
+	jns	KMP_LABEL(kmp_3)	// jump to movq
+	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
+KMP_LABEL(kmp_3):
+	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+KMP_LABEL(kmp_3_exit):
+
+	cmpq	$2, %rax	// argc >= 2?
+	jns	KMP_LABEL(kmp_2)	// jump to movq
+	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
+KMP_LABEL(kmp_2):
+	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+KMP_LABEL(kmp_2_exit):
+
+	cmpq	$1, %rax	// argc >= 1?
+	jns	KMP_LABEL(kmp_1)	// jump to movq
+	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
+KMP_LABEL(kmp_1):
+	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+KMP_LABEL(kmp_1_exit):
+#else
+	cmpq	$4, %rax	// argc >= 4?
+	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+
+	cmpq	$3, %rax	// argc >= 3?
+	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+
+	cmpq	$2, %rax	// argc >= 2?
+	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+
+	cmpq	$1, %rax	// argc >= 1?
+	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+#endif // __MIC__ || __MIC2__
+
+	call	*%rbx		// call (*pkfn)();
+	movq	$1, %rax	// move 1 into return register;
+
+	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
+	movq 	%rbp, %rsp	// restore stack pointer
+	popq 	%rbp		// restore frame pointer
+	KMP_CFI_DEF rsp,8
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+// kmp_uint64
+// __kmp_hardware_timestamp(void)
+        .text
+	PROC  __kmp_hardware_timestamp
+	rdtsc
+	shlq    $32, %rdx
+	orq     %rdx, %rax
+	ret
+
+	DEBUG_INFO __kmp_hardware_timestamp
+// -- End  __kmp_hardware_timestamp
+
+//------------------------------------------------------------------------
+//
+// FUNCTION __kmp_bsr32
+//
+// int
+// __kmp_bsr32( int );
+//
+
+        .text
+        PROC  __kmp_bsr32
+
+        bsr    %edi,%eax
+        ret
+
+        DEBUG_INFO __kmp_bsr32
+
+	
+// -----------------------------------------------------------------------
+#endif /* KMP_ARCH_X86_64 */
+
+#if KMP_ARCH_ARM
+    .data
+    .comm .gomp_critical_user_,32,8
+    .data
+    .align 4
+    .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+    .4byte .gomp_critical_user_
+    .size __kmp_unnamed_critical_addr,4
+#endif /* KMP_ARCH_ARM */
+
+#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
+    .data
+    .comm .gomp_critical_user_,32,8
+    .data
+    .align 8
+    .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+    .8byte .gomp_critical_user_
+    .size __kmp_unnamed_critical_addr,8
+#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
+
+#if defined(__linux__)
+# if KMP_ARCH_ARM
+.section .note.GNU-stack,"",%progbits
+# else
+.section .note.GNU-stack,"",@progbits
+# endif
+#endif

diff --git a/final/runtime/src/z_Linux_util.c b/final/runtime/src/z_Linux_util.c
new file mode 100644
index 0000000..9f34d34
--- /dev/null
+++ b/final/runtime/src/z_Linux_util.c

@@ -0,0 +1,2762 @@
+/*
+ * z_Linux_util.c -- platform specific routines.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_wrapper_getpid.h"
+#include "kmp_itt.h"
+#include "kmp_str.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_stats.h"
+#include "kmp_wait_release.h"
+
+#if !KMP_OS_FREEBSD
+# include <alloca.h>
+#endif
+#include <unistd.h>
+#include <math.h>               // HUGE_VAL.
+#include <sys/time.h>
+#include <sys/times.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+
+#if KMP_OS_LINUX && !KMP_OS_CNK
+# include <sys/sysinfo.h>
+# if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+// We should really include <futex.h>, but that causes compatibility problems on different
+// Linux* OS distributions that either require that you include (or break when you try to include)
+// <pci/types.h>.
+// Since all we need is the two macros below (which are part of the kernel ABI, so can't change)
+// we just define the constants here and don't include <futex.h>
+#  ifndef FUTEX_WAIT
+#   define FUTEX_WAIT    0
+#  endif
+#  ifndef FUTEX_WAKE
+#   define FUTEX_WAKE    1
+#  endif
+# endif
+#elif KMP_OS_DARWIN
+# include <sys/sysctl.h>
+# include <mach/mach.h>
+#elif KMP_OS_FREEBSD
+# include <sys/sysctl.h>
+# include <pthread_np.h>
+#endif
+
+
+#include <dirent.h>
+#include <ctype.h>
+#include <fcntl.h>
+
+// For non-x86 architecture
+#if KMP_COMPILER_GCC && !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64)
+# include <stdbool.h>
+# include <ffi.h>
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+struct kmp_sys_timer {
+    struct timespec     start;
+};
+
+// Convert timespec to nanoseconds.
+#define TS2NS(timespec) (((timespec).tv_sec * 1e9) + (timespec).tv_nsec)
+
+static struct kmp_sys_timer __kmp_sys_timer_data;
+
+#if KMP_HANDLE_SIGNALS
+    typedef void                            (* sig_func_t )( int );
+    STATIC_EFI2_WORKAROUND struct sigaction    __kmp_sighldrs[ NSIG ];
+    static sigset_t                            __kmp_sigset;
+#endif
+
+static int __kmp_init_runtime   = FALSE;
+
+static int __kmp_fork_count = 0;
+
+static pthread_condattr_t  __kmp_suspend_cond_attr;
+static pthread_mutexattr_t __kmp_suspend_mutex_attr;
+
+static kmp_cond_align_t    __kmp_wait_cv;
+static kmp_mutex_align_t   __kmp_wait_mx;
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#ifdef DEBUG_SUSPEND
+static void
+__kmp_print_cond( char *buffer, kmp_cond_align_t *cond )
+{
+    KMP_SNPRINTF( buffer, 128, "(cond (lock (%ld, %d)), (descr (%p)))",
+                      cond->c_cond.__c_lock.__status, cond->c_cond.__c_lock.__spinlock,
+                      cond->c_cond.__c_waiting );
+}
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if ( KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED)
+
+/*
+ * Affinity support
+ */
+
+/*
+ * On some of the older OS's that we build on, these constants aren't present
+ * in <asm/unistd.h> #included from <sys.syscall.h>.  They must be the same on
+ * all systems of the same arch where they are defined, and they cannot change.
+ * stone forever.
+ */
+
+#  if KMP_ARCH_X86 || KMP_ARCH_ARM
+#   ifndef __NR_sched_setaffinity
+#    define __NR_sched_setaffinity  241
+#   elif __NR_sched_setaffinity != 241
+#    error Wrong code for setaffinity system call.
+#   endif /* __NR_sched_setaffinity */
+#   ifndef __NR_sched_getaffinity
+#    define __NR_sched_getaffinity  242
+#   elif __NR_sched_getaffinity != 242
+#    error Wrong code for getaffinity system call.
+#   endif /* __NR_sched_getaffinity */
+
+#  elif KMP_ARCH_AARCH64
+#   ifndef __NR_sched_setaffinity
+#    define __NR_sched_setaffinity  122
+#   elif __NR_sched_setaffinity != 122
+#    error Wrong code for setaffinity system call.
+#   endif /* __NR_sched_setaffinity */
+#   ifndef __NR_sched_getaffinity
+#    define __NR_sched_getaffinity  123
+#   elif __NR_sched_getaffinity != 123
+#    error Wrong code for getaffinity system call.
+#   endif /* __NR_sched_getaffinity */
+
+#  elif KMP_ARCH_X86_64
+#   ifndef __NR_sched_setaffinity
+#    define __NR_sched_setaffinity  203
+#   elif __NR_sched_setaffinity != 203
+#    error Wrong code for setaffinity system call.
+#   endif /* __NR_sched_setaffinity */
+#   ifndef __NR_sched_getaffinity
+#    define __NR_sched_getaffinity  204
+#   elif __NR_sched_getaffinity != 204
+#    error Wrong code for getaffinity system call.
+#   endif /* __NR_sched_getaffinity */
+
+#  elif KMP_ARCH_PPC64
+#   ifndef __NR_sched_setaffinity
+#    define __NR_sched_setaffinity  222
+#   elif __NR_sched_setaffinity != 222
+#    error Wrong code for setaffinity system call.
+#   endif /* __NR_sched_setaffinity */
+#   ifndef __NR_sched_getaffinity
+#    define __NR_sched_getaffinity  223
+#   elif __NR_sched_getaffinity != 223
+#    error Wrong code for getaffinity system call.
+#   endif /* __NR_sched_getaffinity */
+
+
+#  else
+#   error Unknown or unsupported architecture
+
+#  endif /* KMP_ARCH_* */
+
+int
+__kmp_set_system_affinity( kmp_affin_mask_t const *mask, int abort_on_error )
+{
+    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+      "Illegal set affinity operation when not capable");
+
+    int retval = syscall( __NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask );
+    if (retval >= 0) {
+        return 0;
+    }
+    int error = errno;
+    if (abort_on_error) {
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( FatalSysError ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+    return error;
+}
+
+int
+__kmp_get_system_affinity( kmp_affin_mask_t *mask, int abort_on_error )
+{
+    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+      "Illegal get affinity operation when not capable");
+
+    int retval = syscall( __NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask );
+    if (retval >= 0) {
+        return 0;
+    }
+    int error = errno;
+    if (abort_on_error) {
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( FatalSysError ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+    return error;
+}
+
+void
+__kmp_affinity_bind_thread( int which )
+{
+    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+      "Illegal set affinity operation when not capable");
+
+    kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(which, mask);
+    __kmp_set_system_affinity(mask, TRUE);
+}
+
+/*
+ * Determine if we can access affinity functionality on this version of
+ * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
+ * __kmp_affin_mask_size to the appropriate value (0 means not capable).
+ */
+void
+__kmp_affinity_determine_capable(const char *env_var)
+{
+    //
+    // Check and see if the OS supports thread affinity.
+    //
+
+# define KMP_CPU_SET_SIZE_LIMIT          (1024*1024)
+
+    int gCode;
+    int sCode;
+    kmp_affin_mask_t *buf;
+    buf = ( kmp_affin_mask_t * ) KMP_INTERNAL_MALLOC( KMP_CPU_SET_SIZE_LIMIT );
+
+    // If Linux* OS:
+    // If the syscall fails or returns a suggestion for the size,
+    // then we don't have to search for an appropriate size.
+    gCode = syscall( __NR_sched_getaffinity, 0, KMP_CPU_SET_SIZE_LIMIT, buf );
+    KA_TRACE(30, ( "__kmp_affinity_determine_capable: "
+       "initial getaffinity call returned %d errno = %d\n",
+       gCode, errno));
+
+    //if ((gCode < 0) && (errno == ENOSYS))
+    if (gCode < 0) {
+        //
+        // System call not supported
+        //
+        if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+          && (__kmp_affinity_type != affinity_none)
+          && (__kmp_affinity_type != affinity_default)
+          && (__kmp_affinity_type != affinity_disabled))) {
+            int error = errno;
+            __kmp_msg(
+                kmp_ms_warning,
+                KMP_MSG( GetAffSysCallNotSupported, env_var ),
+                KMP_ERR( error ),
+                __kmp_msg_null
+            );
+        }
+        KMP_AFFINITY_DISABLE();
+        KMP_INTERNAL_FREE(buf);
+        return;
+    }
+    if (gCode > 0) { // Linux* OS only
+        // The optimal situation: the OS returns the size of the buffer
+        // it expects.
+        //
+        // A verification of correct behavior is that Isetaffinity on a NULL
+        // buffer with the same size fails with errno set to EFAULT.
+        sCode = syscall( __NR_sched_setaffinity, 0, gCode, NULL );
+        KA_TRACE(30, ( "__kmp_affinity_determine_capable: "
+           "setaffinity for mask size %d returned %d errno = %d\n",
+           gCode, sCode, errno));
+        if (sCode < 0) {
+            if (errno == ENOSYS) {
+                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                  && (__kmp_affinity_type != affinity_none)
+                  && (__kmp_affinity_type != affinity_default)
+                  && (__kmp_affinity_type != affinity_disabled))) {
+                    int error = errno;
+                    __kmp_msg(
+                        kmp_ms_warning,
+                        KMP_MSG( SetAffSysCallNotSupported, env_var ),
+                        KMP_ERR( error ),
+                        __kmp_msg_null
+                    );
+                }
+                KMP_AFFINITY_DISABLE();
+                KMP_INTERNAL_FREE(buf);
+            }
+            if (errno == EFAULT) {
+                KMP_AFFINITY_ENABLE(gCode);
+                KA_TRACE(10, ( "__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n",
+                  (int)__kmp_affin_mask_size));
+                KMP_INTERNAL_FREE(buf);
+                return;
+            }
+        }
+    }
+
+    //
+    // Call the getaffinity system call repeatedly with increasing set sizes
+    // until we succeed, or reach an upper bound on the search.
+    //
+    KA_TRACE(30, ( "__kmp_affinity_determine_capable: "
+      "searching for proper set size\n"));
+    int size;
+    for (size = 1; size <= KMP_CPU_SET_SIZE_LIMIT; size *= 2) {
+        gCode = syscall( __NR_sched_getaffinity, 0,  size, buf );
+        KA_TRACE(30, ( "__kmp_affinity_determine_capable: "
+          "getaffinity for mask size %d returned %d errno = %d\n", size,
+            gCode, errno));
+
+        if (gCode < 0) {
+            if ( errno == ENOSYS )
+            {
+                //
+                // We shouldn't get here
+                //
+                KA_TRACE(30, ( "__kmp_affinity_determine_capable: "
+                  "inconsistent OS call behavior: errno == ENOSYS for mask size %d\n",
+                   size));
+                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                  && (__kmp_affinity_type != affinity_none)
+                  && (__kmp_affinity_type != affinity_default)
+                  && (__kmp_affinity_type != affinity_disabled))) {
+                    int error = errno;
+                    __kmp_msg(
+                        kmp_ms_warning,
+                        KMP_MSG( GetAffSysCallNotSupported, env_var ),
+                        KMP_ERR( error ),
+                        __kmp_msg_null
+                    );
+                }
+                KMP_AFFINITY_DISABLE();
+                KMP_INTERNAL_FREE(buf);
+                return;
+            }
+            continue;
+        }
+
+        sCode = syscall( __NR_sched_setaffinity, 0, gCode, NULL );
+        KA_TRACE(30, ( "__kmp_affinity_determine_capable: "
+           "setaffinity for mask size %d returned %d errno = %d\n",
+           gCode, sCode, errno));
+        if (sCode < 0) {
+            if (errno == ENOSYS) { // Linux* OS only
+                //
+                // We shouldn't get here
+                //
+                KA_TRACE(30, ( "__kmp_affinity_determine_capable: "
+                  "inconsistent OS call behavior: errno == ENOSYS for mask size %d\n",
+                   size));
+                if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                  && (__kmp_affinity_type != affinity_none)
+                  && (__kmp_affinity_type != affinity_default)
+                  && (__kmp_affinity_type != affinity_disabled))) {
+                    int error = errno;
+                    __kmp_msg(
+                        kmp_ms_warning,
+                        KMP_MSG( SetAffSysCallNotSupported, env_var ),
+                        KMP_ERR( error ),
+                        __kmp_msg_null
+                    );
+                }
+                KMP_AFFINITY_DISABLE();
+                KMP_INTERNAL_FREE(buf);
+                return;
+            }
+            if (errno == EFAULT) {
+                KMP_AFFINITY_ENABLE(gCode);
+                KA_TRACE(10, ( "__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n",
+                   (int)__kmp_affin_mask_size));
+                KMP_INTERNAL_FREE(buf);
+                return;
+            }
+        }
+    }
+    //int error = errno;  // save uncaught error code
+    KMP_INTERNAL_FREE(buf);
+    // errno = error;  // restore uncaught error code, will be printed at the next KMP_WARNING below
+
+    //
+    // Affinity is not supported
+    //
+    KMP_AFFINITY_DISABLE();
+    KA_TRACE(10, ( "__kmp_affinity_determine_capable: "
+      "cannot determine mask size - affinity not supported\n"));
+    if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+      && (__kmp_affinity_type != affinity_none)
+      && (__kmp_affinity_type != affinity_default)
+      && (__kmp_affinity_type != affinity_disabled))) {
+        KMP_WARNING( AffCantGetMaskSize, env_var );
+    }
+}
+
+#endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && !KMP_OS_CNK
+
+int
+__kmp_futex_determine_capable()
+{
+    int loc = 0;
+    int rc = syscall( __NR_futex, &loc, FUTEX_WAKE, 1, NULL, NULL, 0 );
+    int retval = ( rc == 0 ) || ( errno != ENOSYS );
+
+    KA_TRACE(10, ( "__kmp_futex_determine_capable: rc = %d errno = %d\n", rc,
+      errno ) );
+    KA_TRACE(10, ( "__kmp_futex_determine_capable: futex syscall%s supported\n",
+        retval ? "" : " not" ) );
+
+    return retval;
+}
+
+#endif // KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM) && !KMP_OS_CNK
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (! KMP_ASM_INTRINS)
+/*
+ * Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
+ * use compare_and_store for these routines
+ */
+
+kmp_int8
+__kmp_test_then_or8( volatile kmp_int8 *p, kmp_int8 d )
+{
+    kmp_int8 old_value, new_value;
+
+    old_value = TCR_1( *p );
+    new_value = old_value | d;
+
+    while ( ! KMP_COMPARE_AND_STORE_REL8 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_1( *p );
+        new_value = old_value | d;
+    }
+    return old_value;
+}
+
+kmp_int8
+__kmp_test_then_and8( volatile kmp_int8 *p, kmp_int8 d )
+{
+    kmp_int8 old_value, new_value;
+
+    old_value = TCR_1( *p );
+    new_value = old_value & d;
+
+    while ( ! KMP_COMPARE_AND_STORE_REL8 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_1( *p );
+        new_value = old_value & d;
+    }
+    return old_value;
+}
+
+kmp_int32
+__kmp_test_then_or32( volatile kmp_int32 *p, kmp_int32 d )
+{
+    kmp_int32 old_value, new_value;
+
+    old_value = TCR_4( *p );
+    new_value = old_value | d;
+
+    while ( ! KMP_COMPARE_AND_STORE_REL32 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_4( *p );
+        new_value = old_value | d;
+    }
+    return old_value;
+}
+
+kmp_int32
+__kmp_test_then_and32( volatile kmp_int32 *p, kmp_int32 d )
+{
+    kmp_int32 old_value, new_value;
+
+    old_value = TCR_4( *p );
+    new_value = old_value & d;
+
+    while ( ! KMP_COMPARE_AND_STORE_REL32 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_4( *p );
+        new_value = old_value & d;
+    }
+    return old_value;
+}
+
+# if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
+kmp_int8
+__kmp_test_then_add8( volatile kmp_int8 *p, kmp_int8 d )
+{
+    kmp_int8 old_value, new_value;
+
+    old_value = TCR_1( *p );
+    new_value = old_value + d;
+
+    while ( ! KMP_COMPARE_AND_STORE_REL8 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_1( *p );
+        new_value = old_value + d;
+    }
+    return old_value;
+}
+
+kmp_int64
+__kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d )
+{
+    kmp_int64 old_value, new_value;
+
+    old_value = TCR_8( *p );
+    new_value = old_value + d;
+
+    while ( ! KMP_COMPARE_AND_STORE_REL64 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_8( *p );
+        new_value = old_value + d;
+    }
+    return old_value;
+}
+# endif /* KMP_ARCH_X86 */
+
+kmp_int64
+__kmp_test_then_or64( volatile kmp_int64 *p, kmp_int64 d )
+{
+    kmp_int64 old_value, new_value;
+
+    old_value = TCR_8( *p );
+    new_value = old_value | d;
+    while ( ! KMP_COMPARE_AND_STORE_REL64 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_8( *p );
+        new_value = old_value | d;
+    }
+    return old_value;
+}
+
+kmp_int64
+__kmp_test_then_and64( volatile kmp_int64 *p, kmp_int64 d )
+{
+    kmp_int64 old_value, new_value;
+
+    old_value = TCR_8( *p );
+    new_value = old_value & d;
+    while ( ! KMP_COMPARE_AND_STORE_REL64 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_8( *p );
+        new_value = old_value & d;
+    }
+    return old_value;
+}
+
+#endif /* (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (! KMP_ASM_INTRINS) */
+
+void
+__kmp_terminate_thread( int gtid )
+{
+    int status;
+    kmp_info_t  *th = __kmp_threads[ gtid ];
+
+    if ( !th ) return;
+
+    #ifdef KMP_CANCEL_THREADS
+        KA_TRACE( 10, ("__kmp_terminate_thread: kill (%d)\n", gtid ) );
+        status = pthread_cancel( th->th.th_info.ds.ds_thread );
+        if ( status != 0 && status != ESRCH ) {
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantTerminateWorkerThread ),
+                KMP_ERR( status ),
+                __kmp_msg_null
+            );
+        }; // if
+    #endif
+    __kmp_yield( TRUE );
+} //
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Set thread stack info according to values returned by
+ * pthread_getattr_np().
+ * If values are unreasonable, assume call failed and use
+ * incremental stack refinement method instead.
+ * Returns TRUE if the stack parameters could be determined exactly,
+ * FALSE if incremental refinement is necessary.
+ */
+static kmp_int32
+__kmp_set_stack_info( int gtid, kmp_info_t *th )
+{
+    int            stack_data;
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
+    /* Linux* OS only -- no pthread_getattr_np support on OS X* */
+    pthread_attr_t attr;
+    int            status;
+    size_t         size = 0;
+    void *         addr = 0;
+
+    /* Always do incremental stack refinement for ubermaster threads since the initial
+       thread stack range can be reduced by sibling thread creation so pthread_attr_getstack
+       may cause thread gtid aliasing */
+    if ( ! KMP_UBER_GTID(gtid) ) {
+
+        /* Fetch the real thread attributes */
+        status = pthread_attr_init( &attr );
+        KMP_CHECK_SYSFAIL( "pthread_attr_init", status );
+#if KMP_OS_FREEBSD
+        status = pthread_attr_get_np( pthread_self(), &attr );
+        KMP_CHECK_SYSFAIL( "pthread_attr_get_np", status );
+#else
+        status = pthread_getattr_np( pthread_self(), &attr );
+        KMP_CHECK_SYSFAIL( "pthread_getattr_np", status );
+#endif
+        status = pthread_attr_getstack( &attr, &addr, &size );
+        KMP_CHECK_SYSFAIL( "pthread_attr_getstack", status );
+        KA_TRACE( 60, ( "__kmp_set_stack_info: T#%d pthread_attr_getstack returned size: %lu, "
+                        "low addr: %p\n",
+                        gtid, size, addr ));
+
+        status = pthread_attr_destroy( &attr );
+        KMP_CHECK_SYSFAIL( "pthread_attr_destroy", status );
+    }
+
+    if ( size != 0 && addr != 0 ) {     /* was stack parameter determination successful? */
+        /* Store the correct base and size */
+        TCW_PTR(th->th.th_info.ds.ds_stackbase, (((char *)addr) + size));
+        TCW_PTR(th->th.th_info.ds.ds_stacksize, size);
+        TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
+        return TRUE;
+    }
+#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
+    /* Use incremental refinement starting from initial conservative estimate */
+    TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
+    TCW_PTR(th -> th.th_info.ds.ds_stackbase, &stack_data);
+    TCW_4(th->th.th_info.ds.ds_stackgrow, TRUE);
+    return FALSE;
+}
+
+static void*
+__kmp_launch_worker( void *thr )
+{
+    int status, old_type, old_state;
+#ifdef KMP_BLOCK_SIGNALS
+    sigset_t    new_set, old_set;
+#endif /* KMP_BLOCK_SIGNALS */
+    void *exit_val;
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
+    void *padding = 0;
+#endif
+    int gtid;
+
+    gtid = ((kmp_info_t*)thr) -> th.th_info.ds.ds_gtid;
+    __kmp_gtid_set_specific( gtid );
+#ifdef KMP_TDATA_GTID
+    __kmp_gtid = gtid;
+#endif
+#if KMP_STATS_ENABLED
+    // set __thread local index to point to thread-specific stats
+    __kmp_stats_thread_ptr = ((kmp_info_t*)thr)->th.th_stats;
+#endif
+
+#if USE_ITT_BUILD
+    __kmp_itt_thread_name( gtid );
+#endif /* USE_ITT_BUILD */
+
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_affinity_set_init_mask( gtid, FALSE );
+#endif
+
+#ifdef KMP_CANCEL_THREADS
+    status = pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, & old_type );
+    KMP_CHECK_SYSFAIL( "pthread_setcanceltype", status );
+    /* josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads? */
+    status = pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, & old_state );
+    KMP_CHECK_SYSFAIL( "pthread_setcancelstate", status );
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    //
+    // Set the FP control regs to be a copy of
+    // the parallel initialization thread's.
+    //
+    __kmp_clear_x87_fpu_status_word();
+    __kmp_load_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
+    __kmp_load_mxcsr( &__kmp_init_mxcsr );
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef KMP_BLOCK_SIGNALS
+    status = sigfillset( & new_set );
+    KMP_CHECK_SYSFAIL_ERRNO( "sigfillset", status );
+    status = pthread_sigmask( SIG_BLOCK, & new_set, & old_set );
+    KMP_CHECK_SYSFAIL( "pthread_sigmask", status );
+#endif /* KMP_BLOCK_SIGNALS */
+
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
+    if ( __kmp_stkoffset > 0 && gtid > 0 ) {
+        padding = KMP_ALLOCA( gtid * __kmp_stkoffset );
+    }
+#endif
+
+    KMP_MB();
+    __kmp_set_stack_info( gtid, (kmp_info_t*)thr );
+
+    __kmp_check_stack_overlap( (kmp_info_t*)thr );
+
+    exit_val = __kmp_launch_thread( (kmp_info_t *) thr );
+
+#ifdef KMP_BLOCK_SIGNALS
+    status = pthread_sigmask( SIG_SETMASK, & old_set, NULL );
+    KMP_CHECK_SYSFAIL( "pthread_sigmask", status );
+#endif /* KMP_BLOCK_SIGNALS */
+
+    return exit_val;
+}
+
+
+/* The monitor thread controls all of the threads in the complex */
+
+static void*
+__kmp_launch_monitor( void *thr )
+{
+    int         status, old_type, old_state;
+#ifdef KMP_BLOCK_SIGNALS
+    sigset_t    new_set;
+#endif /* KMP_BLOCK_SIGNALS */
+    struct timespec  interval;
+    int yield_count;
+    int yield_cycles = 0;
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 10, ("__kmp_launch_monitor: #1 launched\n" ) );
+
+    /* register us as the monitor thread */
+    __kmp_gtid_set_specific( KMP_GTID_MONITOR );
+#ifdef KMP_TDATA_GTID
+    __kmp_gtid = KMP_GTID_MONITOR;
+#endif
+
+    KMP_MB();
+
+#if USE_ITT_BUILD
+    __kmp_itt_thread_ignore();    // Instruct Intel(R) Threading Tools to ignore monitor thread.
+#endif /* USE_ITT_BUILD */
+
+    __kmp_set_stack_info( ((kmp_info_t*)thr)->th.th_info.ds.ds_gtid, (kmp_info_t*)thr );
+
+    __kmp_check_stack_overlap( (kmp_info_t*)thr );
+
+#ifdef KMP_CANCEL_THREADS
+    status = pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, & old_type );
+    KMP_CHECK_SYSFAIL( "pthread_setcanceltype", status );
+    /* josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads? */
+    status = pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, & old_state );
+    KMP_CHECK_SYSFAIL( "pthread_setcancelstate", status );
+#endif
+
+    #if KMP_REAL_TIME_FIX
+    // This is a potential fix which allows application with real-time scheduling policy work.
+    // However, decision about the fix is not made yet, so it is disabled by default.
+    { // Are program started with real-time scheduling policy?
+        int sched = sched_getscheduler( 0 );
+        if ( sched == SCHED_FIFO || sched == SCHED_RR ) {
+            // Yes, we are a part of real-time application. Try to increase the priority of the
+            // monitor.
+            struct sched_param param;
+            int    max_priority = sched_get_priority_max( sched );
+            int    rc;
+            KMP_WARNING( RealTimeSchedNotSupported );
+            sched_getparam( 0, & param );
+            if ( param.sched_priority < max_priority ) {
+                param.sched_priority += 1;
+                rc = sched_setscheduler( 0, sched, & param );
+                if ( rc != 0 ) {
+                    int error = errno;
+                  __kmp_msg(
+                      kmp_ms_warning,
+                      KMP_MSG( CantChangeMonitorPriority ),
+                      KMP_ERR( error ),
+                      KMP_MSG( MonitorWillStarve ),
+                      __kmp_msg_null
+                  );
+                }; // if
+            } else {
+                // We cannot abort here, because number of CPUs may be enough for all the threads,
+                // including the monitor thread, so application could potentially work...
+                __kmp_msg(
+                    kmp_ms_warning,
+                    KMP_MSG( RunningAtMaxPriority ),
+                    KMP_MSG( MonitorWillStarve ),
+                    KMP_HNT( RunningAtMaxPriority ),
+                    __kmp_msg_null
+                );
+            }; // if
+        }; // if
+        TCW_4( __kmp_global.g.g_time.dt.t_value, 0 );  // AC: free thread that waits for monitor started
+    }
+    #endif // KMP_REAL_TIME_FIX
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    if ( __kmp_monitor_wakeups == 1 ) {
+        interval.tv_sec  = 1;
+        interval.tv_nsec = 0;
+    } else {
+        interval.tv_sec  = 0;
+        interval.tv_nsec = (KMP_NSEC_PER_SEC / __kmp_monitor_wakeups);
+    }
+
+    KA_TRACE( 10, ("__kmp_launch_monitor: #2 monitor\n" ) );
+
+    if (__kmp_yield_cycle) {
+        __kmp_yielding_on = 0;  /* Start out with yielding shut off */
+        yield_count = __kmp_yield_off_count;
+    } else {
+        __kmp_yielding_on = 1;  /* Yielding is on permanently */
+    }
+
+    while( ! TCR_4( __kmp_global.g.g_done ) ) {
+        struct timespec  now;
+        struct timeval   tval;
+
+        /*  This thread monitors the state of the system */
+
+        KA_TRACE( 15, ( "__kmp_launch_monitor: update\n" ) );
+
+        status = gettimeofday( &tval, NULL );
+        KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+        TIMEVAL_TO_TIMESPEC( &tval, &now );
+
+        now.tv_sec  += interval.tv_sec;
+        now.tv_nsec += interval.tv_nsec;
+
+        if (now.tv_nsec >= KMP_NSEC_PER_SEC) {
+            now.tv_sec  += 1;
+            now.tv_nsec -= KMP_NSEC_PER_SEC;
+        }
+
+        status = pthread_mutex_lock( & __kmp_wait_mx.m_mutex );
+        KMP_CHECK_SYSFAIL( "pthread_mutex_lock", status );
+        // AC: the monitor should not fall asleep if g_done has been set
+        if ( !TCR_4(__kmp_global.g.g_done) ) {  // check once more under mutex
+            status = pthread_cond_timedwait( &__kmp_wait_cv.c_cond, &__kmp_wait_mx.m_mutex, &now );
+            if ( status != 0 ) {
+                if ( status != ETIMEDOUT && status != EINTR ) {
+                    KMP_SYSFAIL( "pthread_cond_timedwait", status );
+                };
+            };
+        };
+        status = pthread_mutex_unlock( & __kmp_wait_mx.m_mutex );
+        KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
+
+        if (__kmp_yield_cycle) {
+            yield_cycles++;
+            if ( (yield_cycles % yield_count) == 0 ) {
+                if (__kmp_yielding_on) {
+                    __kmp_yielding_on = 0;   /* Turn it off now */
+                    yield_count = __kmp_yield_off_count;
+                } else {
+                    __kmp_yielding_on = 1;   /* Turn it on now */
+                    yield_count = __kmp_yield_on_count;
+                }
+                yield_cycles = 0;
+            }
+        } else {
+            __kmp_yielding_on = 1;
+        }
+
+        TCW_4( __kmp_global.g.g_time.dt.t_value,
+          TCR_4( __kmp_global.g.g_time.dt.t_value ) + 1 );
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+    }
+
+    KA_TRACE( 10, ("__kmp_launch_monitor: #3 cleanup\n" ) );
+
+#ifdef KMP_BLOCK_SIGNALS
+    status = sigfillset( & new_set );
+    KMP_CHECK_SYSFAIL_ERRNO( "sigfillset", status );
+    status = pthread_sigmask( SIG_UNBLOCK, & new_set, NULL );
+    KMP_CHECK_SYSFAIL( "pthread_sigmask", status );
+#endif /* KMP_BLOCK_SIGNALS */
+
+    KA_TRACE( 10, ("__kmp_launch_monitor: #4 finished\n" ) );
+
+    if( __kmp_global.g.g_abort != 0 ) {
+        /* now we need to terminate the worker threads  */
+        /* the value of t_abort is the signal we caught */
+
+        int gtid;
+
+        KA_TRACE( 10, ("__kmp_launch_monitor: #5 terminate sig=%d\n", __kmp_global.g.g_abort ) );
+
+        /* terminate the OpenMP worker threads */
+        /* TODO this is not valid for sibling threads!!
+         * the uber master might not be 0 anymore.. */
+        for (gtid = 1; gtid < __kmp_threads_capacity; ++gtid)
+            __kmp_terminate_thread( gtid );
+
+        __kmp_cleanup();
+
+        KA_TRACE( 10, ("__kmp_launch_monitor: #6 raise sig=%d\n", __kmp_global.g.g_abort ) );
+
+        if (__kmp_global.g.g_abort > 0)
+            raise( __kmp_global.g.g_abort );
+
+    }
+
+    KA_TRACE( 10, ("__kmp_launch_monitor: #7 exit\n" ) );
+
+    return thr;
+}
+
+void
+__kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
+{
+    pthread_t      handle;
+    pthread_attr_t thread_attr;
+    int            status;
+
+
+    th->th.th_info.ds.ds_gtid = gtid;
+
+#if KMP_STATS_ENABLED
+    // sets up worker thread stats
+    __kmp_acquire_tas_lock(&__kmp_stats_lock, gtid);
+
+    // th->th.th_stats is used to transfer thread specific stats-pointer to __kmp_launch_worker
+    // So when thread is created (goes into __kmp_launch_worker) it will
+    // set it's __thread local pointer to th->th.th_stats
+    th->th.th_stats = __kmp_stats_list.push_back(gtid);
+    if(KMP_UBER_GTID(gtid)) {
+        __kmp_stats_start_time = tsc_tick_count::now();
+        __kmp_stats_thread_ptr = th->th.th_stats;
+        __kmp_stats_init();
+        KMP_START_EXPLICIT_TIMER(OMP_serial);
+        KMP_START_EXPLICIT_TIMER(OMP_start_end);
+    }
+    __kmp_release_tas_lock(&__kmp_stats_lock, gtid);
+
+#endif // KMP_STATS_ENABLED
+
+    if ( KMP_UBER_GTID(gtid) ) {
+        KA_TRACE( 10, ("__kmp_create_worker: uber thread (%d)\n", gtid ) );
+        th -> th.th_info.ds.ds_thread = pthread_self();
+        __kmp_set_stack_info( gtid, th );
+        __kmp_check_stack_overlap( th );
+        return;
+    }; // if
+
+    KA_TRACE( 10, ("__kmp_create_worker: try to create thread (%d)\n", gtid ) );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+#ifdef KMP_THREAD_ATTR
+        {
+            status = pthread_attr_init( &thread_attr );
+            if ( status != 0 ) {
+                __kmp_msg(
+                          kmp_ms_fatal,
+                          KMP_MSG( CantInitThreadAttrs ),
+                          KMP_ERR( status ),
+                          __kmp_msg_null
+                          );
+            }; // if
+            status = pthread_attr_setdetachstate( & thread_attr, PTHREAD_CREATE_JOINABLE );
+            if ( status != 0 ) {
+                __kmp_msg(
+                          kmp_ms_fatal,
+                          KMP_MSG( CantSetWorkerState ),
+                          KMP_ERR( status ),
+                          __kmp_msg_null
+                          );
+            }; // if
+
+            /* Set stack size for this thread now. */
+            stack_size += gtid * __kmp_stkoffset;
+
+            KA_TRACE( 10, ( "__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
+                            "__kmp_stksize = %lu bytes, final stacksize = %lu bytes\n",
+                            gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size ) );
+
+# ifdef _POSIX_THREAD_ATTR_STACKSIZE
+                status = pthread_attr_setstacksize( & thread_attr, stack_size );
+#  ifdef KMP_BACKUP_STKSIZE
+            if ( status != 0 ) {
+                if ( ! __kmp_env_stksize ) {
+                    stack_size = KMP_BACKUP_STKSIZE + gtid * __kmp_stkoffset;
+                    __kmp_stksize = KMP_BACKUP_STKSIZE;
+                    KA_TRACE( 10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
+                                   "__kmp_stksize = %lu bytes, (backup) final stacksize = %lu "
+                                   "bytes\n",
+                                   gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size )
+                              );
+                    status = pthread_attr_setstacksize( &thread_attr, stack_size );
+                }; // if
+            }; // if
+#  endif /* KMP_BACKUP_STKSIZE */
+            if ( status != 0 ) {
+                __kmp_msg(
+                          kmp_ms_fatal,
+                          KMP_MSG( CantSetWorkerStackSize, stack_size ),
+                          KMP_ERR( status ),
+                          KMP_HNT( ChangeWorkerStackSize  ),
+                          __kmp_msg_null
+                          );
+            }; // if
+# endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+        }
+#endif /* KMP_THREAD_ATTR */
+
+        {
+            status = pthread_create( & handle, & thread_attr, __kmp_launch_worker, (void *) th );
+            if ( status != 0 || ! handle ) { // ??? Why do we check handle??
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+                if ( status == EINVAL ) {
+                    __kmp_msg(
+                              kmp_ms_fatal,
+                              KMP_MSG( CantSetWorkerStackSize, stack_size ),
+                              KMP_ERR( status ),
+                              KMP_HNT( IncreaseWorkerStackSize ),
+                              __kmp_msg_null
+                              );
+                };
+                if ( status == ENOMEM ) {
+                    __kmp_msg(
+                              kmp_ms_fatal,
+                              KMP_MSG( CantSetWorkerStackSize, stack_size ),
+                              KMP_ERR( status ),
+                              KMP_HNT( DecreaseWorkerStackSize ),
+                              __kmp_msg_null
+                              );
+                };
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+                if ( status == EAGAIN ) {
+                    __kmp_msg(
+                              kmp_ms_fatal,
+                              KMP_MSG( NoResourcesForWorkerThread ),
+                              KMP_ERR( status ),
+                              KMP_HNT( Decrease_NUM_THREADS ),
+                              __kmp_msg_null
+                              );
+                }; // if
+                KMP_SYSFAIL( "pthread_create", status );
+            }; // if
+
+            th->th.th_info.ds.ds_thread = handle;
+        }
+
+#ifdef KMP_THREAD_ATTR
+        {
+            status = pthread_attr_destroy( & thread_attr );
+            if ( status ) {
+                __kmp_msg(
+                          kmp_ms_warning,
+                          KMP_MSG( CantDestroyThreadAttrs ),
+                          KMP_ERR( status ),
+                          __kmp_msg_null
+                          );
+            }; // if
+        }
+#endif /* KMP_THREAD_ATTR */
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 10, ("__kmp_create_worker: done creating thread (%d)\n", gtid ) );
+
+} // __kmp_create_worker
+
+
+void
+__kmp_create_monitor( kmp_info_t *th )
+{
+    pthread_t           handle;
+    pthread_attr_t      thread_attr;
+    size_t              size;
+    int                 status;
+    int                 caller_gtid;
+    int                 auto_adj_size = FALSE;
+
+    caller_gtid = __kmp_get_gtid();
+
+    KA_TRACE( 10, ("__kmp_create_monitor: try to create monitor\n" ) );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    th->th.th_info.ds.ds_tid  = KMP_GTID_MONITOR;
+    th->th.th_info.ds.ds_gtid = KMP_GTID_MONITOR;
+    #if KMP_REAL_TIME_FIX
+        TCW_4( __kmp_global.g.g_time.dt.t_value, -1 ); // Will use it for synchronization a bit later.
+    #else
+        TCW_4( __kmp_global.g.g_time.dt.t_value, 0 );
+    #endif // KMP_REAL_TIME_FIX
+
+    #ifdef KMP_THREAD_ATTR
+        if ( __kmp_monitor_stksize == 0 ) {
+            __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+            auto_adj_size = TRUE;
+        }
+        status = pthread_attr_init( &thread_attr );
+        if ( status != 0 ) {
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantInitThreadAttrs ),
+                KMP_ERR( status ),
+                __kmp_msg_null
+            );
+        }; // if
+        status = pthread_attr_setdetachstate( & thread_attr, PTHREAD_CREATE_JOINABLE );
+        if ( status != 0 ) {
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( CantSetMonitorState ),
+                KMP_ERR( status ),
+                __kmp_msg_null
+            );
+        }; // if
+
+        #ifdef _POSIX_THREAD_ATTR_STACKSIZE
+            status = pthread_attr_getstacksize( & thread_attr, & size );
+            KMP_CHECK_SYSFAIL( "pthread_attr_getstacksize", status );
+        #else
+            size = __kmp_sys_min_stksize;
+        #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+    #endif /* KMP_THREAD_ATTR */
+
+    if ( __kmp_monitor_stksize == 0 ) {
+        __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+    }
+    if ( __kmp_monitor_stksize < __kmp_sys_min_stksize ) {
+        __kmp_monitor_stksize = __kmp_sys_min_stksize;
+    }
+
+    KA_TRACE( 10, ( "__kmp_create_monitor: default stacksize = %lu bytes,"
+                    "requested stacksize = %lu bytes\n",
+                    size, __kmp_monitor_stksize ) );
+
+    retry:
+
+    /* Set stack size for this thread now. */
+
+    #ifdef _POSIX_THREAD_ATTR_STACKSIZE
+        KA_TRACE( 10, ( "__kmp_create_monitor: setting stacksize = %lu bytes,",
+                        __kmp_monitor_stksize ) );
+        status = pthread_attr_setstacksize( & thread_attr, __kmp_monitor_stksize );
+        if ( status != 0 ) {
+            if ( auto_adj_size ) {
+                __kmp_monitor_stksize *= 2;
+                goto retry;
+            }
+            __kmp_msg(
+                kmp_ms_warning,  // should this be fatal?  BB
+                KMP_MSG( CantSetMonitorStackSize, (long int) __kmp_monitor_stksize ),
+                KMP_ERR( status ),
+                KMP_HNT( ChangeMonitorStackSize ),
+                __kmp_msg_null
+            );
+        }; // if
+    #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+
+    status = pthread_create( &handle, & thread_attr, __kmp_launch_monitor, (void *) th );
+
+    if ( status != 0 ) {
+        #ifdef _POSIX_THREAD_ATTR_STACKSIZE
+            if ( status == EINVAL ) {
+                if ( auto_adj_size  && ( __kmp_monitor_stksize < (size_t)0x40000000 ) ) {
+                    __kmp_monitor_stksize *= 2;
+                    goto retry;
+                }
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( CantSetMonitorStackSize, __kmp_monitor_stksize ),
+                    KMP_ERR( status ),
+                    KMP_HNT( IncreaseMonitorStackSize ),
+                    __kmp_msg_null
+                );
+            }; // if
+            if ( status == ENOMEM ) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( CantSetMonitorStackSize, __kmp_monitor_stksize ),
+                    KMP_ERR( status ),
+                    KMP_HNT( DecreaseMonitorStackSize ),
+                    __kmp_msg_null
+                );
+            }; // if
+        #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+        if ( status == EAGAIN ) {
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( NoResourcesForMonitorThread ),
+                KMP_ERR( status ),
+                KMP_HNT( DecreaseNumberOfThreadsInUse ),
+                __kmp_msg_null
+            );
+        }; // if
+        KMP_SYSFAIL( "pthread_create", status );
+    }; // if
+
+    th->th.th_info.ds.ds_thread = handle;
+
+    #if KMP_REAL_TIME_FIX
+        // Wait for the monitor thread is really started and set its *priority*.
+        KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == sizeof( __kmp_global.g.g_time.dt.t_value ) );
+        __kmp_wait_yield_4(
+            (kmp_uint32 volatile *) & __kmp_global.g.g_time.dt.t_value, -1, & __kmp_neq_4, NULL
+        );
+    #endif // KMP_REAL_TIME_FIX
+
+    #ifdef KMP_THREAD_ATTR
+        status = pthread_attr_destroy( & thread_attr );
+        if ( status != 0 ) {
+            __kmp_msg(    //
+                kmp_ms_warning,
+                KMP_MSG( CantDestroyThreadAttrs ),
+                KMP_ERR( status ),
+                __kmp_msg_null
+            );
+        }; // if
+    #endif
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 10, ( "__kmp_create_monitor: monitor created %#.8lx\n", th->th.th_info.ds.ds_thread ) );
+
+} // __kmp_create_monitor
+
+void
+__kmp_exit_thread(
+    int exit_status
+) {
+    pthread_exit( (void *)(intptr_t) exit_status );
+} // __kmp_exit_thread
+
+void __kmp_resume_monitor();
+
+void
+__kmp_reap_monitor( kmp_info_t *th )
+{
+    int          status;
+    void        *exit_val;
+
+    KA_TRACE( 10, ("__kmp_reap_monitor: try to reap monitor thread with handle %#.8lx\n",
+                   th->th.th_info.ds.ds_thread ) );
+
+    // If monitor has been created, its tid and gtid should be KMP_GTID_MONITOR.
+    // If both tid and gtid are 0, it means the monitor did not ever start.
+    // If both tid and gtid are KMP_GTID_DNE, the monitor has been shut down.
+    KMP_DEBUG_ASSERT( th->th.th_info.ds.ds_tid == th->th.th_info.ds.ds_gtid );
+    if ( th->th.th_info.ds.ds_gtid != KMP_GTID_MONITOR ) {
+        return;
+    }; // if
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+
+    /* First, check to see whether the monitor thread exists.  This could prevent a hang,
+       but if the monitor dies after the pthread_kill call and before the pthread_join
+       call, it will still hang. */
+
+    status = pthread_kill( th->th.th_info.ds.ds_thread, 0 );
+    if (status == ESRCH) {
+
+        KA_TRACE( 10, ("__kmp_reap_monitor: monitor does not exist, returning\n") );
+
+    } else
+    {
+        __kmp_resume_monitor();   // Wake up the monitor thread
+        status = pthread_join( th->th.th_info.ds.ds_thread, & exit_val);
+        if (exit_val != th) {
+            __kmp_msg(
+                kmp_ms_fatal,
+                KMP_MSG( ReapMonitorError ),
+                KMP_ERR( status ),
+                __kmp_msg_null
+            );
+        }
+    }
+
+    th->th.th_info.ds.ds_tid  = KMP_GTID_DNE;
+    th->th.th_info.ds.ds_gtid = KMP_GTID_DNE;
+
+    KA_TRACE( 10, ("__kmp_reap_monitor: done reaping monitor thread with handle %#.8lx\n",
+                   th->th.th_info.ds.ds_thread ) );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+}
+
+void
+__kmp_reap_worker( kmp_info_t *th )
+{
+    int          status;
+    void        *exit_val;
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 10, ("__kmp_reap_worker: try to reap T#%d\n", th->th.th_info.ds.ds_gtid ) );
+
+    /* First, check to see whether the worker thread exists.  This could prevent a hang,
+       but if the worker dies after the pthread_kill call and before the pthread_join
+       call, it will still hang. */
+
+        {
+            status = pthread_kill( th->th.th_info.ds.ds_thread, 0 );
+            if (status == ESRCH) {
+                KA_TRACE( 10, ("__kmp_reap_worker: worker T#%d does not exist, returning\n",
+                               th->th.th_info.ds.ds_gtid ) );
+            }
+            else {
+                KA_TRACE( 10, ("__kmp_reap_worker: try to join with worker T#%d\n",
+                               th->th.th_info.ds.ds_gtid ) );
+
+                status = pthread_join( th->th.th_info.ds.ds_thread, & exit_val);
+#ifdef KMP_DEBUG
+                /* Don't expose these to the user until we understand when they trigger */
+                if ( status != 0 ) {
+                    __kmp_msg(
+                              kmp_ms_fatal,
+                              KMP_MSG( ReapWorkerError ),
+                              KMP_ERR( status ),
+                              __kmp_msg_null
+                              );
+                }
+                if ( exit_val != th ) {
+                    KA_TRACE( 10, ( "__kmp_reap_worker: worker T#%d did not reap properly, "
+                                    "exit_val = %p\n",
+                                    th->th.th_info.ds.ds_gtid, exit_val ) );
+                }
+#endif /* KMP_DEBUG */
+            }
+        }
+
+    KA_TRACE( 10, ("__kmp_reap_worker: done reaping T#%d\n", th->th.th_info.ds.ds_gtid ) );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if KMP_HANDLE_SIGNALS
+
+
+static void
+__kmp_null_handler( int signo )
+{
+    //  Do nothing, for doing SIG_IGN-type actions.
+} // __kmp_null_handler
+
+
+static void
+__kmp_team_handler( int signo )
+{
+    if ( __kmp_global.g.g_abort == 0 ) {
+        /* Stage 1 signal handler, let's shut down all of the threads */
+        #ifdef KMP_DEBUG
+            __kmp_debug_printf( "__kmp_team_handler: caught signal = %d\n", signo );
+        #endif
+        switch ( signo ) {
+            case SIGHUP  :
+            case SIGINT  :
+            case SIGQUIT :
+            case SIGILL  :
+            case SIGABRT :
+            case SIGFPE  :
+            case SIGBUS  :
+            case SIGSEGV :
+            #ifdef SIGSYS
+                case SIGSYS :
+            #endif
+            case SIGTERM :
+                if ( __kmp_debug_buf ) {
+                    __kmp_dump_debug_buffer( );
+                }; // if
+                KMP_MB();       // Flush all pending memory write invalidates.
+                TCW_4( __kmp_global.g.g_abort, signo );
+                KMP_MB();       // Flush all pending memory write invalidates.
+                TCW_4( __kmp_global.g.g_done, TRUE );
+                KMP_MB();       // Flush all pending memory write invalidates.
+                break;
+            default:
+                #ifdef KMP_DEBUG
+                    __kmp_debug_printf( "__kmp_team_handler: unknown signal type" );
+                #endif
+                break;
+        }; // switch
+    }; // if
+} // __kmp_team_handler
+
+
+static
+void __kmp_sigaction( int signum, const struct sigaction * act, struct sigaction * oldact ) {
+    int rc = sigaction( signum, act, oldact );
+    KMP_CHECK_SYSFAIL_ERRNO( "sigaction", rc );
+}
+
+
+static void
+__kmp_install_one_handler( int sig, sig_func_t handler_func, int parallel_init )
+{
+    KMP_MB();       // Flush all pending memory write invalidates.
+    KB_TRACE( 60, ( "__kmp_install_one_handler( %d, ..., %d )\n", sig, parallel_init ) );
+    if ( parallel_init ) {
+        struct sigaction new_action;
+        struct sigaction old_action;
+        new_action.sa_handler = handler_func;
+        new_action.sa_flags   = 0;
+        sigfillset( & new_action.sa_mask );
+        __kmp_sigaction( sig, & new_action, & old_action );
+        if ( old_action.sa_handler == __kmp_sighldrs[ sig ].sa_handler ) {
+            sigaddset( & __kmp_sigset, sig );
+        } else {
+            // Restore/keep user's handler if one previously installed.
+            __kmp_sigaction( sig, & old_action, NULL );
+        }; // if
+    } else {
+        // Save initial/system signal handlers to see if user handlers installed.
+        __kmp_sigaction( sig, NULL, & __kmp_sighldrs[ sig ] );
+    }; // if
+    KMP_MB();       // Flush all pending memory write invalidates.
+} // __kmp_install_one_handler
+
+
+static void
+__kmp_remove_one_handler( int sig )
+{
+    KB_TRACE( 60, ( "__kmp_remove_one_handler( %d )\n", sig ) );
+    if ( sigismember( & __kmp_sigset, sig ) ) {
+        struct sigaction old;
+        KMP_MB();       // Flush all pending memory write invalidates.
+        __kmp_sigaction( sig, & __kmp_sighldrs[ sig ], & old );
+        if ( ( old.sa_handler != __kmp_team_handler ) && ( old.sa_handler != __kmp_null_handler ) ) {
+            // Restore the users signal handler.
+            KB_TRACE( 10, ( "__kmp_remove_one_handler: oops, not our handler, restoring: sig=%d\n", sig ) );
+            __kmp_sigaction( sig, & old, NULL );
+        }; // if
+        sigdelset( & __kmp_sigset, sig );
+        KMP_MB();       // Flush all pending memory write invalidates.
+    }; // if
+} // __kmp_remove_one_handler
+
+
+void
+__kmp_install_signals( int parallel_init )
+{
+    KB_TRACE( 10, ( "__kmp_install_signals( %d )\n", parallel_init ) );
+    if ( __kmp_handle_signals || ! parallel_init ) {
+        // If ! parallel_init, we do not install handlers, just save original handlers.
+        // Let us do it even __handle_signals is 0.
+        sigemptyset( & __kmp_sigset );
+        __kmp_install_one_handler( SIGHUP,  __kmp_team_handler, parallel_init );
+        __kmp_install_one_handler( SIGINT,  __kmp_team_handler, parallel_init );
+        __kmp_install_one_handler( SIGQUIT, __kmp_team_handler, parallel_init );
+        __kmp_install_one_handler( SIGILL,  __kmp_team_handler, parallel_init );
+        __kmp_install_one_handler( SIGABRT, __kmp_team_handler, parallel_init );
+        __kmp_install_one_handler( SIGFPE,  __kmp_team_handler, parallel_init );
+        __kmp_install_one_handler( SIGBUS,  __kmp_team_handler, parallel_init );
+        __kmp_install_one_handler( SIGSEGV, __kmp_team_handler, parallel_init );
+        #ifdef SIGSYS
+            __kmp_install_one_handler( SIGSYS,  __kmp_team_handler, parallel_init );
+        #endif // SIGSYS
+        __kmp_install_one_handler( SIGTERM, __kmp_team_handler, parallel_init );
+        #ifdef SIGPIPE
+            __kmp_install_one_handler( SIGPIPE, __kmp_team_handler, parallel_init );
+        #endif // SIGPIPE
+    }; // if
+} // __kmp_install_signals
+
+
+void
+__kmp_remove_signals( void )
+{
+    int    sig;
+    KB_TRACE( 10, ( "__kmp_remove_signals()\n" ) );
+    for ( sig = 1; sig < NSIG; ++ sig ) {
+        __kmp_remove_one_handler( sig );
+    }; // for sig
+} // __kmp_remove_signals
+
+
+#endif // KMP_HANDLE_SIGNALS
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_enable( int new_state )
+{
+    #ifdef KMP_CANCEL_THREADS
+        int status, old_state;
+        status = pthread_setcancelstate( new_state, & old_state );
+        KMP_CHECK_SYSFAIL( "pthread_setcancelstate", status );
+        KMP_DEBUG_ASSERT( old_state == PTHREAD_CANCEL_DISABLE );
+    #endif
+}
+
+void
+__kmp_disable( int * old_state )
+{
+    #ifdef KMP_CANCEL_THREADS
+        int status;
+        status = pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, old_state );
+        KMP_CHECK_SYSFAIL( "pthread_setcancelstate", status );
+    #endif
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+static void
+__kmp_atfork_prepare (void)
+{
+    /*  nothing to do  */
+}
+
+static void
+__kmp_atfork_parent (void)
+{
+    /*  nothing to do  */
+}
+
+/*
+    Reset the library so execution in the child starts "all over again" with
+    clean data structures in initial states.  Don't worry about freeing memory
+    allocated by parent, just abandon it to be safe.
+*/
+static void
+__kmp_atfork_child (void)
+{
+    /* TODO make sure this is done right for nested/sibling */
+    // ATT:  Memory leaks are here? TODO: Check it and fix.
+    /* KMP_ASSERT( 0 ); */
+
+    ++__kmp_fork_count;
+
+    __kmp_init_runtime = FALSE;
+    __kmp_init_monitor = 0;
+    __kmp_init_parallel = FALSE;
+    __kmp_init_middle = FALSE;
+    __kmp_init_serial = FALSE;
+    TCW_4(__kmp_init_gtid, FALSE);
+    __kmp_init_common = FALSE;
+
+    TCW_4(__kmp_init_user_locks, FALSE);
+#if ! KMP_USE_DYNAMIC_LOCK
+    __kmp_user_lock_table.used = 1;
+    __kmp_user_lock_table.allocated = 0;
+    __kmp_user_lock_table.table = NULL;
+    __kmp_lock_blocks = NULL;
+#endif
+
+    __kmp_all_nth = 0;
+    TCW_4(__kmp_nth, 0);
+
+    /* Must actually zero all the *cache arguments passed to __kmpc_threadprivate here
+       so threadprivate doesn't use stale data */
+    KA_TRACE( 10, ( "__kmp_atfork_child: checking cache address list %p\n",
+                 __kmp_threadpriv_cache_list ) );
+
+    while ( __kmp_threadpriv_cache_list != NULL ) {
+
+        if ( *__kmp_threadpriv_cache_list -> addr != NULL ) {
+            KC_TRACE( 50, ( "__kmp_atfork_child: zeroing cache at address %p\n",
+                        &(*__kmp_threadpriv_cache_list -> addr) ) );
+
+            *__kmp_threadpriv_cache_list -> addr = NULL;
+        }
+        __kmp_threadpriv_cache_list = __kmp_threadpriv_cache_list -> next;
+    }
+
+    __kmp_init_runtime = FALSE;
+
+    /* reset statically initialized locks */
+    __kmp_init_bootstrap_lock( &__kmp_initz_lock );
+    __kmp_init_bootstrap_lock( &__kmp_stdio_lock );
+    __kmp_init_bootstrap_lock( &__kmp_console_lock );
+
+    /* This is necessary to make sure no stale data is left around */
+    /* AC: customers complain that we use unsafe routines in the atfork
+       handler. Mathworks: dlsym() is unsafe. We call dlsym and dlopen
+       in dynamic_link when check the presence of shared tbbmalloc library.
+       Suggestion is to make the library initialization lazier, similar
+       to what done for __kmpc_begin(). */
+    // TODO: synchronize all static initializations with regular library
+    //       startup; look at kmp_global.c and etc.
+    //__kmp_internal_begin ();
+
+}
+
+void
+__kmp_register_atfork(void) {
+    if ( __kmp_need_register_atfork ) {
+        int status = pthread_atfork( __kmp_atfork_prepare, __kmp_atfork_parent, __kmp_atfork_child );
+        KMP_CHECK_SYSFAIL( "pthread_atfork", status );
+        __kmp_need_register_atfork = FALSE;
+    }
+}
+
+void
+__kmp_suspend_initialize( void )
+{
+    int status;
+    status = pthread_mutexattr_init( &__kmp_suspend_mutex_attr );
+    KMP_CHECK_SYSFAIL( "pthread_mutexattr_init", status );
+    status = pthread_condattr_init( &__kmp_suspend_cond_attr );
+    KMP_CHECK_SYSFAIL( "pthread_condattr_init", status );
+}
+
+static void
+__kmp_suspend_initialize_thread( kmp_info_t *th )
+{
+    if ( th->th.th_suspend_init_count <= __kmp_fork_count ) {
+        /* this means we haven't initialized the suspension pthread objects for this thread
+           in this instance of the process */
+        int     status;
+        status = pthread_cond_init( &th->th.th_suspend_cv.c_cond, &__kmp_suspend_cond_attr );
+        KMP_CHECK_SYSFAIL( "pthread_cond_init", status );
+        status = pthread_mutex_init( &th->th.th_suspend_mx.m_mutex, & __kmp_suspend_mutex_attr );
+        KMP_CHECK_SYSFAIL( "pthread_mutex_init", status );
+        *(volatile int*)&th->th.th_suspend_init_count = __kmp_fork_count + 1;
+    };
+}
+
+void
+__kmp_suspend_uninitialize_thread( kmp_info_t *th )
+{
+    if(th->th.th_suspend_init_count > __kmp_fork_count) {
+        /* this means we have initialize the suspension pthread objects for this thread
+           in this instance of the process */
+        int status;
+
+        status = pthread_cond_destroy( &th->th.th_suspend_cv.c_cond );
+        if ( status != 0 && status != EBUSY ) {
+            KMP_SYSFAIL( "pthread_cond_destroy", status );
+        };
+        status = pthread_mutex_destroy( &th->th.th_suspend_mx.m_mutex );
+        if ( status != 0 && status != EBUSY ) {
+            KMP_SYSFAIL( "pthread_mutex_destroy", status );
+        };
+        --th->th.th_suspend_init_count;
+        KMP_DEBUG_ASSERT(th->th.th_suspend_init_count == __kmp_fork_count);
+    }
+}
+
+/* This routine puts the calling thread to sleep after setting the
+ * sleep bit for the indicated flag variable to true.
+ */
+template <class C>
+static inline void __kmp_suspend_template( int th_gtid, C *flag )
+{
+    KMP_TIME_BLOCK(USER_suspend);
+    kmp_info_t *th = __kmp_threads[th_gtid];
+    int status;
+    typename C::flag_t old_spin;
+
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d enter for flag = %p\n", th_gtid, flag->get() ) );
+
+    __kmp_suspend_initialize_thread( th );
+
+    status = pthread_mutex_lock( &th->th.th_suspend_mx.m_mutex );
+    KMP_CHECK_SYSFAIL( "pthread_mutex_lock", status );
+
+    KF_TRACE( 10, ( "__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n",
+                    th_gtid, flag->get() ) );
+
+    /* TODO: shouldn't this use release semantics to ensure that __kmp_suspend_initialize_thread
+       gets called first?
+    */
+    old_spin = flag->set_sleeping();
+
+    KF_TRACE( 5, ( "__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%d\n",
+                   th_gtid, flag->get(), *(flag->get()) ) );
+
+    if ( flag->done_check_val(old_spin) ) {
+        old_spin = flag->unset_sleeping();
+        KF_TRACE( 5, ( "__kmp_suspend_template: T#%d false alarm, reset sleep bit for spin(%p)\n",
+                       th_gtid, flag->get()) );
+    } else {
+        /* Encapsulate in a loop as the documentation states that this may
+         * "with low probability" return when the condition variable has
+         * not been signaled or broadcast
+         */
+        int deactivated = FALSE;
+        TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+        while ( flag->is_sleeping() ) {
+#ifdef DEBUG_SUSPEND
+            char buffer[128];
+            __kmp_suspend_count++;
+            __kmp_print_cond( buffer, &th->th.th_suspend_cv );
+            __kmp_printf( "__kmp_suspend_template: suspending T#%d: %s\n", th_gtid, buffer );
+#endif
+            // Mark the thread as no longer active (only in the first iteration of the loop).
+            if ( ! deactivated ) {
+                th->th.th_active = FALSE;
+                if ( th->th.th_active_in_pool ) {
+                    th->th.th_active_in_pool = FALSE;
+                    KMP_TEST_THEN_DEC32(
+                      (kmp_int32 *) &__kmp_thread_pool_active_nth );
+                    KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
+                }
+                deactivated = TRUE;
+
+
+            }
+
+#if USE_SUSPEND_TIMEOUT
+            struct timespec  now;
+            struct timeval   tval;
+            int msecs;
+
+            status = gettimeofday( &tval, NULL );
+            KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+            TIMEVAL_TO_TIMESPEC( &tval, &now );
+
+            msecs = (4*__kmp_dflt_blocktime) + 200;
+            now.tv_sec  += msecs / 1000;
+            now.tv_nsec += (msecs % 1000)*1000;
+
+            KF_TRACE( 15, ( "__kmp_suspend_template: T#%d about to perform pthread_cond_timedwait\n",
+                            th_gtid ) );
+            status = pthread_cond_timedwait( &th->th.th_suspend_cv.c_cond, &th->th.th_suspend_mx.m_mutex, & now );
+#else
+            KF_TRACE( 15, ( "__kmp_suspend_template: T#%d about to perform pthread_cond_wait\n",
+                               th_gtid ) );
+
+            status = pthread_cond_wait( &th->th.th_suspend_cv.c_cond, &th->th.th_suspend_mx.m_mutex );
+#endif
+
+            if ( (status != 0) && (status != EINTR) && (status != ETIMEDOUT) ) {
+                KMP_SYSFAIL( "pthread_cond_wait", status );
+            }
+#ifdef KMP_DEBUG
+            if (status == ETIMEDOUT) {
+                if ( flag->is_sleeping() ) {
+                    KF_TRACE( 100, ( "__kmp_suspend_template: T#%d timeout wakeup\n", th_gtid ) );
+                } else {
+                    KF_TRACE( 2, ( "__kmp_suspend_template: T#%d timeout wakeup, sleep bit not set!\n",
+                                   th_gtid ) );
+                }
+            } else if ( flag->is_sleeping() ) {
+                KF_TRACE( 100, ( "__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid ) );
+            }
+#endif
+        } // while
+
+        // Mark the thread as active again (if it was previous marked as inactive)
+        if ( deactivated ) {
+            th->th.th_active = TRUE;
+            if ( TCR_4(th->th.th_in_pool) ) {
+                KMP_TEST_THEN_INC32( (kmp_int32 *) &__kmp_thread_pool_active_nth );
+                th->th.th_active_in_pool = TRUE;
+            }
+        }
+    }
+
+#ifdef DEBUG_SUSPEND
+    {
+        char buffer[128];
+        __kmp_print_cond( buffer, &th->th.th_suspend_cv);
+        __kmp_printf( "__kmp_suspend_template: T#%d has awakened: %s\n", th_gtid, buffer );
+    }
+#endif
+
+
+    status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
+    KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
+
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d exit\n", th_gtid ) );
+}
+
+void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+
+
+/* This routine signals the thread specified by target_gtid to wake up
+ * after setting the sleep bit indicated by the flag argument to FALSE.
+ * The target thread must already have called __kmp_suspend_template()
+ */
+template <class C>
+static inline void __kmp_resume_template( int target_gtid, C *flag )
+{
+    kmp_info_t *th = __kmp_threads[target_gtid];
+    int status;
+
+#ifdef KMP_DEBUG
+    int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d wants to wakeup T#%d enter\n", gtid, target_gtid ) );
+    KMP_DEBUG_ASSERT( gtid != target_gtid );
+
+    __kmp_suspend_initialize_thread( th );
+
+    status = pthread_mutex_lock( &th->th.th_suspend_mx.m_mutex );
+    KMP_CHECK_SYSFAIL( "pthread_mutex_lock", status );
+
+    if (!flag) {
+        flag = (C *)th->th.th_sleep_loc;
+    }
+
+    if (!flag) {
+        KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag(%p)\n",
+                       gtid, target_gtid, NULL ) );
+        status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
+        KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
+        return;
+    }
+    else {
+        typename C::flag_t old_spin = flag->unset_sleeping();
+        if ( ! flag->is_sleeping_val(old_spin) ) {
+            KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag(%p): "
+                           "%u => %u\n",
+                           gtid, target_gtid, flag->get(), old_spin, *flag->get() ) );
+
+            status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
+            KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
+            return;
+        }
+        KF_TRACE( 5, ( "__kmp_resume_template: T#%d about to wakeup T#%d, reset sleep bit for flag's loc(%p): "
+                       "%u => %u\n",
+                       gtid, target_gtid, flag->get(), old_spin, *flag->get() ) );
+    }
+    TCW_PTR(th->th.th_sleep_loc, NULL);
+
+
+#ifdef DEBUG_SUSPEND
+    {
+        char buffer[128];
+        __kmp_print_cond( buffer, &th->th.th_suspend_cv );
+        __kmp_printf( "__kmp_resume_template: T#%d resuming T#%d: %s\n", gtid, target_gtid, buffer );
+    }
+#endif
+
+
+    status = pthread_cond_signal( &th->th.th_suspend_cv.c_cond );
+    KMP_CHECK_SYSFAIL( "pthread_cond_signal", status );
+    status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
+    KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d exiting after signaling wake up for T#%d\n",
+                    gtid, target_gtid ) );
+}
+
+void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+
+void
+__kmp_resume_monitor()
+{
+    KMP_TIME_BLOCK(USER_resume);
+    int status;
+#ifdef KMP_DEBUG
+    int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+    KF_TRACE( 30, ( "__kmp_resume_monitor: T#%d wants to wakeup T#%d enter\n",
+                    gtid, KMP_GTID_MONITOR ) );
+    KMP_DEBUG_ASSERT( gtid != KMP_GTID_MONITOR );
+#endif
+    status = pthread_mutex_lock( &__kmp_wait_mx.m_mutex );
+    KMP_CHECK_SYSFAIL( "pthread_mutex_lock", status );
+#ifdef DEBUG_SUSPEND
+    {
+        char buffer[128];
+        __kmp_print_cond( buffer, &__kmp_wait_cv.c_cond );
+        __kmp_printf( "__kmp_resume_monitor: T#%d resuming T#%d: %s\n", gtid, KMP_GTID_MONITOR, buffer );
+    }
+#endif
+    status = pthread_cond_signal( &__kmp_wait_cv.c_cond );
+    KMP_CHECK_SYSFAIL( "pthread_cond_signal", status );
+    status = pthread_mutex_unlock( &__kmp_wait_mx.m_mutex );
+    KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
+    KF_TRACE( 30, ( "__kmp_resume_monitor: T#%d exiting after signaling wake up for T#%d\n",
+                    gtid, KMP_GTID_MONITOR ) );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_yield( int cond )
+{
+    if (cond && __kmp_yielding_on) {
+        sched_yield();
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_gtid_set_specific( int gtid )
+{
+    int status;
+    KMP_ASSERT( __kmp_init_runtime );
+    status = pthread_setspecific( __kmp_gtid_threadprivate_key, (void*)(intptr_t)(gtid+1) );
+    KMP_CHECK_SYSFAIL( "pthread_setspecific", status );
+}
+
+int
+__kmp_gtid_get_specific()
+{
+    int gtid;
+    if ( !__kmp_init_runtime ) {
+        KA_TRACE( 50, ("__kmp_get_specific: runtime shutdown, returning KMP_GTID_SHUTDOWN\n" ) );
+        return KMP_GTID_SHUTDOWN;
+    }
+    gtid = (int)(size_t)pthread_getspecific( __kmp_gtid_threadprivate_key );
+    if ( gtid == 0 ) {
+        gtid = KMP_GTID_DNE;
+    }
+    else {
+        gtid--;
+    }
+    KA_TRACE( 50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
+               __kmp_gtid_threadprivate_key, gtid ));
+    return gtid;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+double
+__kmp_read_cpu_time( void )
+{
+    /*clock_t   t;*/
+    struct tms  buffer;
+
+    /*t =*/  times( & buffer );
+
+    return (buffer.tms_utime + buffer.tms_cutime) / (double) CLOCKS_PER_SEC;
+}
+
+int
+__kmp_read_system_info( struct kmp_sys_info *info )
+{
+    int status;
+    struct rusage r_usage;
+
+    memset( info, 0, sizeof( *info ) );
+
+    status = getrusage( RUSAGE_SELF, &r_usage);
+    KMP_CHECK_SYSFAIL_ERRNO( "getrusage", status );
+
+    info->maxrss  = r_usage.ru_maxrss;  /* the maximum resident set size utilized (in kilobytes)     */
+    info->minflt  = r_usage.ru_minflt;  /* the number of page faults serviced without any I/O        */
+    info->majflt  = r_usage.ru_majflt;  /* the number of page faults serviced that required I/O      */
+    info->nswap   = r_usage.ru_nswap;   /* the number of times a process was "swapped" out of memory */
+    info->inblock = r_usage.ru_inblock; /* the number of times the file system had to perform input  */
+    info->oublock = r_usage.ru_oublock; /* the number of times the file system had to perform output */
+    info->nvcsw   = r_usage.ru_nvcsw;   /* the number of times a context switch was voluntarily      */
+    info->nivcsw  = r_usage.ru_nivcsw;  /* the number of times a context switch was forced           */
+
+    return (status != 0);
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+
+void
+__kmp_read_system_time( double *delta )
+{
+    double              t_ns;
+    struct timeval      tval;
+    struct timespec     stop;
+    int status;
+
+    status = gettimeofday( &tval, NULL );
+    KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+    TIMEVAL_TO_TIMESPEC( &tval, &stop );
+    t_ns = TS2NS(stop) - TS2NS(__kmp_sys_timer_data.start);
+    *delta = (t_ns * 1e-9);
+}
+
+void
+__kmp_clear_system_time( void )
+{
+    struct timeval tval;
+    int status;
+    status = gettimeofday( &tval, NULL );
+    KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+    TIMEVAL_TO_TIMESPEC( &tval, &__kmp_sys_timer_data.start );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#ifdef BUILD_TV
+
+void
+__kmp_tv_threadprivate_store( kmp_info_t *th, void *global_addr, void *thread_addr )
+{
+    struct tv_data *p;
+
+    p = (struct tv_data *) __kmp_allocate( sizeof( *p ) );
+
+    p->u.tp.global_addr = global_addr;
+    p->u.tp.thread_addr = thread_addr;
+
+    p->type = (void *) 1;
+
+    p->next =  th->th.th_local.tv_data;
+    th->th.th_local.tv_data = p;
+
+    if ( p->next == 0 ) {
+        int rc = pthread_setspecific( __kmp_tv_key, p );
+        KMP_CHECK_SYSFAIL( "pthread_setspecific", rc );
+    }
+}
+
+#endif /* BUILD_TV */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+static int
+__kmp_get_xproc( void ) {
+
+    int r = 0;
+
+    #if KMP_OS_LINUX
+
+        r = sysconf( _SC_NPROCESSORS_ONLN );
+
+    #elif KMP_OS_DARWIN
+
+        // Bug C77011 High "OpenMP Threads and number of active cores".
+
+        // Find the number of available CPUs.
+        kern_return_t          rc;
+        host_basic_info_data_t info;
+        mach_msg_type_number_t num = HOST_BASIC_INFO_COUNT;
+        rc = host_info( mach_host_self(), HOST_BASIC_INFO, (host_info_t) & info, & num );
+        if ( rc == 0 && num == HOST_BASIC_INFO_COUNT ) {
+            // Cannot use KA_TRACE() here because this code works before trace support is
+            // initialized.
+            r = info.avail_cpus;
+        } else {
+            KMP_WARNING( CantGetNumAvailCPU );
+            KMP_INFORM( AssumedNumCPU );
+        }; // if
+
+    #elif KMP_OS_FREEBSD
+
+        int mib[] = { CTL_HW, HW_NCPU };
+        size_t len = sizeof( r );
+        if ( sysctl( mib, 2, &r, &len, NULL, 0 ) < 0 ) {
+             r = 0;
+             KMP_WARNING( CantGetNumAvailCPU );
+             KMP_INFORM( AssumedNumCPU );
+        }
+
+    #else
+
+        #error "Unknown or unsupported OS."
+
+    #endif
+
+    return r > 0 ? r : 2; /* guess value of 2 if OS told us 0 */
+
+} // __kmp_get_xproc
+
+int
+__kmp_read_from_file( char const *path, char const *format, ... )
+{
+    int result;
+    va_list args;
+
+    va_start(args, format);
+    FILE *f = fopen(path, "rb");
+    if ( f == NULL )
+        return 0;
+    result = vfscanf(f, format, args);
+    fclose(f);
+
+    return result;
+}
+
+void
+__kmp_runtime_initialize( void )
+{
+    int status;
+    pthread_mutexattr_t mutex_attr;
+    pthread_condattr_t  cond_attr;
+
+    if ( __kmp_init_runtime ) {
+        return;
+    }; // if
+
+    #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
+        if ( ! __kmp_cpuinfo.initialized ) {
+            __kmp_query_cpuid( &__kmp_cpuinfo );
+        }; // if
+    #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    __kmp_xproc = __kmp_get_xproc();
+
+    if ( sysconf( _SC_THREADS ) ) {
+
+        /* Query the maximum number of threads */
+        __kmp_sys_max_nth = sysconf( _SC_THREAD_THREADS_MAX );
+        if ( __kmp_sys_max_nth == -1 ) {
+            /* Unlimited threads for NPTL */
+            __kmp_sys_max_nth = INT_MAX;
+        }
+        else if ( __kmp_sys_max_nth <= 1 ) {
+            /* Can't tell, just use PTHREAD_THREADS_MAX */
+            __kmp_sys_max_nth = KMP_MAX_NTH;
+        }
+
+        /* Query the minimum stack size */
+        __kmp_sys_min_stksize = sysconf( _SC_THREAD_STACK_MIN );
+        if ( __kmp_sys_min_stksize <= 1 ) {
+            __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
+        }
+    }
+
+    /* Set up minimum number of threads to switch to TLS gtid */
+    __kmp_tls_gtid_min = KMP_TLS_GTID_MIN;
+
+
+    #ifdef BUILD_TV
+        {
+            int rc = pthread_key_create( & __kmp_tv_key, 0 );
+            KMP_CHECK_SYSFAIL( "pthread_key_create", rc );
+        }
+    #endif
+
+    status = pthread_key_create( &__kmp_gtid_threadprivate_key, __kmp_internal_end_dest );
+    KMP_CHECK_SYSFAIL( "pthread_key_create", status );
+    status = pthread_mutexattr_init( & mutex_attr );
+    KMP_CHECK_SYSFAIL( "pthread_mutexattr_init", status );
+    status = pthread_mutex_init( & __kmp_wait_mx.m_mutex, & mutex_attr );
+    KMP_CHECK_SYSFAIL( "pthread_mutex_init", status );
+    status = pthread_condattr_init( & cond_attr );
+    KMP_CHECK_SYSFAIL( "pthread_condattr_init", status );
+    status = pthread_cond_init( & __kmp_wait_cv.c_cond, & cond_attr );
+    KMP_CHECK_SYSFAIL( "pthread_cond_init", status );
+#if USE_ITT_BUILD
+    __kmp_itt_initialize();
+#endif /* USE_ITT_BUILD */
+
+    __kmp_init_runtime = TRUE;
+}
+
+void
+__kmp_runtime_destroy( void )
+{
+    int status;
+
+    if ( ! __kmp_init_runtime ) {
+        return; // Nothing to do.
+    };
+
+#if USE_ITT_BUILD
+    __kmp_itt_destroy();
+#endif /* USE_ITT_BUILD */
+
+    status = pthread_key_delete( __kmp_gtid_threadprivate_key );
+    KMP_CHECK_SYSFAIL( "pthread_key_delete", status );
+    #ifdef BUILD_TV
+        status = pthread_key_delete( __kmp_tv_key );
+        KMP_CHECK_SYSFAIL( "pthread_key_delete", status );
+    #endif
+
+    status = pthread_mutex_destroy( & __kmp_wait_mx.m_mutex );
+    if ( status != 0 && status != EBUSY ) {
+        KMP_SYSFAIL( "pthread_mutex_destroy", status );
+    }
+    status = pthread_cond_destroy( & __kmp_wait_cv.c_cond );
+    if ( status != 0 && status != EBUSY ) {
+        KMP_SYSFAIL( "pthread_cond_destroy", status );
+    }
+    #if KMP_AFFINITY_SUPPORTED
+        __kmp_affinity_uninitialize();
+    #endif
+
+    __kmp_init_runtime = FALSE;
+}
+
+
+/* Put the thread to sleep for a time period */
+/* NOTE: not currently used anywhere */
+void
+__kmp_thread_sleep( int millis )
+{
+    sleep(  ( millis + 500 ) / 1000 );
+}
+
+/* Calculate the elapsed wall clock time for the user */
+void
+__kmp_elapsed( double *t )
+{
+    int status;
+# ifdef FIX_SGI_CLOCK
+    struct timespec ts;
+
+    status = clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts );
+    KMP_CHECK_SYSFAIL_ERRNO( "clock_gettime", status );
+    *t = (double) ts.tv_nsec * (1.0 / (double) KMP_NSEC_PER_SEC) +
+        (double) ts.tv_sec;
+# else
+    struct timeval tv;
+
+    status = gettimeofday( & tv, NULL );
+    KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+    *t = (double) tv.tv_usec * (1.0 / (double) KMP_USEC_PER_SEC) +
+        (double) tv.tv_sec;
+# endif
+}
+
+/* Calculate the elapsed wall clock tick for the user */
+void
+__kmp_elapsed_tick( double *t )
+{
+    *t = 1 / (double) CLOCKS_PER_SEC;
+}
+
+/*
+    Determine whether the given address is mapped into the current address space.
+*/
+
+int
+__kmp_is_address_mapped( void * addr ) {
+
+    int found = 0;
+    int rc;
+
+    #if KMP_OS_LINUX
+
+        /*
+            On Linux* OS, read the /proc/<pid>/maps pseudo-file to get all the address ranges mapped
+            into the address space.
+        */
+
+        char * name = __kmp_str_format( "/proc/%d/maps", getpid() );
+        FILE * file  = NULL;
+
+        file = fopen( name, "r" );
+        KMP_ASSERT( file != NULL );
+
+        for ( ; ; ) {
+
+            void * beginning = NULL;
+            void * ending    = NULL;
+            char   perms[ 5 ];
+
+            rc = fscanf( file, "%p-%p %4s %*[^\n]\n", & beginning, & ending, perms );
+            if ( rc == EOF ) {
+                break;
+            }; // if
+            KMP_ASSERT( rc == 3 && KMP_STRLEN( perms ) == 4 ); // Make sure all fields are read.
+
+            // Ending address is not included in the region, but beginning is.
+            if ( ( addr >= beginning ) && ( addr < ending ) ) {
+                perms[ 2 ] = 0;    // 3th and 4th character does not matter.
+                if ( strcmp( perms, "rw" ) == 0 ) {
+                    // Memory we are looking for should be readable and writable.
+                    found = 1;
+                }; // if
+                break;
+            }; // if
+
+        }; // forever
+
+        // Free resources.
+        fclose( file );
+        KMP_INTERNAL_FREE( name );
+
+    #elif KMP_OS_DARWIN
+
+        /*
+            On OS X*, /proc pseudo filesystem is not available. Try to read memory using vm
+            interface.
+        */
+
+        int       buffer;
+        vm_size_t count;
+        rc =
+            vm_read_overwrite(
+                mach_task_self(),           // Task to read memory of.
+                (vm_address_t)( addr ),     // Address to read from.
+                1,                          // Number of bytes to be read.
+                (vm_address_t)( & buffer ), // Address of buffer to save read bytes in.
+                & count                     // Address of var to save number of read bytes in.
+            );
+        if ( rc == 0 ) {
+            // Memory successfully read.
+            found = 1;
+        }; // if
+
+    #elif KMP_OS_FREEBSD
+
+        // FIXME(FreeBSD*): Implement this
+        found = 1;
+
+    #else
+
+        #error "Unknown or unsupported OS"
+
+    #endif
+
+    return found;
+
+} // __kmp_is_address_mapped
+
+#ifdef USE_LOAD_BALANCE
+
+
+# if KMP_OS_DARWIN
+
+// The function returns the rounded value of the system load average
+// during given time interval which depends on the value of
+// __kmp_load_balance_interval variable (default is 60 sec, other values
+// may be 300 sec or 900 sec).
+// It returns -1 in case of error.
+int
+__kmp_get_load_balance( int max )
+{
+    double averages[3];
+    int ret_avg = 0;
+
+    int res = getloadavg( averages, 3 );
+
+    //Check __kmp_load_balance_interval to determine which of averages to use.
+    // getloadavg() may return the number of samples less than requested that is
+    // less than 3.
+    if ( __kmp_load_balance_interval < 180 && ( res >= 1 ) ) {
+        ret_avg = averages[0];// 1 min
+    } else if ( ( __kmp_load_balance_interval >= 180
+                  && __kmp_load_balance_interval < 600 ) && ( res >= 2 ) ) {
+        ret_avg = averages[1];// 5 min
+    } else if ( ( __kmp_load_balance_interval >= 600 ) && ( res == 3 ) ) {
+        ret_avg = averages[2];// 15 min
+    } else {// Error occurred
+        return -1;
+    }
+
+    return ret_avg;
+}
+
+# else // Linux* OS
+
+// The fuction returns number of running (not sleeping) threads, or -1 in case of error.
+// Error could be reported if Linux* OS kernel too old (without "/proc" support).
+// Counting running threads stops if max running threads encountered.
+int
+__kmp_get_load_balance( int max )
+{
+    static int permanent_error = 0;
+
+    static int     glb_running_threads          = 0;  /* Saved count of the running threads for the thread balance algortihm */
+    static double  glb_call_time = 0;  /* Thread balance algorithm call time */
+
+    int running_threads = 0;              // Number of running threads in the system.
+
+    DIR  *          proc_dir   = NULL;    // Handle of "/proc/" directory.
+    struct dirent * proc_entry = NULL;
+
+    kmp_str_buf_t   task_path;            // "/proc/<pid>/task/<tid>/" path.
+    DIR  *          task_dir   = NULL;    // Handle of "/proc/<pid>/task/<tid>/" directory.
+    struct dirent * task_entry = NULL;
+    int             task_path_fixed_len;
+
+    kmp_str_buf_t   stat_path;            // "/proc/<pid>/task/<tid>/stat" path.
+    int             stat_file = -1;
+    int             stat_path_fixed_len;
+
+    int total_processes = 0;              // Total number of processes in system.
+    int total_threads   = 0;              // Total number of threads in system.
+
+    double call_time = 0.0;
+
+    __kmp_str_buf_init( & task_path );
+    __kmp_str_buf_init( & stat_path );
+
+     __kmp_elapsed( & call_time );
+
+    if ( glb_call_time &&
+            ( call_time - glb_call_time < __kmp_load_balance_interval ) ) {
+        running_threads = glb_running_threads;
+        goto finish;
+    }
+
+    glb_call_time = call_time;
+
+    // Do not spend time on scanning "/proc/" if we have a permanent error.
+    if ( permanent_error ) {
+        running_threads = -1;
+        goto finish;
+    }; // if
+
+    if ( max <= 0 ) {
+        max = INT_MAX;
+    }; // if
+
+    // Open "/proc/" directory.
+    proc_dir = opendir( "/proc" );
+    if ( proc_dir == NULL ) {
+        // Cannot open "/prroc/". Probably the kernel does not support it. Return an error now and
+        // in subsequent calls.
+        running_threads = -1;
+        permanent_error = 1;
+        goto finish;
+    }; // if
+
+    // Initialize fixed part of task_path. This part will not change.
+    __kmp_str_buf_cat( & task_path, "/proc/", 6 );
+    task_path_fixed_len = task_path.used;    // Remember number of used characters.
+
+    proc_entry = readdir( proc_dir );
+    while ( proc_entry != NULL ) {
+        // Proc entry is a directory and name starts with a digit. Assume it is a process'
+        // directory.
+        if ( proc_entry->d_type == DT_DIR && isdigit( proc_entry->d_name[ 0 ] ) ) {
+
+            ++ total_processes;
+            // Make sure init process is the very first in "/proc", so we can replace
+            // strcmp( proc_entry->d_name, "1" ) == 0 with simpler total_processes == 1.
+            // We are going to check that total_processes == 1 => d_name == "1" is true (where
+            // "=>" is implication). Since C++ does not have => operator, let us replace it with its
+            // equivalent: a => b == ! a || b.
+            KMP_DEBUG_ASSERT( total_processes != 1 || strcmp( proc_entry->d_name, "1" ) == 0 );
+
+            // Construct task_path.
+            task_path.used = task_path_fixed_len;    // Reset task_path to "/proc/".
+            __kmp_str_buf_cat( & task_path, proc_entry->d_name, KMP_STRLEN( proc_entry->d_name ) );
+            __kmp_str_buf_cat( & task_path, "/task", 5 );
+
+            task_dir = opendir( task_path.str );
+            if ( task_dir == NULL ) {
+                // Process can finish between reading "/proc/" directory entry and opening process'
+                // "task/" directory. So, in general case we should not complain, but have to skip
+                // this process and read the next one.
+                // But on systems with no "task/" support we will spend lot of time to scan "/proc/"
+                // tree again and again without any benefit. "init" process (its pid is 1) should
+                // exist always, so, if we cannot open "/proc/1/task/" directory, it means "task/"
+                // is not supported by kernel. Report an error now and in the future.
+                if ( strcmp( proc_entry->d_name, "1" ) == 0 ) {
+                    running_threads = -1;
+                    permanent_error = 1;
+                    goto finish;
+                }; // if
+            } else {
+                 // Construct fixed part of stat file path.
+                __kmp_str_buf_clear( & stat_path );
+                __kmp_str_buf_cat( & stat_path, task_path.str, task_path.used );
+                __kmp_str_buf_cat( & stat_path, "/", 1 );
+                stat_path_fixed_len = stat_path.used;
+
+                task_entry = readdir( task_dir );
+                while ( task_entry != NULL ) {
+                    // It is a directory and name starts with a digit.
+                    if ( proc_entry->d_type == DT_DIR && isdigit( task_entry->d_name[ 0 ] ) ) {
+
+                        ++ total_threads;
+
+                        // Consruct complete stat file path. Easiest way would be:
+                        //  __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str, task_entry->d_name );
+                        // but seriae of __kmp_str_buf_cat works a bit faster.
+                        stat_path.used = stat_path_fixed_len;    // Reset stat path to its fixed part.
+                        __kmp_str_buf_cat( & stat_path, task_entry->d_name, KMP_STRLEN( task_entry->d_name ) );
+                        __kmp_str_buf_cat( & stat_path, "/stat", 5 );
+
+                        // Note: Low-level API (open/read/close) is used. High-level API
+                        // (fopen/fclose)  works ~ 30 % slower.
+                        stat_file = open( stat_path.str, O_RDONLY );
+                        if ( stat_file == -1 ) {
+                            // We cannot report an error because task (thread) can terminate just
+                            // before reading this file.
+                        } else {
+                            /*
+                                Content of "stat" file looks like:
+
+                                    24285 (program) S ...
+
+                                It is a single line (if program name does not include fanny
+                                symbols). First number is a thread id, then name of executable file
+                                name in paretheses, then state of the thread. We need just thread
+                                state.
+
+                                Good news: Length of program name is 15 characters max. Longer
+                                names are truncated.
+
+                                Thus, we need rather short buffer: 15 chars for program name +
+                                2 parenthesis, + 3 spaces + ~7 digits of pid = 37.
+
+                                Bad news: Program name may contain special symbols like space,
+                                closing parenthesis, or even new line. This makes parsing "stat"
+                                file not 100 % reliable. In case of fanny program names parsing
+                                may fail (report incorrect thread state).
+
+                                Parsing "status" file looks more promissing (due to different
+                                file structure and escaping special symbols) but reading and
+                                parsing of "status" file works slower.
+
+                                -- ln
+                            */
+                            char buffer[ 65 ];
+                            int len;
+                            len = read( stat_file, buffer, sizeof( buffer ) - 1 );
+                            if ( len >= 0 ) {
+                                buffer[ len ] = 0;
+                                // Using scanf:
+                                //     sscanf( buffer, "%*d (%*s) %c ", & state );
+                                // looks very nice, but searching for a closing parenthesis works a
+                                // bit faster.
+                                char * close_parent = strstr( buffer, ") " );
+                                if ( close_parent != NULL ) {
+                                    char state = * ( close_parent + 2 );
+                                    if ( state == 'R' ) {
+                                        ++ running_threads;
+                                        if ( running_threads >= max ) {
+                                            goto finish;
+                                        }; // if
+                                    }; // if
+                                }; // if
+                            }; // if
+                            close( stat_file );
+                            stat_file = -1;
+                        }; // if
+                    }; // if
+                    task_entry = readdir( task_dir );
+                }; // while
+                closedir( task_dir );
+                task_dir = NULL;
+            }; // if
+        }; // if
+        proc_entry = readdir( proc_dir );
+    }; // while
+
+    //
+    // There _might_ be a timing hole where the thread executing this
+    // code get skipped in the load balance, and running_threads is 0.
+    // Assert in the debug builds only!!!
+    //
+    KMP_DEBUG_ASSERT( running_threads > 0 );
+    if ( running_threads <= 0 ) {
+        running_threads = 1;
+    }
+
+    finish: // Clean up and exit.
+        if ( proc_dir != NULL ) {
+            closedir( proc_dir );
+        }; // if
+        __kmp_str_buf_free( & task_path );
+        if ( task_dir != NULL ) {
+            closedir( task_dir );
+        }; // if
+        __kmp_str_buf_free( & stat_path );
+        if ( stat_file != -1 ) {
+            close( stat_file );
+        }; // if
+
+    glb_running_threads = running_threads;
+
+    return running_threads;
+
+} // __kmp_get_load_balance
+
+# endif // KMP_OS_DARWIN
+
+#endif // USE_LOAD_BALANCE
+
+
+#if KMP_COMPILER_GCC && !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64)
+
+int __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid, int argc,
+        void *p_argv[] 
+#if OMPT_SUPPORT
+        , void **exit_frame_ptr
+#endif
+)
+{
+    int argc_full = argc + 2;
+    int i;
+    ffi_cif cif;
+    ffi_type *types[argc_full];
+    void *args[argc_full];
+    void *idp[2];
+
+#if OMPT_SUPPORT
+    *exit_frame_ptr = __builtin_frame_address(0);
+#endif
+    /* We're only passing pointers to the target. */
+    for (i = 0; i < argc_full; i++)
+        types[i] = &ffi_type_pointer;
+
+    /* Ugly double-indirection, but that's how it goes... */
+    idp[0] = &gtid;
+    idp[1] = &tid;
+    args[0] = &idp[0];
+    args[1] = &idp[1];
+
+    for (i = 0; i < argc; i++)
+        args[2 + i] = &p_argv[i];
+
+    if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, argc_full,
+                &ffi_type_void, types) != FFI_OK)
+        abort();
+
+    ffi_call(&cif, (void (*)(void))pkfn, NULL, args);
+
+#if OMPT_SUPPORT
+    *exit_frame_ptr = 0;
+#endif
+
+    return 1;
+}
+
+#endif // KMP_COMPILER_GCC && !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_PPC64)
+
+#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
+
+// we really only need the case with 1 argument, because CLANG always build
+// a struct of pointers to shared variables referenced in the outlined function
+int
+__kmp_invoke_microtask( microtask_t pkfn,
+                        int gtid, int tid,
+                        int argc, void *p_argv[] 
+#if OMPT_SUPPORT
+                        , void **exit_frame_ptr
+#endif
+) 
+{
+#if OMPT_SUPPORT
+  *exit_frame_ptr = __builtin_frame_address(0);
+#endif
+
+  switch (argc) {
+  default:
+    fprintf(stderr, "Too many args to microtask: %d!\n", argc);
+    fflush(stderr);
+    exit(-1);
+  case 0:
+    (*pkfn)(&gtid, &tid);
+    break;
+  case 1:
+    (*pkfn)(&gtid, &tid, p_argv[0]);
+    break;
+  case 2:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+    break;
+  case 3:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+    break;
+  case 4:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+    break;
+  case 5:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+    break;
+  case 6:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5]);
+    break;
+  case 7:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6]);
+    break;
+  case 8:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7]);
+    break;
+  case 9:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
+    break;
+  case 10:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
+    break;
+  case 11:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
+    break;
+  case 12:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11]);
+    break;
+  case 13:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12]);
+    break;
+  case 14:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13]);
+    break;
+  case 15:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
+    break;
+  }
+
+#if OMPT_SUPPORT
+  *exit_frame_ptr = 0;
+#endif
+
+  return 1;
+}
+
+#endif
+
+// end of file //
+

diff --git a/final/runtime/src/z_Windows_NT-586_asm.asm b/final/runtime/src/z_Windows_NT-586_asm.asm
new file mode 100644
index 0000000..a4f9a38
--- /dev/null
+++ b/final/runtime/src/z_Windows_NT-586_asm.asm

@@ -0,0 +1,1402 @@
+;  z_Windows_NT-586_asm.asm:  - microtasking routines specifically
+;    written for IA-32 architecture and Intel(R) 64 running Windows* OS
+
+;
+;//===----------------------------------------------------------------------===//
+;//
+;//                     The LLVM Compiler Infrastructure
+;//
+;// This file is dual licensed under the MIT and the University of Illinois Open
+;// Source Licenses. See LICENSE.txt for details.
+;//
+;//===----------------------------------------------------------------------===//
+;
+
+        TITLE   z_Windows_NT-586_asm.asm
+
+; ============================= IA-32 architecture ==========================
+ifdef _M_IA32
+
+        .586P
+
+if @Version gt 510
+        .model HUGE
+else
+_TEXT   SEGMENT PARA USE32 PUBLIC 'CODE'
+_TEXT   ENDS
+_DATA   SEGMENT DWORD USE32 PUBLIC 'DATA'
+_DATA   ENDS
+CONST   SEGMENT DWORD USE32 PUBLIC 'CONST'
+CONST   ENDS
+_BSS    SEGMENT DWORD USE32 PUBLIC 'BSS'
+_BSS    ENDS
+$$SYMBOLS       SEGMENT BYTE USE32 'DEBSYM'
+$$SYMBOLS       ENDS
+$$TYPES SEGMENT BYTE USE32 'DEBTYP'
+$$TYPES ENDS
+_TLS    SEGMENT DWORD USE32 PUBLIC 'TLS'
+_TLS    ENDS
+FLAT    GROUP _DATA, CONST, _BSS
+        ASSUME  CS: FLAT, DS: FLAT, SS: FLAT
+endif
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_x86_pause
+;
+; void
+; __kmp_x86_pause( void )
+;
+
+PUBLIC  ___kmp_x86_pause
+_p$ = 4
+_d$ = 8
+_TEXT   SEGMENT
+        ALIGN 16
+___kmp_x86_pause PROC NEAR
+
+        db      0f3H
+        db      090H    ;; pause
+        ret
+
+___kmp_x86_pause ENDP
+_TEXT   ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_x86_cpuid
+;
+; void
+; __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
+;
+
+PUBLIC  ___kmp_x86_cpuid
+_TEXT   SEGMENT
+        ALIGN 16
+_mode$  = 8
+_mode2$ = 12
+_p$     = 16
+_eax$   = 0
+_ebx$   = 4
+_ecx$   = 8
+_edx$   = 12
+
+___kmp_x86_cpuid PROC NEAR
+
+        push      ebp
+        mov       ebp, esp
+
+        push      edi
+        push      ebx
+        push      ecx
+        push      edx
+
+        mov	  eax, DWORD PTR _mode$[ebp]
+        mov	  ecx, DWORD PTR _mode2$[ebp]
+	cpuid					; Query the CPUID for the current processor
+
+        mov       edi, DWORD PTR _p$[ebp]
+	mov 	  DWORD PTR _eax$[ edi ], eax
+	mov 	  DWORD PTR _ebx$[ edi ], ebx
+	mov 	  DWORD PTR _ecx$[ edi ], ecx
+	mov 	  DWORD PTR _edx$[ edi ], edx
+
+        pop       edx
+        pop       ecx
+        pop       ebx
+        pop       edi
+
+        mov       esp, ebp
+        pop       ebp
+        ret
+
+___kmp_x86_cpuid ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_test_then_add32
+;
+; kmp_int32
+; __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+;
+
+PUBLIC  ___kmp_test_then_add32
+_p$ = 4
+_d$ = 8
+_TEXT   SEGMENT
+        ALIGN 16
+___kmp_test_then_add32 PROC NEAR
+
+        mov     eax, DWORD PTR _d$[esp]
+        mov     ecx, DWORD PTR _p$[esp]
+lock    xadd    DWORD PTR [ecx], eax
+        ret
+
+___kmp_test_then_add32 ENDP
+_TEXT   ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store8
+;
+; kmp_int8
+; __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store8
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store8 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       al, BYTE PTR _cv$[esp]
+        mov       dl, BYTE PTR _sv$[esp]
+lock    cmpxchg   BYTE PTR [ecx], dl
+        sete      al           ; if al == [ecx] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        ret
+
+___kmp_compare_and_store8 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store16
+;
+; kmp_int16
+; __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store16
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store16 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       ax, WORD PTR _cv$[esp]
+        mov       dx, WORD PTR _sv$[esp]
+lock    cmpxchg   WORD PTR [ecx], dx
+        sete      al           ; if ax == [ecx] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        ret
+
+___kmp_compare_and_store16 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store32
+;
+; kmp_int32
+; __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store32 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       eax, DWORD PTR _cv$[esp]
+        mov       edx, DWORD PTR _sv$[esp]
+lock    cmpxchg   DWORD PTR [ecx], edx
+        sete      al           ; if eax == [ecx] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        ret
+
+___kmp_compare_and_store32 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store64
+;
+; kmp_int32
+; __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store64
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 8
+_cv_low$ = 12
+_cv_high$ = 16
+_sv_low$ = 20
+_sv_high$ = 24
+
+___kmp_compare_and_store64 PROC NEAR
+
+        push      ebp
+        mov       ebp, esp
+        push      ebx
+        push      edi
+        mov       edi, DWORD PTR _p$[ebp]
+        mov       eax, DWORD PTR _cv_low$[ebp]
+        mov       edx, DWORD PTR _cv_high$[ebp]
+        mov       ebx, DWORD PTR _sv_low$[ebp]
+        mov       ecx, DWORD PTR _sv_high$[ebp]
+lock    cmpxchg8b QWORD PTR [edi]
+        sete      al           ; if edx:eax == [edi] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        pop       edi
+        pop       ebx
+        mov       esp, ebp
+        pop       ebp
+        ret
+
+___kmp_compare_and_store64 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_fixed8
+;
+; kmp_int8
+; __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+;
+
+PUBLIC  ___kmp_xchg_fixed8
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_d$ = 8
+
+___kmp_xchg_fixed8 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       al,  BYTE PTR _d$[esp]
+lock    xchg      BYTE PTR [ecx], al
+        ret
+
+___kmp_xchg_fixed8 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_fixed16
+;
+; kmp_int16
+; __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+;
+
+PUBLIC  ___kmp_xchg_fixed16
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_d$ = 8
+
+___kmp_xchg_fixed16 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       ax,  WORD PTR  _d$[esp]
+lock    xchg      WORD PTR [ecx], ax
+        ret
+
+___kmp_xchg_fixed16 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_fixed32
+;
+; kmp_int32
+; __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+;
+
+PUBLIC  ___kmp_xchg_fixed32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_d$ = 8
+
+___kmp_xchg_fixed32 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       eax, DWORD PTR _d$[esp]
+lock    xchg      DWORD PTR [ecx], eax
+        ret
+
+___kmp_xchg_fixed32 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_real32
+;
+; kmp_real32
+; __kmp_xchg_real32( volatile kmp_real32 *p, kmp_real32 d );
+;
+
+PUBLIC  ___kmp_xchg_real32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 8
+_d$ = 12
+_old_value$ = -4
+
+___kmp_xchg_real32 PROC NEAR
+
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 4
+        push    esi
+        mov     esi, DWORD PTR _p$[ebp]
+
+        fld     DWORD PTR [esi]
+                        ;; load <addr>
+        fst     DWORD PTR _old_value$[ebp]
+                        ;; store into old_value
+
+        mov     eax, DWORD PTR _d$[ebp]
+
+lock    xchg    DWORD PTR [esi], eax
+
+        fld     DWORD PTR _old_value$[ebp]
+                        ;; return old_value
+        pop     esi
+        mov     esp, ebp
+        pop     ebp
+        ret
+
+___kmp_xchg_real32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store_ret8
+;
+; kmp_int8
+; __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store_ret8
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store_ret8 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       al, BYTE PTR _cv$[esp]
+        mov       dl, BYTE PTR _sv$[esp]
+lock    cmpxchg   BYTE PTR [ecx], dl
+        ret
+
+___kmp_compare_and_store_ret8 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store_ret16
+;
+; kmp_int16
+; __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store_ret16
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store_ret16 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       ax, WORD PTR _cv$[esp]
+        mov       dx, WORD PTR _sv$[esp]
+lock    cmpxchg   WORD PTR [ecx], dx
+        ret
+
+___kmp_compare_and_store_ret16 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store_ret32
+;
+; kmp_int32
+; __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store_ret32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store_ret32 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       eax, DWORD PTR _cv$[esp]
+        mov       edx, DWORD PTR _sv$[esp]
+lock    cmpxchg   DWORD PTR [ecx], edx
+        ret
+
+___kmp_compare_and_store_ret32 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_compare_and_store_ret64
+;
+; kmp_int64
+; __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+;
+
+PUBLIC  ___kmp_compare_and_store_ret64
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 8
+_cv_low$ = 12
+_cv_high$ = 16
+_sv_low$ = 20
+_sv_high$ = 24
+
+___kmp_compare_and_store_ret64 PROC NEAR
+
+        push      ebp
+        mov       ebp, esp
+        push      ebx
+        push      edi
+        mov       edi, DWORD PTR _p$[ebp]
+        mov       eax, DWORD PTR _cv_low$[ebp]
+        mov       edx, DWORD PTR _cv_high$[ebp]
+        mov       ebx, DWORD PTR _sv_low$[ebp]
+        mov       ecx, DWORD PTR _sv_high$[ebp]
+lock    cmpxchg8b QWORD PTR [edi]
+        pop       edi
+        pop       ebx
+        mov       esp, ebp
+        pop       ebp
+        ret
+
+___kmp_compare_and_store_ret64 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_load_x87_fpu_control_word
+;
+; void
+; __kmp_load_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;       p:      4(%esp)
+
+PUBLIC  ___kmp_load_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+
+___kmp_load_x87_fpu_control_word PROC NEAR
+
+        mov       eax, DWORD PTR _p$[esp]
+        fldcw     WORD PTR [eax]
+        ret
+
+___kmp_load_x87_fpu_control_word ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_store_x87_fpu_control_word
+;
+; void
+; __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;       p:      4(%esp)
+
+PUBLIC  ___kmp_store_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+
+___kmp_store_x87_fpu_control_word PROC NEAR
+
+        mov       eax, DWORD PTR _p$[esp]
+        fstcw     WORD PTR [eax]
+        ret
+
+___kmp_store_x87_fpu_control_word ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_clear_x87_fpu_status_word
+;
+; void
+; __kmp_clear_x87_fpu_status_word();
+;
+
+PUBLIC  ___kmp_clear_x87_fpu_status_word
+_TEXT   SEGMENT
+        ALIGN 16
+
+___kmp_clear_x87_fpu_status_word PROC NEAR
+
+        fnclex
+        ret
+
+___kmp_clear_x87_fpu_status_word ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_invoke_microtask
+;
+; typedef void  (*microtask_t)( int *gtid, int *tid, ... );
+;
+; int
+; __kmp_invoke_microtask( microtask_t pkfn,
+;                         int gtid, int tid,
+;                         int argc, void *p_argv[] )
+;
+
+PUBLIC  ___kmp_invoke_microtask
+_TEXT   SEGMENT
+        ALIGN 16
+_pkfn$ = 8
+_gtid$ = 12
+_tid$ = 16
+_argc$ = 20
+_argv$ = 24
+if OMPT_SUPPORT
+_exit_frame$ = 28
+endif
+_i$ = -8
+_stk_adj$ = -16
+_vptr$ = -12
+_qptr$ = -4
+
+___kmp_invoke_microtask PROC NEAR
+; Line 102
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 16                                 ; 00000010H
+        push    ebx
+        push    esi
+        push    edi
+if OMPT_SUPPORT
+        mov     eax, DWORD PTR _exit_frame$[ebp]
+        mov     DWORD PTR [eax], ebp
+endif
+; Line 114
+        mov     eax, DWORD PTR _argc$[ebp]
+        mov     DWORD PTR _i$[ebp], eax
+
+;; ------------------------------------------------------------
+	lea     edx, DWORD PTR [eax*4+8]
+	mov     ecx, esp                                ; Save current SP into ECX
+	mov	eax,edx		; Save the size of the args in eax
+	sub	ecx,edx		; esp-((#args+2)*4) -> ecx -- without mods, stack ptr would be this
+	mov	edx,ecx		; Save to edx
+	and	ecx,-128	; Mask off 7 bits
+	sub	edx,ecx		; Amount to subtract from esp
+	sub	esp,edx		; Prepare stack ptr-- Now it will be aligned on 128-byte boundary at the call
+
+	add	edx,eax		; Calculate total size of the stack decrement.
+        mov     DWORD PTR _stk_adj$[ebp], edx
+;; ------------------------------------------------------------
+
+        jmp     SHORT $L22237
+$L22238:
+        mov     ecx, DWORD PTR _i$[ebp]
+        sub     ecx, 1
+        mov     DWORD PTR _i$[ebp], ecx
+$L22237:
+        cmp     DWORD PTR _i$[ebp], 0
+        jle     SHORT $L22239
+; Line 116
+        mov     edx, DWORD PTR _i$[ebp]
+        mov     eax, DWORD PTR _argv$[ebp]
+        mov     ecx, DWORD PTR [eax+edx*4-4]
+        mov     DWORD PTR _vptr$[ebp], ecx
+; Line 123
+        mov     eax, DWORD PTR _vptr$[ebp]
+; Line 124
+        push    eax
+; Line 127
+        jmp     SHORT $L22238
+$L22239:
+; Line 129
+        lea     edx, DWORD PTR _tid$[ebp]
+        mov     DWORD PTR _vptr$[ebp], edx
+; Line 130
+        lea     eax, DWORD PTR _gtid$[ebp]
+        mov     DWORD PTR _qptr$[ebp], eax
+; Line 143
+        mov     eax, DWORD PTR _vptr$[ebp]
+; Line 144
+        push    eax
+; Line 145
+        mov     eax, DWORD PTR _qptr$[ebp]
+; Line 146
+        push    eax
+; Line 147
+        call    DWORD PTR _pkfn$[ebp]
+; Line 148
+        add     esp, DWORD PTR _stk_adj$[ebp]
+; Line 152
+        mov     eax, 1
+; Line 153
+        pop     edi
+        pop     esi
+        pop     ebx
+        mov     esp, ebp
+        pop     ebp
+        ret     0
+___kmp_invoke_microtask ENDP
+_TEXT   ENDS
+
+endif
+
+; ==================================== Intel(R) 64 ===================================
+
+ifdef _M_AMD64
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_x86_cpuid
+;
+; void
+; __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
+;
+; parameters:
+;	mode:		ecx
+;	mode2:		edx
+;	cpuid_buffer: 	r8
+
+PUBLIC  __kmp_x86_cpuid
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_x86_cpuid PROC FRAME ;NEAR
+
+        push      rbp
+        .pushreg  rbp
+        mov       rbp, rsp
+        .setframe rbp, 0
+        push      rbx				; callee-save register
+        .pushreg  rbx
+        .ENDPROLOG
+
+	mov	  r10, r8                       ; p parameter
+        mov	  eax, ecx			; mode parameter
+        mov	  ecx, edx                      ; mode2 parameter
+	cpuid					; Query the CPUID for the current processor
+
+	mov 	  DWORD PTR 0[ r10 ], eax	; store results into buffer
+	mov 	  DWORD PTR 4[ r10 ], ebx
+	mov 	  DWORD PTR 8[ r10 ], ecx
+	mov 	  DWORD PTR 12[ r10 ], edx
+
+        pop       rbx				; callee-save register
+        mov       rsp, rbp
+        pop       rbp
+        ret
+
+__kmp_x86_cpuid ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_test_then_add32
+;
+; kmp_int32
+; __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	edx
+;
+; return: 	eax
+
+PUBLIC  __kmp_test_then_add32
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_test_then_add32 PROC ;NEAR
+
+        mov     eax, edx
+lock    xadd    DWORD PTR [rcx], eax
+        ret
+
+__kmp_test_then_add32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_test_then_add64
+;
+; kmp_int32
+; __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	rdx
+;
+; return: 	rax
+
+PUBLIC  __kmp_test_then_add64
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_test_then_add64 PROC ;NEAR
+
+        mov     rax, rdx
+lock    xadd    QWORD PTR [rcx], rax
+        ret
+
+__kmp_test_then_add64 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store8
+;
+; kmp_int8
+; __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+
+PUBLIC  __kmp_compare_and_store8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store8 PROC ;NEAR
+
+        mov       al, dl	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   BYTE PTR [rcx], dl
+        sete      al           	; if al == [rcx] set al = 1 else set al = 0
+        and       rax, 1       	; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store16
+;
+; kmp_int16
+; __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+
+PUBLIC  __kmp_compare_and_store16
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store16 PROC ;NEAR
+
+        mov       ax, dx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   WORD PTR [rcx], dx
+        sete      al           	; if ax == [rcx] set al = 1 else set al = 0
+        and       rax, 1       	; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store16 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store32
+;
+; kmp_int32
+; __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+
+PUBLIC  __kmp_compare_and_store32
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store32 PROC ;NEAR
+
+        mov       eax, edx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   DWORD PTR [rcx], edx
+        sete      al           	; if eax == [rcx] set al = 1 else set al = 0
+        and       rax, 1       	; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store32 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store64
+;
+; kmp_int32
+; __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+; parameters:
+;	p:	rcx
+;	cv:	rdx
+;	sv:	r8
+;
+; return:	eax
+
+PUBLIC  __kmp_compare_and_store64
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store64 PROC ;NEAR
+
+        mov       rax, rdx	; "cv"
+	mov	  rdx, r8	; "sv"
+lock    cmpxchg   QWORD PTR [rcx], rdx
+        sete      al           ; if rax == [rcx] set al = 1 else set al = 0
+        and       rax, 1       ; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store64 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_fixed8
+;
+; kmp_int8
+; __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	dl
+;
+; return: 	al
+
+PUBLIC  __kmp_xchg_fixed8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_xchg_fixed8 PROC ;NEAR
+
+        mov       al,  dl
+lock    xchg      BYTE PTR [rcx], al
+        ret
+
+__kmp_xchg_fixed8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_fixed16
+;
+; kmp_int16
+; __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	dx
+;
+; return: 	ax
+
+PUBLIC  __kmp_xchg_fixed16
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_xchg_fixed16 PROC ;NEAR
+
+        mov       ax,  dx
+lock    xchg      WORD PTR [rcx], ax
+        ret
+
+__kmp_xchg_fixed16 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_fixed32
+;
+; kmp_int32
+; __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	edx
+;
+; return: 	eax
+
+PUBLIC  __kmp_xchg_fixed32
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_fixed32 PROC ;NEAR
+
+        mov     eax, edx
+lock    xchg    DWORD PTR [rcx], eax
+        ret
+
+__kmp_xchg_fixed32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION ___kmp_xchg_fixed64
+;
+; kmp_int64
+; __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	rdx
+;
+; return: 	rax
+
+PUBLIC  __kmp_xchg_fixed64
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_fixed64 PROC ;NEAR
+
+        mov     rax, rdx
+lock    xchg    QWORD PTR [rcx], rax
+        ret
+
+__kmp_xchg_fixed64 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store_ret8
+;
+; kmp_int8
+; __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+
+PUBLIC  __kmp_compare_and_store_ret8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret8 PROC ;NEAR
+        mov       al, dl	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   BYTE PTR [rcx], dl
+                        ; Compare AL with [rcx].  If equal set
+                        ; ZF and exchange DL with [rcx].  Else, clear
+                        ; ZF and load [rcx] into AL.
+        ret
+
+__kmp_compare_and_store_ret8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store_ret16
+;
+; kmp_int16
+; __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+
+PUBLIC  __kmp_compare_and_store_ret16
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret16 PROC ;NEAR
+
+        mov       ax, dx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   WORD PTR [rcx], dx
+        ret
+
+__kmp_compare_and_store_ret16 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store_ret32
+;
+; kmp_int32
+; __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+
+PUBLIC  __kmp_compare_and_store_ret32
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret32 PROC ;NEAR
+
+        mov       eax, edx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   DWORD PTR [rcx], edx
+        ret
+
+__kmp_compare_and_store_ret32 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store_ret64
+;
+; kmp_int64
+; __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+; parameters:
+;	p:	rcx
+;	cv:	rdx
+;	sv:	r8
+;
+; return:	rax
+
+PUBLIC  __kmp_compare_and_store_ret64
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret64 PROC ;NEAR
+
+        mov       rax, rdx	; "cv"
+	mov	  rdx, r8	; "sv"
+lock    cmpxchg   QWORD PTR [rcx], rdx
+        ret
+
+__kmp_compare_and_store_ret64 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_compare_and_store_loop8
+;
+; kmp_int8
+; __kmp_compare_and_store_loop8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	al
+
+PUBLIC  __kmp_compare_and_store_loop8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_loop8 PROC ;NEAR
+$__kmp_loop:
+        mov       al, dl	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   BYTE PTR [rcx], dl
+                        ; Compare AL with [rcx].  If equal set
+                        ; ZF and exchange DL with [rcx].  Else, clear
+                        ; ZF and load [rcx] into AL.
+        jz     	SHORT $__kmp_success
+
+        db      0f3H
+        db      090H    		; pause
+
+	jmp	SHORT $__kmp_loop
+
+$__kmp_success:
+        ret
+
+__kmp_compare_and_store_loop8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_xchg_real32
+;
+; kmp_real32
+; __kmp_xchg_real32( volatile kmp_real32 *p, kmp_real32 d );
+;
+; parameters:
+;	p:	rcx
+;       d:	xmm1 (lower 4 bytes)
+;
+; return:	xmm0 (lower 4 bytes)
+
+PUBLIC  __kmp_xchg_real32
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_real32 PROC ;NEAR
+
+	movd	eax, xmm1		; load d
+
+lock    xchg    DWORD PTR [rcx], eax
+
+	movd	xmm0, eax		; load old value into return register
+        ret
+
+__kmp_xchg_real32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_xchg_real64
+;
+; kmp_real64
+; __kmp_xchg_real64( volatile kmp_real64 *p, kmp_real64 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	xmm1 (lower 8 bytes)
+;
+; return:	xmm0 (lower 8 bytes)
+
+PUBLIC  __kmp_xchg_real64
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_real64 PROC ;NEAR
+
+	movd	rax, xmm1		; load "d"
+
+lock    xchg    QWORD PTR [rcx], rax
+
+	movd	xmm0, rax		; load old value into return register
+        ret
+
+__kmp_xchg_real64 ENDP
+_TEXT   ENDS
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_load_x87_fpu_control_word
+;
+; void
+; __kmp_load_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;	p:	rcx
+;
+
+PUBLIC  __kmp_load_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_load_x87_fpu_control_word PROC ;NEAR
+
+        fldcw   WORD PTR [rcx]
+        ret
+
+__kmp_load_x87_fpu_control_word ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_store_x87_fpu_control_word
+;
+; void
+; __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;	p:	rcx
+;
+
+PUBLIC  __kmp_store_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_store_x87_fpu_control_word PROC ;NEAR
+
+        fstcw   WORD PTR [rcx]
+        ret
+
+__kmp_store_x87_fpu_control_word ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_clear_x87_fpu_status_word
+;
+; void
+; __kmp_clear_x87_fpu_status_word()
+;
+
+PUBLIC  __kmp_clear_x87_fpu_status_word
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_clear_x87_fpu_status_word PROC ;NEAR
+
+        fnclex
+        ret
+
+__kmp_clear_x87_fpu_status_word ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+;
+; FUNCTION __kmp_invoke_microtask
+;
+; typedef void  (*microtask_t)( int *gtid, int *tid, ... );
+;
+; int
+; __kmp_invoke_microtask( microtask_t pkfn,
+;                         int gtid, int tid,
+;                         int argc, void *p_argv[] ) {
+;
+;     (*pkfn) ( &gtid, &tid, argv[0], ... );
+;     return 1;
+; }
+;
+; note:
+;      just before call to pkfn must have rsp 128-byte aligned for compiler
+;
+; parameters:
+;      rcx:   pkfn	16[rbp]
+;      edx:   gtid	24[rbp]
+;      r8d:   tid	32[rbp]
+;      r9d:   argc	40[rbp]
+;      [st]:  p_argv	48[rbp]
+;
+; reg temps:
+;      rax:   used all over the place
+;      rdx:   used all over the place
+;      rcx:   used as argument counter for push parms loop
+;      r10:   used to hold pkfn function pointer argument
+;
+; return:      eax    (always 1/TRUE)
+;
+
+$_pkfn   = 16
+$_gtid   = 24
+$_tid    = 32
+$_argc   = 40
+$_p_argv = 48
+if OMPT_SUPPORT
+$_exit_frame = 56
+endif
+
+PUBLIC  __kmp_invoke_microtask
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_invoke_microtask PROC FRAME ;NEAR
+	mov	QWORD PTR 16[rsp], rdx	; home gtid parameter
+	mov 	QWORD PTR 24[rsp], r8	; home tid parameter
+        push    rbp		; save base pointer
+        .pushreg rbp
+	sub	rsp, 0		; no fixed allocation necessary - end prolog
+
+        lea     rbp, QWORD PTR [rsp]   	; establish the base pointer
+        .setframe rbp, 0
+        .ENDPROLOG
+if OMPT_SUPPORT
+        mov     rax, QWORD PTR $_exit_frame[rbp]
+        mov     QWORD PTR [rax], rbp
+endif
+	mov	r10, rcx	; save pkfn pointer for later
+
+;; ------------------------------------------------------------
+        mov     rax, r9		; rax <= argc
+        cmp     rax, 2
+        jge     SHORT $_kmp_invoke_stack_align
+        mov     rax, 2          ; set 4 homes if less than 2 parms
+$_kmp_invoke_stack_align:
+	lea     rdx, QWORD PTR [rax*8+16] ; rax <= (argc + 2) * 8
+	mov     rax, rsp        ; Save current SP into rax
+	sub	rax, rdx	; rsp - ((argc+2)*8) -> rax
+				; without align, rsp would be this
+	and     rax, -128       ; Mask off 7 bits (128-byte align)
+	add     rax, rdx        ; add space for push's in a loop below
+	mov     rsp, rax        ; Prepare the stack ptr
+				; Now it will align to 128-byte at the call
+;; ------------------------------------------------------------
+        			; setup pkfn parameter stack
+	mov	rax, r9		; rax <= argc
+	shl	rax, 3		; rax <= argc*8
+	mov	rdx, QWORD PTR $_p_argv[rbp]	; rdx <= p_argv
+	add	rdx, rax	; rdx <= &p_argv[argc]
+	mov	rcx, r9		; rcx <= argc
+	jecxz	SHORT $_kmp_invoke_pass_parms	; nothing to push if argc=0
+	cmp	ecx, 1		; if argc=1 branch ahead
+	je	SHORT $_kmp_invoke_one_parm
+	sub	ecx, 2		; if argc=2 branch ahead, subtract two from
+	je	SHORT $_kmp_invoke_two_parms
+
+$_kmp_invoke_push_parms:	; push last - 5th parms to pkfn on stack
+	sub	rdx, 8		; decrement p_argv pointer to previous parm
+	mov 	r8, QWORD PTR [rdx] ; r8 <= p_argv[rcx-1]
+	push	r8		; push p_argv[rcx-1] onto stack (reverse order)
+	sub	ecx, 1
+	jecxz	SHORT $_kmp_invoke_two_parms
+	jmp	SHORT $_kmp_invoke_push_parms
+
+$_kmp_invoke_two_parms:
+	sub	rdx, 8		; put 4th parm to pkfn in r9
+	mov	r9, QWORD PTR [rdx] ; r9 <= p_argv[1]
+
+$_kmp_invoke_one_parm:
+        sub	rdx, 8		; put 3rd parm to pkfn in r8
+	mov	r8, QWORD PTR [rdx] ; r8 <= p_argv[0]
+
+$_kmp_invoke_pass_parms:	; put 1st & 2nd parms to pkfn in registers
+	lea	rdx, QWORD PTR $_tid[rbp]  ; rdx <= &tid (2nd parm to pkfn)
+	lea	rcx, QWORD PTR $_gtid[rbp] ; rcx <= &gtid (1st parm to pkfn)
+        sub     rsp, 32         ; add stack space for first four parms
+	mov	rax, r10	; rax <= pkfn
+	call	rax		; call (*pkfn)()
+	mov	rax, 1		; move 1 into return register;
+
+        lea     rsp, QWORD PTR [rbp]	; restore stack pointer
+
+;	add	rsp, 0		; no fixed allocation necessary - start epilog
+        pop     rbp		; restore frame pointer
+        ret
+__kmp_invoke_microtask ENDP
+_TEXT   ENDS
+
+endif
+
+END

diff --git a/final/runtime/src/z_Windows_NT-586_util.c b/final/runtime/src/z_Windows_NT-586_util.c
new file mode 100644
index 0000000..3aeafae
--- /dev/null
+++ b/final/runtime/src/z_Windows_NT-586_util.c

@@ -0,0 +1,163 @@
+/*
+ * z_Windows_NT-586_util.c -- platform specific routines.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+/* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
+ * use compare_and_store for these routines
+ */
+
+kmp_int8
+__kmp_test_then_or8( volatile kmp_int8 *p, kmp_int8 d )
+{
+    kmp_int8 old_value, new_value;
+
+    old_value = TCR_1( *p );
+    new_value = old_value | d;
+
+    while ( ! __kmp_compare_and_store8 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_1( *p );
+        new_value = old_value | d;
+    }
+    return old_value;
+}
+
+kmp_int8
+__kmp_test_then_and8( volatile kmp_int8 *p, kmp_int8 d )
+{
+    kmp_int8 old_value, new_value;
+
+    old_value = TCR_1( *p );
+    new_value = old_value & d;
+
+    while ( ! __kmp_compare_and_store8 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_1( *p );
+        new_value = old_value & d;
+    }
+    return old_value;
+}
+
+kmp_int32
+__kmp_test_then_or32( volatile kmp_int32 *p, kmp_int32 d )
+{
+    kmp_int32 old_value, new_value;
+
+    old_value = TCR_4( *p );
+    new_value = old_value | d;
+
+    while ( ! __kmp_compare_and_store32 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_4( *p );
+        new_value = old_value | d;
+    }
+    return old_value;
+}
+
+kmp_int32
+__kmp_test_then_and32( volatile kmp_int32 *p, kmp_int32 d )
+{
+    kmp_int32 old_value, new_value;
+
+    old_value = TCR_4( *p );
+    new_value = old_value & d;
+
+    while ( ! __kmp_compare_and_store32 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_4( *p );
+        new_value = old_value & d;
+    }
+    return old_value;
+}
+
+kmp_int8
+__kmp_test_then_add8( volatile kmp_int8 *p, kmp_int8 d )
+{
+    kmp_int64 old_value, new_value;
+
+    old_value = TCR_1( *p );
+    new_value = old_value + d;
+    while ( ! __kmp_compare_and_store8 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_1( *p );
+        new_value = old_value + d;
+    }
+    return old_value;
+}
+
+#if KMP_ARCH_X86
+kmp_int64
+__kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d )
+{
+    kmp_int64 old_value, new_value;
+
+    old_value = TCR_8( *p );
+    new_value = old_value + d;
+    while ( ! __kmp_compare_and_store64 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_8( *p );
+        new_value = old_value + d;
+    }
+    return old_value;
+}
+#endif /* KMP_ARCH_X86 */
+
+kmp_int64
+__kmp_test_then_or64( volatile kmp_int64 *p, kmp_int64 d )
+{
+    kmp_int64 old_value, new_value;
+
+    old_value = TCR_8( *p );
+    new_value = old_value | d;
+    while ( ! __kmp_compare_and_store64 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_8( *p );
+        new_value = old_value | d;
+    }
+
+    return old_value;
+}
+
+kmp_int64
+__kmp_test_then_and64( volatile kmp_int64 *p, kmp_int64 d )
+{
+    kmp_int64 old_value, new_value;
+
+    old_value = TCR_8( *p );
+    new_value = old_value & d;
+    while ( ! __kmp_compare_and_store64 ( p, old_value, new_value ) )
+    {
+        KMP_CPU_PAUSE();
+        old_value = TCR_8( *p );
+        new_value = old_value & d;
+    }
+
+    return old_value;
+}
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+

diff --git a/final/runtime/src/z_Windows_NT_util.c b/final/runtime/src/z_Windows_NT_util.c
new file mode 100644
index 0000000..bfeb1b1
--- /dev/null
+++ b/final/runtime/src/z_Windows_NT_util.c

@@ -0,0 +1,1932 @@
+/*
+ * z_Windows_NT_util.c -- platform specific routines.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_itt.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_wait_release.h"
+
+
+
+/* ----------------------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------------------- */
+
+/* This code is related to NtQuerySystemInformation() function. This function
+   is used in the Load balance algorithm for OMP_DYNAMIC=true to find the
+   number of running threads in the system. */
+
+#include <ntstatus.h>
+#include <ntsecapi.h>   // UNICODE_STRING
+
+enum SYSTEM_INFORMATION_CLASS {
+    SystemProcessInformation = 5
+}; // SYSTEM_INFORMATION_CLASS
+
+struct CLIENT_ID {
+    HANDLE UniqueProcess;
+    HANDLE UniqueThread;
+}; // struct CLIENT_ID
+
+enum THREAD_STATE {
+    StateInitialized,
+    StateReady,
+    StateRunning,
+    StateStandby,
+    StateTerminated,
+    StateWait,
+    StateTransition,
+    StateUnknown
+}; // enum THREAD_STATE
+
+struct VM_COUNTERS {
+    SIZE_T        PeakVirtualSize;
+    SIZE_T        VirtualSize;
+    ULONG         PageFaultCount;
+    SIZE_T        PeakWorkingSetSize;
+    SIZE_T        WorkingSetSize;
+    SIZE_T        QuotaPeakPagedPoolUsage;
+    SIZE_T        QuotaPagedPoolUsage;
+    SIZE_T        QuotaPeakNonPagedPoolUsage;
+    SIZE_T        QuotaNonPagedPoolUsage;
+    SIZE_T        PagefileUsage;
+    SIZE_T        PeakPagefileUsage;
+    SIZE_T        PrivatePageCount;
+}; // struct VM_COUNTERS
+
+struct SYSTEM_THREAD {
+  LARGE_INTEGER   KernelTime;
+  LARGE_INTEGER   UserTime;
+  LARGE_INTEGER   CreateTime;
+  ULONG           WaitTime;
+  LPVOID          StartAddress;
+  CLIENT_ID       ClientId;
+  DWORD           Priority;
+  LONG            BasePriority;
+  ULONG           ContextSwitchCount;
+  THREAD_STATE    State;
+  ULONG           WaitReason;
+}; // SYSTEM_THREAD
+
+KMP_BUILD_ASSERT( offsetof( SYSTEM_THREAD, KernelTime ) == 0 );
+#if KMP_ARCH_X86
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_THREAD, StartAddress ) == 28 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_THREAD, State        ) == 52 );
+#else
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_THREAD, StartAddress ) == 32 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_THREAD, State        ) == 68 );
+#endif
+
+struct SYSTEM_PROCESS_INFORMATION {
+  ULONG           NextEntryOffset;
+  ULONG           NumberOfThreads;
+  LARGE_INTEGER   Reserved[ 3 ];
+  LARGE_INTEGER   CreateTime;
+  LARGE_INTEGER   UserTime;
+  LARGE_INTEGER   KernelTime;
+  UNICODE_STRING  ImageName;
+  DWORD           BasePriority;
+  HANDLE          ProcessId;
+  HANDLE          ParentProcessId;
+  ULONG           HandleCount;
+  ULONG           Reserved2[ 2 ];
+  VM_COUNTERS     VMCounters;
+  IO_COUNTERS     IOCounters;
+  SYSTEM_THREAD   Threads[ 1 ];
+}; // SYSTEM_PROCESS_INFORMATION
+typedef SYSTEM_PROCESS_INFORMATION * PSYSTEM_PROCESS_INFORMATION;
+
+KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, NextEntryOffset ) ==  0 );
+KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, CreateTime      ) == 32 );
+KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, ImageName       ) == 56 );
+#if KMP_ARCH_X86
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, ProcessId       ) ==  68 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, HandleCount     ) ==  76 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, VMCounters      ) ==  88 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, IOCounters      ) == 136 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, Threads         ) == 184 );
+#else
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, ProcessId       ) ==  80 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, HandleCount     ) ==  96 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, VMCounters      ) == 112 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, IOCounters      ) == 208 );
+    KMP_BUILD_ASSERT( offsetof( SYSTEM_PROCESS_INFORMATION, Threads         ) == 256 );
+#endif
+
+typedef NTSTATUS (NTAPI *NtQuerySystemInformation_t)( SYSTEM_INFORMATION_CLASS, PVOID, ULONG, PULONG );
+NtQuerySystemInformation_t NtQuerySystemInformation = NULL;
+
+HMODULE ntdll = NULL;
+
+/* End of NtQuerySystemInformation()-related code */
+
+#if KMP_GROUP_AFFINITY
+static HMODULE kernel32 = NULL;
+#endif /* KMP_GROUP_AFFINITY */
+
+/* ----------------------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------------------- */
+
+#if KMP_HANDLE_SIGNALS
+    typedef void    (* sig_func_t )( int );
+    static sig_func_t  __kmp_sighldrs[ NSIG ];
+    static int         __kmp_siginstalled[ NSIG ];
+#endif
+
+static HANDLE   __kmp_monitor_ev;
+static kmp_int64 __kmp_win32_time;
+double __kmp_win32_tick;
+
+int __kmp_init_runtime = FALSE;
+CRITICAL_SECTION __kmp_win32_section;
+
+void
+__kmp_win32_mutex_init( kmp_win32_mutex_t *mx )
+{
+    InitializeCriticalSection( & mx->cs );
+#if USE_ITT_BUILD
+    __kmp_itt_system_object_created( & mx->cs, "Critical Section" );
+#endif /* USE_ITT_BUILD */
+}
+
+void
+__kmp_win32_mutex_destroy( kmp_win32_mutex_t *mx )
+{
+    DeleteCriticalSection( & mx->cs );
+}
+
+void
+__kmp_win32_mutex_lock( kmp_win32_mutex_t *mx )
+{
+    EnterCriticalSection( & mx->cs );
+}
+
+void
+__kmp_win32_mutex_unlock( kmp_win32_mutex_t *mx )
+{
+    LeaveCriticalSection( & mx->cs );
+}
+
+void
+__kmp_win32_cond_init( kmp_win32_cond_t *cv )
+{
+    cv->waiters_count_         = 0;
+    cv->wait_generation_count_ = 0;
+    cv->release_count_         = 0;
+
+    /* Initialize the critical section */
+    __kmp_win32_mutex_init( & cv->waiters_count_lock_ );
+
+    /* Create a manual-reset event. */
+    cv->event_ = CreateEvent( NULL,     // no security
+                              TRUE,     // manual-reset
+                              FALSE,    // non-signaled initially
+                              NULL );   // unnamed
+#if USE_ITT_BUILD
+    __kmp_itt_system_object_created( cv->event_, "Event" );
+#endif /* USE_ITT_BUILD */
+}
+
+void
+__kmp_win32_cond_destroy( kmp_win32_cond_t *cv )
+{
+    __kmp_win32_mutex_destroy( & cv->waiters_count_lock_ );
+    __kmp_free_handle( cv->event_ );
+    memset( cv, '\0', sizeof( *cv ) );
+}
+
+/* TODO associate cv with a team instead of a thread so as to optimize
+ * the case where we wake up a whole team */
+
+void
+__kmp_win32_cond_wait( kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx, kmp_info_t *th, int need_decrease_load )
+{
+    int my_generation;
+    int last_waiter;
+
+    /* Avoid race conditions */
+    __kmp_win32_mutex_lock( &cv->waiters_count_lock_ );
+
+    /* Increment count of waiters */
+    cv->waiters_count_++;
+
+    /* Store current generation in our activation record. */
+    my_generation = cv->wait_generation_count_;
+
+    __kmp_win32_mutex_unlock( &cv->waiters_count_lock_ );
+    __kmp_win32_mutex_unlock( mx );
+
+
+    for (;;) {
+        int wait_done;
+
+        /* Wait until the event is signaled */
+        WaitForSingleObject( cv->event_, INFINITE );
+
+        __kmp_win32_mutex_lock( &cv->waiters_count_lock_ );
+
+        /* Exit the loop when the <cv->event_> is signaled and
+         * there are still waiting threads from this <wait_generation>
+         * that haven't been released from this wait yet.              */
+        wait_done = ( cv->release_count_ > 0 ) &&
+                    ( cv->wait_generation_count_ != my_generation );
+
+        __kmp_win32_mutex_unlock( &cv->waiters_count_lock_);
+
+        /* there used to be a semicolon after the if statement,
+         * it looked like a bug, so i removed it */
+        if( wait_done )
+            break;
+    }
+
+    __kmp_win32_mutex_lock( mx );
+    __kmp_win32_mutex_lock( &cv->waiters_count_lock_ );
+
+    cv->waiters_count_--;
+    cv->release_count_--;
+
+    last_waiter =  ( cv->release_count_ == 0 );
+
+    __kmp_win32_mutex_unlock( &cv->waiters_count_lock_ );
+
+    if( last_waiter ) {
+        /* We're the last waiter to be notified, so reset the manual event. */
+        ResetEvent( cv->event_ );
+    }
+}
+
+void
+__kmp_win32_cond_broadcast( kmp_win32_cond_t *cv )
+{
+    __kmp_win32_mutex_lock( &cv->waiters_count_lock_ );
+
+    if( cv->waiters_count_ > 0 ) {
+        SetEvent( cv->event_ );
+        /* Release all the threads in this generation. */
+
+        cv->release_count_ = cv->waiters_count_;
+
+        /* Start a new generation. */
+        cv->wait_generation_count_++;
+    }
+
+    __kmp_win32_mutex_unlock( &cv->waiters_count_lock_ );
+}
+
+void
+__kmp_win32_cond_signal( kmp_win32_cond_t *cv )
+{
+    __kmp_win32_cond_broadcast( cv );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_enable( int new_state )
+{
+    if (__kmp_init_runtime)
+        LeaveCriticalSection( & __kmp_win32_section );
+}
+
+void
+__kmp_disable( int *old_state )
+{
+    *old_state = 0;
+
+    if (__kmp_init_runtime)
+        EnterCriticalSection( & __kmp_win32_section );
+}
+
+void
+__kmp_suspend_initialize( void )
+{
+    /* do nothing */
+}
+
+static void
+__kmp_suspend_initialize_thread( kmp_info_t *th )
+{
+    if ( ! TCR_4( th->th.th_suspend_init ) ) {
+      /* this means we haven't initialized the suspension pthread objects for this thread
+         in this instance of the process */
+        __kmp_win32_cond_init(  &th->th.th_suspend_cv );
+        __kmp_win32_mutex_init( &th->th.th_suspend_mx );
+        TCW_4( th->th.th_suspend_init, TRUE );
+    }
+}
+
+void
+__kmp_suspend_uninitialize_thread( kmp_info_t *th )
+{
+    if ( TCR_4( th->th.th_suspend_init ) ) {
+      /* this means we have initialize the suspension pthread objects for this thread
+         in this instance of the process */
+      __kmp_win32_cond_destroy( & th->th.th_suspend_cv );
+      __kmp_win32_mutex_destroy( & th->th.th_suspend_mx );
+      TCW_4( th->th.th_suspend_init, FALSE );
+    }
+}
+
+/* This routine puts the calling thread to sleep after setting the
+ * sleep bit for the indicated flag variable to true.
+ */
+template <class C>
+static inline void __kmp_suspend_template( int th_gtid, C *flag )
+{
+    kmp_info_t *th = __kmp_threads[th_gtid];
+    int status;
+    typename C::flag_t old_spin;
+
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d enter for flag's loc(%p)\n", th_gtid, flag->get() ) );
+
+    __kmp_suspend_initialize_thread( th );
+    __kmp_win32_mutex_lock( &th->th.th_suspend_mx );
+
+    KF_TRACE( 10, ( "__kmp_suspend_template: T#%d setting sleep bit for flag's loc(%p)\n",
+                    th_gtid, flag->get() ) );
+
+    /* TODO: shouldn't this use release semantics to ensure that __kmp_suspend_initialize_thread
+       gets called first?
+    */
+    old_spin = flag->set_sleeping();
+
+    KF_TRACE( 5, ( "__kmp_suspend_template: T#%d set sleep bit for flag's loc(%p)==%d\n",
+                   th_gtid, flag->get(), *(flag->get()) ) );
+
+    if ( flag->done_check_val(old_spin) ) {
+        old_spin = flag->unset_sleeping();
+        KF_TRACE( 5, ( "__kmp_suspend_template: T#%d false alarm, reset sleep bit for flag's loc(%p)\n",
+                       th_gtid, flag->get()) );
+    } else {
+#ifdef DEBUG_SUSPEND
+        __kmp_suspend_count++;
+#endif
+        /* Encapsulate in a loop as the documentation states that this may
+         * "with low probability" return when the condition variable has
+         * not been signaled or broadcast
+         */
+        int deactivated = FALSE;
+        TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+        while ( flag->is_sleeping() ) {
+            KF_TRACE( 15, ("__kmp_suspend_template: T#%d about to perform kmp_win32_cond_wait()\n",
+                     th_gtid ) );
+            // Mark the thread as no longer active (only in the first iteration of the loop).
+            if ( ! deactivated ) {
+                th->th.th_active = FALSE;
+                if ( th->th.th_active_in_pool ) {
+                    th->th.th_active_in_pool = FALSE;
+                    KMP_TEST_THEN_DEC32(
+                      (kmp_int32 *) &__kmp_thread_pool_active_nth );
+                    KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
+                }
+                deactivated = TRUE;
+
+
+                __kmp_win32_cond_wait( &th->th.th_suspend_cv, &th->th.th_suspend_mx, 0, 0 );
+            }
+            else {
+                __kmp_win32_cond_wait( &th->th.th_suspend_cv, &th->th.th_suspend_mx, 0, 0 );
+            }
+
+#ifdef KMP_DEBUG
+            if( flag->is_sleeping() ) {
+                KF_TRACE( 100, ("__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid ));
+            }
+#endif /* KMP_DEBUG */
+
+        } // while
+
+        // Mark the thread as active again (if it was previous marked as inactive)
+        if ( deactivated ) {
+            th->th.th_active = TRUE;
+            if ( TCR_4(th->th.th_in_pool) ) {
+                KMP_TEST_THEN_INC32(
+                  (kmp_int32 *) &__kmp_thread_pool_active_nth );
+                th->th.th_active_in_pool = TRUE;
+            }
+        }
+    }
+
+
+    __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
+
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d exit\n", th_gtid ) );
+}
+
+void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+
+
+/* This routine signals the thread specified by target_gtid to wake up
+ * after setting the sleep bit indicated by the flag argument to FALSE
+ */
+template <class C>
+static inline void __kmp_resume_template( int target_gtid, C *flag )
+{
+    kmp_info_t *th = __kmp_threads[target_gtid];
+    int status;
+
+#ifdef KMP_DEBUG
+    int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d wants to wakeup T#%d enter\n", gtid, target_gtid ) );
+
+    __kmp_suspend_initialize_thread( th );
+    __kmp_win32_mutex_lock( &th->th.th_suspend_mx );
+
+    if (!flag) {
+        flag = (C *)th->th.th_sleep_loc;
+    }
+
+    if (!flag) {
+        KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag's loc(%p)\n",
+                       gtid, target_gtid, NULL ) );
+        __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
+        return;
+    }
+    else {
+        typename C::flag_t old_spin = flag->unset_sleeping();
+        if ( !flag->is_sleeping_val(old_spin) ) {
+            KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag's loc(%p): "
+                           "%u => %u\n",
+                           gtid, target_gtid, flag->get(), old_spin, *(flag->get()) ) );
+            __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
+            return;
+        }
+    }
+    TCW_PTR(th->th.th_sleep_loc, NULL);
+
+    KF_TRACE( 5, ( "__kmp_resume_template: T#%d about to wakeup T#%d, reset sleep bit for flag's loc(%p)\n",
+                   gtid, target_gtid, flag->get() ) );
+
+
+    __kmp_win32_cond_signal(  &th->th.th_suspend_cv );
+    __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
+
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d exiting after signaling wake up for T#%d\n",
+                    gtid, target_gtid ) );
+}
+
+void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_yield( int cond )
+{
+    if (cond)
+        Sleep(0);
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_gtid_set_specific( int gtid )
+{
+    KA_TRACE( 50, ("__kmp_gtid_set_specific: T#%d key:%d\n",
+                gtid, __kmp_gtid_threadprivate_key ));
+    KMP_ASSERT( __kmp_init_runtime );
+    if( ! TlsSetValue( __kmp_gtid_threadprivate_key, (LPVOID)(gtid+1)) )
+        KMP_FATAL( TLSSetValueFailed );
+}
+
+int
+__kmp_gtid_get_specific()
+{
+    int gtid;
+    if( !__kmp_init_runtime ) {
+        KA_TRACE( 50, ("__kmp_get_specific: runtime shutdown, returning KMP_GTID_SHUTDOWN\n" ) );
+        return KMP_GTID_SHUTDOWN;
+    }
+    gtid = (int)(kmp_intptr_t)TlsGetValue( __kmp_gtid_threadprivate_key );
+    if ( gtid == 0 ) {
+        gtid = KMP_GTID_DNE;
+    }
+    else {
+        gtid--;
+    }
+    KA_TRACE( 50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
+                __kmp_gtid_threadprivate_key, gtid ));
+    return gtid;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if KMP_GROUP_AFFINITY
+
+//
+// Only 1 DWORD in the mask should have any procs set.
+// Return the appropriate index, or -1 for an invalid mask.
+//
+int
+__kmp_get_proc_group( kmp_affin_mask_t const *mask )
+{
+    int i;
+    int group = -1;
+    for (i = 0; i < __kmp_num_proc_groups; i++) {
+        if (mask[i] == 0) {
+            continue;
+        }
+        if (group >= 0) {
+            return -1;
+        }
+        group = i;
+    }
+    return group;
+}
+
+#endif /* KMP_GROUP_AFFINITY */
+
+int
+__kmp_set_system_affinity( kmp_affin_mask_t const *mask, int abort_on_error )
+{
+
+#if KMP_GROUP_AFFINITY
+
+    if (__kmp_num_proc_groups > 1) {
+        //
+        // Check for a valid mask.
+        //
+        GROUP_AFFINITY ga;
+        int group = __kmp_get_proc_group( mask );
+        if (group < 0) {
+            if (abort_on_error) {
+                KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+            }
+            return -1;
+        }
+
+        //
+        // Transform the bit vector into a GROUP_AFFINITY struct
+        // and make the system call to set affinity.
+        //
+        ga.Group = group;
+        ga.Mask = mask[group];
+        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
+
+        KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
+        if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
+            DWORD error = GetLastError();
+            if (abort_on_error) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( CantSetThreadAffMask ),
+                    KMP_ERR( error ),
+                    __kmp_msg_null
+                );
+            }
+            return error;
+        }
+    }
+    else
+
+#endif /* KMP_GROUP_AFFINITY */
+
+    {
+        if (!SetThreadAffinityMask( GetCurrentThread(), *mask )) {
+            DWORD error = GetLastError();
+            if (abort_on_error) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG( CantSetThreadAffMask ),
+                    KMP_ERR( error ),
+                    __kmp_msg_null
+                );
+            }
+            return error;
+        }
+    }
+    return 0;
+}
+
+int
+__kmp_get_system_affinity( kmp_affin_mask_t *mask, int abort_on_error )
+{
+
+#if KMP_GROUP_AFFINITY
+
+    if (__kmp_num_proc_groups > 1) {
+        KMP_CPU_ZERO(mask);
+        GROUP_AFFINITY ga;
+        KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
+
+        if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
+            DWORD error = GetLastError();
+            if (abort_on_error) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
+                    KMP_ERR(error),
+                    __kmp_msg_null
+                );
+            }
+            return error;
+        }
+
+        if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups)
+          || (ga.Mask == 0)) {
+            return -1;
+        }
+
+        mask[ga.Group] = ga.Mask;
+    }
+    else
+
+#endif /* KMP_GROUP_AFFINITY */
+
+    {
+        kmp_affin_mask_t newMask, sysMask, retval;
+
+        if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
+            DWORD error = GetLastError();
+            if (abort_on_error) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
+                    KMP_ERR(error),
+                    __kmp_msg_null
+                );
+            }
+            return error;
+        }
+        retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
+        if (! retval) {
+            DWORD error = GetLastError();
+            if (abort_on_error) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                    KMP_ERR(error),
+                    __kmp_msg_null
+                );
+            }
+            return error;
+        }
+        newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
+        if (! newMask) {
+            DWORD error = GetLastError();
+            if (abort_on_error) {
+                __kmp_msg(
+                    kmp_ms_fatal,
+                    KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                    KMP_ERR(error),
+                    __kmp_msg_null
+                );
+            }
+        }
+        *mask = retval;
+    }
+    return 0;
+}
+
+void
+__kmp_affinity_bind_thread( int proc )
+{
+
+#if KMP_GROUP_AFFINITY
+
+    if (__kmp_num_proc_groups > 1) {
+        //
+        // Form the GROUP_AFFINITY struct directly, rather than filling
+        // out a bit vector and calling __kmp_set_system_affinity().
+        //
+        GROUP_AFFINITY ga;
+        KMP_DEBUG_ASSERT((proc >= 0) && (proc < (__kmp_num_proc_groups
+           * CHAR_BIT * sizeof(DWORD_PTR))));
+        ga.Group = proc / (CHAR_BIT * sizeof(DWORD_PTR));
+        ga.Mask = (unsigned long long)1 << (proc % (CHAR_BIT * sizeof(DWORD_PTR)));
+        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
+
+        KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
+        if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
+            DWORD error = GetLastError();
+            if (__kmp_affinity_verbose) { // AC: continue silently if not verbose
+                __kmp_msg(
+                    kmp_ms_warning,
+                    KMP_MSG( CantSetThreadAffMask ),
+                    KMP_ERR( error ),
+                    __kmp_msg_null
+                );
+            }
+        }
+    }
+    else
+
+#endif /* KMP_GROUP_AFFINITY */
+
+    {
+        kmp_affin_mask_t mask;
+        KMP_CPU_ZERO(&mask);
+        KMP_CPU_SET(proc, &mask);
+        __kmp_set_system_affinity(&mask, TRUE);
+    }
+}
+
+void
+__kmp_affinity_determine_capable( const char *env_var )
+{
+    //
+    // All versions of Windows* OS (since Win '95) support SetThreadAffinityMask().
+    //
+
+#if KMP_GROUP_AFFINITY
+    KMP_AFFINITY_ENABLE(__kmp_num_proc_groups*sizeof(kmp_affin_mask_t));
+#else
+    KMP_AFFINITY_ENABLE(sizeof(kmp_affin_mask_t));
+#endif
+
+    KA_TRACE( 10, (
+        "__kmp_affinity_determine_capable: "
+            "Windows* OS affinity interface functional (mask size = %" KMP_SIZE_T_SPEC ").\n",
+        __kmp_affin_mask_size
+    ) );
+}
+
+double
+__kmp_read_cpu_time( void )
+{
+    FILETIME    CreationTime, ExitTime, KernelTime, UserTime;
+    int         status;
+    double      cpu_time;
+
+    cpu_time = 0;
+
+    status = GetProcessTimes( GetCurrentProcess(), &CreationTime,
+                              &ExitTime, &KernelTime, &UserTime );
+
+    if (status) {
+        double  sec = 0;
+
+        sec += KernelTime.dwHighDateTime;
+        sec += UserTime.dwHighDateTime;
+
+        /* Shift left by 32 bits */
+        sec *= (double) (1 << 16) * (double) (1 << 16);
+
+        sec += KernelTime.dwLowDateTime;
+        sec += UserTime.dwLowDateTime;
+
+        cpu_time += (sec * 100.0) / KMP_NSEC_PER_SEC;
+    }
+
+    return cpu_time;
+}
+
+int
+__kmp_read_system_info( struct kmp_sys_info *info )
+{
+    info->maxrss  = 0;                   /* the maximum resident set size utilized (in kilobytes)     */
+    info->minflt  = 0;                   /* the number of page faults serviced without any I/O        */
+    info->majflt  = 0;                   /* the number of page faults serviced that required I/O      */
+    info->nswap   = 0;                   /* the number of times a process was "swapped" out of memory */
+    info->inblock = 0;                   /* the number of times the file system had to perform input  */
+    info->oublock = 0;                   /* the number of times the file system had to perform output */
+    info->nvcsw   = 0;                   /* the number of times a context switch was voluntarily      */
+    info->nivcsw  = 0;                   /* the number of times a context switch was forced           */
+
+    return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+
+void
+__kmp_runtime_initialize( void )
+{
+    SYSTEM_INFO info;
+    kmp_str_buf_t path;
+    UINT path_size;
+
+    if ( __kmp_init_runtime ) {
+        return;
+    };
+
+#if KMP_DYNAMIC_LIB
+    /* Pin dynamic library for the lifetime of application */
+    {
+        // First, turn off error message boxes
+        UINT err_mode = SetErrorMode (SEM_FAILCRITICALERRORS);
+        HMODULE h;
+        BOOL ret = GetModuleHandleEx( GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS
+                                     |GET_MODULE_HANDLE_EX_FLAG_PIN,
+                                     (LPCTSTR)&__kmp_serial_initialize, &h);
+        KMP_DEBUG_ASSERT2(h && ret, "OpenMP RTL cannot find itself loaded");
+        SetErrorMode (err_mode);   // Restore error mode
+        KA_TRACE( 10, ("__kmp_runtime_initialize: dynamic library pinned\n") );
+    }
+#endif
+
+    InitializeCriticalSection( & __kmp_win32_section );
+#if USE_ITT_BUILD
+    __kmp_itt_system_object_created( & __kmp_win32_section, "Critical Section" );
+#endif /* USE_ITT_BUILD */
+    __kmp_initialize_system_tick();
+
+    #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+        if ( ! __kmp_cpuinfo.initialized ) {
+            __kmp_query_cpuid( & __kmp_cpuinfo );
+        }; // if
+    #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    /* Set up minimum number of threads to switch to TLS gtid */
+    #if KMP_OS_WINDOWS && ! defined KMP_DYNAMIC_LIB
+        // Windows* OS, static library.
+        /*
+            New thread may use stack space previously used by another thread, currently terminated.
+            On Windows* OS, in case of static linking, we do not know the moment of thread termination,
+            and our structures (__kmp_threads and __kmp_root arrays) are still keep info about dead
+            threads. This leads to problem in __kmp_get_global_thread_id() function: it wrongly
+            finds gtid (by searching through stack addresses of all known threads) for unregistered
+            foreign tread.
+
+            Setting __kmp_tls_gtid_min to 0 workarounds this problem: __kmp_get_global_thread_id()
+            does not search through stacks, but get gtid from TLS immediately.
+
+            --ln
+        */
+        __kmp_tls_gtid_min = 0;
+    #else
+        __kmp_tls_gtid_min = KMP_TLS_GTID_MIN;
+    #endif
+
+    /* for the static library */
+    if ( !__kmp_gtid_threadprivate_key ) {
+        __kmp_gtid_threadprivate_key = TlsAlloc();
+        if( __kmp_gtid_threadprivate_key == TLS_OUT_OF_INDEXES ) {
+            KMP_FATAL( TLSOutOfIndexes );
+        }
+    }
+
+
+    //
+    // Load ntdll.dll.
+    //
+    /*
+        Simple
+            GetModuleHandle( "ntdll.dl" )
+        is not suitable due to security issue (see
+        http://www.microsoft.com/technet/security/advisory/2269637.mspx). We have to specify full
+        path to the library.
+    */
+    __kmp_str_buf_init( & path );
+    path_size = GetSystemDirectory( path.str, path.size );
+    KMP_DEBUG_ASSERT( path_size > 0 );
+    if ( path_size >= path.size ) {
+        //
+        // Buffer is too short.  Expand the buffer and try again.
+        //
+        __kmp_str_buf_reserve( & path, path_size );
+        path_size = GetSystemDirectory( path.str, path.size );
+        KMP_DEBUG_ASSERT( path_size > 0 );
+    }; // if
+    if ( path_size > 0 && path_size < path.size ) {
+        //
+        // Now we have system directory name in the buffer.
+        // Append backslash and name of dll to form full path,
+        //
+        path.used = path_size;
+        __kmp_str_buf_print( & path, "\\%s", "ntdll.dll" );
+
+        //
+        // Now load ntdll using full path.
+        //
+        ntdll = GetModuleHandle( path.str );
+    }
+
+    KMP_DEBUG_ASSERT( ntdll != NULL );
+    if ( ntdll != NULL ) {
+        NtQuerySystemInformation = (NtQuerySystemInformation_t) GetProcAddress( ntdll, "NtQuerySystemInformation" );
+    }
+    KMP_DEBUG_ASSERT( NtQuerySystemInformation != NULL );
+
+#if KMP_GROUP_AFFINITY
+    //
+    // Load kernel32.dll.
+    // Same caveat - must use full system path name.
+    //
+    if ( path_size > 0 && path_size < path.size ) {
+        //
+        // Truncate the buffer back to just the system path length,
+        // discarding "\\ntdll.dll", and replacing it with "kernel32.dll".
+        //
+        path.used = path_size;
+        __kmp_str_buf_print( & path, "\\%s", "kernel32.dll" );
+
+        //
+        // Load kernel32.dll using full path.
+        //
+        kernel32 = GetModuleHandle( path.str );
+        KA_TRACE( 10, ("__kmp_runtime_initialize: kernel32.dll = %s\n", path.str ) );
+
+        //
+        // Load the function pointers to kernel32.dll routines
+        // that may or may not exist on this system.
+        //
+        if ( kernel32 != NULL ) {
+            __kmp_GetActiveProcessorCount = (kmp_GetActiveProcessorCount_t) GetProcAddress( kernel32, "GetActiveProcessorCount" );
+            __kmp_GetActiveProcessorGroupCount = (kmp_GetActiveProcessorGroupCount_t) GetProcAddress( kernel32, "GetActiveProcessorGroupCount" );
+            __kmp_GetThreadGroupAffinity = (kmp_GetThreadGroupAffinity_t) GetProcAddress( kernel32, "GetThreadGroupAffinity" );
+            __kmp_SetThreadGroupAffinity = (kmp_SetThreadGroupAffinity_t) GetProcAddress( kernel32, "SetThreadGroupAffinity" );
+
+            KA_TRACE( 10, ("__kmp_runtime_initialize: __kmp_GetActiveProcessorCount = %p\n", __kmp_GetActiveProcessorCount ) );
+            KA_TRACE( 10, ("__kmp_runtime_initialize: __kmp_GetActiveProcessorGroupCount = %p\n", __kmp_GetActiveProcessorGroupCount ) );
+            KA_TRACE( 10, ("__kmp_runtime_initialize:__kmp_GetThreadGroupAffinity = %p\n", __kmp_GetThreadGroupAffinity ) );
+            KA_TRACE( 10, ("__kmp_runtime_initialize: __kmp_SetThreadGroupAffinity = %p\n", __kmp_SetThreadGroupAffinity ) );
+            KA_TRACE( 10, ("__kmp_runtime_initialize: sizeof(kmp_affin_mask_t) = %d\n", sizeof(kmp_affin_mask_t) ) );
+
+            //
+            // See if group affinity is supported on this system.
+            // If so, calculate the #groups and #procs.
+            //
+            // Group affinity was introduced with Windows* 7 OS and
+            // Windows* Server 2008 R2 OS.
+            //
+            if ( ( __kmp_GetActiveProcessorCount != NULL )
+              && ( __kmp_GetActiveProcessorGroupCount != NULL )
+              && ( __kmp_GetThreadGroupAffinity != NULL )
+              && ( __kmp_SetThreadGroupAffinity != NULL )
+              && ( ( __kmp_num_proc_groups
+              = __kmp_GetActiveProcessorGroupCount() ) > 1 ) ) {
+                //
+                // Calculate the total number of active OS procs.
+                //
+                int i;
+
+                KA_TRACE( 10, ("__kmp_runtime_initialize: %d processor groups detected\n", __kmp_num_proc_groups ) );
+
+                __kmp_xproc = 0;
+
+                for ( i = 0; i < __kmp_num_proc_groups; i++ ) {
+                    DWORD size = __kmp_GetActiveProcessorCount( i );
+                    __kmp_xproc += size;
+                    KA_TRACE( 10, ("__kmp_runtime_initialize: proc group %d size = %d\n", i, size ) );
+                }
+                }
+            else {
+                KA_TRACE( 10, ("__kmp_runtime_initialize: %d processor groups detected\n", __kmp_num_proc_groups ) );
+            }
+        }
+    }
+    if ( __kmp_num_proc_groups <= 1 ) {
+        GetSystemInfo( & info );
+        __kmp_xproc = info.dwNumberOfProcessors;
+    }
+#else
+    GetSystemInfo( & info );
+    __kmp_xproc = info.dwNumberOfProcessors;
+#endif /* KMP_GROUP_AFFINITY */
+
+    //
+    // If the OS said there were 0 procs, take a guess and use a value of 2.
+    // This is done for Linux* OS, also.  Do we need error / warning?
+    //
+    if ( __kmp_xproc <= 0 ) {
+        __kmp_xproc = 2;
+    }
+
+    KA_TRACE( 5, ("__kmp_runtime_initialize: total processors = %d\n", __kmp_xproc) );
+
+    __kmp_str_buf_free( & path );
+
+#if USE_ITT_BUILD
+    __kmp_itt_initialize();
+#endif /* USE_ITT_BUILD */
+
+    __kmp_init_runtime = TRUE;
+} // __kmp_runtime_initialize
+
+void
+__kmp_runtime_destroy( void )
+{
+    if ( ! __kmp_init_runtime ) {
+        return;
+    }
+
+#if USE_ITT_BUILD
+    __kmp_itt_destroy();
+#endif /* USE_ITT_BUILD */
+
+    /* we can't DeleteCriticalsection( & __kmp_win32_section ); */
+    /* due to the KX_TRACE() commands */
+    KA_TRACE( 40, ("__kmp_runtime_destroy\n" ));
+
+    if( __kmp_gtid_threadprivate_key ) {
+        TlsFree( __kmp_gtid_threadprivate_key );
+        __kmp_gtid_threadprivate_key = 0;
+    }
+
+    __kmp_affinity_uninitialize();
+    DeleteCriticalSection( & __kmp_win32_section );
+
+    ntdll = NULL;
+    NtQuerySystemInformation = NULL;
+
+#if KMP_ARCH_X86_64
+    kernel32 = NULL;
+    __kmp_GetActiveProcessorCount = NULL;
+    __kmp_GetActiveProcessorGroupCount = NULL;
+    __kmp_GetThreadGroupAffinity = NULL;
+    __kmp_SetThreadGroupAffinity = NULL;
+#endif // KMP_ARCH_X86_64
+
+    __kmp_init_runtime = FALSE;
+}
+
+
+void
+__kmp_terminate_thread( int gtid )
+{
+    kmp_info_t  *th = __kmp_threads[ gtid ];
+
+    if( !th ) return;
+
+    KA_TRACE( 10, ("__kmp_terminate_thread: kill (%d)\n", gtid ) );
+
+    if (TerminateThread( th->th.th_info.ds.ds_thread, (DWORD) -1) == FALSE) {
+        /* It's OK, the thread may have exited already */
+    }
+    __kmp_free_handle( th->th.th_info.ds.ds_thread );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void
+__kmp_clear_system_time( void )
+{
+    BOOL status;
+    LARGE_INTEGER time;
+    status = QueryPerformanceCounter( & time );
+    __kmp_win32_time = (kmp_int64) time.QuadPart;
+}
+
+void
+__kmp_initialize_system_tick( void )
+{
+    {
+  BOOL status;
+  LARGE_INTEGER freq;
+
+  status = QueryPerformanceFrequency( & freq );
+  if (! status) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( FunctionError, "QueryPerformanceFrequency()" ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+
+  }
+  else {
+      __kmp_win32_tick = ((double) 1.0) / (double) freq.QuadPart;
+  }
+    }
+}
+
+/* Calculate the elapsed wall clock time for the user */
+
+void
+__kmp_elapsed( double *t )
+{
+    BOOL status;
+    LARGE_INTEGER now;
+    status = QueryPerformanceCounter( & now );
+    *t = ((double) now.QuadPart) * __kmp_win32_tick;
+}
+
+/* Calculate the elapsed wall clock tick for the user */
+
+void
+__kmp_elapsed_tick( double *t )
+{
+    *t = __kmp_win32_tick;
+}
+
+void
+__kmp_read_system_time( double *delta )
+{
+
+    if (delta != NULL) {
+        BOOL status;
+        LARGE_INTEGER now;
+
+        status = QueryPerformanceCounter( & now );
+
+        *delta = ((double) (((kmp_int64) now.QuadPart) - __kmp_win32_time))
+    * __kmp_win32_tick;
+    }
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void * __stdcall
+__kmp_launch_worker( void *arg )
+{
+    volatile void *stack_data;
+    void *exit_val;
+    void *padding = 0;
+    kmp_info_t *this_thr = (kmp_info_t *) arg;
+    int gtid;
+
+    gtid = this_thr->th.th_info.ds.ds_gtid;
+    __kmp_gtid_set_specific( gtid );
+#ifdef KMP_TDATA_GTID
+    #error "This define causes problems with LoadLibrary() + declspec(thread) " \
+        "on Windows* OS.  See CQ50564, tests kmp_load_library*.c and this MSDN " \
+        "reference: http://support.microsoft.com/kb/118816"
+    //__kmp_gtid = gtid;
+#endif
+
+#if USE_ITT_BUILD
+    __kmp_itt_thread_name( gtid );
+#endif /* USE_ITT_BUILD */
+
+    __kmp_affinity_set_init_mask( gtid, FALSE );
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    //
+    // Set the FP control regs to be a copy of
+    // the parallel initialization thread's.
+    //
+    __kmp_clear_x87_fpu_status_word();
+    __kmp_load_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
+    __kmp_load_mxcsr( &__kmp_init_mxcsr );
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    if ( __kmp_stkoffset > 0 && gtid > 0 ) {
+        padding = KMP_ALLOCA( gtid * __kmp_stkoffset );
+    }
+
+    KMP_FSYNC_RELEASING( &this_thr -> th.th_info.ds.ds_alive );
+    this_thr -> th.th_info.ds.ds_thread_id = GetCurrentThreadId();
+    TCW_4( this_thr -> th.th_info.ds.ds_alive, TRUE );
+
+    if ( TCR_4(__kmp_gtid_mode) < 2 ) { // check stack only if it is used to get gtid
+        TCW_PTR(this_thr->th.th_info.ds.ds_stackbase, &stack_data);
+        KMP_ASSERT( this_thr -> th.th_info.ds.ds_stackgrow == FALSE );
+        __kmp_check_stack_overlap( this_thr );
+    }
+    KMP_MB();
+    exit_val = __kmp_launch_thread( this_thr );
+    KMP_FSYNC_RELEASING( &this_thr -> th.th_info.ds.ds_alive );
+    TCW_4( this_thr -> th.th_info.ds.ds_alive, FALSE );
+    KMP_MB();
+    return exit_val;
+}
+
+
+/* The monitor thread controls all of the threads in the complex */
+
+void * __stdcall
+__kmp_launch_monitor( void *arg )
+{
+    DWORD        wait_status;
+    kmp_thread_t monitor;
+    int          status;
+    int          interval;
+    kmp_info_t *this_thr = (kmp_info_t *) arg;
+
+    KMP_DEBUG_ASSERT(__kmp_init_monitor);
+    TCW_4( __kmp_init_monitor, 2 );    // AC: Signal the library that monitor has started
+                                       // TODO: hide "2" in enum (like {true,false,started})
+    this_thr -> th.th_info.ds.ds_thread_id = GetCurrentThreadId();
+    TCW_4( this_thr -> th.th_info.ds.ds_alive, TRUE );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+    KA_TRACE( 10, ("__kmp_launch_monitor: launched\n" ) );
+
+    monitor = GetCurrentThread();
+
+    /* set thread priority */
+    status = SetThreadPriority( monitor, THREAD_PRIORITY_HIGHEST );
+    if (! status) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( CantSetThreadPriority ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+
+    /* register us as monitor */
+    __kmp_gtid_set_specific( KMP_GTID_MONITOR );
+#ifdef KMP_TDATA_GTID
+    #error "This define causes problems with LoadLibrary() + declspec(thread) " \
+        "on Windows* OS.  See CQ50564, tests kmp_load_library*.c and this MSDN " \
+        "reference: http://support.microsoft.com/kb/118816"
+    //__kmp_gtid = KMP_GTID_MONITOR;
+#endif
+
+#if USE_ITT_BUILD
+    __kmp_itt_thread_ignore();    // Instruct Intel(R) Threading Tools to ignore monitor thread.
+#endif /* USE_ITT_BUILD */
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    interval = ( 1000 / __kmp_monitor_wakeups ); /* in milliseconds */
+
+    while (! TCR_4(__kmp_global.g.g_done)) {
+        /*  This thread monitors the state of the system */
+
+        KA_TRACE( 15, ( "__kmp_launch_monitor: update\n" ) );
+
+        wait_status = WaitForSingleObject( __kmp_monitor_ev, interval );
+
+        if (wait_status == WAIT_TIMEOUT) {
+            TCW_4( __kmp_global.g.g_time.dt.t_value,
+              TCR_4( __kmp_global.g.g_time.dt.t_value ) + 1 );
+        }
+
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+    }
+
+    KA_TRACE( 10, ("__kmp_launch_monitor: finished\n" ) );
+
+    status = SetThreadPriority( monitor, THREAD_PRIORITY_NORMAL );
+    if (! status) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( CantSetThreadPriority ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+
+    if (__kmp_global.g.g_abort != 0) {
+        /* now we need to terminate the worker threads   */
+        /* the value of t_abort is the signal we caught */
+
+        int gtid;
+
+        KA_TRACE( 10, ("__kmp_launch_monitor: terminate sig=%d\n", (__kmp_global.g.g_abort) ) );
+
+        /* terminate the OpenMP worker threads */
+        /* TODO this is not valid for sibling threads!!
+         * the uber master might not be 0 anymore.. */
+        for (gtid = 1; gtid < __kmp_threads_capacity; ++gtid)
+            __kmp_terminate_thread( gtid );
+
+        __kmp_cleanup();
+
+        Sleep( 0 );
+
+        KA_TRACE( 10, ("__kmp_launch_monitor: raise sig=%d\n", (__kmp_global.g.g_abort) ) );
+
+        if (__kmp_global.g.g_abort > 0) {
+            raise( __kmp_global.g.g_abort );
+        }
+    }
+
+    TCW_4( this_thr -> th.th_info.ds.ds_alive, FALSE );
+
+    KMP_MB();
+    return arg;
+}
+
+void
+__kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
+{
+    kmp_thread_t   handle;
+    DWORD          idThread;
+
+    KA_TRACE( 10, ("__kmp_create_worker: try to create thread (%d)\n", gtid ) );
+
+    th->th.th_info.ds.ds_gtid = gtid;
+
+    if ( KMP_UBER_GTID(gtid) ) {
+        int     stack_data;
+
+        /* TODO: GetCurrentThread() returns a pseudo-handle that is unsuitable for other threads to use.
+           Is it appropriate to just use GetCurrentThread?  When should we close this handle?  When
+           unregistering the root?
+        */
+        {
+            BOOL rc;
+            rc = DuplicateHandle(
+                                 GetCurrentProcess(),
+                                 GetCurrentThread(),
+                                 GetCurrentProcess(),
+                                 &th->th.th_info.ds.ds_thread,
+                                 0,
+                                 FALSE,
+                                 DUPLICATE_SAME_ACCESS
+                                 );
+            KMP_ASSERT( rc );
+            KA_TRACE( 10, (" __kmp_create_worker: ROOT Handle duplicated, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
+                           (LPVOID)th,
+                           th->th.th_info.ds.ds_thread ) );
+            th->th.th_info.ds.ds_thread_id = GetCurrentThreadId();
+        }
+        if ( TCR_4(__kmp_gtid_mode) < 2 ) { // check stack only if it is used to get gtid
+            /* we will dynamically update the stack range if gtid_mode == 1 */
+            TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
+            TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
+            TCW_4(th->th.th_info.ds.ds_stackgrow, TRUE);
+            __kmp_check_stack_overlap( th );
+        }
+    }
+    else {
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+        /* Set stack size for this thread now. */
+        KA_TRACE( 10, ( "__kmp_create_worker: stack_size = %" KMP_SIZE_T_SPEC
+                        " bytes\n", stack_size ) );
+
+        stack_size += gtid * __kmp_stkoffset;
+
+        TCW_PTR(th->th.th_info.ds.ds_stacksize, stack_size);
+        TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
+
+        KA_TRACE( 10, ( "__kmp_create_worker: (before) stack_size = %"
+                        KMP_SIZE_T_SPEC
+                        " bytes, &__kmp_launch_worker = %p, th = %p, "
+                        "&idThread = %p\n",
+                        (SIZE_T) stack_size,
+                        (LPTHREAD_START_ROUTINE) & __kmp_launch_worker,
+                        (LPVOID) th, &idThread ) );
+
+            {
+                handle = CreateThread( NULL, (SIZE_T) stack_size,
+                                       (LPTHREAD_START_ROUTINE) __kmp_launch_worker,
+                                       (LPVOID) th, STACK_SIZE_PARAM_IS_A_RESERVATION, &idThread );
+            }
+
+        KA_TRACE( 10, ( "__kmp_create_worker: (after) stack_size = %"
+                        KMP_SIZE_T_SPEC
+                        " bytes, &__kmp_launch_worker = %p, th = %p, "
+                        "idThread = %u, handle = %" KMP_UINTPTR_SPEC "\n",
+                        (SIZE_T) stack_size,
+                        (LPTHREAD_START_ROUTINE) & __kmp_launch_worker,
+                        (LPVOID) th, idThread, handle ) );
+
+            {
+                if ( handle == 0 ) {
+                    DWORD error = GetLastError();
+                    __kmp_msg(
+                              kmp_ms_fatal,
+                              KMP_MSG( CantCreateThread ),
+                              KMP_ERR( error ),
+                              __kmp_msg_null
+                              );
+                } else {
+                    th->th.th_info.ds.ds_thread = handle;
+                }
+            }
+        KMP_MB();       /* Flush all pending memory write invalidates.  */
+    }
+
+    KA_TRACE( 10, ("__kmp_create_worker: done creating thread (%d)\n", gtid ) );
+}
+
+int
+__kmp_still_running(kmp_info_t *th) {
+    return (WAIT_TIMEOUT == WaitForSingleObject( th->th.th_info.ds.ds_thread, 0));
+}
+
+void
+__kmp_create_monitor( kmp_info_t *th )
+{
+    kmp_thread_t        handle;
+    DWORD               idThread;
+    int                 ideal, new_ideal;
+    int     caller_gtid = __kmp_get_gtid();
+
+    KA_TRACE( 10, ("__kmp_create_monitor: try to create monitor\n" ) );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    __kmp_monitor_ev = CreateEvent( NULL, TRUE, FALSE, NULL );
+    if ( __kmp_monitor_ev == NULL ) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( CantCreateEvent ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }; // if
+#if USE_ITT_BUILD
+    __kmp_itt_system_object_created( __kmp_monitor_ev, "Event" );
+#endif /* USE_ITT_BUILD */
+
+    th->th.th_info.ds.ds_tid  = KMP_GTID_MONITOR;
+    th->th.th_info.ds.ds_gtid = KMP_GTID_MONITOR;
+
+    // FIXME - on Windows* OS, if __kmp_monitor_stksize = 0, figure out how
+    // to automatically expand stacksize based on CreateThread error code.
+    if ( __kmp_monitor_stksize == 0 ) {
+        __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+    }
+    if ( __kmp_monitor_stksize < __kmp_sys_min_stksize ) {
+        __kmp_monitor_stksize = __kmp_sys_min_stksize;
+    }
+
+    KA_TRACE( 10, ("__kmp_create_monitor: requested stacksize = %d bytes\n",
+                   (int) __kmp_monitor_stksize ) );
+
+    TCW_4( __kmp_global.g.g_time.dt.t_value, 0 );
+
+    handle = CreateThread( NULL, (SIZE_T) __kmp_monitor_stksize,
+                           (LPTHREAD_START_ROUTINE) __kmp_launch_monitor,
+                           (LPVOID) th, STACK_SIZE_PARAM_IS_A_RESERVATION, &idThread );
+    if (handle == 0) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( CantCreateThread ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+    else
+        th->th.th_info.ds.ds_thread = handle;
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 10, ("__kmp_create_monitor: monitor created %p\n",
+                   (void *) th->th.th_info.ds.ds_thread ) );
+}
+
+/*
+  Check to see if thread is still alive.
+
+  NOTE:  The ExitProcess(code) system call causes all threads to Terminate
+         with a exit_val = code.  Because of this we can not rely on
+         exit_val having any particular value.  So this routine may
+         return STILL_ALIVE in exit_val even after the thread is dead.
+*/
+
+int
+__kmp_is_thread_alive( kmp_info_t * th, DWORD *exit_val )
+{
+    DWORD rc;
+    rc = GetExitCodeThread( th->th.th_info.ds.ds_thread, exit_val );
+    if ( rc == 0 ) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( FunctionError, "GetExitCodeThread()" ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }; // if
+    return ( *exit_val == STILL_ACTIVE );
+}
+
+
+void
+__kmp_exit_thread(
+    int exit_status
+) {
+    ExitThread( exit_status );
+} // __kmp_exit_thread
+
+/*
+    This is a common part for both __kmp_reap_worker() and __kmp_reap_monitor().
+*/
+static void
+__kmp_reap_common( kmp_info_t * th )
+{
+    DWORD exit_val;
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    KA_TRACE( 10, ( "__kmp_reap_common: try to reap (%d)\n", th->th.th_info.ds.ds_gtid ) );
+
+    /*
+        2006-10-19:
+
+        There are two opposite situations:
+
+            1. Windows* OS keep thread alive after it resets ds_alive flag and exits from thread
+               function. (For example, see C70770/Q394281 "unloading of dll based on OMP is very
+               slow".)
+            2. Windows* OS may kill thread before it resets ds_alive flag.
+
+        Right solution seems to be waiting for *either* thread termination *or* ds_alive resetting.
+
+    */
+
+    {
+        // TODO: This code is very similar to KMP_WAIT_YIELD. Need to generalize KMP_WAIT_YIELD to
+        // cover this usage also.
+        void * obj = NULL;
+        register kmp_uint32 spins;
+#if USE_ITT_BUILD
+        KMP_FSYNC_SPIN_INIT( obj, (void*) & th->th.th_info.ds.ds_alive );
+#endif /* USE_ITT_BUILD */
+        KMP_INIT_YIELD( spins );
+        do {
+#if USE_ITT_BUILD
+            KMP_FSYNC_SPIN_PREPARE( obj );
+#endif /* USE_ITT_BUILD */
+            __kmp_is_thread_alive( th, &exit_val );
+            KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
+            KMP_YIELD_SPIN( spins );
+        } while ( exit_val == STILL_ACTIVE && TCR_4( th->th.th_info.ds.ds_alive ) );
+#if USE_ITT_BUILD
+        if ( exit_val == STILL_ACTIVE ) {
+            KMP_FSYNC_CANCEL( obj );
+        } else {
+            KMP_FSYNC_SPIN_ACQUIRED( obj );
+        }; // if
+#endif /* USE_ITT_BUILD */
+    }
+
+    __kmp_free_handle( th->th.th_info.ds.ds_thread );
+
+    /*
+     * NOTE:  The ExitProcess(code) system call causes all threads to Terminate
+     *        with a exit_val = code.  Because of this we can not rely on
+     *        exit_val having any particular value.
+     */
+    if ( exit_val == STILL_ACTIVE ) {
+        KA_TRACE( 1, ( "__kmp_reap_common: thread still active.\n" ) );
+    } else if ( (void *) exit_val != (void *) th) {
+        KA_TRACE( 1, ( "__kmp_reap_common: ExitProcess / TerminateThread used?\n" ) );
+    }; // if
+
+    KA_TRACE( 10,
+        (
+            "__kmp_reap_common: done reaping (%d), handle = %" KMP_UINTPTR_SPEC "\n",
+            th->th.th_info.ds.ds_gtid,
+            th->th.th_info.ds.ds_thread
+        )
+    );
+
+    th->th.th_info.ds.ds_thread    = 0;
+    th->th.th_info.ds.ds_tid       = KMP_GTID_DNE;
+    th->th.th_info.ds.ds_gtid      = KMP_GTID_DNE;
+    th->th.th_info.ds.ds_thread_id = 0;
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+}
+
+void
+__kmp_reap_monitor( kmp_info_t *th )
+{
+    int status;
+
+    KA_TRACE( 10, ("__kmp_reap_monitor: try to reap %p\n",
+                   (void *) th->th.th_info.ds.ds_thread ) );
+
+    // If monitor has been created, its tid and gtid should be KMP_GTID_MONITOR.
+    // If both tid and gtid are 0, it means the monitor did not ever start.
+    // If both tid and gtid are KMP_GTID_DNE, the monitor has been shut down.
+    KMP_DEBUG_ASSERT( th->th.th_info.ds.ds_tid == th->th.th_info.ds.ds_gtid );
+    if ( th->th.th_info.ds.ds_gtid != KMP_GTID_MONITOR ) {
+        return;
+    }; // if
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+
+    status = SetEvent( __kmp_monitor_ev );
+    if ( status == FALSE ) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( CantSetEvent ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+    KA_TRACE( 10, ( "__kmp_reap_monitor: reaping thread (%d)\n", th->th.th_info.ds.ds_gtid ) );
+    __kmp_reap_common( th );
+
+    __kmp_free_handle( __kmp_monitor_ev );
+
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+}
+
+void
+__kmp_reap_worker( kmp_info_t * th )
+{
+    KA_TRACE( 10, ( "__kmp_reap_worker: reaping thread (%d)\n", th->th.th_info.ds.ds_gtid ) );
+    __kmp_reap_common( th );
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#if KMP_HANDLE_SIGNALS
+
+
+static void
+__kmp_team_handler( int signo )
+{
+    if ( __kmp_global.g.g_abort == 0 ) {
+        // Stage 1 signal handler, let's shut down all of the threads.
+        if ( __kmp_debug_buf ) {
+            __kmp_dump_debug_buffer();
+        }; // if
+        KMP_MB();       // Flush all pending memory write invalidates.
+        TCW_4( __kmp_global.g.g_abort, signo );
+        KMP_MB();       // Flush all pending memory write invalidates.
+        TCW_4( __kmp_global.g.g_done, TRUE );
+        KMP_MB();       // Flush all pending memory write invalidates.
+    }
+} // __kmp_team_handler
+
+
+
+static
+sig_func_t __kmp_signal( int signum, sig_func_t handler ) {
+    sig_func_t old = signal( signum, handler );
+    if ( old == SIG_ERR ) {
+        int error = errno;
+        __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "signal" ), KMP_ERR( error ), __kmp_msg_null );
+    }; // if
+    return old;
+}
+
+static void
+__kmp_install_one_handler(
+    int           sig,
+    sig_func_t    handler,
+    int           parallel_init
+) {
+    sig_func_t old;
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+    KB_TRACE( 60, ("__kmp_install_one_handler: called: sig=%d\n", sig ) );
+    if ( parallel_init ) {
+        old = __kmp_signal( sig, handler );
+        // SIG_DFL on Windows* OS in NULL or 0.
+        if ( old == __kmp_sighldrs[ sig ] ) {
+            __kmp_siginstalled[ sig ] = 1;
+        } else {
+            // Restore/keep user's handler if one previously installed.
+            old = __kmp_signal( sig, old );
+        }; // if
+    } else {
+        // Save initial/system signal handlers to see if user handlers installed.
+        // 2009-09-23: It is a dead code. On Windows* OS __kmp_install_signals called once with
+        // parallel_init == TRUE.
+        old = __kmp_signal( sig, SIG_DFL );
+        __kmp_sighldrs[ sig ] = old;
+        __kmp_signal( sig, old );
+    }; // if
+    KMP_MB();       /* Flush all pending memory write invalidates.  */
+} // __kmp_install_one_handler
+
+static void
+__kmp_remove_one_handler( int sig ) {
+    if ( __kmp_siginstalled[ sig ] ) {
+        sig_func_t old;
+        KMP_MB();       // Flush all pending memory write invalidates.
+        KB_TRACE( 60, ( "__kmp_remove_one_handler: called: sig=%d\n", sig ) );
+        old = __kmp_signal( sig, __kmp_sighldrs[ sig ] );
+        if ( old != __kmp_team_handler ) {
+            KB_TRACE( 10, ( "__kmp_remove_one_handler: oops, not our handler, restoring: sig=%d\n", sig ) );
+            old = __kmp_signal( sig, old );
+        }; // if
+        __kmp_sighldrs[ sig ] = NULL;
+        __kmp_siginstalled[ sig ] = 0;
+        KMP_MB();       // Flush all pending memory write invalidates.
+    }; // if
+} // __kmp_remove_one_handler
+
+
+void
+__kmp_install_signals( int parallel_init )
+{
+    KB_TRACE( 10, ( "__kmp_install_signals: called\n" ) );
+    if ( ! __kmp_handle_signals ) {
+        KB_TRACE( 10, ( "__kmp_install_signals: KMP_HANDLE_SIGNALS is false - handlers not installed\n" ) );
+        return;
+    }; // if
+    __kmp_install_one_handler( SIGINT,  __kmp_team_handler, parallel_init );
+    __kmp_install_one_handler( SIGILL,  __kmp_team_handler, parallel_init );
+    __kmp_install_one_handler( SIGABRT, __kmp_team_handler, parallel_init );
+    __kmp_install_one_handler( SIGFPE,  __kmp_team_handler, parallel_init );
+    __kmp_install_one_handler( SIGSEGV, __kmp_team_handler, parallel_init );
+    __kmp_install_one_handler( SIGTERM, __kmp_team_handler, parallel_init );
+} // __kmp_install_signals
+
+
+void
+__kmp_remove_signals( void )
+{
+    int sig;
+    KB_TRACE( 10, ("__kmp_remove_signals: called\n" ) );
+    for ( sig = 1; sig < NSIG; ++ sig ) {
+        __kmp_remove_one_handler( sig );
+    }; // for sig
+} // __kmp_remove_signals
+
+
+#endif // KMP_HANDLE_SIGNALS
+
+/* Put the thread to sleep for a time period */
+void
+__kmp_thread_sleep( int millis )
+{
+    DWORD status;
+
+    status = SleepEx( (DWORD) millis, FALSE );
+    if ( status ) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( FunctionError, "SleepEx()" ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+}
+
+/* Determine whether the given address is mapped into the current address space. */
+int
+__kmp_is_address_mapped( void * addr )
+{
+    DWORD status;
+    MEMORY_BASIC_INFORMATION lpBuffer;
+    SIZE_T dwLength;
+
+    dwLength = sizeof(MEMORY_BASIC_INFORMATION);
+
+    status = VirtualQuery( addr, &lpBuffer, dwLength );
+
+    return !((( lpBuffer.State == MEM_RESERVE) || ( lpBuffer.State == MEM_FREE )) ||
+       (( lpBuffer.Protect == PAGE_NOACCESS ) || ( lpBuffer.Protect == PAGE_EXECUTE )));
+}
+
+kmp_uint64
+__kmp_hardware_timestamp(void)
+{
+    kmp_uint64 r = 0;
+
+    QueryPerformanceCounter((LARGE_INTEGER*) &r);
+    return r;
+}
+
+/* Free handle and check the error code */
+void
+__kmp_free_handle( kmp_thread_t tHandle )
+{
+/* called with parameter type HANDLE also, thus suppose kmp_thread_t defined as HANDLE */
+    BOOL rc;
+    rc = CloseHandle( tHandle );
+    if ( !rc ) {
+        DWORD error = GetLastError();
+        __kmp_msg(
+            kmp_ms_fatal,
+            KMP_MSG( CantCloseHandle ),
+            KMP_ERR( error ),
+            __kmp_msg_null
+        );
+    }
+}
+
+int
+__kmp_get_load_balance( int max ) {
+
+    static ULONG glb_buff_size = 100 * 1024;
+
+    static int     glb_running_threads  = 0;  /* Saved count of the running threads for the thread balance algortihm */
+    static double  glb_call_time        = 0;  /* Thread balance algorithm call time */
+
+    int running_threads = 0;              // Number of running threads in the system.
+    NTSTATUS  status        = 0;
+    ULONG     buff_size     = 0;
+    ULONG     info_size     = 0;
+    void *    buffer        = NULL;
+    PSYSTEM_PROCESS_INFORMATION spi = NULL;
+    int first_time          = 1;
+
+    double call_time = 0.0; //start, finish;
+
+    __kmp_elapsed( & call_time );
+
+    if ( glb_call_time &&
+            ( call_time - glb_call_time < __kmp_load_balance_interval ) ) {
+        running_threads = glb_running_threads;
+        goto finish;
+    }
+    glb_call_time = call_time;
+
+    // Do not spend time on running algorithm if we have a permanent error.
+    if ( NtQuerySystemInformation == NULL ) {
+        running_threads = -1;
+        goto finish;
+    }; // if
+
+    if ( max <= 0 ) {
+        max = INT_MAX;
+    }; // if
+
+    do {
+
+        if ( first_time ) {
+            buff_size = glb_buff_size;
+        } else {
+            buff_size = 2 * buff_size;
+        }
+
+        buffer = KMP_INTERNAL_REALLOC( buffer, buff_size );
+        if ( buffer == NULL ) {
+            running_threads = -1;
+            goto finish;
+        }; // if
+        status = NtQuerySystemInformation( SystemProcessInformation, buffer, buff_size, & info_size );
+        first_time = 0;
+
+    } while ( status == STATUS_INFO_LENGTH_MISMATCH );
+    glb_buff_size = buff_size;
+
+    #define CHECK( cond )                       \
+        {                                       \
+            KMP_DEBUG_ASSERT( cond );           \
+            if ( ! ( cond ) ) {                 \
+                running_threads = -1;           \
+                goto finish;                    \
+            }                                   \
+        }
+
+    CHECK( buff_size >= info_size );
+    spi = PSYSTEM_PROCESS_INFORMATION( buffer );
+    for ( ; ; ) {
+        ptrdiff_t offset = uintptr_t( spi ) - uintptr_t( buffer );
+        CHECK( 0 <= offset && offset + sizeof( SYSTEM_PROCESS_INFORMATION ) < info_size );
+        HANDLE pid = spi->ProcessId;
+        ULONG num = spi->NumberOfThreads;
+        CHECK( num >= 1 );
+        size_t spi_size = sizeof( SYSTEM_PROCESS_INFORMATION ) + sizeof( SYSTEM_THREAD ) * ( num - 1 );
+        CHECK( offset + spi_size < info_size );          // Make sure process info record fits the buffer.
+        if ( spi->NextEntryOffset != 0 ) {
+            CHECK( spi_size <= spi->NextEntryOffset );   // And do not overlap with the next record.
+        }; // if
+        // pid == 0 corresponds to the System Idle Process. It always has running threads
+        // on all cores. So, we don't consider the running threads of this process.
+        if ( pid != 0 ) {
+            for ( int i = 0; i < num; ++ i ) {
+                THREAD_STATE state = spi->Threads[ i ].State;
+                // Count threads that have Ready or Running state.
+                // !!! TODO: Why comment does not match the code???
+                if ( state == StateRunning ) {
+                    ++ running_threads;
+                    // Stop counting running threads if the number is already greater than
+                    // the number of available cores
+                    if ( running_threads >= max ) {
+                        goto finish;
+                    }
+                } // if
+            }; // for i
+        } // if
+        if ( spi->NextEntryOffset == 0 ) {
+            break;
+        }; // if
+        spi = PSYSTEM_PROCESS_INFORMATION( uintptr_t( spi ) + spi->NextEntryOffset );
+    }; // forever
+
+    #undef CHECK
+
+    finish: // Clean up and exit.
+
+        if ( buffer != NULL ) {
+            KMP_INTERNAL_FREE( buffer );
+        }; // if
+
+        glb_running_threads = running_threads;
+
+        return running_threads;
+
+} //__kmp_get_load_balance()
+

diff --git a/final/runtime/tools/build.pl b/final/runtime/tools/build.pl
new file mode 100755
index 0000000..53027a5
--- /dev/null
+++ b/final/runtime/tools/build.pl

@@ -0,0 +1,756 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Pragmas.
+use strict;
+use warnings;
+
+# Standard modules.
+use Data::Dumper;    # Not actually used, but useful for debugging dumps.
+
+# Enable `libomp/tools/lib/' module directory.
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+# LIBOMP modules.
+use Build;
+use LibOMP;
+use Platform ":vars";
+use Uname;
+use tools;
+
+our $VERSION = "0.017";
+
+# --------------------------------------------------------------------------------------------------
+# Important variables.
+# --------------------------------------------------------------------------------------------------
+
+my $root_dir  = $ENV{ LIBOMP_WORK };
+
+my %makefiles = (
+    rtl       => cat_file( $root_dir, "src",                       "makefile.mk" ),
+    timelimit => cat_file( $root_dir, "tools", "src", "timelimit", "makefile.mk" ),
+);
+
+# --------------------------------------------------------------------------------------------------
+# Parse command line.
+# --------------------------------------------------------------------------------------------------
+
+# Possible options.
+#     * targets: comma separated list of targets the option has meaning for. For example,
+#         "version" option (4 or 5) has a meaning only for "rtl" target, while "mode" option has
+#         meaning for all targets.
+#     * base: If base is true this is a base option. All the possible values of base options are
+#         iterated if "--all" option is specified. If base is 0, this is an extra option.
+#     * params: A hash of possible option values. "*" denotes default option value. For example,
+#         if "versio" option is not specified, "--version=5" will be used implicitly.
+#     * suffux: Only for extra options. Subroutine returning suffix for build and output
+#         directories. ** When you do not want an option to be part of the suffix, set its base=2
+my $opts = {
+    "target"          => { targets => "",                  base => 1, parms => { map( ( $_ => "" ), keys( %makefiles ) ), rtl => "*" }, },
+    "version"         => { targets => "rtl",               base => 1, parms => { 5       => "*", 4         => ""              }, },
+    "lib-type"        => { targets => "rtl",               base => 1, parms => { normal  => "*", stubs => ""                  }, },
+    "link-type"       => { targets => "rtl",               base => 1, parms => { dynamic => "*", static    => ""              }, },
+    "mode"            => { targets => "rtl,dsl,timelimit", base => 0, parms => { release => "*", diag      => "", debug => "" }, suffix => sub { substr( $_[ 0 ], 0, 3 );       } },
+    "omp-version"     => { targets => "rtl",               base => 0, parms => { 40      => "", 30        => "", 41 => "*"   }, suffix => sub { $_[ 0 ]; } },
+    "coverage"        => { targets => "rtl",               base => 0, parms => { off     => "*", on        => ""              }, suffix => sub { $_[ 0 ] eq "on" ? "c1" : "c0"; } },
+    "stats"           => { targets => "rtl",               base => 0, parms => { off     => "*", on        => ""              }, suffix => sub { $_[ 0 ] eq "on" ? "s1" : "s0"; } },
+    "ompt-support"    => { targets => "rtl",               base => 0, parms => { off     => "*", on        => ""              }, suffix => sub { $_[ 0 ] eq "on" ? "ompt" : "" } },
+    "ompt-blame"      => { targets => "rtl",               base => 0, parms => { off     => "",  on        => "*"             }, suffix => sub { $_[ 0 ] eq "on" ? "" : "no-ompt-blame" } },
+    "ompt-trace"      => { targets => "rtl",               base => 0, parms => { off     => "",  on        => "*"             }, suffix => sub { $_[ 0 ] eq "on" ? "" : "no-ompt-trace" } },
+};
+my $synonyms = {
+    "debug" => [ qw{ dbg debg } ],
+};
+# This array specifies order of options to process, so it cannot be initialized with keys( %$opts ).
+my @all_opts   = qw{ target version lib-type link-type mode omp-version coverage stats ompt-support ompt-blame ompt-trace };
+# This is the list of base options.
+my @base_opts  = grep( $opts->{ $_ }->{ base } == 1, @all_opts );
+# This is the list of extra options.
+my @extra_opts = grep( $opts->{ $_ }->{ base } == 0, @all_opts );
+
+sub suffix($$$) {
+    my ( $opt, $value, $skip_if_default ) = @_;
+    my $suffix = "";
+    if ( not $skip_if_default or $value ne $opts->{ $opt }->{ dflt } ) {
+        $suffix = $opts->{ $opt }->{ suffix }->( $value );
+    }; # if
+    return $suffix;
+}; # sub suffix
+
+my $scuts = {};     # Shortcuts. Will help to locate proper item in $opts.
+foreach my $opt ( keys( %$opts ) ) {
+    foreach my $parm ( keys( %{ $opts->{ $opt }->{ parms } } ) ) {
+        if ( $parm !~ m{\A(?:[012]|on|off)\z} ) {
+            $scuts->{ $parm } = $opts->{ $opt };
+        }; # if
+        if ( $opts->{ $opt }->{ parms }->{ $parm } eq "*" ) {
+            $opts->{ $opt }->{ dflt } = $parm;
+        }; # if
+    }; # foreach $parm
+}; # foreach $opt
+
+sub parse_option(@) {
+    # This function is called to process every option. $name is option name, $value is option value.
+    # For boolean options $value is either 1 or 0,
+    my ( $name, $value ) = @_;
+    if ( $name eq "all" or $name eq "ALL" ) {
+        foreach my $opt ( keys( %$opts ) ) {
+            if ( $opts->{ $opt }->{ base } or $name eq "ALL" ) {
+                foreach my $parm ( keys( %{ $opts->{ $opt }->{ parms } } ) ) {
+                    $opts->{ $opt }->{ parms }->{ $parm } = 1;
+                }; # foreach $parm
+            }; # if
+        }; # foreach $opt
+        return;
+    }; # if
+    if ( exists( $opts->{ $name } ) ) {
+        # Suppose it is option with explicit value, like "target=normal".
+        if ( $value eq "all" ) {
+            foreach my $parm ( keys( %{ $opts->{ $name }->{ parms } } ) ) {
+                $opts->{ $name }->{ parms }->{ $parm } = 1;
+            }; # foreach
+            return;
+        } elsif ( exists( $opts->{ $name }->{ parms }->{ $value } ) ) {
+            $opts->{ $name }->{ parms }->{ $value } = 1;
+            return;
+        } elsif ( $value eq "" and exists( $opts->{ $name }->{ parms }->{ on } ) ) {
+            $opts->{ $name }->{ parms }->{ on } = 1;
+            return;
+        } else {
+            cmdline_error( "Illegal value of \"$name\" option: \"$value\"" );
+        }; # if
+    }; # if
+    # Ok, it is not an option with explicit value. Try to treat is as a boolean option.
+    if ( exists( $scuts->{ $name } ) ) {
+        ( $value eq "1" or $value eq "0" ) or die "Internal error; stopped";
+        $scuts->{ $name }->{ parms }->{ $name } = $value;
+        return;
+    }; # if
+    # No, it is not a valid option at all.
+    cmdline_error( "Illegal option: \"$name\"" );
+}; # sub parse_option
+
+my $clean        = 0;
+my $clean_common = 0;
+my $clobber      = 0;
+my $test_deps    = 1;
+my $test_touch   = 1;
+my @goals;
+
+sub synonyms($) {
+    my ( $opt ) = @_;
+    return exists( $synonyms->{ $opt } ) ? "|" . join( "|", @{ $synonyms->{ $opt } } ) : "";
+}; # sub synonyms
+
+my @specs = (
+    map( ( "$_" . synonyms( $_ ) . "=s" => \&parse_option ), keys( %$opts  ) ),
+    map( ( "$_" . synonyms( $_ ) . "!"  => \&parse_option ), keys( %$scuts ) ),
+);
+my $answer;
+get_options(
+    @specs,
+    Platform::target_options(),
+    "all"           => \&parse_option,
+    "ALL"           => \&parse_option,
+    "answer=s"      => \$answer,
+    "test-deps!"    => \$test_deps,
+    "test-touch!"   => \$test_touch,
+    "version|ver:s" =>
+        sub {
+            # It is a tricky option. It specifies library version to build and it is also a standard
+            # option to request tool version.
+            if ( $_[ 1 ] eq "" ) {
+                # No arguments => version request.
+                print( "$tool version $VERSION\n" );
+                exit( 0 );
+            } else {
+                # Arguments => version to build.
+                parse_option( @_ )
+            };
+        },
+);
+@goals = @ARGV;
+if ( grep( $_ eq "clobber", @goals ) ) {
+    $clobber = 1;
+}; # if
+if ( grep( $_ eq "clean", @goals ) ) {
+    $clean = 1;
+}; # if
+
+# Ok, now $opts is fulfilled with 0, 1 (explicitly set by the user) and "" and "*" (original
+# values). In each option at least one 1 should be present (otherwise there is nothing to build).
+foreach my $opt ( keys( %$opts ) ) {
+    if ( not grep( $_ eq "1", values( %{ $opts->{ $opt }->{ parms } } ) ) ) {
+        # No explicit "1" found. Enable default choice by replacing "*" with "1".
+        foreach my $parm ( keys( %{ $opts->{ $opt }->{ parms } } ) ) {
+            if ( $opts->{ $opt }->{ parms }->{ $parm } eq "*" ) {
+                $opts->{ $opt }->{ parms }->{ $parm } = 1;
+            }; # if
+        }; # foreach $parm
+    }; # if
+}; # foreach $opt
+
+# Clear $opts. Leave only "1".
+foreach my $opt ( keys( %$opts ) ) {
+    foreach my $parm ( keys( %{ $opts->{ $opt }->{ parms } } ) ) {
+        if ( $opts->{ $opt }->{ parms }->{ $parm } ne "1" ) {
+            delete( $opts->{ $opt }->{ parms }->{ $parm } );
+        }; # if
+    }; # foreach $parm
+}; # foreach $opt
+
+# --------------------------------------------------------------------------------------------------
+# Fill job queue.
+# --------------------------------------------------------------------------------------------------
+
+sub enqueue_jobs($$@);
+sub enqueue_jobs($$@) {
+    my ( $jobs, $set, @rest ) = @_;
+    if ( @rest ) {
+        my $opt = shift( @rest );
+        if (
+            exists( $set->{ target } )
+            and
+            $opts->{ $opt }->{ targets } !~ m{(?:\A|,)$set->{ target }(?:,|\z)}
+        ) {
+            # This option does not have meananing for the target,
+            # do not iterate, just use default value.
+            enqueue_jobs( $jobs, { $opt => $opts->{ $opt }->{ dflt }, %$set }, @rest );
+        } else {
+            foreach my $parm ( sort( keys( %{ $opts->{ $opt }->{ parms } } ) ) ) {
+                enqueue_jobs( $jobs, { $opt => $parm, %$set }, @rest );
+            }; # foreach $parm
+        }; # if
+    } else {
+        my $makefile  = $makefiles{ $set->{ target } };
+        my @base      = map( substr( $set->{ $_ }, 0, 3 ), @base_opts );
+        my @extra     = map( suffix( $_, $set->{ $_ }, 0 ), @extra_opts );
+        my @ex        = grep( $_ ne "", map( suffix( $_, $set->{ $_ }, 1 ), @extra_opts ) );
+            # Shortened version of @extra -- only non-default values.
+        my $suffix    = ( @extra ? "." . join( ".", @extra ) : "" );
+        my $knights   = index( $suffix, "kn" ) - 1;
+        if ( $target_arch !~ "mic" and $knights > 0 ) {
+            $suffix = substr( $suffix, 0, $knights );
+        }
+        my $suf       = ( @ex ? "." . join( ".", @ex ) : "" );
+            # Shortened version of $siffix -- only non-default values.
+        my $build_dir = join( "-", $target_platform, join( "_", @base ) . $suffix, Uname::host_name() );
+        my $out_arch_dir = cat_dir( $ENV{ LIBOMP_EXPORTS }, $target_platform . $suf );
+        my $out_cmn_dir  = cat_dir( $ENV{ LIBOMP_EXPORTS }, "common" );
+        push(
+            @$jobs,
+            {
+                makefile => $makefile,
+                make_args => [
+                    "os="   . $target_os,
+                    "arch=" . $target_arch,
+                    "MIC_ARCH=" . $target_mic_arch,
+                    "date=" . Build::tstr( $Build::start ),
+                    "TEST_DEPS=" . ( $test_deps   ? "on" : "off" ),
+                    "TEST_TOUCH=" . ( $test_touch ? "on" : "off" ),
+                    "CPLUSPLUS=on",
+                    "COVERAGE=" . $set->{ coverage },
+                    # Option "mode" controls 3 make flags:
+                    #     debug   => Full debugging   :    diagnostics,    debug info, no optimization.
+                    #     diag    => Only diagnostics :    diagnostics,    debug info,    optimization.
+                    #     release => Production build : no diagnostics, no debug info,    optimization.
+                    "DEBUG_INFO=" .   ( $set->{ mode } ne "release" ? "on" : "off" ),
+                    "DIAG=" .         ( $set->{ mode } ne "release" ? "on" : "off" ),
+                    "OPTIMIZATION=" . ( $set->{ mode } ne "debug"   ? "on" : "off" ),
+                    "LIB_TYPE=" . substr( $set->{ "lib-type" }, 0, 4 ),
+                    "LINK_TYPE=" . substr( $set->{ "link-type" }, 0, 4 ),
+                    "OMP_VERSION=" . $set->{ "omp-version" },
+                    "VERSION=" . $set->{ version },
+                    "suffix=" . $suf,
+                    "stats=" . $set->{ stats },
+                    "OMPT_SUPPORT=" . $set->{ "ompt-support" },
+                    "OMPT_BLAME=" . $set->{ "ompt-blame" },
+                    "OMPT_TRACE=" . $set->{ "ompt-trace" },
+                    @goals,
+                ],
+                build_dir  => $build_dir
+            }
+        ); # push
+    }; # if
+}; # sub enqueue_jobs
+
+my @jobs;
+enqueue_jobs( \@jobs, {}, @all_opts );
+
+# --------------------------------------------------------------------------------------------------
+# Do the work.
+# --------------------------------------------------------------------------------------------------
+
+my $exit = 0;
+
+Build::init();
+
+if ( $clobber ) {
+    my @dirs = ( $ENV{ LIBOMP_TMP }, $ENV{ LIBOMP_EXPORTS }, cat_dir( $root_dir, "tools", "bin"  ) );
+    my $rc = 0;
+    question(
+        "Clobber " . join( ", ", map( "\"" . Build::shorter( $_ ) . "\"", @dirs ) ) . " dirs? ",
+        $answer,
+        qr{\A(y|yes|n|no)\z}i
+    );
+    if ( $answer =~ m{\Ay}i ) {
+        info( "Clobbering..." );
+        $rc = Build::clean( @dirs );
+        info( Build::rstr( $rc ) );
+    }; # if
+    if ( $rc != 0 ) {
+        $exit = 3;
+    }; # if
+} else { # Build or clean.
+    if ( @jobs ) {
+        my $total = @jobs;    # Total number of jobs.
+        my $n     = 0;        # Current job number.
+        Build::progress( "", "" );     # Output empty line to log file.
+        my $goals = join( " ", @goals );
+        Build::progress( "Goals", $goals eq "" ? "(all)" : $goals );
+        Build::progress( "Configurations", scalar( @jobs ) );
+        foreach my $job ( @jobs ) {
+            ++ $n;
+            my $base = get_file( $job->{ build_dir } );
+            Build::progress( "Making", "%3d of %3d : %s", $n, $total, $base );
+            $job->{ rc } = Build::make( $job, $clean, sprintf( "%d/%d", $n, $total ) );
+        }; # my $job
+        my $failures = Build::summary();
+        if ( $failures > 0 ) {
+            $exit = 3;
+        }; # if
+    } else {
+        info( "Nothing to do." );
+    }; # if
+}; # if
+
+# And exit.
+exit( $exit );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+
+B<build.pl> -- Build one or more configurations of OMP RTL libraries.
+
+=head1 SYNOPSIS
+
+B<build.pl> I<option>... [B<-->] I<make-option>... I<variable>... I<goal>...
+
+=head1 OPTIONS
+
+=over
+
+=item B<--all>
+
+Build all base configurations.
+
+=item B<--ALL>
+
+Build really all configurations, including extra ones.
+
+=item B<--answer=>I<str>
+
+Use specified string as default answer to all questions.
+
+=item B<--architecture=>I<arch>
+
+Specify target architecture to build. Default is architecture of host machine. I<arch> can be C<32>,
+C<32e>, C<mic>, or one of known aliases like C<IA32>.
+
+If architecture is not specified explicitly, value of LIBOMP_ARCH environment variable is used.
+If LIBOMP_ARCH is not defined, host architecture detected.
+
+=item B<--os=>I<os>
+
+Specify target OS. Default is OS of host machine. I<os> can be C<lin>, C<mac>, C<win>,
+or one of known aliases like C<Linux>, C<WinNT>, etc.
+
+=item B<--mic-arch=>I<arch>
+
+Specify architecture of Intel(R) Many Integrated Core Architecture card. Default is C<knf>. I<arch> can be C<knf>, C<knc>, C<knl>.
+
+=item B<-->[B<no->]B<test-deps>
+
+Enable or disable C<test-deps>. The test runs in any case, but result of disabled test is ignored.
+By default, test is enabled.
+
+=item B<-->[B<no->]B<test-touch>
+
+Enable or disable C<test-touch>. The test runs in any case, but result of disabled test is ignored.
+By default, test is enabled.
+
+=item Base Configuration Selection Options
+
+=over
+
+=item B<--target=>I<target>
+
+Build specified target, either C<rtl> (OMP Runtime Library; default),
+or C<timelimit> (program used in testing), or C<all>.
+
+=item B<--lib-type=>I<lib>
+
+Build specified library, either C<normal> (default), or C<stubs>, or C<all>.
+
+=item B<--link-type=>I<type>
+
+Build specified link type, either C<dynamic> (default) or C<all>.
+
+=back
+
+=item Extra Configuration Selection Options
+
+=over
+
+=item B<--cover=>I<switch>
+
+Build for code coverage data collection. I<switch> can be C<off> (default), C<on>
+or C<all>.
+
+=item B<--mode=>I<mode>
+
+Build library of specified I<mode>, either C<debug>, C<diag>, C<release> (default), or C<all>.
+Mode controls 3 features:
+
+    ---------------------------------------------------
+    feature/mode                   debug  diag  release
+    ---------------------------------------------------
+    debug info                       o      o
+    diagnostics (asserts, traces)    o      o
+    code optimization                       o      o
+    ---------------------------------------------------
+
+=back
+
+=item Shortcuts
+
+If option with C<no> prefix is used, corresponding configuration will B<not> be built.
+Useful for excluding some configurations if one or more other options specified with C<all>
+value (see Examples).
+
+=over
+
+=item B<-->[B<no>]B<11>
+
+Build files for compiler C<11>.
+
+=item B<-->[B<no>]B<12>
+
+Build files for compiler C<12>.
+
+=item B<-->[B<no>]B<debug>
+
+=item B<-->[B<no>]B<debg>
+
+=item B<-->[B<no>]B<dbg>
+
+Build debuggable library.
+
+=item B<-->[B<no>]B<diag>
+
+Build library with diagnostics enabled.
+
+=item B<-->[B<no>]B<dynamic>
+
+Build dynamic library (default).
+
+=item B<-->[B<no>]B<normal>
+
+Build normal library (default).
+
+=item B<-->[B<no>]B<release>
+
+Build release library (default).
+
+=item B<-->[B<no>]B<rtl>
+
+Build OMP RTL (default).
+
+=item B<-->[B<no>]B<stubs>
+
+Build stubs library.
+
+=item B<-->[B<no>]B<timelimit>
+
+Build timelimit utility program.
+
+=back
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print program version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<make-option>
+
+Any option for makefile, for example C<-k> or C<-n>. If you pass some options to makefile, C<-->
+delimiter is mandatory, otherwise C<build.pl> processes all the options internally.
+
+=item I<variable>
+
+Define makefile variable in form I<name>B<=>I<value>. Most makefile capabilities are
+accessible through C<build.pl> options, so there is no need in defining make variables in command
+line.
+
+=item I<goal>
+
+Makefile goal to build (or clean).
+
+=over
+
+=item B<all>
+
+Build C<lib>, C<tests>, C<inc>.
+
+=item B<common>
+
+Build common (architecture-independent) files. Common files are not configuration-dependent, so
+there is no point in building it for more than one configuration (thought it is harmless).
+However, do not build common files on many machines simultaneously.
+
+=item B<clean>
+
+Delete the export files and clean build directory of configuration(s) specified by options. Note
+that C<clean> goal cannot be mixed with other goals (except for C<clean-common>).
+
+=item B<clean-common>
+
+Delete the common files in F<exports/> directory.
+
+=item B<clobber>
+
+Clean F<export/> and F<tmp/> directories. If C<clobber> is specified, other goals and/or options
+do not matter.
+
+Note: Clobbering is potentialy dangerous operation, because it deletes content of directory
+pointed by If C<LIBOMP_TMP> environment variable, so C<build.pl> asks a confirmation before
+clobbering. To suppress the question, use option C<--answer=yes>.
+
+=item B<fat>
+
+C<mac_32e> only: Build fat libraries for both mac_32 and mac_32e. Should be run when C<lib>
+goal is built on both C<mac_32> and C<mac_32e>.
+
+=item I<file.o>
+
+(Windows* OS: I<file.obj>) Build specified object file only.
+
+=item I<file.i>
+
+Create preprocessed source file.
+
+=item B<force-tests>
+
+Force performing tests.
+
+=item B<force-test-deps>
+
+Force performing test-deps.
+
+=item B<force-test-instr>
+
+Force performing test-instr.
+
+=item B<force-test-relo>
+
+Force performing test-relo.
+
+=item B<force-test-touch>
+
+Force performing test-touch.
+
+=item B<inc>
+
+Build Fortran include files, omp_lib.h, omp_lib.mod and omp_lib_kinds.mod.
+
+=item B<lib>
+
+Build library (on Windows* OS in case of dynamic linking, it also builds import library).
+
+=item B<tests>
+
+Perform tests: C<test-deps>, C<test-instr>, C<test-relo>, and C<test-touch>.
+
+=item B<test-deps>
+
+Check the library dependencies. 
+
+=item B<test-instr>
+
+Intel(R) Many Integrated Core Architecture only: check the library does not contain undesired instructions.
+
+=item B<test-relo>
+
+Linux* OS with dynamic linking only: check the library does not contain position-dependent
+code.
+
+=item B<test-touch>
+
+Build a very simple application with native compiler (GNU on Linux* OS and OS X*, MS
+on Windows* OS), check it does not depend on C<libirc> library, and run it.
+
+=back
+
+=back
+
+=head1 DESCRIPTION
+
+C<build.pl> constructs the name of a build directory, creates the directory if it
+does not exist, changes to it, and runs make to build the goals in specified configuration.
+If more than one configuration are specified in command line C<build.pl> builds them all.
+
+Being run with C<clean> goal, C<build.pl> does not build but deletes export files and
+cleans build directories of configuration specified by other options. For example,
+C<build.pl --all clean> means "clean build directories for all configurations",
+it does B<not> mean "clean then build all".
+
+C<clear-common> goal deletes common files in F<exports/> directory.
+Since common files are really common and not architecture and/or configuration dependent,
+there are no much meaning in combining C<clear-common> with configuration selection options.
+For example, C<build.pl --all clean-common> deletes the same files 13 times.
+However, it does not hurt and can be used in conjunction with C<clear> goal.
+
+C<clobber> goal instructs C<build.pl> to clean exports and all build
+directories, e. g. clean everything under F<exports/> and F<tmp/> directories.
+
+Logs are saved automatically, there is no need in explicit output redirection.
+Log file for each particular configuration is named F<build.log> and located in build directory.
+Summary log file (just result of each configuration) is saved in F<tmp/> directory.
+
+Log files are never overwritten. C<build.pl> always appends output to log files.
+However (obviously), C<clear> deletes log file for cleared configurations,
+and C<clobber> deletes all summary log files.
+
+=head2 Environment Variables
+
+=over
+
+=item B<LIBOMP_ARCH>
+
+Specifies target architecture. If not present, host architecture is used. Environment variable may
+be overriden by C<--architecture> command line option.
+
+=item B<LIBOMP_EXPORTS>
+
+Specifies directory for output files. If not set, C<$LIBOMP_WORK/exports/> used by default.
+
+=item B<LIBOMP_OS>
+
+Specifies target OS. If not present, host OS is used. Environment variable may
+be overriden by C<--os> command line option.
+
+=item B<LIBOMP_TMP>
+
+Directory for temporary files. C<build.pl> creates build directories there. If not set,
+C<$LIBOMP_WORK/tmp/> used by default.
+
+On Windows* OS F<tmp/> directory on local drive speeds up the build process.
+
+=item B<LIBOMP_WORK>
+
+Root of libomp directory tree, contains F<src/>, F<tools/>, and F<exports/> subdirs.
+If not set, C<build.pl> guesses the root dir (it is a parent of dir containing C<build.pl>).
+
+Note: Guessing it not reliable. Please set C<LIBOMP_WORK> environment variable appropriately.
+
+=back
+
+=head1 EXAMPLES
+
+=head2 Development
+
+Build normal (performance) dynamic library for debugging:
+
+    $ build.pl --debug
+
+Build all libraries (normal, stub; dynamic RTL) for debugging:
+
+    $ build.pl --all --debug
+
+Do a clean build for all:
+
+    $ build.pl --all --debug clean && build.pl --all --debug
+
+Debugging libraries are saved in F<exports/I<platform>.deb/>.
+
+=head2 Promotion
+
+=over
+
+=item 1
+
+Clobber everything; on one machine:
+
+    $ build.pl clobber
+
+=item 2
+
+Build common headers, on one machine:
+
+    $ build.pl common
+
+=item 3
+
+Build all platform-dependent files, on all machines:
+
+    $ build.pl --all
+
+=item 4
+
+Build OS X* universal (fat) libraries, on C<mac_32e>:
+
+    $ build.pl fat
+
+=back
+
+=cut
+
+# end of file #

diff --git a/final/runtime/tools/check-depends.pl b/final/runtime/tools/check-depends.pl
new file mode 100755
index 0000000..b199877
--- /dev/null
+++ b/final/runtime/tools/check-depends.pl

@@ -0,0 +1,497 @@
+#!/usr/bin/env perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+use Platform ":vars";
+
+our $VERSION = "0.005";
+
+# --------------------------------------------------------------------------------------------------
+# Ouput parse error.
+#     $tool -- Name of tool.
+#     @bulk -- Output of the tool.
+#     $n    -- Number of line caused parse error.
+sub parse_error($\@$) {
+    my ( $tool, $bulk, $n ) = @_;
+    my @bulk;
+    for ( my $i = 0; $i < @$bulk; ++ $i ) {
+        push( @bulk, ( $i == $n ? ">>> " : "    " ) . $bulk->[ $i ] );
+    }; # for $i
+    runtime_error( "Fail to parse $tool output:", @bulk, "(eof)" );
+}; # sub parse_error
+
+
+# --------------------------------------------------------------------------------------------------
+# Linux* OS version of get_deps() parses output of ldd:
+#
+# $ ldd libname.so
+#   libc.so.6 => /lib64/libc.so.6 (0x00002b60fedd8000)
+#   libdl.so.2 => /lib64/libdl.so.2 (0x00002b60ff12b000)
+#   libpthread.so.0 => /lib64/libpthread.so.0 (0x00002b60ff32f000)
+#   /lib64/ld-linux-x86-64.so.2 (0x0000003879400000)
+#
+# Note: ldd printd all the dependencies, direct and indirect. (For example, if specified library
+# requires libdl.so, and libdl.so requires /lib/ld-linux.so, ldd prints both libdl.so and
+# /lib/ld-linux.so). If you do not want indirect dependencies, look at readelf tool.
+#
+sub get_deps_ldd($) {
+
+    my $lib = shift ( @_ );
+    my $tool = "ldd";
+    my @bulk;
+    my @deps;
+
+    execute( [ $tool, $lib ], -stdout => \@bulk );
+    debug( @bulk, "(eof)" );
+
+    foreach my $i ( 0 .. @bulk - 1 ) {
+        my $line = $bulk[ $i ];
+        if ( $line !~ m{^\s*(?:([_a-z0-9.+-/]*)\s+=>\s+)?([_a-z0-9.+-/]*)\s+\(0x[0-9a-z]*\)$}i ) {
+            parse_error( $tool, @bulk, $i );
+        }; # if
+        my $dep = ( defined( $1 ) ? $1 : $2 );
+        push( @deps, $dep );
+    }; # foreach $i
+
+    return @deps;
+
+}; # sub get_deps_ldd
+
+
+# --------------------------------------------------------------------------------------------------
+# Another Linux* OS version of get_deps() parses output of readelf:
+#
+# $ readelf -d exports/lin_32e/lib/libomp.so
+#
+# Dynamic segment at offset 0x87008 contains 24 entries:
+#   Tag        Type                         Name/Value
+#  0x0000000000000001 (NEEDED)             Shared library: [libc.so.6]
+#  0x0000000000000001 (NEEDED)             Shared library: [libdl.so.2]
+#  0x0000000000000001 (NEEDED)             Shared library: [libpthread.so.0]
+#  0x000000000000000e (SONAME)             Library soname: [libomp.so]
+#  0x000000000000000d (FINI)               0x51caa
+#  0x0000000000000004 (HASH)               0x158
+#  0x0000000000000005 (STRTAB)             0x9350
+#  ...
+#
+# Note: In contrast to ldd, readlef shows only direct dependencies.
+#
+sub get_deps_readelf($) {
+
+    my $file = shift ( @_ );
+    my $tool;
+    my @bulk;
+    my @deps;
+
+    if($target_arch eq "mic") {
+        $tool = "x86_64-k1om-linux-readelf";
+    } else {
+        $tool = "readelf";
+    }
+
+    execute( [ $tool, "-d", $file ], -stdout => \@bulk );
+    debug( @bulk, "(eof)" );
+
+    my $i = 0;
+    # Parse header.
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    if ( $i == @bulk - 1 and $bulk[ $i ] =~ m{^There is no dynamic section in this file\.\s*$} ) {
+        # This is not dynamic executable => no dependencies.
+        return @deps;
+    }; # if
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Dynamic (?:segment|section) at offset 0x[0-9a-f]+ contains \d+ entries:\s*$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*Tag\s+Type\s+Name/Value\s*$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    # Parse body.
+    while ( $i < @bulk ) {
+        my $line = $bulk[ $i ];
+        if ( $line !~ m{^\s*0x[0-9a-f]+\s+\(([_A-Z0-9]+)\)\s+(.*)\s*$}i ) {
+            parse_error( $tool, @bulk, $i );
+        }; # if
+        my ( $type, $value ) = ( $1, $2 );
+        if ( $type eq "NEEDED" ) {
+            if ( $value !~ m{\AShared library: \[(.*)\]\z} ) {
+                parse_error( $tool, @bulk, $i );
+            }; # if
+            my $dep = $1;
+            push( @deps, $dep );
+        }; # if
+        ++ $i;
+    }; # foreach $i
+
+    return @deps;
+
+}; # sub get_deps_readelf
+
+
+# --------------------------------------------------------------------------------------------------
+# OS X* version of get_deps() parses output of otool:
+#
+# $ otool -L libname.dylib
+# exports/mac_32/lib.thin/libomp.dylib:
+#        libomp.dylib (compatibility version 5.0.0, current version 5.0.0)
+#        /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 88.1.3)
+#
+sub get_deps_otool($) {
+
+    my $file = shift ( @_ );
+    my $name = get_file( $file );
+    my $tool = "otool";
+    my @bulk;
+    my @deps;
+
+    if ( $target_arch eq "32e" ) {
+        # On older (Tiger) systems otool does not recognize 64-bit binaries, so try to locate
+        # otool64.
+        my $path = which( "otool64" );
+        if ( defined ( $path ) ) {
+            $tool = "otool64";
+        }; # if
+    }; # if
+
+    execute( [ $tool, "-L", $file ], -stdout => \@bulk );
+    debug( @bulk, "(eof)" );
+
+    my $i = 0;
+    # Parse the first one or two lines separately.
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\Q$file\E:$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    if ( $name =~ m{\.dylib\z} ) {
+        # In case of dynamic library otool print the library itself as a dependent library.
+        ( $i < @bulk and $bulk[ $i ] =~ m{^\s+\Q$name\E\s+\(compatibility version.*\)$} )
+            or parse_error( $tool, @bulk, $i );
+        ++ $i;
+    }; # if
+
+    # Then parse the rest.
+    while ( $i < @bulk ) {
+        my $line = $bulk[ $i ];
+        if ( $line !~ m/^\s*(.*)\s+\(compatibility version\s.*\)$/ ) {
+            parse_error( $tool, @bulk, $i );
+        }; # if
+        my ( $dep ) = ( $1 );
+        push( @deps, $dep );
+        ++ $i;
+    }; # while
+
+    return @deps;
+
+}; # sub get_deps_otool
+
+
+# --------------------------------------------------------------------------------------------------
+# Windows* OS version of get_deps() parses output of link:
+#
+# > link -dump -dependents libname.dll
+# Microsoft (R) COFF/PE Dumper Version 8.00.40310.39
+# Copyright (C) Microsoft Corporation.  All rights reserved.
+# Dump of file S:\Projects.OMP\users\omalyshe\omp\libomp\exports\win_64\lib\libompmd.dll
+# File Type: DLL
+#   Image has the following dependencies:
+#     KERNEL32.dll
+#   Summary
+#         C000 .data
+#         6000 .pdata
+#        18000 .rdata
+#        ...
+#
+# > link -dump -directives libname.lib
+# Microsoft (R) COFF/PE Dumper Version 8.00.40310.39
+# Copyright (C) Microsoft Corporation.  All rights reserved.
+# Dump of file S:\Projects.OMP\users\omalyshe\omp\libomp\exports\win_32e\lib\libimp5mt.lib
+# File Type: LIBRARY
+#   Linker Directives
+#   -----------------
+#   -defaultlib:"uuid.lib"
+#   -defaultlib:"uuid.lib"
+#   .....
+#   Summary
+#       3250 .bss
+#       3FBC .data
+#         34 .data1
+#       ....
+sub get_deps_link($) {
+
+    my ( $lib ) = @_;
+    my $tool = "link";
+    my @bulk;
+    my @deps;
+
+    my $ext = lc( get_ext( $lib ) );
+    if ( $ext !~ m{\A\.(?:lib|dll|exe)\z}i ) {
+        runtime_error( "Incorrect file is specified: `$lib'; only `lib', `dll' or `exe' file expected" );
+    }; # if
+
+    execute(
+        [ $tool, "/dump", ( $ext eq ".lib" ? "/directives" : "/dependents" ), $lib ],
+        -stdout => \@bulk
+    );
+
+    debug( @bulk, "(eof)" );
+
+    my $i = 0;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Microsoft \(R\) COFF\/PE Dumper Version.*$} ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Copyright \(C\) Microsoft Corporation\..*$} ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Dump of file\s\Q$lib\E$}                    ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^File Type:\s(.*)$}                          ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+
+    if ( $ext eq ".lib" ) {
+
+        my %deps;
+        while ( $i < @bulk ) {
+            my $line = $bulk[ $i ];
+            if ( 0 ) {
+            } elsif ( $line =~ m{^\s*[-/]defaultlib\:(.*)\s*$}i ) {
+                my $dep = $1;
+                # Normalize library name:
+                $dep = lc( $1 );              # Convert to lower case.
+                $dep =~ s{\A"(.*)"\z}{$1};    # Drop surrounding quotes (if any).
+                $dep =~ s{\.lib\z}{};         # Drop .lib suffix (if any).
+                $deps{ $dep } = 1;
+            } elsif ( $line =~ m{^\s*Linker Directives\s*$} ) {
+            } elsif ( $line =~ m{^\s*-+\s*$} ) {
+            } elsif ( $line =~ m{^\s*/alternatename\:.*$} ) {
+            } elsif ( $line =~ m{^\s*$} ) {
+            } elsif ( $line =~ m{^\s*/FAILIFMISMATCH\:.*$} ) {
+                # This directive is produced only by _MSC_VER=1600
+            } elsif ( $line =~ m{^\s*Summary\s*$} ) {
+                last;
+            } else {
+                parse_error( $tool, @bulk, $i );
+            }; # if
+            ++ $i;
+        } # while
+        @deps = keys( %deps );
+
+    } else {
+
+        ( $i < @bulk and $bulk[ $i ] =~ m{\s*Image has the following dependencies\:$} )
+            or parse_error( $tool, @bulk, $i );
+        ++ $i;
+        while ( $i < @bulk ) {
+            my $line = $bulk[ $i ];
+            if ( 0 ) {
+            } elsif ( $line =~ m{^\s*$} ) {
+                # Ignore empty lines.
+            } elsif ( $line =~ m{^\s*(.*\.dll)$}i ) {
+                my $dep = lc( $1 );
+                push( @deps, $dep );
+            } elsif ( $line =~ m{^\s*Summary$} ) {
+                last;
+            } else {
+                parse_error( $tool, @bulk, $i );
+            }; # if
+            ++ $i;
+        }; # while
+
+    }; # if
+
+    return @deps;
+
+}; # sub get_deps_link
+
+
+# --------------------------------------------------------------------------------------------------
+# Main.
+# --------------------------------------------------------------------------------------------------
+
+# Parse command line.
+my $expected;
+my $bare;
+Getopt::Long::Configure( "permute" );
+get_options(
+    Platform::target_options(),
+    "bare"       => \$bare,
+    "expected=s" => \$expected,
+);
+my @expected;
+if ( defined( $expected ) ) {
+    if ( $expected ne "none" ) {
+        @expected = sort( split( ",", $expected ) );
+        if ( $target_os eq "win" ) {
+            @expected = map( lc( $_ ), @expected );
+        }; # if
+    }; # if
+}; # if
+if ( @ARGV < 1 ) {
+    cmdline_error( "Specify a library name to check for dependencies" );
+}; # if
+if ( @ARGV > 1 ) {
+    cmdline_error( "Too many arguments" );
+}; # if
+my $lib = shift( @ARGV );
+if ( not -e $lib ){
+    runtime_error( "Specified file does not exist: \"$lib\"" );
+}; # if
+
+# Select appropriate get_deps implementation.
+if ( 0 ) {
+} elsif ( $target_os eq "lin" ) {
+    *get_deps = \*get_deps_readelf;
+} elsif ( $target_os eq "mac" ) {
+    *get_deps = \*get_deps_otool;
+} elsif ( $target_os eq "win" ) {
+    *get_deps = \*get_deps_link;
+} else {
+    runtime_error( "OS \"$target_os\" not supported" );
+}; # if
+
+# Do the work.
+my @deps = sort( get_deps( $lib ) );
+if ( $bare ) {
+    print( map( "$_\n", @deps ) );
+} else {
+    info( "Dependencies:", @deps ? map( "    $_", @deps ) : "(none)" );
+}; # if
+if ( defined( $expected ) ) {
+    my %deps = map( ( $_ => 1 ), @deps );
+    foreach my $dep ( @expected ) {
+        delete( $deps{ $dep } );
+    }; # foreach
+    my @unexpected = sort( keys( %deps ) );
+    if ( @unexpected ) {
+        runtime_error( "Unexpected dependencies:", map( "    $_", @unexpected ) );
+    }; # if
+}; # if
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<check-depends.pl> -- Check dependencies for a specified library.
+
+=head1 SYNOPSIS
+
+B<check-depends.pl> I<OPTIONS>... I<library>
+
+=head1 DESCRIPTION
+
+C<check-depends.pl> finds direct dependencies for a specified library. List of actual dependencies
+is sorted alphabetically and printed. If list of expected dependencies is specified, the scripts
+checks the library has only allowed dependencies. In case of not expected depndencies the script
+issues error message and exits with non-zero code.
+
+Linux* OS and OS X*: The script finds dependencies only for dymamic libraries. Windows* OS: The script
+finds dependencies for either static or dymamic libraries.
+
+The script uses external tools. On Linux* OS, it runs F<readelf>, on OS X* -- F<otool> (or F<otool64>),
+on Windows* OS -- F<link>.
+
+On Windows* OS dependencies are printed in lower case, case of expected dependencies ignored.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--bare>
+
+Do not use fancy formatting; produce plain, bare output: just a list of libraries,
+a library per line.
+
+=item B<--expected=>I<list>
+
+I<list> is comma-separated list of expected dependencies (or C<none>).
+If C<--expected> option specified, C<check-depends.pl> checks the specified library
+has only expected dependencies.
+
+=item B<--os=>I<str>
+
+Specify target OS (tool to use) manually.
+Useful for cross-build, when host OS is not the same as target OS.
+I<str> should be either C<lin>, C<mac>, or C<win>.
+
+=back
+
+=head2 Standard Options
+
+=over
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full documentation and exit.
+
+=item B<--quiet>
+
+Do not output informational messages.
+
+=item B<--version>
+
+Print version and exit.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<library>
+
+A name of library to find or check dependencies.
+
+=back
+
+=head1 EXAMPLES
+
+Just print library dependencies (Windows* OS):
+
+    > check-depends.pl exports/win_32/lib/libompmd.dll
+    check-depends.pl: (i) Dependencies:
+    check-depends.pl: (i)     kernel32.dll
+
+Print library dependencies, use bare output (Linux* OS):
+
+    $ check-depends.pl --bare exports/lin_32e/lib/libomp_db.so
+    libc.so.6
+    libdl.so.2
+    libpthread.so.0
+
+Check the library does not have any dependencies (OS X*):
+
+    $ check-depends.pl --expected=none exports/mac_32/lib/libomp.dylib
+    check-depends.pl: (i) Dependencies:
+    check-depends.pl: (i)     /usr/lib/libSystem.B.dylib
+    check-depends.pl: (x) Unexpected dependencies:
+    check-depends.pl: (x)     /usr/lib/libSystem.B.dylib
+    $ echo $?
+    2
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/check-execstack.pl b/final/runtime/tools/check-execstack.pl
new file mode 100755
index 0000000..43c2bb2
--- /dev/null
+++ b/final/runtime/tools/check-execstack.pl

@@ -0,0 +1,146 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+use Platform ":vars";
+
+our $VERSION = "0.002";
+
+sub execstack($) {
+    my ( $file ) = @_;
+    my @output;
+    my @stack;
+    my $tool;
+    if($target_arch eq "mic") {
+        $tool = "x86_64-k1om-linux-readelf";
+    } else {
+        $tool = "readelf";
+    }
+    execute( [ $tool, "-l", "-W", $file ], -stdout => \@output );
+    @stack = grep( $_ =~ m{\A\s*(?:GNU_)?STACK\s+}, @output );
+    if ( not @stack ) {
+        # Interpret missed "STACK" line as error.
+        runtime_error( "$file: No stack segment found; looks like stack would be executable." );
+    }; # if
+    if ( @stack > 1 ) {
+        runtime_error( "$file: More than one stack segment found.", "readelf output:", @output, "(eof)" );
+    }; # if
+    # Typical stack lines are:
+    # Linux* OS IA-32 architecture:
+    #    GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x4
+    # Linux* OS Intel(R) 64:
+    #    GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RWE 0x8
+    if ( $stack[ 0 ] !~ m{\A\s*(?:GNU_)?STACK(?:\s+0x[0-9a-f]+){5}\s+([R ][W ][E ])\s+0x[0-9a-f]+\s*\z} ) {
+        runtime_error( "$file: Cannot parse stack segment line:", ">>> $stack[ 0 ]" );
+    }; # if
+    my $attrs = $1;
+    if ( $attrs =~ m{E} ) {
+        runtime_error( "$file: Stack is executable" );
+    }; # if
+}; # sub execstack
+
+get_options(
+    Platform::target_options(),
+);
+
+foreach my $file ( @ARGV ) {
+    execstack( $file );
+}; # foreach $file
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<check-execstack.pl> -- Check whether stack is executable, issue an error if so.
+
+=head1 SYNOPSIS
+
+B<check-execstack.pl> I<optiion>... I<file>...
+
+=head1 DESCRIPTION
+
+The script checks whether stack of specified executable file, and issues error if stack is
+executable. If stack is not executable, the script exits silently with zero exit code.
+
+The script runs C<readelf> utility to get information about specified executable file. So, the
+script fails if C<readelf> is not available. Effectively it means the script works only on Linux* OS
+(and, probably, Intel(R) Many Integrated Core Architecture).
+
+=head1 OPTIONS
+
+=over
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print program version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+A name of executable or shared object to check. Multiple files may be specified.
+
+=back
+
+=head1 EXAMPLES
+
+Check libomp.so library:
+
+    $ check-execstack.pl libomp.so
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/check-instruction-set.pl b/final/runtime/tools/check-instruction-set.pl
new file mode 100755
index 0000000..b77e207
--- /dev/null
+++ b/final/runtime/tools/check-instruction-set.pl

@@ -0,0 +1,316 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use Platform ":vars";
+use tools;
+
+our $VERSION = "0.004";
+
+my $hex = qr{[0-9a-f]}i;    # hex digit.
+
+# mic-specific details.
+
+sub bad_mic_fmt($) {
+    # Before we allowed both elf64-x86-64-freebsd and elf-l1om-freebsd.
+    # Now the first one is obsolete, only elf64-l1om-freebsd is allowed.
+    my ( $fmt ) = @_;
+    if ( 0 ) {
+    } elsif ( "$target_mic_arch" eq "knf" ) {
+	    return $fmt !~ m{\Aelf64-l1om?\z};
+    } elsif ( "$target_mic_arch" eq "knc" ) {
+	    return $fmt !~ m{\Aelf64-k1om?\z};
+	} else {
+	    return 1;
+	};
+}; # sub bad_mic_fmt
+
+# Undesired instructions for mic: all x87 and some other.
+# AC: Since compiler 2010-06-30 x87 instructions are supported, removed the check of x87.
+my $mic_bad_re;
+sub bad_mic_instr($$) {
+    my ( $instr, $args ) = @_;
+    if ( "$target_mic_arch" eq "knc" ) {
+	# workaround of bad code generation on KNF Linux* OS:
+	return ( defined( $instr ) and $instr =~ $mic_bad_re );
+    } else {
+	return ( defined( $instr ) and $instr =~ $mic_bad_re or defined( $args ) and $args =~ m{xmm}i );
+    }
+}; # sub bad_mic_instr
+
+# lin_32-specific details.
+
+sub bad_ia32_fmt($) {
+    my ( $fmt ) = @_;
+    return $fmt !~ m{\Aelf32-i386\z};
+}; # sub bad_ia32_fmt
+
+my @sse2 =
+    qw{
+        movapd movupd movhpd movlpd movmskpd movsd
+        addpd addsd subpd subsd mulpd mulsd divpd divsd sqrtpd sqrtsd maxpd maxsd minpd minsd
+        andpd andnpd orpd xorpd
+        cmppd cmpsd comisd ucomisd
+        shufpd unpckhpd unpcklpd
+        cvtpd2pi cvttpd2pi cvtpi2pd cvtpd2dq cvttpd2dq cvtdq2pd cvtps2pd cvtpd2ps cvtss2sd cvtsd2ss
+        cvtsd2si cvttsd2si cvtsi2sd cvtdq2ps cvtps2dq cvttps2dq movdqa movdqu movq2dq movdq2q
+        pmuludq paddq psubq pshuflw pshufhw pshufd pslldq psrldq punpckhqdq punpcklqdq clflush
+        lfence mfence maskmovdqu movntpd movntdq movnti
+    };
+my @sse3 =
+    qw{
+        fisttp lddqu addsubps addsubpd haddps hsubps haddpd hsubpd movshdup movsldup movddup monitor
+        mwait
+    };
+my @ssse3 =
+    qw{
+        phaddw phaddsw phaddd phsubw phsubsw phsubd pabsb pabsw pabsd pmaddubsw pmulhrsw pshufb
+        psignb psignw psignd palignr
+    };
+my @sse4 =
+    (
+        # SSE4.1
+        qw{
+            pmulld pmuldq dppd dpps movntdqa blendpd blendps blendvpd blendvps pblendvb pblendw pminuw
+            pminud pminsb pminsd pmaxuw pmaxud pmaxsb pmaxsd roundps roundpd roundss roundsd extractps
+            insertps pinsrb pinsrd pinsrq pextrb pextrw pextrd pextrq pmovsxbw pmovzxbw pmovsxbd
+            pmovzxbd pmovsxwd pmovzxwd pmovsxbq pmovzxbq pmovsxwq pmovzxwq pmovsxdq pmovzxdq mpsadbw
+            phminposuw ptest pcmpeqq packusdw
+        },
+        # SSE4.2
+        qw{
+            pcmpestri pcmpestrm pcmpistri pcmpistrm pcmpgtq crc32 popcnt
+        }
+    );
+
+# Undesired instructions for IA-32 architecture: Pentium 4 (SSE2) and newer.
+# TODO: It would be much more reliable to list *allowed* instructions rather than list undesired
+# instructions. In such a case the list will be stable and not require update when SSE5 is released.
+my @ia32_bad_list = ( @sse2, @sse3, @ssse3, @sse4 );
+
+my $ia32_bad_re = qr{@{[ "^(?:" . join( "|", @ia32_bad_list ) . ")" ]}}i;
+
+sub bad_ia32_instr($$) {
+    my ( $instr, $args ) = @_;
+    return ( defined( $instr ) and $instr =~ $ia32_bad_re );
+}; # sub bad_ia32_instr
+
+sub check_file($;$$) {
+
+    my ( $file, $show_instructions, $max_instructions ) = @_;
+    my @bulk;
+
+    if ( not defined( $max_instructions ) ) {
+        $max_instructions = 100;
+    }; # if
+
+    execute( [ "x86_64-k1om-linux-objdump", "-d", $file ], -stdout => \@bulk );
+
+    my $n = 0;
+    my $errors = 0;
+    my $current_func  = "";    # Name of current fuction.
+    my $reported_func = "";    # name of last reported function.
+    foreach my $line ( @bulk ) {
+        ++ $n;
+        if ( 0 ) {
+        } elsif ( $line =~ m{^\s*$} ) {
+            # Empty line.
+            # Ignore.
+        } elsif ( $line =~ m{^In archive (.*?):\s*$} ) {
+            # In archive libomp.a:
+        } elsif ( $line =~ m{^(?:.*?):\s*file format (.*?)\s*$} ) {
+            # libomp.so:     file format elf64-x86-64-freebsd
+            # kmp_ftn_cdecl.o:     file format elf64-x86-64
+            my $fmt = $1;
+            if ( bad_fmt( $fmt ) ) {
+                runtime_error( "Invalid file format: $fmt." );
+            }; # if
+        } elsif ( $line =~ m{^Disassembly of section (.*?):\s*$} ) {
+            # Disassembly of section .plt:
+        } elsif ( $line =~ m{^$hex+ <([^>]+)>:\s*$} ) {
+            # 0000000000017e98 <__kmp_str_format@plt-0x10>:
+            $current_func = $1;
+        } elsif ( $line =~ m{^\s*\.{3}\s*$} ) {
+        } elsif ( $line =~ m{^\s*($hex+):\s+($hex$hex(?: $hex$hex)*)\s+(?:lock\s+|rex[.a-z]*\s+)?([^ ]+)(?:\s+([^#]+?))?\s*(?:#|$)} ) {
+            #   17e98:       ff 35 fa 7d 26 00       pushq  0x267dfa(%rip)        # 27fc98 <_GLOBAL_OFFSET_TABLE>
+            my ( $addr, $dump, $instr, $args ) = ( $1, $2, $3, $4 );
+            # Check this is not a bad instruction and xmm registers are not used.
+            if ( bad_instr( $instr, $args ) ) {
+                if ( $errors == 0 ) {
+                    warning( "Invalid instructions found in `$file':" );
+                }; # if
+                if ( $current_func ne $reported_func ) {
+                    warning( "    $current_func" );
+                    $reported_func = $current_func;
+                }; # if
+                ++ $errors;
+                if ( $show_instructions ) {
+                    warning( "        $line" );
+                }; # if
+                if ( $errors >= $max_instructions ) {
+                    info( "$errors invalid instructions found; scanning stopped." );
+                    last;
+                }; # if
+            }; # if
+        } else {
+            runtime_error( "Error parsing objdump output line $n:\n>>>> $line\n" );
+        }; # if
+    }; # foreach $line
+
+    return $errors;
+
+}; # sub check_file
+
+# --------------------------------------------------------------------------------------------------
+
+# Parse command line.
+my $max_instructions;
+my $show_instructions;
+get_options(
+    "max-instructions=i" => \$max_instructions,
+    "show-instructions!" => \$show_instructions,
+    Platform::target_options(),
+);
+if ( "$target_os" eq "lin" and "$target_mic_arch" eq "knf" ) {
+    $mic_bad_re = qr{^(?:pause|[slm]fence|scatter|gather|cmpxchg16b|clevict[12])}i;
+} else {
+    $mic_bad_re = qr{^(?:pause|[slm]fence|scatter|gather|cmov|cmpxchg16b|clevict[12])}i;
+};
+if ( 0 ) {
+} elsif ( $target_os eq "lin" and $target_arch eq "mic" ) {
+    *bad_instr = \*bad_mic_instr;
+    *bad_fmt   = \*bad_mic_fmt;
+} elsif ( $target_platform eq "lin_32" ) {
+    *bad_instr = \*bad_ia32_instr;
+    *bad_fmt   = \*bad_ia32_fmt;
+} else {
+    runtime_error( "Only works on lin_32 and lin_mic platforms." );
+}; # if
+
+# Do the work.
+my $rc = 0;
+if ( not @ARGV ) {
+    info( "No arguments specified -- nothing to do." );
+} else {
+    foreach my $arg ( @ARGV ) {
+        my $errs = check_file( $arg, $show_instructions, $max_instructions );
+        if ( $errs > 0 ) {
+            $rc = 3;
+        }; # if
+    }; # foreach $arg
+}; # if
+
+exit( $rc );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<check-instruction-set.pl> -- Make sure binary file does not contain undesired instructions.
+
+=head1 SYNOPSIS
+
+B<check-instructions.pl> I<option>... I<file>...
+
+=head1 OPTIONS
+
+=over
+
+=item B<--architecture=>I<arch>
+
+Specify target architecture.
+
+=item B<--max-instructions=>I<number>
+
+Stop scanning if I<number> invalid instructions found. 100 by default.
+
+=item B<--os=>I<os>
+
+Specify target OS.
+
+=item B<-->[B<no->]B<show-instructions>
+
+Show invalid instructions found in the file. Bu default, instructions are not shown.
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print program version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+File (object file or library, either static or dynamic) to check.
+
+=back
+
+=head1 DESCRIPTION
+
+The script runs F<objdump> utility to get disassembler listing and checks the file does not contain
+unwanted instructions.
+
+Currently the script works only for:
+
+=over
+
+=item C<lin_mic>
+
+Intel(R) Many Integrated Core Architecture target OS. Undesired unstructions are: all x87 instructions and some others.
+
+=item C<lin_32>
+
+Undesired instructions are instructions not valid for Pentium 3 processor (SSE2 and newer).
+
+=back
+
+=cut
+

diff --git a/final/runtime/tools/check-openmp-test.pl b/final/runtime/tools/check-openmp-test.pl
new file mode 100755
index 0000000..2e64a74
--- /dev/null
+++ b/final/runtime/tools/check-openmp-test.pl

@@ -0,0 +1,18 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+# LIBOMP modules.
+use Build;
+use LibOMP;
+use Platform ":vars";
+use Uname;
+use tools;
+
+my $root_dir  = $ENV{ LIBOMP_WORK };
+print join('', $target_platform, "/");
+

diff --git a/final/runtime/tools/check-openmp.pl b/final/runtime/tools/check-openmp.pl
new file mode 100755
index 0000000..60334d7
--- /dev/null
+++ b/final/runtime/tools/check-openmp.pl

@@ -0,0 +1,18 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+# LIBOMP modules.
+use Build;
+use LibOMP;
+use Platform ":vars";
+use Uname;
+use tools;
+
+my $root_dir  = $ENV{ LIBOMP_WORK };
+print join('', $root_dir, "/", "exports", "/", $target_platform, "/", "lib");
+

diff --git a/final/runtime/tools/check-tools.pl b/final/runtime/tools/check-tools.pl
new file mode 100755
index 0000000..5f6e53a
--- /dev/null
+++ b/final/runtime/tools/check-tools.pl

@@ -0,0 +1,605 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Pragmas.
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+# LIBOMP modules.
+use Platform ":vars";
+use tools;
+
+our $VERSION = "0.015";
+
+my $pedantic;
+
+# --------------------------------------------------------------------------------------------------
+# Helper functions
+# --------------------------------------------------------------------------------------------------
+
+
+sub run($\$\$;\$) {
+    my ( $cmd, $stdout, $stderr, $path ) = @_;
+    my ( @path, $rc );
+    @path = which( $cmd->[ 0 ], -all => 1 );
+    if ( @path > 0 ) {
+        if ( @path > 1 and $pedantic ) {
+            warning( "More than one \"$cmd->[ 0 ]\" found in PATH:", map( "    $_", @path ) );
+        }; # if
+        debug( "\"$cmd->[ 0 ]\" full path is \"$path[ 0 ]\"." );
+        if ( defined( $path ) ) {
+            $$path = $path[ 0 ];
+        }; # if
+        debug( "Executing command: \""  . join ( " ", @$cmd ) . "\"." );
+        $rc =
+            execute(
+                $cmd,
+                -ignore_signal => 1, -ignore_status => 1,
+                -stdout => $stdout, -stderr => $stderr, -stdin => undef
+            );
+        if ( $rc < 0 ) {
+            warning( "Cannot run \"$cmd->[ 0 ]\": $@" );
+        }; # if
+        debug( "stdout:", $$stdout, "(eof)", "stderr:", $$stderr, "(eof)" );
+    } else {
+        warning( "No \"$cmd->[ 0 ]\" found in PATH." );
+        $rc = -1;
+    }; # if
+    return $rc;
+}; # sub run
+
+
+sub get_arch($$$) {
+    my ( $name, $str, $exps ) = @_;
+    my ( $arch, $count );
+    $count = 0;
+    foreach my $re ( keys( %$exps ) ) {
+        if ( $str =~ $re ) {
+            $arch = $exps->{ $re };
+            ++ $count;
+        }; # if
+    }; # for
+    if ( $count != 1 or not Platform::canon_arch( $arch ) ) {
+        warning( "Cannot detect $name architecture: $str" );
+        return undef;
+    }; # if
+    return $arch;
+}; # sub get_arch
+
+sub encode($) {
+    my ( $str ) = @_;
+    $str =~ s{ }{_}g;
+    return $str;
+}; # sub encode
+
+
+# --------------------------------------------------------------------------------------------------
+# get_xxx_version subroutines.
+# --------------------------------------------------------------------------------------------------
+#
+# Some of get_xxx_version() subroutines accept an argument -- a tool name. For example,
+# get_intel_compiler_version() can report version of C, C++, or Fortran compiler. The tool for
+# report should be specified by argument, for example: get_intel_compiler_version( "ifort" ).
+#
+# get_xxx_version() subroutines returns list of one or two elements:
+#     1. The first element is short tool name (like "gcc", "g++", "icl", etc).
+#     2. The second element is version string.
+# If returned list contain just one element, it means there is a problem with the tool.
+#
+
+sub get_perl_version() {
+    my ( $rc, $stdout, $stderr, $version );
+    my $tool = "perl";
+    my ( @ret ) = ( $tool );
+    $rc = run( [ $tool, "--version" ], $stdout, $stderr );
+    if ( $rc >= 0 ) {
+        # Typical perl output:
+        #    This is perl, v5.10.0 built for x86_64-linux-thread-multi
+        #    This is perl, v5.8.8 built for MSWin32-x64-multi-thread
+        #    This is perl, v5.10.1 (*) built for x86_64-linux-thread-multi
+        if ( $stdout !~ m{^This is perl.*v(\d+\.\d+(?:\.\d+)).*built for}m ) {
+            warning( "Cannot parse perl output:", $stdout, "(oef)" );
+        }; # if
+        $version = $1;
+        if ( $target_os eq "win" ) {
+            if ( $stdout !~ m{Binary build (.*) provided by ActiveState } ) {
+                warning( "Perl is not ActiveState one" );
+            }; # if
+        }; # if
+    }; # if
+    push( @ret, $version );
+    return @ret;
+}; # sub get_perl_version
+
+
+sub get_gnu_make_version() {
+    my ( $rc, $stdout, $stderr, $version );
+    my $tool = "make";
+    my ( @ret ) = ( $tool );
+    my ( $path );
+    $rc = run( [ $tool, "--version" ], $stdout, $stderr, $path );
+    if ( $rc >= 0 ) {
+        # Typical make output:
+        #     GNU Make version 3.79.1, by Richard Stallman and Roland McGrath.
+        #     GNU Make 3.81
+        if ( $stdout =~ m{^GNU Make (?:version )?(\d+\.\d+(?:\.\d+)?)(?:,|\s)} ) {
+            $version = $1;
+        }; # if
+        if ( $target_os eq "win" and $stdout =~ m{built for ([a-z0-9-]+)} ) {
+            my $built_for = $1;
+            debug( "GNU Make built for: \"$built_for\"." );
+            if ( $built_for =~ m{cygwin}i ) {
+                warning( "\"$path\" is a Cygwin make, it is *not* suitable." );
+                return @ret;
+            }; # if
+        }; # if
+    }; # if
+    push( @ret, $version );
+    return @ret;
+}; # sub get_gnu_make_version
+
+
+sub get_intel_compiler_version($) {
+    my ( $tool ) = @_;    # Tool name, like "icc", "icpc", "icl", or "ifort".
+    my ( @ret ) = ( $tool );
+    my ( $rc, $stdout, $stderr, $tool_re );
+    my $version;
+    my $ic_archs = {
+        qr{32-bit|IA-32}        => "32",
+        qr{Intel\(R\) 64} => "32e",
+        qr{Intel\(R\) [M][I][C] Architecture} => "32e",
+    };
+    $tool_re = quotemeta( $tool );
+    $rc = run( [ $tool, ( $target_os eq "win" ? () : ( "-V" ) ) ], $stdout, $stderr );
+    if ( $rc < 0 ) {
+        return @ret;
+    }; # if
+    # Intel compiler version string is in the first line of stderr. Get it.
+    #$stderr =~ m{\A(.*\n?)};
+    # AC: Let's look for version string in the first line which contains "Intel" string.
+    #     This allows to use 11.1 and 12.0 compilers on new MAC machines by ignoring
+    #     huge number of warnings issued by old compilers.
+    $stderr =~ m{^(Intel.*)$}m;
+    my $vstr = $1;
+    my ( $apl, $ver, $bld, $pkg );
+    if ( 0 ) {
+    } elsif ( $vstr =~ m{^Intel.*?Compiler\s+(.*?),?\s+Version\s+(.*?)\s+Build\s+(\S+)(?:\s+Package ID: (\S+))?} ) {
+        # 9.x, 10.x, 11.0.
+        ( $apl, $ver, $bld, $pkg ) = ( $1, $2, $3, $4 );
+    } elsif ( $vstr =~ m{^Intel's (.*?) Compiler,?\s+Version\s+(.*?)\s+Build\s+(\S+)} ) {
+        # 11.1
+        ( $apl, $ver, $bld ) = ( $1, $2, $3 );
+    } else {
+        warning( "Cannot parse ${tool}'s stderr:", $stderr, "(eof)" );
+        return @ret;
+    }; # if
+    my $ic_arch = get_arch( "Intel compiler", $apl, $ic_archs );
+    if ( not defined( $ic_arch ) ) {
+        return @ret;
+    }; # if
+    if ( Platform::canon_arch( $ic_arch ) ne $target_arch and not (Platform::canon_arch($ic_arch) eq "32e" and $target_arch eq "mic" )) {
+        warning( "Target architecture is $target_arch, $tool for $ic_arch found." );
+        return @ret;
+    }; # if
+    # Normalize version.
+    my $stage;
+    $ver =~ s{\s+}{ }g;
+    $ver = lc( $ver );
+    if ( $ver =~ m{\A(\d+\.\d+(?:\.\d+)?) ([a-z]+)\a}i ) {
+        ( $version, $stage ) = ( $1, $2 );
+    } else {
+        ( $version, $stage ) = ( $ver, "" );
+    }; # if
+    # Parse package.
+    if ( defined( $pkg ) ) {
+        if ( $pkg !~ m{\A[lwm]_[a-z]+_[a-z]_(\d+\.\d+\.\d+)\z}i ) {
+            warning( "Cannot parse Intel compiler package: $pkg" );
+            return @ret;
+        }; # if
+        $pkg = $1;
+        $version = $pkg;
+    }; # if
+    push( @ret, "$version " . ( $stage ? "$stage " : "" ) . "($bld) for $ic_arch" );
+    # Ok, version of Intel compiler found successfully. Now look at config file.
+    # Installer of Intel compiler tends to add a path to MS linker into compiler config file.
+    # It leads to troubles. For example, all the environment set up for MS VS 2005, but Intel
+    # compiler uses lnker from MS VS 2003 because it is specified in config file.
+    # To avoid such troubles, make sure:
+    #     ICLCFG/IFORTCFG environment variable exists or
+    #     compiler config file does not exist, or
+    #     compiler config file does not specify linker.
+    if ( $target_os eq "win" ) {
+        if ( not exists( $ENV{ uc( $tool . "cfg" ) } ) ) {
+            # If ICLCFG/IFORTCFG environment varianle exists, everything is ok.
+            # Otherwise check compiler's config file.
+            my $path = which( $tool );
+            $path =~ s{\.exe\z}{}i;     # Drop ".exe" suffix.
+            $path .= ".cfg";            # And add ".cfg" one.
+            if ( -f $path ) {
+                # If no config file exists, it is ok.
+                # Otherwise analyze its content.
+                my $bulk = read_file( $path );
+                $bulk =~ s{#.*\n}{}g;    # Remove comments.
+                my @options = ( "Qvc", "Qlocation,link," );
+                foreach my  $opt ( @options ) {
+                    if ( $bulk =~ m{[-/]$opt} ) {
+                        warning( "Compiler config file \"$path\" contains \"-$opt\" option." );
+                    }; # if
+                }; # foreach
+            }; # if
+        }; # if
+    }; # if
+    return @ret;
+}; # sub get_intel_compiler_version
+
+
+sub get_gnu_compiler_version($) {
+    my ( $tool ) = @_;
+    my ( @ret ) = ( $tool );
+    my ( $rc, $stdout, $stderr, $version );
+    $rc = run( [ $tool, "--version" ], $stdout, $stderr );
+    if ( $rc >= 0 ) {
+        my ( $ver, $bld );
+        if ( $target_os eq "mac" ) {
+            # i686-apple-darwin8-gcc-4.0.1 (GCC) 4.0.1 (Apple Computer, Inc. build 5367)
+            # i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5484)
+            # i686-apple-darwin11-llvm-gcc-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.9.00)
+            $stdout =~ m{^.*? \(GCC\) (\d+\.\d+\.\d+) \(.*Apple.*?Inc\. build (\d+)\)}m;
+            ( $ver, $bld ) = ( $1, $2 );
+        } else {
+            if ( 0 ) {
+            } elsif ( $stdout =~ m{^.*? \(GCC\) (\d+\.\d+\.\d+)(?: (\d+))?}m ) {
+                # g++ (GCC) 3.2.3 20030502 (Red Hat Linux 3.2.3-20)
+                # GNU Fortran (GCC) 4.3.2 20081105 (Red Hat 4.3.2-7)
+                ( $ver, $bld ) = ( $1, $2 );
+            } elsif ( $stdout =~ m{^.*? \(SUSE Linux\) (\d+\.\d+\.\d+)\s+\[.*? (\d+)\]}m ) {
+                # gcc (SUSE Linux) 4.3.2 [gcc-4_3-branch revision 141291]
+                ( $ver, $bld ) = ( $1, $2 );
+            } elsif ( $stdout =~ m{^.*? \(SUSE Linux\) (\d+\.\d+\.\d+)\s+\d+\s+\[.*? (\d+)\]}m ) {
+                # gcc (SUSE Linux) 4.7.2 20130108 [gcc-4_7-branch revision 195012]
+                ( $ver, $bld ) = ( $1, $2 );
+            } elsif ( $stdout =~ m{^.*? \((Debian|Ubuntu).*?\) (\d+\.\d+\.\d+)}m ) {
+                # gcc (Debian 4.7.2-22) 4.7.2
+                # Debian support from Sylvestre Ledru 
+                # Thanks!
+                $ver = $2;
+            }; # if
+        }; # if
+        if ( defined( $ver ) ) {
+            $version = $ver . ( defined( $bld ) ? " ($bld)" : "" );
+        } else {
+            warning( "Cannot parse GNU compiler version:", $stdout, "(eof)" );
+        }; # if
+    }; # if
+    push( @ret, $version );
+    return @ret;
+}; # sub get_gnu_compiler_version
+
+
+sub get_clang_compiler_version($) {
+    my ( $tool ) = @_;
+    my ( @ret ) = ( $tool );
+    my ( $rc, $stdout, $stderr, $version );
+    $rc = run( [ $tool, "--version" ], $stdout, $stderr );
+    if ( $rc >= 0 ) {
+        my ( $ver, $bld );
+        if ( $target_os eq "mac" ) {
+            # Apple LLVM version 4.2 (clang-425.0.28) (based on LLVM 3.2svn)
+            $stdout =~ m{^.*? (\d+\.\d+) \(.*-(\d+\.\d+\.\d+)\)}m;
+            ( $ver, $bld ) = ( $1, $2 );
+            # For custom clang versions.
+            if ( not defined($ver) and $stdout =~ m{^.*? (\d+\.\d+)( \((.*)\))?}m ) {
+                ( $ver, $bld ) = ( $1, $3 );
+            }
+        } else {
+            if ( 0 ) {
+            } elsif ( $stdout =~ m{^.*? (\d+\.\d+)( \((.*)\))?}m ) {
+                # clang version 3.3 (tags/RELEASE_33/final)
+                ( $ver, $bld ) = ( $1, $3 );
+            } 
+        }; # if
+        if ( defined( $ver ) ) {
+            $version = $ver . ( defined( $bld ) ? " ($bld)" : "" );
+        } else {
+            warning( "Cannot parse Clang compiler version:", $stdout, "(eof)" );
+        }; # if
+    }; # if
+    push( @ret, $version );
+    return @ret;
+}; # sub get_gnu_compiler_version
+
+
+sub get_ms_compiler_version() {
+    my ( $rc, $stdout, $stderr, $version );
+    my $tool = "cl";
+    my ( @ret ) = ( $tool );
+    my $mc_archs = {
+        qr{80x86|x86}     => "IA-32 architecture",
+        qr{AMD64|x64}     => "Intel(R) 64",
+    };
+    $rc = run( [ $tool ], $stdout, $stderr );
+    if ( $rc < 0 ) {
+        return @ret;
+    }; # if
+    if ( $stderr !~ m{^Microsoft .* Compiler Version (.*?) for (.*)\s*$}m ) {
+        warning( "Cannot parse MS compiler output:", $stderr, "(eof)" );
+        return @ret;
+    }; # if
+    my ( $ver, $apl ) = ( $1, $2 );
+    if ( $ver !~ m{\A\d+(?:\.\d+)+\z} ) {
+        warning( "Cannot parse MS compiler version: $ver" );
+        return @ret;
+    }; # if
+    my $mc_arch = get_arch( "MS compiler", $apl, $mc_archs );
+    if ( not defined( $mc_arch ) ) {
+        return @ret;
+    }; # if
+    if ( Platform::canon_arch( $mc_arch ) ne $target_arch ) {
+        warning( "Target architecture is $target_arch, $tool for $mc_arch found" );
+        return @ret;
+    }; # if
+    $version = "$ver for $target_arch";
+    push( @ret, $version );
+    return @ret;
+}; # sub get_ms_compiler_version
+
+
+sub get_ms_linker_version() {
+    my ( $rc, $stdout, $stderr, $version );
+    my $tool = "link";
+    my ( @ret ) = ( $tool );
+    my ( $path );
+    $rc = run( [ $tool ], $stdout, $stderr, $path );
+    if ( $rc < 0 ) {
+        return @ret;
+    }; # if
+    if ( $stdout !~ m{^Microsoft \(R\) Incremental Linker Version (\d+(?:\.\d+)+)\s*$}m ) {
+        warning( "Cannot parse MS linker output:", $stdout, "(eof)" );
+        if ( $stderr =~ m{^link: missing operand} ) {
+            warning( "Seems \"$path\" is a Unix-like \"link\" program, not MS linker." );
+        }; # if
+        return @ret;
+    }; # if
+    $version = ( $1 );
+    push( @ret, $version );
+    return @ret;
+}; # sub get_ms_linker_version
+
+
+# --------------------------------------------------------------------------------------------------
+# "main" program.
+# --------------------------------------------------------------------------------------------------
+
+my $make;
+my $intel       = 1;             # Check Intel compilers.
+my $fortran     = 0;             # Check for corresponding Fortran compiler, ifort for intel 
+                                 #                                           gfortran for gnu 
+                                 #                                           gfortran for clang 
+my $clang       = 0;             # Check Clang Compilers.
+my $intel_compilers = {
+    "lin" => { c => "icc", cpp => "icpc", f => "ifort" },
+    "mac" => { c => "icc", cpp => "icpc", f => "ifort" },
+    "win" => { c => "icl", cpp => undef,  f => "ifort" },
+};
+my $gnu_compilers = {
+    "lin" => { c => "gcc", cpp =>  "g++", f => "gfortran" },
+    "mac" => { c => "gcc", cpp =>  "g++", f => "gfortran" },
+};
+my $clang_compilers = {
+    "lin" => { c => "clang", cpp =>  "clang++" },
+    "mac" => { c => "clang", cpp =>  "clang++" },
+};
+
+get_options(
+    Platform::target_options(),
+    "intel!"         => \$intel,
+    "fortran"        => \$fortran,
+    "clang"          => \$clang,
+    "make"           => \$make,
+    "pedantic"       => \$pedantic,
+);
+
+my @versions;
+push( @versions, [ "Perl",     get_perl_version() ] );
+push( @versions, [ "GNU Make", get_gnu_make_version() ] );
+if ( $intel ) {
+    my $ic = $intel_compilers->{ $target_os };
+    push( @versions, [ "Intel C Compiler",       get_intel_compiler_version( $ic->{ c } ) ] );
+    if ( defined( $ic->{ cpp } ) ) {
+        # If Intel C++ compiler has a name different from C compiler, check it as well.
+        push( @versions, [ "Intel C++ Compiler", get_intel_compiler_version( $ic->{ cpp } ) ] );
+    }; # if
+    # fortran check must be explicitly specified on command line with --fortran
+    if ( $fortran ) {
+        if ( defined( $ic->{ f } ) ) {
+            push( @versions, [ "Intel Fortran Compiler", get_intel_compiler_version( $ic->{ f } ) ] );
+        }; # if
+    };
+}; # if
+if ( $target_os eq "lin" or $target_os eq "mac" ) {
+    # check for clang/gnu tools because touch-test.c is compiled with them.
+    if ( $clang or $target_os eq "mac" ) { # OS X* >= 10.9 discarded GNU compilers.
+        push( @versions, [ "Clang C Compiler",     get_clang_compiler_version( $clang_compilers->{ $target_os }->{ c   } ) ] );
+        push( @versions, [ "Clang C++ Compiler",   get_clang_compiler_version( $clang_compilers->{ $target_os }->{ cpp } ) ] );
+    } else {
+        push( @versions, [ "GNU C Compiler",     get_gnu_compiler_version( $gnu_compilers->{ $target_os }->{ c   } ) ] );
+        push( @versions, [ "GNU C++ Compiler",   get_gnu_compiler_version( $gnu_compilers->{ $target_os }->{ cpp } ) ] );
+    };
+    # if intel fortran has been checked then gnu fortran is unnecessary
+    # also, if user specifies clang as build compiler, then gfortran is assumed fortran compiler
+    if ( $fortran and not $intel ) {
+        push( @versions, [ "GNU Fortran Compiler", get_gnu_compiler_version( $gnu_compilers->{ $target_os }->{ f } ) ] );
+    }; 
+}; 
+if ( $target_os eq "win" ) {
+    push( @versions, [ "MS C/C++ Compiler",  get_ms_compiler_version() ] );
+    push( @versions, [ "MS Linker",          get_ms_linker_version() ] );
+}; # if
+
+my $count = 0;
+foreach my $item ( @versions ) {
+    my ( $title, $tool, $version ) = @$item;
+    if ( not defined( $version ) ) {
+        $version = "--- N/A ---";
+        ++ $count;
+    }; # if
+    if ( $make ) {
+        printf( "%s=%s\n", encode( $tool ), encode( $version ) );
+    } else {
+        printf( "%-25s: %s\n", $title, $version );
+    }; # if
+}; # foreach
+
+exit( $count == 0 ? 0 : 1 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<check-tools.pl> -- Check development tools availability and versions.
+
+=head1 SYNOPSIS
+
+B<check-tools.pl> I<OPTION>...
+
+=head1 OPTIONS
+
+=over
+
+=item B<--make>
+
+Produce output suitable for using in makefile: short tool names (e. g. "icc" instead of "Intel C
+Compiler"), spaces in version strings replaced with underscores.
+
+=item Tools selection
+
+=over
+
+=item B<-->[B<no->]B<-gnu-fortran>
+
+Check GNU Fortran compiler. By default, it is not checked.
+
+=item B<-->[B<no->]B<intel>
+
+Check Intel C, C++ and Fortran compilers. This is default.
+
+=back
+
+=item Platform selection
+
+=over
+
+=item B<--architecture=>I<str>
+
+Specify target architecture. Used in cross-builds, for example when building 32-bit applications on
+Intel(R) 64 machine.
+
+If architecture is not specified explicitly, value of LIBOMP_ARCH environment variable is used.
+If LIBOMP_ARCH is not defined, host architecture detected.
+
+=item B<--os=>I<str>
+
+Specify target OS name. Used in cross-builds, for example when building Intel(R) Many Integrated Core Architecture applications on
+Windows* OS.
+
+If OS is not specified explicitly, value of LIBOMP_OS environment variable is used.
+If LIBOMP_OS is not defined, host OS detected.
+
+=back
+
+=back
+
+=head2 Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=head1 DESCRIPTION
+
+This script checks availability and versions of development tools. By default, the script checks:
+Perl, GNU Make, Intel compilers, GNU C and C++ compilers (Linux* OS and OS X*),
+Microsoft C/C++ compiler and linker (Windows* OS).
+
+The sript prints nice looking table or machine-readable strings.
+
+=head2 EXIT
+
+=over
+
+=item *
+
+0 -- All programs found.
+
+=item *
+
+1 -- Some of tools are not found.
+
+=back
+
+=head1 EXAMPLES
+
+    $ check-tools.pl
+    Perl                     : 5.8.0
+    GNU Make                 : 3.79.1
+    Intel C Compiler         : 11.0 (20080930) for 32e
+    Intel C++ Compiler       : 11.0 (20080930) for 32e
+    Intel Fortran Compiler   : 10.1.008 (20070913) for 32e
+    GNU C Compiler           : 3.2.3 (20030502)
+    GNU C++ Compiler         : 3.2.3 (20030502)
+
+    > check-tools.pl --make
+    perl=5.8.8
+    make=3.81
+    icl=10.1_(20070913)_for_32e
+    ifort=10.1_(20070913)_for_32e
+    cl=14.00.40310.41_for_32e
+    link=8.00.40310.39
+
+=back
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/common.inc b/final/runtime/tools/common.inc
new file mode 100644
index 0000000..2e51a69
--- /dev/null
+++ b/final/runtime/tools/common.inc

@@ -0,0 +1,109 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+ifndef omp_os
+
+  # Windows sets environment variable OS; for other systems, ask uname
+  ifeq ($(OS),)
+    OS:=$(shell uname)
+    ifeq ($(OS),)
+      $(error "Cannot detect operating system")
+    endif
+    export omp_os=$(OS)
+  endif
+
+  ifeq ($(OS), Windows_NT)
+    export omp_os=windows
+  endif
+  ifeq ($(OS), Linux)
+    export omp_os=linux
+  endif
+  ifeq ($(OS), FreeBSD)
+    export omp_os=freebsd
+  endif
+  ifeq ($(OS), Darwin)
+    export omp_os=macos
+  endif
+
+endif # !omp_os
+
+# Compiling for the Intel(R) Many Integrated Core architecture is non-trivial at the next layer
+# of script down, but we can make it consistent here.
+ifneq "$(filter knf knc knl, $(arch))" ""
+    # I really do mean this...
+    # have top-level arch=mic and then mic_arch = flavor of mic
+    override mic_arch:=$(arch)
+    override arch:=mic
+    override mic:=yes
+else
+    ifeq "$(arch)" "mic"
+        # default flavor of mic is knc
+        mic_arch?=knc
+        override mic:=yes
+    else
+        override mic:=no
+    endif
+endif
+
+ifeq (,$(wildcard $(omp_root)/tools/$(omp_os).inc))
+  $(error "$(omp_os)" is not supported. Add tools/$(omp_os).inc file with os-specific settings )
+endif
+
+# detect arch and runtime versions, provide common host-specific definitions
+include $(omp_root)/tools/$(omp_os).inc
+
+ifeq ($(arch),)
+ $(error Architecture not detected)
+endif
+
+# Setting defaults
+mode?=release
+
+ifeq "$(filter 32 32e 64 mic,$(arch))" ""
+    compiler?=gcc
+else
+    ifeq "$(omp_os)" "windows"
+        compiler?=icl
+    else
+        compiler?=icc
+    endif
+endif
+
+ifneq "$(mic)" "no"
+    ifeq "$(compiler)" "gcc"
+        $(error Compiling the runtime with gcc is not supported on Intel(R) Many Integrated Core Architecture)
+    endif
+
+    # Add Intel(R) Many Integrated Core Architecture kind (knf, knc, knl, etc.)
+    build_args += --mic-arch=$(mic_arch)
+
+    # Check that the binutils for Intel(R) Many Integrated Core Architecture are available
+    # First we see whether the objdump on the user's path supports the k1om architecture.
+    hask1om = $(shell if (x86_64-k1om-linux-objdump --help | grep -s k1om); then echo OK; else echo KO; fi)
+    ifneq "$(hask1om)" "OK"
+        # Appropriate binutils are not already set up, so try to add them from the default place.
+        micBinPath = /usr/linux-k1om-4.7/bin
+        micBinPresent = $(shell if test -d $(micBinPath); then echo OK; else echo KO; fi)
+        ifneq "$(micBinPresent)" "OK"
+            # We can't find them in the normal place, so complain.
+            $(error Compiling for Intel(R) Many Integrated Core Architecture requires that the cross-hosted binutils are available in $(micBinPath).\
+See the Tools tab at http://software.intel.com/mic-developer)
+        endif
+        export PATH := $(micBinPath):${PATH}
+    endif
+endif
+
+# number of parallel build jobs
+jobs?=1
+
+export BUILD_COMPILER := $(compiler)
+
+

diff --git a/final/runtime/tools/expand-vars.pl b/final/runtime/tools/expand-vars.pl
new file mode 100755
index 0000000..40a6490
--- /dev/null
+++ b/final/runtime/tools/expand-vars.pl

@@ -0,0 +1,306 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.005";
+
+my $name_rexp    = qr{[A-Za-z_]+[A-Za-z0-9_]*};
+my $keyword_rexp = qr{if|else|end|omp};
+
+sub error($$$) {
+    my ( $input, $msg, $bulk ) = @_;
+    my $pos = pos( $$bulk );
+    $$bulk =~ m{^(.*?)\G(.*?)$}m or die "Internal error";
+    my ( $pre, $post ) = ( $1, $2 );
+    my $n = scalar( @{ [ substr( $$bulk, 0, $pos ) =~ m{\n}g ] } ) + 1;
+    runtime_error( "\"$input\" line $n: $msg:", ">>> " . $pre . "--[HERE]-->" . $post );
+}; # sub error
+
+sub evaluate($$$\$) {
+    my ( $expr, $strict, $input, $bulk ) = @_;
+    my $value;
+    { # Signal handler will be restored on exit from this block.
+        # In case of "use strict; use warnings" eval issues warnings to stderr. This direct
+        # output may confuse user, so we need to catch it and prepend with our info.
+        local $SIG{ __WARN__ } = sub { die @_; };
+        $value =
+            eval(
+                "package __EXPAND_VARS__;\n" .
+                ( $strict ? "use strict; use warnings;\n" : "no strict; no warnings;\n" ) .
+                $expr
+            );
+    };
+    if ( $@ ) {
+        # Drop location information -- increasing eval number and constant "line 3"
+        # is useless for the user.
+        $@ =~ s{ at \(eval \d+\) line \d+}{}g;
+        $@ =~ s{\s*\z}{};
+        error( $input, "Cannot evaluate expression \"\${{$expr}}\": $@", $bulk );
+    }; # if
+    if ( $strict and not defined( $value ) ) {
+        error( $input, "Substitution value is undefined", $bulk );
+    }; # if
+    return $value;
+}; # sub evaluate
+
+#
+# Parse command line.
+#
+
+my ( @defines, $input, $output, $strict );
+get_options(
+    "D|define=s" => \@defines,
+    "strict!"    => \$strict,
+);
+if ( @ARGV < 2 ) {
+    cmdline_error( "Not enough argument" );
+}; # if
+if ( @ARGV > 2 ) {
+    cmdline_error( "Too many argument(s)" );
+}; # if
+( $input, $output ) = @ARGV;
+
+foreach my $define ( @defines ) {
+    my ( $equal, $name, $value );
+    $equal = index( $define, "=" );
+    if ( $equal < 0 ) {
+        $name = $define;
+        $value = "";
+    } else {
+        $name = substr( $define, 0, $equal );
+        $value = substr( $define, $equal + 1 );
+    }; # if
+    if ( $name eq "" ) {
+        cmdline_error( "Illegal definition: \"$define\": variable name should not be empty." );
+    }; # if
+    if ( $name !~ m{\A$name_rexp\z} ) {
+        cmdline_error(
+            "Illegal definition: \"$define\": " .
+                "variable name should consist of alphanumeric characters."
+        );
+    }; # if
+    eval( "\$__EXPAND_VARS__::$name = \$value;" );
+    if ( $@ ) {
+        die( "Internal error: $@" );
+    }; # if
+}; # foreach $define
+
+#
+# Do the work.
+#
+
+my $bulk;
+
+# Read input file.
+$bulk = read_file( $input );
+
+# Do the replacements.
+$bulk =~
+    s{(?:\$($keyword_rexp)|\$($name_rexp)|\${{(.*?)}})}
+    {
+        my $value;
+        if ( defined( $1 ) ) {
+            # Keyword. Leave it as is.
+            $value = "\$$1";
+        } elsif ( defined( $2 ) ) {
+            # Variable to expand.
+            my $name = $2;
+            $value = eval( "\$__EXPAND_VARS__::$name" );
+            if ( $@ ) {
+                die( "Internal error" );
+            }; # if
+            if ( $strict and not defined( $value ) ) {
+                error( $input, "Variable \"\$$name\" not defined", \$bulk );
+            }; # if
+        } else {
+            # Perl code to evaluate.
+            my $expr = $3;
+            $value = evaluate( $expr, $strict, $input, $bulk );
+        }; # if
+        $value;
+    }ges;
+
+# Process conditionals.
+# Dirty patch! Nested conditionals not supported!
+# TODO: Implement nested constructs.
+$bulk =~
+    s{^\$if +([^\n]*) *\n(.*\n)\$else *\n(.*\n)\$end *\n}
+    {
+        my ( $expr, $then_part, $else_part ) = ( $1, $2, $3 );
+        my $value = evaluate( $expr, $strict, $input, $bulk );
+        if ( $value ) {
+            $value = $then_part;
+        } else {
+            $value = $else_part;
+        }; # if
+    }gesm;
+
+# Write output.
+write_file( $output, \$bulk );
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<expand-vars.pl> -- Simple text preprocessor.
+
+=head1 SYNOPSIS
+
+B<expand-vars.pl> I<OPTION>... I<input> I<output>
+
+=head1 OPTIONS
+
+=over
+
+=item B<-D> I<name>[B<=>I<value>]
+
+=item B<--define=>I<name>[B<=>I<value>]
+
+Define variable.
+
+=item B<--strict>
+
+In strict mode, the script issues error on using undefined variables and executes Perl code
+with C<use strict; use warnings;> pragmas.
+
+=back
+
+=head2 Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<input>
+
+Input file name.
+
+=item I<output>
+
+Output file name.
+
+=back
+
+=head1 DESCRIPTION
+
+This script reads input file, makes substitutes and writes output file.
+
+There are two form of substitutes:
+
+=over
+
+=item Variables
+
+Variables are referenced in input file in form:
+
+    $name
+
+Name of variable should consist of alphanumeric characters (Latin letters, digits, and underscores).
+Variables are defined in command line with C<-D> or C<--define> options.
+
+=item Perl Code
+
+Perl code is specified in input file in form:
+
+    ${{ ...code... }}
+
+The code is evaluated, and is replaced with its result. Note: in strict mode, you should declare
+variable before use. See examples.
+
+=back
+
+=head1 EXAMPLES
+
+Replace occurrences of C<$year>, C<$month>, and C<$day> in C<input.txt> file with C<2007>, C<09>, C<01>
+respectively and write result to C<output.txt> file:
+
+    $ cat input.var
+    Today is $year-$month-$day.
+    $ expand-vars.pl -D year=2007 -D month=09 -D day=01 input.var output.txt && cat output.txt
+    Today is 2007-09-01.
+
+Using Perl code:
+
+    $ cat input.var
+    ${{ localtime(); }}
+    $ expand-vars.pl -D year=2007 -D month=09 -D day=01 input.var output.txt && cat output.txt
+    Now Tue May  5 20:54:13 2009
+
+Using strict mode for catching bugs:
+
+    $ cat input.var
+    ${{ "year : " . substr( $date, 0, 4 ); }}
+    $ expand-vars.pl input.var output.txt && cat output.txt
+    year :
+
+Oops, why it does not print year? Let us use strict mode:
+
+    $ expand-vars.pl --strict input.var output.txt && cat output.txt
+    expand-vars.pl: (x) "test.var": Cannot evaluate expression "${{ "year : " . substr( $date, 0, 4 ); }}": Global symbol "$date" requires explicit package name
+
+Ok, variable is not defined. Let us define it:
+
+    $ expand-vars.pl --strict -D date=20090501 input.var output.txt && cat output.txt
+    expand-vars.pl: (x) "test.var": Cannot evaluate expression "${{ "year : " . substr( $date, 0, 4 ); }}": Variable "$date" is not imported
+
+What is wrong? Variable should be declared:
+
+    $ cat input.var
+    ${{ our $date; "year : " . substr( $date, 0, 4 ); }}
+    $ expand-vars.pl --strict -D date=20090501 input.var output.txt && cat output.txt
+    year : 2009
+
+=cut
+
+# end of file #

diff --git a/final/runtime/tools/extract-objects.pl b/final/runtime/tools/extract-objects.pl
new file mode 100755
index 0000000..e9eaa3d
--- /dev/null
+++ b/final/runtime/tools/extract-objects.pl

@@ -0,0 +1,258 @@
+#!/usr/bin/env perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use File::Glob ":glob";
+use File::Temp;
+use Cwd;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+use Uname;
+use Platform ":vars";
+
+our $VERSION = "0.005";
+
+# --------------------------------------------------------------------------------------------------
+# Subroutines.
+# --------------------------------------------------------------------------------------------------
+
+sub windows {
+    my ( $arch, $output, @args ) = @_;
+    my %files;
+    # TODO: Check the archives are of specified architecture.
+    foreach my $arg ( @args ) {
+        foreach my $archive ( bsd_glob( $arg ) ) {
+            info( "Processing \"$archive\"..." );
+            my $bulk;
+            execute( [ "lib.exe", "/nologo", "/list", $archive ], -stdout => \$bulk );
+            my @members = split( "\n", $bulk );
+            foreach my $member ( @members ) {
+                my $file = get_file( $member );
+                my $path = cat_file( $output, $file );
+                if ( exists( $files{ $file } ) ) {
+                    runtime_error(
+                        "Extraction \"$file\" member from \"$archive\" archive failed:",
+                        "\"$file\" member has already been extracted from \"$files{ $file }\" archive"
+                    );
+                }; # if
+                $files{ $file } = $archive;
+                info( "    Writing \"$path\"..." );
+                execute( [ "lib.exe", "/nologo", "/extract:" . $member, "/out:" . $path, $archive ] );
+            }; # foreach $member
+        }; # foreach $archive
+    }; # foreach $arg
+}; # sub windows
+
+sub linux {
+    my ( $arch, $output, @archives ) = @_;
+    # TODO: Check the archives are of specified architecture.
+    my $cwd = Cwd::cwd();
+    change_dir( $output );
+    foreach my $archive ( @archives ) {
+        info( "Processing \"$archive\"..." );
+        my $path = abs_path( $archive, $cwd );
+        execute( [ "ar", "xo", $path ] );
+    }; # foreach $archive
+    change_dir( $cwd );
+}; # sub linux
+
+my %mac_arch = (
+    "32"  => "i386",
+    "32e" => "x86_64"
+);
+
+sub darwin {
+    my ( $arch, $output, @archives ) = @_;
+    my $cwd = getcwd();
+    change_dir( $output );
+    if ( defined( $arch ) ) {
+        if ( not defined( $mac_arch{ $arch } ) ) {
+            runtime_error( "Architecture \"$arch\" is not a valid one for OS X*" );
+        }; # if
+        $arch = $mac_arch{ $arch };
+    }; # if
+    foreach my $archive ( @archives ) {
+        info( "Processing \"$archive\"..." );
+        my $path = abs_path( $archive, $cwd );
+        my $temp;
+        # Whether archive is a fat or thin?
+        my $bulk;
+        execute( [ "file", $path ], -stdout => \$bulk );
+        if ( $bulk =~ m{Mach-O universal binary} ) {
+            # Archive is fat, extracy thin archive first.
+            if ( not defined( $arch ) ) {
+                runtime_error(
+                    "\"$archive\" archive is universal binary, " .
+                        "please specify architecture to work with"
+                );
+            }; # if
+            ( undef, $temp ) = File::Temp::tempfile();
+            execute( [ "libtool", "-static", "-arch_only", $arch, "-o", $temp, $path ] );
+            $path = $temp;
+        }; # if
+        execute( [ "ar", "xo", $path ] );     # Extract members.
+        if ( defined( $temp ) ) {             # Delete temp file, if any.
+            del_file( $temp );
+        }; # if
+    }; # foreach $archive
+    change_dir( $cwd );
+}; # sub darwin
+
+
+# --------------------------------------------------------------------------------------------------
+# Main.
+# --------------------------------------------------------------------------------------------------
+
+# Parse command line.
+
+my $output = ".";
+my @args;
+
+get_options(
+    Platform::target_options(),
+    "o|output-directory=s" => \$output,
+);
+@args = @ARGV;
+
+if ( not -e $output ) {
+    runtime_error( "Output directory \"$output\" does not exist" );
+}; # if
+if ( not -d $output ) {
+    runtime_error( "\"$output\" is not a directory" );
+}; # if
+if ( not -w $output ) {
+    runtime_error( "Output directory \"$output\" is not writable" );
+}; # if
+
+if ( $target_os eq "win" ) {
+    *process = \&windows;
+} elsif ( $target_os eq "lin") {
+    *process = \&linux;
+} elsif ( $target_os eq "mac" ) {
+    *process = \&darwin;
+} else {
+    runtime_error( "OS \"$target_os\" not supported" );
+}; # if
+
+
+# Do the work.
+process( $target_arch, $output, @args );
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<extract-objects.pl> -- Extract all object files from static library.
+
+=head1 SYNOPSIS
+
+B<extract-objects.pl> I<option>... I<archive>...
+
+=head1 OPTIONS
+
+=over
+
+=item B<--architecture=>I<arch>
+
+Specify architecture to work with. The option is mandatory on OS X* in case of universal archive.
+In other cases the option should not be used. I<arch> may be one of C<32> or C<32e>.
+
+=item B<--os=>I<str>
+
+Specify OS name. By default OS is autodetected.
+
+Depending on OS, B<extract-objects.pl> uses different external tools for handling static
+libraries: F<ar> (in case of "lin" and "mac") or F<lib.exe> (in case of "win").
+
+=item B<--output-directory=>I<dir>
+
+Specify directory to write extracted members to. Current directory is used by default.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full documentation and exit.
+
+=item B<--quiet>
+
+Do not print information messages.
+
+=item B<--version>
+
+Print version and exit.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<archive>
+
+A name of archive file (static library). Multiple archives may be specified.
+
+=back
+
+=head1 DESCRIPTION
+
+The script extracts all the members (object files) from archive (static library) to specified
+directory. Commands to perform this action differ on different OSes. On Linux* OS, simple command
+
+    ar xo libfile.a
+
+is enough (in case of extracting files to current directory).
+
+On OS X*, it is a bit compilicated with universal ("fat") binaries -- C<ar> cannot
+operate on fat archives, so "thin" archive should be extracted from the universal binary first.
+
+On Windows* OS, library manager (C<lib.exe>) can extract only one object file, so operation should be
+repeated for every object file in the library.
+
+B<extract-objects.pl> detects OS automatically. But detection can be overrided with B<--os> option.
+It may be helpful in cross-build environments.
+
+B<extract-objects.pl> effectively encapsulates all these details and provides uniform way for
+extracting object files from static libraries, which helps to keep makefiles simple and clean.
+
+=head1 EXAMPLES
+
+Extract object files from library F<libirc.lib>, and put them into F<obj/> directory:
+
+    $ extract-objects.pl --output=obj libirc.lib
+
+Extract object files from library F<libirc.a>. Use Linux* OS tools (F<ar>), even if run on another OS:
+
+    $ extract-objects.pl --os=lin libirc.a
+
+Extract object files from library F<libirc.a>, if it is a OS X* universal binary, use i386
+architecture. Be quiet:
+
+    $ extract-objects.pl --quiet --arch=i386 libirc.a
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/freebsd.inc b/final/runtime/tools/freebsd.inc
new file mode 100644
index 0000000..7b60eec
--- /dev/null
+++ b/final/runtime/tools/freebsd.inc

@@ -0,0 +1,12 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+include tools/linux.inc

diff --git a/final/runtime/tools/generate-def.pl b/final/runtime/tools/generate-def.pl
new file mode 100755
index 0000000..7c2b0f3
--- /dev/null
+++ b/final/runtime/tools/generate-def.pl

@@ -0,0 +1,321 @@
+#!/usr/bin/env perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Some pragmas.
+use strict;          # Restrict unsafe constructs.
+use warnings;        # Enable all warnings.
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.004";
+
+#
+# Subroutines.
+#
+
+sub parse_input($\%) {
+
+    my ( $input, $defs ) = @_;
+    my @bulk = read_file( $input );
+    my %entries;
+    my %ordinals;
+    my @dirs;
+    my $value = 1;
+
+    my $error =
+        sub {
+            my ( $msg, $l, $line ) = @_;
+            runtime_error(
+                "Error parsing file \"$input\" line $l:\n" .
+                "    $line" .
+                ( $msg ? $msg . "\n" : () )
+            );
+        }; # sub
+
+    my $n = 0;    # Line number.
+    foreach my $line ( @bulk ) {
+        ++ $n;
+        if ( 0 ) {
+        } elsif ( $line =~ m{^\s*(?:#|\n)} ) {
+            # Empty line or comment. Skip it.
+        } elsif ( $line =~ m{^\s*%} ) {
+            # A directive.
+            if ( 0  ) {
+            } elsif ( $line =~ m{^\s*%\s*if(n)?def\s+([A-Za-z0-9_]+)\s*(?:#|\n)} ) {
+                my ( $negation, $name ) = ( $1, $2 );
+                my $dir = { n => $n, line => $line, name => $name, value => $value };
+                push( @dirs, $dir );
+                $value = ( $value and ( $negation xor $defs->{ $name } ) );
+            } elsif ( $line =~ m{^\s*%\s*endif\s*(?:#|\n)} ) {
+                if ( not @dirs ) {
+                    $error->( "Orphan %endif directive.", $n, $line );
+                }; # if
+                my $dir = pop( @dirs );
+                $value = $dir->{ value };
+            } else {
+                $error->( "Bad directive.", $n, $line );
+            }; # if
+        } elsif ( $line =~ m{^\s*(-)?\s*([A-Za-z0-9_]+)(?:\s+(\d+|DATA))?\s*(?:#|\n)} ) {
+            my ( $obsolete, $entry, $ordinal ) = ( $1, $2, $3 );
+            if ( $value ) {
+                if ( exists( $entries{ $entry } ) ) {
+                    $error->( "Entry \"$entry\" has already been specified.", $n, $line );
+                }; # if
+                $entries{ $entry } = { ordinal => $ordinal, obsolete => defined( $obsolete ) };
+                if ( defined( $ordinal ) and $ordinal ne "DATA" ) {
+                    if ( $ordinal >= 1000 and $entry =~ m{\A[ok]mp_} ) {
+                        $error->( "Ordinal of user-callable entry must be < 1000", $n, $line );
+                    }; # if
+                    if ( $ordinal >= 1000 and $ordinal < 2000 ) {
+                        $error->( "Ordinals between 1000 and 1999 are reserved.", $n, $line );
+                    }; # if
+                    if ( exists( $ordinals{ $ordinal } ) ) {
+                        $error->( "Ordinal $ordinal has already been used.", $n, $line );
+                    }; # if
+                    $ordinals{ $ordinal } = $entry;
+                }; # if
+            }; # if
+        } else {
+            $error->( "", $n, $line );
+        }; # if
+    }; # foreach
+
+    if ( @dirs ) {
+        my $dir = pop( @dirs );
+        $error->( "Unterminated %if direcive.", $dir->{ n }, $dir->{ line } );
+    }; # while
+
+    return %entries;
+
+}; # sub parse_input
+
+sub process(\%) {
+
+    my ( $entries ) = @_;
+
+    foreach my $entry ( keys( %$entries ) ) {
+        if ( not $entries->{ $entry }->{ obsolete } ) {
+            my $ordinal = $entries->{ $entry }->{ ordinal };
+            if ( $entry =~ m{\A[ok]mp_} ) {
+                if ( not defined( $ordinal ) or $ordinal eq "DATA" ) {
+                    runtime_error(
+                        "Bad entry \"$entry\": ordinal number is not specified."
+                    );
+                }; # if
+                $entries->{ uc( $entry ) } = { ordinal => 1000 + $ordinal };
+            }; # if
+        }; # if
+    }; # foreach
+
+    return %$entries;
+
+}; # sub process
+
+sub generate_output(\%$) {
+
+    my ( $entries, $output ) = @_;
+    my $bulk;
+
+    $bulk = "EXPORTS\n";
+    foreach my $entry ( sort( keys( %$entries ) ) ) {
+        if ( not $entries->{ $entry }->{ obsolete } ) {
+            $bulk .= sprintf( "    %-40s ", $entry );
+            my $ordinal = $entries->{ $entry }->{ ordinal };
+            if ( defined( $ordinal ) ) {
+                if ( $ordinal eq "DATA" ) {
+                    $bulk .= "DATA";
+                } else {
+                    $bulk .= "\@" . $ordinal;
+                }; # if
+            }; # if
+            $bulk .= "\n";
+        }; # if
+    }; # foreach
+    if ( defined( $output ) ) {
+        write_file( $output, \$bulk );
+    } else {
+        print( $bulk );
+    }; # if
+
+}; # sub generate_ouput
+
+#
+# Parse command line.
+#
+
+my $input;   # The name of input file.
+my $output;  # The name of output file.
+my %defs;
+
+get_options(
+    "output=s"    => \$output,
+    "D|define=s"  =>
+        sub {
+            my ( $opt_name, $opt_value ) = @_;
+            my ( $def_name, $def_value );
+            if ( $opt_value =~ m{\A(.*?)=(.*)\z} ) {
+                ( $def_name, $def_value ) = ( $1, $2 );
+            } else {
+                ( $def_name, $def_value ) = ( $opt_value, 1 );
+            }; # if
+            $defs{ $def_name } = $def_value;
+        },
+);
+
+if ( @ARGV == 0 ) {
+    cmdline_error( "Not enough arguments." );
+}; # if
+if ( @ARGV > 1 ) {
+    cmdline_error( "Too many arguments." );
+}; # if
+$input = shift( @ARGV );
+
+#
+# Work.
+#
+
+my %data = parse_input( $input, %defs );
+%data = process( %data );
+generate_output( %data, $output );
+exit( 0 );
+
+__END__
+
+#
+# Embedded documentation.
+#
+
+=pod
+
+=head1 NAME
+
+B<generate-def.pl> -- Generate def file for OpenMP RTL.
+
+=head1 SYNOPSIS
+
+B<generate-def.pl> I<OPTION>... I<file>
+
+=head1 OPTIONS
+
+=over
+
+=item B<--define=>I<name>[=I<value>]
+
+=item B<-D> I<name>[=I<value>]
+
+Define specified name. If I<value> is omitted, I<name> is defined to 1. If I<value> is 0 or empty,
+name is B<not> defined.
+
+=item B<--output=>I<file>
+
+=item B<-o> I<file>
+
+Specify output file name. If option is not present, result is printed to stdout.
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+A name of input file.
+
+=back
+
+=head1 DESCRIPTION
+
+The script reads input file, process conditional directives, checks content for consistency, and
+generates ouptput file suitable for linker.
+
+=head2 Input File Format
+
+=over
+
+=item Comments
+
+    # It's a comment.
+
+Comments start with C<#> symbol and continue to the end of line.
+
+=item Conditional Directives
+
+    %ifdef name
+    %ifndef name
+    %endif
+
+A part of file surrounded by C<%ifdef I<name>> and C<%endif> directives is a conditional part -- it
+has effect only if I<name> is defined in the comman line by B<--define> option. C<%ifndef> is a
+negated version of C<%ifdef> -- conditional part has an effect only if I<name> is B<not> defined.
+
+Conditional parts may be nested.
+
+=item Export Definitions
+
+    symbol
+    symbol ordinal
+    symbol DATA
+
+Symbols starting with C<omp_> or C<kmp_> must have ordinal specified. They are subjects for special
+processing: each symbol generates two output lines: original one and upper case version. The ordinal
+number of the second is original ordinal increased by 1000.
+
+=item Obsolete Symbols
+
+    - symbol
+    - symbol ordinal
+    - symbol DATA
+
+Obsolete symbols look like export definitions prefixed with minus sign. Obsolete symbols do not
+affect the output, but obsolete symbols and their ordinals cannot be (re)used in export definitions.
+
+=back
+
+=head1 EXAMPLES
+
+    $ generate-def.pl -D stub -D USE_TCHECK=0 -o libguide.def dllexport
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/lib/Build.pm b/final/runtime/tools/lib/Build.pm
new file mode 100644
index 0000000..cf67156
--- /dev/null
+++ b/final/runtime/tools/lib/Build.pm

@@ -0,0 +1,264 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+package Build;
+
+use strict;
+use warnings;
+
+use Cwd qw{};
+
+use LibOMP;
+use tools;
+use Uname;
+use Platform ":vars";
+
+my $host = Uname::host_name();
+my $root = $ENV{ LIBOMP_WORK    };
+my $tmp  = $ENV{ LIBOMP_TMP     };
+my $out  = $ENV{ LIBOMP_EXPORTS };
+
+my @jobs;
+our $start = time();
+
+# --------------------------------------------------------------------------------------------------
+# Helper functions.
+# --------------------------------------------------------------------------------------------------
+
+# tstr -- Time string. Returns string "yyyy-dd-mm hh:mm:ss UTC".
+sub tstr(;$) {
+    my ( $time ) = @_;
+    if ( not defined( $time ) ) {
+        $time = time();
+    }; # if
+    my ( $sec, $min, $hour, $day, $month, $year ) = gmtime( $time );
+    $month += 1;
+    $year  += 1900;
+    my $str = sprintf( "%04d-%02d-%02d %02d:%02d:%02d UTC", $year, $month, $day, $hour, $min, $sec );
+    return $str;
+}; # sub tstr
+
+# dstr -- Duration string. Returns string "hh:mm:ss".
+sub dstr($) {
+    # Get time in seconds and format it as time in hours, minutes, seconds.
+    my ( $sec ) = @_;
+    my ( $h, $m, $s );
+    $h   = int( $sec / 3600 );
+    $sec = $sec - $h * 3600;
+    $m   = int( $sec / 60 );
+    $sec = $sec - $m * 60;
+    $s   = int( $sec );
+    $sec = $sec - $s;
+    return sprintf( "%02d:%02d:%02d", $h, $m, $s );
+}; # sub dstr
+
+# rstr -- Result string.
+sub rstr($) {
+    my ( $rc ) = @_;
+    return ( $rc == 0 ? "+++ Success +++" : "--- Failure ---" );
+}; # sub rstr
+
+sub shorter($;$) {
+    # Return shorter variant of path -- either absolute or relative.
+    my ( $path, $base ) = @_;
+    my $abs = abs_path( $path );
+    my $rel = rel_path( $path, $base );
+    if ( $rel eq "" ) {
+        $rel = ".";
+    }; # if
+    $path = ( length( $rel ) < length( $abs ) ? $rel : $abs );
+    if ( $target_os eq "win" ) {
+        $path =~ s{\\}{/}g;
+    }; # if
+    return $path;
+}; # sub shorter
+
+sub tee($$) {
+
+    my ( $action, $file ) = @_;
+    my $pid = 0;
+
+    my $save_stdout = Symbol::gensym();
+    my $save_stderr = Symbol::gensym();
+
+    # --- redirect stdout ---
+    STDOUT->flush();
+    # Save stdout in $save_stdout.
+    open( $save_stdout, ">&" . STDOUT->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Redirect stdout to tee or to file.
+    if ( $tools::verbose ) {
+        $pid = open( STDOUT, "| tee -a \"$file\"" )
+            or die "Cannot open pipe to \"tee\": $!; stopped";
+    } else {
+        open( STDOUT, ">>$file" )
+            or die "Cannot open file \"$file\" for writing: $!; stopped";
+    }; # if
+
+    # --- redirect stderr ---
+    STDERR->flush();
+    # Save stderr in $save_stderr.
+    open( $save_stderr, ">&" . STDERR->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Redirect stderr to stdout.
+    open( STDERR, ">&" . STDOUT->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+
+    # Perform actions.
+    $action->();
+
+    # --- restore stderr ---
+    STDERR->flush();
+    # Restore stderr from $save_stderr.
+    open( STDERR, ">&" . $save_stderr->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Close $save_stderr.
+    $save_stderr->close() or die ( "Cannot close filehandle: $!; stopped" );
+
+    # --- restore stdout ---
+    STDOUT->flush();
+    # Restore stdout from $save_stdout.
+    open( STDOUT, ">&" . $save_stdout->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Close $save_stdout.
+    $save_stdout->close() or die ( "Cannot close filehandle: $!; stopped" );
+
+    # Wait for the child tee process, otherwise output of make and build.pl interleaves.
+    if ( $pid != 0 ) {
+        waitpid( $pid, 0 );
+    }; # if
+
+}; # sub tee
+
+sub log_it($$@) {
+    my ( $title, $format, @args ) = @_;
+    my $message  = sprintf( $format, @args );
+    my $progress = cat_file( $tmp, sprintf( "%s-%s.log", $target_platform, Uname::host_name() ) );
+    if ( $title ne "" and $message ne "" ) {
+        my $line = sprintf( "%-15s : %s\n", $title, $message );
+        info( $line );
+        write_file( $progress, tstr() . ": " . $line, -append => 1 );
+    } else {
+        write_file( $progress, "\n", -append => 1 );
+    }; # if
+}; # sub log_it
+
+sub progress($$@) {
+    my ( $title, $format, @args ) = @_;
+    log_it( $title, $format, @args );
+}; # sub progress
+
+sub summary() {
+    my $total   = @jobs;
+    my $success = 0;
+    my $finish = time();
+    foreach my $job ( @jobs ) {
+        my ( $build_dir, $rc ) = ( $job->{ build_dir }, $job->{ rc } );
+        progress( rstr( $rc ), "%s", $build_dir );
+        if ( $rc == 0 ) {
+            ++ $success;
+        }; # if
+    }; # foreach $job
+    my $failure = $total - $success;
+    progress( "Successes",      "%3d of %3d", $success, $total );
+    progress( "Failures",       "%3d of %3d", $failure, $total );
+    progress( "Time elapsed",   "  %s", dstr( $finish - $start ) );
+    progress( "Overall result", "%s", rstr( $failure ) );
+    return $failure;
+}; # sub summary
+
+# --------------------------------------------------------------------------------------------------
+# Worker functions.
+# --------------------------------------------------------------------------------------------------
+
+sub init() {
+    make_dir( $tmp );
+}; # sub init
+
+sub clean(@) {
+    # Clean directories.
+    my ( @dirs ) = @_;
+    my $exit = 0;
+    # Mimisc makefile -- print a command.
+    print( "rm -f -r " . join( " ", map( shorter( $_ ) . "/*", @dirs ) ) . "\n" );
+    $exit =
+        execute(
+            [ $^X, cat_file( $ENV{ LIBOMP_WORK }, "tools", "clean-dir.pl" ), @dirs ],
+            -ignore_status => 1,
+            ( $tools::verbose ? () : ( -stdout => undef, -stderr => "" ) ),
+        );
+    return $exit;
+}; # sub clean
+
+sub make($$$) {
+    # Change dir to build one and run make.
+    my ( $job, $clean, $marker ) = @_;
+    my $dir      = $job->{ build_dir };
+    my $makefile = $job->{ makefile };
+    my $args     = $job->{ make_args };
+    my $cwd      = Cwd::cwd();
+    my $width    = -10;
+
+    my $exit;
+    $dir = cat_dir( $tmp, $dir );
+    make_dir( $dir );
+    change_dir( $dir );
+
+    my $actions =
+        sub {
+            my $start = time();
+            $makefile = shorter( $makefile );
+            print( "-" x 79, "\n" );
+            printf( "%${width}s: %s\n", "Started",   tstr( $start ) );
+            printf( "%${width}s: %s\n", "Root dir",  $root );
+            printf( "%${width}s: %s\n", "Build dir", shorter( $dir, $root ) );
+            printf( "%${width}s: %s\n", "Makefile",  $makefile );
+            print( "-" x 79, "\n" );
+            {
+                # Use shorter LIBOMP_WORK to have shorter command lines.
+                # Note: Some tools may not work if current dir is changed.
+                local $ENV{ LIBOMP_WORK } = shorter( $ENV{ LIBOMP_WORK } );
+                $exit =
+                    execute(
+                        [
+                            "make",
+                            "-r",
+                            "-f", $makefile,
+                            "arch=" . $target_arch,
+                            "marker=$marker",
+                            @$args
+                        ],
+                        -ignore_status => 1
+                    );
+                if ( $clean and $exit == 0 ) {
+                    $exit = clean( $dir );
+                }; # if
+            }
+            my $finish = time();
+            print( "-" x 79, "\n" );
+            printf( "%${width}s: %s\n", "Finished", tstr( $finish ) );
+            printf( "%${width}s: %s\n", "Elapsed", dstr( $finish - $start ) );
+            printf( "%${width}s: %s\n", "Result", rstr( $exit ) );
+            print( "-" x 79, "\n" );
+            print( "\n" );
+        }; # sub
+    tee( $actions, "build.log" );
+
+    change_dir( $cwd );
+
+    # Save completed job to be able print summary later.
+    $job->{ rc } = $exit;
+    push( @jobs, $job );
+
+    return $exit;
+
+}; # sub make
+
+1;

diff --git a/final/runtime/tools/lib/LibOMP.pm b/final/runtime/tools/lib/LibOMP.pm
new file mode 100644
index 0000000..06a371f
--- /dev/null
+++ b/final/runtime/tools/lib/LibOMP.pm

@@ -0,0 +1,85 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+package LibOMP;
+
+use strict;
+use warnings;
+
+use tools;
+
+sub empty($) {
+    my ( $var ) = @_;
+    return not exists( $ENV{ $var } ) or not defined( $ENV{ $var } ) or $ENV{ $var } eq "";
+}; # sub empty
+
+my ( $base, $out, $tmp );
+if ( empty( "LIBOMP_WORK" ) ) {
+    # $FindBin::Bin is not used intentionally because it gives real path. I want to use absolute,
+    # but not real one (real path does not contain symlinks while absolute path may contain
+    # symlinks).
+    $base = get_dir( get_dir( abs_path( $0 ) ) );
+} else {
+    $base = abs_path( $ENV{ LIBOMP_WORK } );
+}; # if
+
+if ( empty( "LIBOMP_EXPORTS" ) ) {
+    $out = cat_dir( $base, "exports" );
+} else {
+    $out = abs_path( $ENV{ LIBOMP_EXPORTS } );
+}; # if
+
+if ( empty( "LIBOMP_TMP" ) ) {
+    $tmp = cat_dir( $base, "tmp" );
+} else {
+    $tmp = abs_path( $ENV{ LIBOMP_TMP } );
+}; # if
+
+$ENV{ LIBOMP_WORK    } = $base;
+$ENV{ LIBOMP_EXPORTS } = $out;
+$ENV{ LIBOMP_TMP     } = $tmp;
+
+return 1;
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<LibOMP.pm> --
+
+=head1 SYNOPSIS
+
+    use FindBin;
+    use lib "$FindBin::Bin/lib";
+    use LibOMP;
+
+    $ENV{ LIBOMP_WORK    }
+    $ENV{ LIBOMP_TMP     }
+    $ENV{ LIBOMP_EXPORTS }
+
+=head1 DESCRIPTION
+
+The module checks C<LIBOMP_WORK>, C<LIBOMP_EXPORTS>, and C<LIBOMP_TMP> environments variables.
+If a variable set, the module makes sure it is absolute. If a variable does not exist, the module
+sets it to default value.
+
+Default value for C<LIBOMP_EXPORTS> is C<$LIBOMP_WORK/exports>, for C<LIBOMP_TMP> --
+C<$LIBOMP_WORK/tmp>.
+
+Value for C<LIBOMP_WORK> is guessed. The module assumes the script (which uses the module) is
+located in C<tools/> directory of libomp directory tree, and uses path of the script to calculate
+C<LIBOMP_WORK>,
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/lib/Platform.pm b/final/runtime/tools/lib/Platform.pm
new file mode 100644
index 0000000..8b701c2
--- /dev/null
+++ b/final/runtime/tools/lib/Platform.pm

@@ -0,0 +1,470 @@
+#
+# This is not a runnable script, it is a Perl module, a collection of variables, subroutines, etc.
+# to be used in Perl scripts.
+#
+# To get help about exported variables and subroutines, execute the following command:
+#
+#     perldoc Platform.pm
+#
+# or see POD (Plain Old Documentation) imbedded to the source...
+#
+#
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+package Platform;
+
+use strict;
+use warnings;
+
+use base "Exporter";
+
+use Uname;
+
+my @vars;
+
+BEGIN {
+    @vars = qw{ $host_arch $host_os $host_platform $target_arch $target_mic_arch $target_os $target_platform };
+}
+
+our $VERSION     = "0.014";
+our @EXPORT      = qw{};
+our @EXPORT_OK   = ( qw{ canon_arch canon_os canon_mic_arch legal_arch arch_opt }, @vars );
+our %EXPORT_TAGS = ( all => [ @EXPORT_OK ], vars => \@vars );
+
+# Canonize architecture name.
+sub canon_arch($) {
+    my ( $arch ) = @_;
+    if ( defined( $arch ) ) {
+        if ( $arch =~ m{\A\s*(?:32|IA-?32|IA-?32 architecture|i[3456]86|x86)\s*\z}i ) {
+            $arch = "32";
+        } elsif ( $arch =~ m{\A\s*(?:48|(?:ia)?32e|Intel\s*64|Intel\(R\)\s*64|x86[_-]64|x64|AMD64)\s*\z}i ) {
+            $arch = "32e";
+        } elsif ( $arch =~ m{\Aarm(?:v7\D*)?\z} ) {
+            $arch = "arm";
+        } elsif ( $arch =~ m{\Appc64le} ) {
+			$arch = "ppc64le";
+        } elsif ( $arch =~ m{\Appc64} ) {
+        	$arch = "ppc64";            
+        } elsif ( $arch =~ m{\Aaarch64} ) {               
+                $arch = "aarch64";
+        } elsif ( $arch =~ m{\Amic} ) {
+            $arch = "mic";
+        } else {
+            $arch = undef;
+        }; # if
+    }; # if
+    return $arch;
+}; # sub canon_arch
+
+# Canonize Intel(R) Many Integrated Core Architecture name.
+sub canon_mic_arch($) {
+    my ( $mic_arch ) = @_;
+    if ( defined( $mic_arch ) ) {
+        if ( $mic_arch =~ m{\Aknf} ) {
+            $mic_arch = "knf";
+        } elsif ( $mic_arch =~ m{\Aknc}) {
+            $mic_arch = "knc";
+        } elsif ( $mic_arch =~ m{\Aknl} ) {
+            $mic_arch = "knl";
+        } else {
+            $mic_arch = undef;
+        }; # if
+    }; # if
+    return $mic_arch;
+}; # sub canon_mic_arch
+
+{  # Return legal approved architecture name.
+    my %legal = (
+        "32"  => "IA-32 architecture",
+        "32e" => "Intel(R) 64",
+        "arm" => "ARM",
+        "aarch64" => "AArch64",
+        "mic" => "Intel(R) Many Integrated Core Architecture",
+    );
+
+    sub legal_arch($) {
+        my ( $arch ) = @_;
+        $arch = canon_arch( $arch );
+        if ( defined( $arch ) ) {
+            $arch = $legal{ $arch };
+        }; # if
+        return $arch;
+    }; # sub legal_arch
+}
+
+{  # Return architecture name suitable for Intel compiler setup scripts.
+    my %option = (
+        "32"  => "ia32",
+        "32e" => "intel64",
+        "64"  => "ia64",
+        "arm" => "arm",
+        "aarch64" => "aarch",
+        "mic" => "intel64",
+    );
+
+    sub arch_opt($) {
+        my ( $arch ) = @_;
+        $arch = canon_arch( $arch );
+        if ( defined( $arch ) ) {
+            $arch = $option{ $arch };
+        }; # if
+        return $arch;
+    }; # sub arch_opt
+}
+
+# Canonize OS name.
+sub canon_os($) {
+    my ( $os ) = @_;
+    if ( defined( $os ) ) {
+        if ( $os =~ m{\A\s*(?:Linux|lin|l)\s*\z}i ) {
+            $os = "lin";
+        } elsif ( $os =~ m{\A\s*(?:Mac(?:\s*OS(?:\s*X)?)?|mac|m|Darwin)\s*\z}i ) {
+            $os = "mac";
+        } elsif ( $os =~ m{\A\s*(?:Win(?:dows)?(?:(?:_|\s*)?(?:NT|XP|95|98|2003))?|w)\s*\z}i ) {
+            $os = "win";
+        } else {
+            $os = undef;
+        }; # if
+    }; # if
+    return $os;
+}; # sub canon_os
+
+my ( $_host_os, $_host_arch, $_target_os, $_target_arch, $_target_mic_arch, $_default_mic_arch);
+
+# Set the default mic-arch value.
+$_default_mic_arch = "knc";
+
+sub set_target_arch($) {
+    my ( $arch ) = canon_arch( $_[ 0 ] );
+    if ( defined( $arch ) ) {
+        $_target_arch       = $arch;
+        $ENV{ LIBOMP_ARCH } = $arch;
+    }; # if
+    return $arch;
+}; # sub set_target_arch
+
+sub set_target_mic_arch($) {
+    my ( $mic_arch ) = canon_mic_arch( $_[ 0 ] );
+    if ( defined( $mic_arch ) ) {
+        $_target_mic_arch       = $mic_arch;
+        $ENV{ LIBOMP_MIC_ARCH } = $mic_arch;
+    }; # if
+    return $mic_arch;
+}; # sub set_target_mic_arch
+
+sub set_target_os($) {
+    my ( $os ) = canon_os( $_[ 0 ] );
+    if ( defined( $os ) ) {
+        $_target_os       = $os;
+        $ENV{ LIBOMP_OS } = $os;
+    }; # if
+    return $os;
+}; # sub set_target_os
+
+sub target_options() {
+    my @options = (
+        "target-os|os=s" =>
+            sub {
+                set_target_os( $_[ 1 ] ) or
+                    die "Bad value of --target-os option: \"$_[ 1 ]\"\n";
+            },
+        "target-architecture|targert-arch|architecture|arch=s" =>
+           sub {
+               set_target_arch( $_[ 1 ] ) or
+                   die "Bad value of --target-architecture option: \"$_[ 1 ]\"\n";
+           },
+        "target-mic-architecture|targert-mic-arch|mic-architecture|mic-arch=s" =>
+           sub {
+               set_target_mic_arch( $_[ 1 ] ) or
+                   die "Bad value of --target-mic-architecture option: \"$_[ 1 ]\"\n";
+           },
+    );
+    return @options;
+}; # sub target_options
+
+# Detect host arch.
+{
+    my $hardware_platform = Uname::hardware_platform();
+    if ( 0 ) {
+    } elsif ( $hardware_platform eq "i386" ) {
+        $_host_arch = "32";
+    } elsif ( $hardware_platform eq "ia64" ) {
+        $_host_arch = "64";
+    } elsif ( $hardware_platform eq "x86_64" ) {
+        $_host_arch = "32e";
+    } elsif ( $hardware_platform eq "arm" ) {
+        $_host_arch = "arm";
+    } elsif ( $hardware_platform eq "ppc64le" ) {
+        $_host_arch = "ppc64le";
+    } elsif ( $hardware_platform eq "ppc64" ) {
+        $_host_arch = "ppc64";
+    } elsif ( $hardware_platform eq "aarch64" ) {         
+        $_host_arch = "aarch64";  
+    } else {
+        die "Unsupported host hardware platform: \"$hardware_platform\"; stopped";
+    }; # if
+}
+
+# Detect host OS.
+{
+    my $operating_system = Uname::operating_system();
+    if ( 0 ) {
+    } elsif ( $operating_system eq "GNU/Linux" ) {
+        $_host_os = "lin";
+    } elsif ( $operating_system eq "FreeBSD" ) {
+        # Host OS resembles Linux.
+        $_host_os = "lin";
+    } elsif ( $operating_system eq "Darwin" ) {
+        $_host_os = "mac";
+    } elsif ( $operating_system eq "MS Windows" ) {
+        $_host_os = "win";
+    } else {
+        die "Unsupported host operating system: \"$operating_system\"; stopped";
+    }; # if
+}
+
+# Detect target arch.
+if ( defined( $ENV{ LIBOMP_ARCH } ) ) {
+    # Use arch specified in LIBOMP_ARCH.
+    $_target_arch = canon_arch( $ENV{ LIBOMP_ARCH } );
+    if ( not defined( $_target_arch ) ) {
+        die "Unknown architecture specified in LIBOMP_ARCH environment variable: \"$ENV{ LIBOMP_ARCH }\"";
+    }; # if
+} else {
+    # Otherwise use host architecture.
+    $_target_arch = $_host_arch;
+}; # if
+$ENV{ LIBOMP_ARCH } = $_target_arch;
+
+# Detect target Intel(R) Many Integrated Core Architecture.
+if ( defined( $ENV{ LIBOMP_MIC_ARCH } ) ) {
+    # Use mic arch specified in LIBOMP_MIC_ARCH.
+    $_target_mic_arch = canon_mic_arch( $ENV{ LIBOMP_MIC_ARCH } );
+    if ( not defined( $_target_mic_arch ) ) {
+        die "Unknown architecture specified in LIBOMP_MIC_ARCH environment variable: \"$ENV{ LIBOMP_MIC_ARCH }\"";
+    }; # if
+} else {
+    # Otherwise use default Intel(R) Many Integrated Core Architecture.
+    $_target_mic_arch = $_default_mic_arch;
+}; # if
+$ENV{ LIBOMP_MIC_ARCH } = $_target_mic_arch;
+
+# Detect target OS.
+if ( defined( $ENV{ LIBOMP_OS } ) ) {
+    # Use OS specified in LIBOMP_OS.
+    $_target_os = canon_os( $ENV{ LIBOMP_OS } );
+    if ( not defined( $_target_os ) ) {
+        die "Unknown OS specified in LIBOMP_OS environment variable: \"$ENV{ LIBOMP_OS }\"";
+    }; # if
+} else {
+    # Otherwise use host OS.
+    $_target_os = $_host_os;
+}; # if
+$ENV{ LIBOMP_OS } = $_target_os;
+
+use vars @vars;
+
+tie( $host_arch,       "Platform::host_arch" );
+tie( $host_os,         "Platform::host_os" );
+tie( $host_platform,   "Platform::host_platform" );
+tie( $target_arch,     "Platform::target_arch" );
+tie( $target_mic_arch, "Platform::target_mic_arch" );
+tie( $target_os,       "Platform::target_os" );
+tie( $target_platform, "Platform::target_platform" );
+
+{ package Platform::base;
+
+    use Carp;
+
+    use Tie::Scalar;
+    use base "Tie::StdScalar";
+
+    sub STORE {
+        my $self = shift( @_ );
+        croak( "Modifying \$" . ref( $self ) . " is not allowed; stopped" );
+    }; # sub STORE
+
+} # package Platform::base
+
+{ package Platform::host_arch;
+    use base "Platform::base";
+    sub FETCH {
+        return $_host_arch;
+    }; # sub FETCH
+} # package Platform::host_arch
+
+{ package Platform::host_os;
+    use base "Platform::base";
+    sub FETCH {
+        return $_host_os;
+    }; # sub FETCH
+} # package Platform::host_os
+
+{ package Platform::host_platform;
+    use base "Platform::base";
+    sub FETCH {
+        return "${_host_os}_${_host_arch}";
+    }; # sub FETCH
+} # package Platform::host_platform
+
+{ package Platform::target_arch;
+    use base "Platform::base";
+    sub FETCH {
+        return $_target_arch;
+    }; # sub FETCH
+} # package Platform::target_arch
+
+{ package Platform::target_mic_arch;
+    use base "Platform::base";
+    sub FETCH {
+        return $_target_mic_arch;
+    }; # sub FETCH
+} # package Platform::target_mic_arch
+
+{ package Platform::target_os;
+    use base "Platform::base";
+    sub FETCH {
+        return $_target_os;
+    }; # sub FETCH
+} # package Platform::target_os
+
+{ package Platform::target_platform;
+    use base "Platform::base";
+    sub FETCH {
+        if ($_target_arch eq "mic") {
+            return "${_target_os}_${_target_mic_arch}";
+        } else {
+        return "${_target_os}_${_target_arch}";
+        }
+    }; # sub FETCH
+} # package Platform::target_platform
+
+
+return 1;
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<Platform.pm> -- Few subroutines to get OS, architecture and platform name in form suitable for
+naming files, directories, macros, etc.
+
+=head1 SYNOPSIS
+
+    use Platform ":all";
+    use tools;
+
+    my $arch   = canon_arch( "em64T" );        # Returns "32e".
+    my $legal  = legal_arch( "em64t" );        # Returns "Intel(R) 64".
+    my $option = arch_opt( "em64t" );          # Returns "intel64".
+    my $os     = canon_os( "Windows NT" );     # Returns "win".
+
+    print( $host_arch, $host_os, $host_platform );
+    print( $taregt_arch, $target_os, $target_platform );
+
+    tools::get_options(
+        Platform::target_options(),
+        ...
+    );
+
+
+=head1 DESCRIPTION
+
+Environment variable LIBOMP_OS specifies target OS to report. If LIBOMP_OS id not defined,
+the script assumes host OS is target OS.
+
+Environment variable LIBOMP_ARCH specifies target architecture to report. If LIBOMP_ARCH is not defined,
+the script assumes host architecture is target one.
+
+=head2 Functions.
+
+=over
+
+=item B<canon_arch( $arch )>
+
+Input string is an architecture name to canonize. The function recognizes many variants, for example:
+C<32e>, C<Intel64>, C<Intel(R) 64>, etc. Returned string is a canononized architecture name,
+one of: C<32>, C<32e>, C<64>, C<arm>, C<ppc64le>, C<ppc64>, C<mic>, or C<undef> is input string is not recognized.
+
+=item B<legal_arch( $arch )>
+
+Input string is architecture name. The function recognizes the same variants as C<arch_canon()> does.
+Returned string is a name approved by Intel Legal, one of: C<IA-32 architecture>, C<Intel(R) 64>
+or C<undef> if input string is not recognized.
+
+=item B<arch_opt( $arch )>
+
+Input string is architecture name. The function recognizes the same variants as C<arch_canon()> does.
+Returned string is an architecture name suitable for passing to compiler setup scripts
+(e. g. C<iccvars.sh>), one of: C<IA-32 architecture>, C<Intel(R) 64> or C<undef> if input string is not
+recognized.
+
+=item B<canon_os( $os )>
+
+Input string is OS name to canonize. The function recognizes many variants, for example: C<mac>, C<OS X>, etc. Returned string is a canonized OS name, one of: C<lin>,
+C<mac>, C<win>, or C<undef> is input string is not recognized.
+
+=item B<target_options()>
+
+Returns array suitable for passing to C<tools::get_options()> to let a script recognize
+C<--target-architecture=I<str>> and C<--target-os=I<str>> options. Typical usage is:
+
+    use tools;
+    use Platform;
+
+    my ( $os, $arch, $platform );    # Global variables, not initialized.
+
+    ...
+
+    get_options(
+        Platform::target_options(),  # Let script recognize --target-os and --target-arch options.
+        ...
+    );
+    # Initialize variabls after parsing command line.
+    ( $os, $arch, $platform ) = ( Platform::target_os(), Platform::target_arch(), Platform::target_platform() );
+
+=back
+
+=head2 Variables
+
+=item B<$host_arch>
+
+Canonized name of host architecture.
+
+=item B<$host_os>
+
+Canonized name of host OS.
+
+=item B<$host_platform>
+
+Host platform name (concatenated canonized OS name, underscore, and canonized architecture name).
+
+=item B<$target_arch>
+
+Canonized name of target architecture.
+
+=item B<$target_os>
+
+Canonized name of target OS.
+
+=item B<$target_platform>
+
+Target platform name (concatenated canonized OS name, underscore, and canonized architecture name).
+
+=back
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/lib/Uname.pm b/final/runtime/tools/lib/Uname.pm
new file mode 100644
index 0000000..78ea31d
--- /dev/null
+++ b/final/runtime/tools/lib/Uname.pm

@@ -0,0 +1,633 @@
+#
+# This is not a runnable script, it is a Perl module, a collection of variables, subroutines, etc.
+# To get help about exported variables and subroutines, execute the following command:
+#
+#     perldoc Uname.pm
+#
+# or see POD (Plain Old Documentation) embedded to the source...
+#
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+package Uname;
+
+use strict;
+use warnings;
+use warnings::register;
+use Exporter;
+
+use POSIX;
+use File::Glob ":glob";
+use Net::Domain qw{};
+
+# Following code does not work with Perl 5.6 on Linux* OS and Windows* OS:
+#
+#     use if $^O eq "darwin", tools => qw{};
+#
+# The workaround for Perl 5.6:
+#
+BEGIN {
+    if ( $^O eq "darwin" or $^O eq "linux" ) {
+	require tools;
+        import tools;
+    }; # if
+    if ( $^O eq "MSWin32" ) {
+        require Win32;
+    }; # if
+}; # BEGIN
+
+my $mswin = qr{\A(?:MSWin32|Windows_NT)\z};
+
+my @posix = qw{ kernel_name fqdn kernel_release kernel_version machine };
+    # Properties supported by POSIX::uname().
+my @linux =
+    qw{ processor hardware_platform operating_system };
+    # Properties reported by uname in Linux* OS.
+my @base = ( @posix, @linux );
+    # Base properties.
+my @aux =
+    (
+        qw{ host_name domain_name },
+        map( "operating_system_$_", qw{ name release codename description } )
+    );
+    # Auxiliary properties.
+my @all = ( @base, @aux );
+    # All the properties.
+my @meta = qw{ base_names all_names value };
+    # Meta functions.
+
+our $VERSION     = "0.07";
+our @ISA         = qw{ Exporter };
+our @EXPORT      = qw{};
+our @EXPORT_OK   = ( @all, @meta );
+our %EXPORT_TAGS =
+    (
+        base => [ @base ],
+        all  => [ @all  ],
+        meta => [ @meta ],
+    );
+
+my %values;
+    # Hash of values. Some values are strings, some may be references to code which should be
+    # evaluated to get real value. This trick is implemented because call to Net::Domain::hostfqdn()
+    # is relatively slow.
+
+# Get values from POSIX::uname().
+@values{ @posix } = POSIX::uname();
+
+# On some systems POSIX::uname() returns "short" node name (without domain name). To be consistent
+# on all systems, we will get node name from alternative source.
+if ( $^O =~ m/cygwin/i ) {
+    # Function from Net::Domain module works well, but on Cygwin it prints to
+    # stderr "domainname: not found". So we will use environment variables for now.
+    $values{ fqdn } = lc( $ENV{ COMPUTERNAME } . "." . $ENV{ USERDNSDOMAIN } );
+} else {
+    # On systems other than Cygwin, let us use Net::Domain::hostfqdn(), but do it only node name
+    # is really requested.
+    $values{ fqdn } =
+        sub {
+            my $fqdn = Net::Domain::hostfqdn(); # "fqdn" stands for "fully qualified doamain name".
+            # On some systems POSIX::uname() and Net::Domain::hostfqdn() reports different names.
+            # Let us issue a warning if they significantly different. Names are insignificantly
+            # different if POSIX::uname() matches the beginning of Net::Domain::hostfqdn().
+            if (
+                $fqdn eq substr( $fqdn, 0, length( $fqdn ) )
+                &&
+                (
+                    length( $fqdn ) == length( $fqdn )
+                    ||
+                    substr( $fqdn, length( $fqdn ), 1 ) eq "."
+                )
+            ) {
+                # Ok.
+            } else {
+                warnings::warnif(
+                    "POSIX::uname() and Net::Domain::hostfqdn() reported different names: " .
+                        "\"$values{ fqdn }\" and \"$fqdn\" respectively\n"
+                );
+            }; # if
+            return $fqdn;
+        }; # sub
+}; # if
+
+if ( $^O =~ $mswin ) {
+    if (
+        $values{ machine } =~ m{\A(?:x86|[56]86)\z}
+        and
+        exists( $ENV{ PROCESSOR_ARCHITECTURE } ) and $ENV{ PROCESSOR_ARCHITECTURE } eq "x86"
+        and
+        exists( $ENV{ PROCESSOR_ARCHITEW6432 } )
+    ) {
+        if ( $ENV{ PROCESSOR_ARCHITEW6432 } eq "AMD64" ) {
+            $values{ machine } = "x86_64";
+        }; # if
+    }; # if
+}; # if
+
+# Some values are not returned by POSIX::uname(), let us compute them.
+
+# processor.
+$values{ processor } = $values{ machine };
+
+# hardware_platform.
+if ( 0 ) {
+} elsif ( $^O eq "linux" or $^O eq "freebsd" ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } =~ m{\Ai[3456]86\z} ) {
+        $values{ hardware_platform } = "i386";
+    } elsif ( $values{ machine } =~ m{\A(x86_64|amd64)\z} ) {
+        $values{ hardware_platform } = "x86_64";
+    } elsif ( $values{ machine } =~ m{\Aarmv7\D*\z} ) {
+        $values{ hardware_platform } = "arm";
+    } elsif ( $values{ machine } =~ m{\Appc64le\z} ) {
+        $values{ hardware_platform } = "ppc64le";
+    } elsif ( $values{ machine } =~ m{\Appc64\z} ) {
+        $values{ hardware_platform } = "ppc64";
+    } elsif ( $values{ machine } =~ m{\Aaarch64\z} ) {
+        $values{ hardware_platform } = "aarch64";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} elsif ( $^O eq "darwin" ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } eq "x86" or $values{ machine } eq "i386" ) {
+        $values{ hardware_platform } =
+            sub {
+                my $platform = "i386";
+                # Some OSes on Intel(R) 64 still reports "i386" machine. Verify it by using
+                # the value returned by 'sysctl -n hw.optional.x86_64'. On Intel(R) 64-bit systems the
+                # value == 1; on 32-bit systems the 'hw.optional.x86_64' property either does not exist
+                # or the value == 0. The path variable does not contain a path to sysctl when
+                # started by crontab.
+                my $sysctl = ( which( "sysctl" ) or "/usr/sbin/sysctl" );
+                my $output;
+                debug( "Executing $sysctl..." );
+                execute( [ $sysctl, "-n", "hw.optional.x86_64" ], -stdout => \$output, -stderr => undef );
+                chomp( $output );
+                if ( 0 ) {
+                } elsif ( "$output" eq "" or "$output" eq "0" ) {
+                    $platform = "i386";
+                } elsif ( "$output" eq "1" ) {
+                    $platform = "x86_64";
+                } else {
+                    die "Unsupported value (\"$output\") returned by \"$sysctl -n hw.optional.x86_64\"; stopped";
+                }; # if
+                return $platform;
+            }; # sub {
+    } elsif ( $values{ machine } eq "x86_64" ) {
+	# Some OS X* versions report "x86_64".
+	$values{ hardware_platform } = "x86_64";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} elsif ( $^O =~ $mswin ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } =~ m{\A(?:x86|[56]86)\z} ) {
+        $values{ hardware_platform } = "i386";
+    } elsif ( $values{ machine } eq "x86_64" or $values{ machine } eq "amd64" ) {
+        # ActivePerl for IA-32 architecture returns "x86_64", while ActivePerl for Intel(R) 64 returns "amd64".
+        $values{ hardware_platform } = "x86_64";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} elsif ( $^O eq "cygwin" ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } =~ m{\Ai[3456]86\z} ) {
+        $values{ hardware_platform } = "i386";
+    } elsif ( $values{ machine } eq "x86_64" ) {
+        $values{ hardware_platform } = "x86_64";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} else {
+    die "Unsupported OS (\"$^O\"); stopped";
+}; # if
+
+# operating_system.
+if ( 0 ) {
+} elsif ( $values{ kernel_name } eq "Linux" ) {
+    $values{ operating_system } = "GNU/Linux";
+    my $release;    # Name of chosen "*-release" file.
+    my $bulk;       # Content of release file.
+    # On Ubuntu, lsb-release is quite informative, e. g.:
+    #     DISTRIB_ID=Ubuntu
+    #     DISTRIB_RELEASE=9.04
+    #     DISTRIB_CODENAME=jaunty
+    #     DISTRIB_DESCRIPTION="Ubuntu 9.04"
+    # Try lsb-release first. But on some older systems lsb-release is not informative.
+    # It may contain just one line:
+    #     LSB_VERSION="1.3"
+    $release = "/etc/lsb-release";
+    if ( -e $release ) {
+        $bulk = read_file( $release );
+    } else {
+        $bulk = "";
+    }; # if
+    if ( $bulk =~ m{^DISTRIB_} ) {
+        # Ok, this lsb-release is informative.
+        $bulk =~ m{^DISTRIB_ID\s*=\s*(.*?)\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_ID:", $bulk, "(eof)" );
+        $values{ operating_system_name } = $1;
+        $bulk =~ m{^DISTRIB_RELEASE\s*=\s*(.*?)\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_RELEASE:", $bulk, "(eof)" );
+        $values{ operating_system_release } = $1;
+        $bulk =~ m{^DISTRIB_CODENAME\s*=\s*(.*?)\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_CODENAME:", $bulk, "(eof)" );
+        $values{ operating_system_codename } = $1;
+        $bulk =~ m{^DISTRIB_DESCRIPTION\s*="?\s*(.*?)"?\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_DESCRIPTION:", $bulk, "(eof)" );
+        $values{ operating_system_description } = $1;
+    } else {
+        # Oops. lsb-release is missed or not informative. Try other *-release files.
+        $release = "/etc/system-release";
+        if ( not -e $release ) {    # Use /etc/system-release" if such file exists.
+            # Otherwise try other "/etc/*-release" files, but ignore "/etc/lsb-release".
+            my @releases = grep( $_ ne "/etc/lsb-release", bsd_glob( "/etc/*-release" ) );
+            # On some Fedora systems there are two files: fedora-release and redhat-release
+            # with identical content. If fedora-release present, ignore redjat-release.
+            if ( grep( $_ eq "/etc/fedora-release", @releases ) ) {
+                @releases = grep( $_ ne "/etc/redhat-release", @releases );
+            }; # if
+            if ( @releases == 1 ) {
+                $release = $releases[ 0 ];
+            } else {
+                if ( @releases == 0 ) {
+                    # No *-release files found, try debian_version.
+                    $release = "/etc/debian_version";
+                    if ( not -e $release ) {
+                        $release = undef;
+                        warning( "No release files found in \"/etc/\" directory." );
+                    }; # if
+                } else {
+                    $release = undef;
+                    warning( "More than one release files found in \"/etc/\" directory:", @releases );
+                }; # if
+            }; # if
+        }; # if
+        if ( defined( $release ) ) {
+            $bulk = read_file( $release );
+            if ( $release =~ m{system|redhat|fedora} ) {
+                # Red Hat or Fedora. Parse the first line of file.
+                # Typical values of *-release (one of):
+                #     Red Hat Enterprise Linux* OS Server release 5.2 (Tikanga)
+                #     Red Hat Enterprise Linux* OS AS release 3 (Taroon Update 4)
+                #     Fedora release 10 (Cambridge)
+                $bulk =~ m{\A(.*)$}m
+                    or runtime_error( "$release: Cannot find the first line:", $bulk, "(eof)" );
+                my $first_line = $1;
+                $values{ operating_system_description } = $first_line;
+                $first_line =~ m{\A(.*?)\s+release\s+(.*?)(?:\s+\((.*?)(?:\s+Update\s+(.*?))?\))?\s*$}
+                    or runtime_error( "$release:1: Cannot parse line:", $first_line );
+                $values{ operating_system_name    }  = $1;
+                $values{ operating_system_release }  = $2 . ( defined( $4 ) ? ".$4" : "" );
+                $values{ operating_system_codename } = $3;
+            } elsif ( $release =~ m{SuSE} ) {
+                # Typical SuSE-release:
+                #     SUSE Linux* OS Enterprise Server 10 (x86_64)
+                #     VERSION = 10
+                #     PATCHLEVEL = 2
+                $bulk =~ m{\A(.*)$}m
+                    or runtime_error( "$release: Cannot find the first line:", $bulk, "(eof)" );
+                my $first_line = $1;
+                $values{ operating_system_description } = $first_line;
+                $first_line =~ m{^(.*?)\s*(\d+)\s*\(.*?\)\s*$}
+                    or runtime_error( "$release:1: Cannot parse line:", $first_line );
+                $values{ operating_system_name } = $1;
+                $bulk =~ m{^VERSION\s*=\s*(.*)\s*$}m
+                    or runtime_error( "$release: There is no VERSION:", $bulk, "(eof)" );
+                $values{ operating_system_release } = $1;
+                if ( $bulk =~ m{^PATCHLEVEL\s*=\s*(.*)\s*$}m ) {
+                    $values{ operating_system_release } .= ".$1";
+                }; # if
+            } elsif ( $release =~ m{debian_version} ) {
+                # Debian. The file debian_version contains just version number, nothing more:
+                #     4.0
+                my $name = "Debian";
+                $bulk =~ m{\A(.*)$}m
+                    or runtime_error( "$release: Cannot find the first line:", $bulk, "(eof)" );
+                my $version = $1;
+                $values{ operating_system_name        } = $name;
+                $values{ operating_system_release     } = $version;
+                $values{ operating_system_codename    } = "unknown";
+                $values{ operating_system_description } = sprintf( "%s %s", $name, $version );
+            }; # if
+        }; # if
+    }; # if
+    if ( not defined( $values{ operating_system_name } ) ) {
+        $values{ operating_system_name } = "GNU/Linux";
+    }; # if
+} elsif ( $values{ kernel_name } eq "Darwin" ) {
+    my %codenames = (
+        10.4 => "Tiger",
+        10.5 => "Leopard",
+        10.6 => "Snow Leopard",
+    );
+   my $darwin;
+   my $get_os_info =
+       sub {
+           my ( $name ) = @_;
+           if ( not defined $darwin ) {
+               $darwin->{ operating_system } = "Darwin";
+               # sw_vers prints OS X* version to stdout:
+               #     ProductName:       OS X*
+               #     ProductVersion:    10.4.11
+               #     BuildVersion:      8S2167
+               # It does not print codename, so we code OS X* codenames here.
+               my $sw_vers = which( "sw_vers" ) || "/usr/bin/sw_vers";
+               my $output;
+               debug( "Executing $sw_vers..." );
+               execute( [ $sw_vers ], -stdout => \$output, -stderr => undef );
+               $output =~ m{^ProductName:\s*(.*)\s*$}m
+                   or runtime_error( "There is no ProductName in sw_vers output:", $output, "(eof)" );
+               my $name = $1;
+               $output =~ m{^ProductVersion:\s*(.*)\s*$}m
+                   or runtime_error( "There is no ProductVersion in sw_vers output:", $output, "(eof)" );
+               my $release = $1;
+               # Sometimes release reported as "10.4.11" (3 componentes), sometimes as "10.6".
+               # Handle both variants.
+               $release =~ m{^(\d+.\d+)(?:\.\d+)?(?=\s|$)}
+                   or runtime_error( "Cannot parse OS X* version: $release" );
+               my $version = $1;
+               my $codename = ( $codenames{ $version } or "unknown" );
+               $darwin->{ operating_system_name        } = $name;
+               $darwin->{ operating_system_release     } = $release;
+               $darwin->{ operating_system_codename    } = $codename;
+               $darwin->{ operating_system_description } = sprintf( "%s %s (%s)", $name, $release, $codename );
+           }; # if
+           return $darwin->{ $name };
+       }; # sub
+    $values{ operating_system             } = sub { $get_os_info->( "operating_system"             ); };
+    $values{ operating_system_name        } = sub { $get_os_info->( "operating_system_name"        ); };
+    $values{ operating_system_release     } = sub { $get_os_info->( "operating_system_release"     ); };
+    $values{ operating_system_codename    } = sub { $get_os_info->( "operating_system_codename"    ); };
+    $values{ operating_system_description } = sub { $get_os_info->( "operating_system_description" ); };
+} elsif ( $values{ kernel_name } =~ m{\AWindows[ _]NT\z} ) {
+    $values{ operating_system } = "MS Windows";
+    # my @os_name = Win32::GetOSName();
+    # $values{ operating_system_release } = $os_name[ 0 ];
+    # $values{ operating_system_update  } = $os_name[ 1 ];
+} elsif ( $values{ kernel_name } =~ m{\ACYGWIN_NT-} ) {
+    $values{ operating_system } = "MS Windows";
+} elsif ( $values{ kernel_name } =~ m{\AFreeBSD} ) {
+    $values{ operating_system } = "FreeBSD";
+} else {
+    die "Unsupported kernel_name (\"$values{ kernel_name }\") returned by POSIX::uname(); stopped";
+}; # if
+
+# host_name and domain_name
+$values{ host_name } =
+    sub {
+        my $fqdn = value( "fqdn" );
+        $fqdn =~ m{\A([^.]*)(?:\.(.*))?\z};
+        my $host_name = $1;
+        if ( not defined( $host_name ) or $host_name eq "" ) {
+            die "Unexpected error: undefined or empty host name; stopped";
+        }; # if
+        return $host_name;
+    };
+$values{ domain_name } =
+    sub {
+        my $fqdn = value( "fqdn" );
+        $fqdn =~ m{\A([^.]*)(?:\.(.*))?\z};
+        my $domain_name = $2;
+        if ( not defined( $domain_name ) or $domain_name eq "" ) {
+            die "Unexpected error: undefined or empty domain name; stopped";
+        }; # if
+        return $domain_name;
+    };
+
+# Replace undefined values with "unknown".
+foreach my $name ( @all ) {
+    if ( not defined( $values{ $name } ) ) {
+        $values{ $name } = "unknown";
+    }; # if
+}; # foreach $name
+
+# Export functions reporting properties.
+foreach my $name ( @all ) {
+    no strict "refs";
+    *$name = sub { return value( $name ); };
+}; # foreach $name
+
+# This function returns base names.
+sub base_names {
+    return @base;
+}; # sub base_names
+
+# This function returns all the names.
+sub all_names {
+    return @all;
+}; # sub all_names
+
+# This function returns value by the specified name.
+sub value($) {
+    my $name = shift( @_ );
+    if ( ref( $values{ $name } ) ) {
+        my $value = $values{ $name }->();
+        $values{ $name } = $value;
+    }; # if
+    return $values{ $name };
+}; # sub value
+
+return 1;
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<Uname.pm> -- A few subroutines to get system information usually provided by
+C</bin/uname> and C<POSIX::uname()>.
+
+=head1 SYNOPSIS
+
+    use Uname;
+
+    # Base property functions.
+    $kernel_name       = Uname::kernel_name();
+    $fqdn              = Uname::fqdn();
+    $kernel_release    = Uname::kernel_release();
+    $kernel_version    = Uname::kernel_version();
+    $machine           = Uname::machine();
+    $processor         = Uname::processor();
+    $hardware_platform = Uname::hardware_platform();
+    $operating_system  = Uname::operating_system();
+
+    # Auxiliary property functions.
+    $host_name         = Uname::host_name();
+    $domain_name       = Uname::domain_name();
+    $os_name           = Uname::operating_system_name();
+    $os_release        = Uname::operating_system_release();
+    $os_codename       = Uname::operating_system_codename();
+    $os_description    = Uname::operating_system_description();
+
+    # Meta functions.
+    @base_names  = Uname::base_names();
+    @all_names   = Uname::all_names();
+    $kernel_name = Uname::value( "kernel_name" );
+
+=head1 DESCRIPTION
+
+B<Uname.pm> resembles functionality found in C<POSIX::uname()> function or in C<uname> program.
+However, both C<POSIX::uname()> and C</bin/uname> have some disadvantages:
+
+=over
+
+=item *
+
+C<uname> may be not available in some environments, for example, in Windows* OS
+(C<uname> may be found in some third-party software packages, like MKS Toolkit or Cygwin, but it is
+not a part of OS).
+
+=item *
+
+There are many different versions of C<uname>. For example, C<uname> on OS X* does not
+recognize options C<-i>, C<-o>, and any long options.
+
+=item *
+
+Different versions of C<uname> may report the same property differently. For example,
+C<uname> on Linux* OS reports machine as C<i686>, while C<uname> on OS X* reports the same machine as
+C<x86>.
+
+=item *
+
+C<POSIX::uname()> returns list of values. I cannot recall what is the fourth element of the list.
+
+=back
+
+=head2 Base Functions
+
+Base property functions provide the information as C<uname> program.
+
+=over
+
+=item B<kernel_name()>
+
+Returns the kernel name, as reported by C<POSIX::uname()>.
+
+=item B<fqdn()>
+
+Returns the FQDN, fully qualified domain name. On some systems C<POSIX::uname()> reports short node
+name (with no domain name), on others C<POSIX::uname()> reports full node name. This
+function strive to return FQDN always (by refining C<POSIX::uname()> with
+C<Net::Domain::hostfqdn()>).
+
+=item B<kernel_release()>
+
+Returns the kernel release string, as reported by C<POSIX::uname()>. Usually the string consists of
+several numbers, separated by dots and dashes, but may also include some non-numeric substrings like
+"smp".
+
+=item B<kernel_version()>
+
+Returns the kernel version string, as reported by C<POSIX::uname()>. It is B<not> several
+dot-separated numbers but much longer string describing the kernel.
+For example, on Linux* OS it includes build date.
+If you look for something identifying the kernel, look at L<kernel_release>.
+
+=item B<machine()>
+
+Returns the machine hardware name, as reported by POSIX::uname(). Not reliable. Different OSes may
+report the same machine hardware name differently. For example, Linux* OS reports C<i686>, while OS X*
+reports C<x86> on the same machine.
+
+=item B<processor()>
+
+Returns the processor type. Not reliable. Usually the same as C<machine>.
+
+=item B<hardware_platform()>
+
+One of: C<i386> or C<x86_64>.
+
+=item B<operating_system()>
+
+One of: C<GNU/Linux>, C<OS X*>, or C<MS Windows>.
+
+=back
+
+=head2 Auxiliary Functions
+
+Auxiliary functions extends base functions with information not reported by C<uname> program.
+
+Auxiliary functions collect information from different sources. For example, on OS X*, they may
+call C<sw_vers> program to find out OS release; on Linux* OS they may parse C</etc/redhat-release> file,
+etc.
+
+=over
+
+=item B<host_name()>
+
+Returns host name (FQDN with dropped domain part).
+
+=item B<domain_name()>
+
+Returns domain name (FQDN with dropped host part).
+
+=item B<operating_system_name>
+
+Name of operating system or name of Linux* OS distribution, like "Fedora" or
+"Red Hat Enterprise Linux* OS Server".
+
+=item B<operating_system_release>
+
+Release (version) of operating system or Linux* OS distribution. Usually it is a series of
+dot-separated numbers.
+
+=item B<operating_system_codename>
+
+Codename of operating system release or Linux* OS distribution. For example, Fedora 10 is "Cambridge"
+while OS X* 10.4 is "Tiger".
+
+=item B<operating_system_description>
+
+Longer string. Usually it includes all the operating system properting mentioned above -- name,
+release, codename in parentheses.
+
+=back
+
+=head2 Meta Functions
+
+=over
+
+=item B<base_names()>
+
+This function returns the list of base property names.
+
+=item B<all_names()>
+
+This function returns the list of all property names.
+
+=item B<value(> I<name> B<)>
+
+This function returns the value of the property specified by I<name>.
+
+=back
+
+=head1 EXAMPLES
+
+    use Uname;
+
+    print( Uname::string(), "\n" );
+
+    foreach my $name ( Uname::all_names() ) {
+        print( "$name=\"" . Uname::value( $name ) . "\"\n" );
+    }; # foreach $name
+
+=head1 SEE ALSO
+
+L<POSIX::uname>, L<uname>.
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/lib/tools.pm b/final/runtime/tools/lib/tools.pm
new file mode 100644
index 0000000..ce5cf44
--- /dev/null
+++ b/final/runtime/tools/lib/tools.pm

@@ -0,0 +1,1981 @@
+#
+# This is not a runnable script, it is a Perl module, a collection of variables, subroutines, etc.
+# to be used in other scripts.
+#
+# To get help about exported variables and subroutines, please execute the following command:
+#
+#     perldoc tools.pm
+#
+# or see POD (Plain Old Documentation) imbedded to the source...
+#
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+=head1 NAME
+
+B<tools.pm> -- A collection of subroutines which are widely used in Perl scripts.
+
+=head1 SYNOPSIS
+
+    use FindBin;
+    use lib "$FindBin::Bin/lib";
+    use tools;
+
+=head1 DESCRIPTION
+
+B<Note:> Because this collection is small and intended for widely using in particular project,
+all variables and functions are exported by default.
+
+B<Note:> I have some ideas how to improve this collection, but it is in my long-term plans.
+Current shape is not ideal, but good enough to use.
+
+=cut
+
+package tools;
+
+use strict;
+use warnings;
+
+use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS );
+require Exporter;
+@ISA = qw( Exporter );
+
+my @vars   = qw( $tool );
+my @utils  = qw( check_opts validate );
+my @opts   = qw( get_options );
+my @print  = qw( debug info warning cmdline_error runtime_error question );
+my @name   = qw( get_vol get_dir get_file get_name get_ext cat_file cat_dir );
+my @file   = qw( which abs_path rel_path real_path make_dir clean_dir copy_dir move_dir del_dir change_dir copy_file move_file del_file );
+my @io     = qw( read_file write_file );
+my @exec   = qw( execute backticks );
+my @string = qw{ pad };
+@EXPORT = ( @utils, @opts, @vars, @print, @name, @file, @io, @exec, @string );
+
+use UNIVERSAL    ();
+
+use FindBin;
+use IO::Handle;
+use IO::File;
+use IO::Dir;
+# Not available on some machines: use IO::Zlib;
+
+use Getopt::Long ();
+use Pod::Usage   ();
+use Carp         ();
+use File::Copy   ();
+use File::Path   ();
+use File::Temp   ();
+use File::Spec   ();
+use POSIX        qw{ :fcntl_h :errno_h };
+use Cwd          ();
+use Symbol       ();
+
+use Data::Dumper;
+
+use vars qw( $tool $verbose $timestamps );
+$tool = $FindBin::Script;
+
+my @warning = ( sub {}, \&warning, \&runtime_error );
+
+
+sub check_opts(\%$;$) {
+
+    my $opts = shift( @_ );  # Referense to hash containing real options and their values.
+    my $good = shift( @_ );  # Reference to an array containing all known option names.
+    my $msg  = shift( @_ );  # Optional (non-mandatory) message.
+
+    if ( not defined( $msg ) ) {
+        $msg = "unknown option(s) passed";   # Default value for $msg.
+    }; # if
+
+    # I'll use these hashes as sets of options.
+    my %good = map( ( $_ => 1 ), @$good );   # %good now is filled with all known options.
+    my %bad;                                 # %bad is empty.
+
+    foreach my $opt ( keys( %$opts ) ) {     # For each real option...
+        if ( not exists( $good{ $opt } ) ) { # Look its name in the set of known options...
+            $bad{ $opt } = 1;                # Add unknown option to %bad set.
+            delete( $opts->{ $opt } );       # And delete original option.
+        }; # if
+    }; # foreach $opt
+    if ( %bad ) {                            # If %bad set is not empty...
+        my @caller = caller( 1 );            # Issue a warning.
+        local $Carp::CarpLevel = 2;
+        Carp::cluck( $caller[ 3 ] . ": " . $msg . ": " . join( ", ", sort( keys( %bad ) ) ) );
+    }; # if
+
+    return 1;
+
+}; # sub check_opts
+
+
+# --------------------------------------------------------------------------------------------------
+# Purpose:
+#     Check subroutine arguments.
+# Synopsis:
+#     my %opts = validate( params => \@_, spec => { ... }, caller => n );
+# Arguments:
+#     params -- A reference to subroutine's actual arguments.
+#     spec   -- Specification of expected arguments.
+#     caller -- ...
+# Return value:
+#     A hash of validated options.
+# Description:
+#     I would like to use Params::Validate module, but it is not a part of default Perl
+#     distribution, so I cannot rely on it. This subroutine resembles to some extent to
+#     Params::Validate::validate_with().
+#     Specification of expected arguments:
+#        { $opt => { type => $type, default => $default }, ... }
+#        $opt     -- String, option name.
+#        $type    -- String, expected type(s). Allowed values are "SCALAR", "UNDEF", "BOOLEAN",
+#                    "ARRAYREF", "HASHREF", "CODEREF". Multiple types may listed using bar:
+#                    "SCALAR|ARRAYREF". The type string is case-insensitive.
+#        $default -- Default value for an option. Will be used if option is not specified or
+#                    undefined.
+#
+sub validate(@) {
+
+    my %opts = @_;    # Temporary use %opts for parameters of `validate' subroutine.
+    my $params = $opts{ params };
+    my $caller = ( $opts{ caller } or 0 ) + 1;
+    my $spec   = $opts{ spec };
+    undef( %opts );   # Ok, Clean %opts, now we will collect result of the subroutine.
+
+    # Find out caller package, filename, line, and subroutine name.
+    my ( $pkg, $file, $line, $subr ) = caller( $caller );
+    my @errors;    # We will collect errors in array not to stop on the first found error.
+    my $error =
+        sub ($) {
+            my $msg = shift( @_ );
+            push( @errors, "$msg at $file line $line.\n" );
+        }; # sub
+
+    # Check options.
+    while ( @$params ) {
+        # Check option name.
+        my $opt = shift( @$params );
+        if ( not exists( $spec->{ $opt } ) ) {
+            $error->( "Invalid option `$opt'" );
+            shift( @$params ); # Skip value of unknow option.
+            next;
+        }; # if
+        # Check option value exists.
+        if ( not @$params ) {
+            $error->( "Option `$opt' does not have a value" );
+            next;
+        }; # if
+        my $val = shift( @$params );
+        # Check option value type.
+        if ( exists( $spec->{ $opt }->{ type } ) ) {
+            # Type specification exists. Check option value type.
+            my $actual_type;
+            if ( ref( $val ) ne "" ) {
+                $actual_type = ref( $val ) . "REF";
+            } else {
+                $actual_type = ( defined( $val ) ? "SCALAR" : "UNDEF" );
+            }; # if
+            my @wanted_types = split( m{\|}, lc( $spec->{ $opt }->{ type } ) );
+            my $wanted_types = join( "|", map( $_ eq "boolean" ? "scalar|undef" : quotemeta( $_ ), @wanted_types ) );
+            if ( $actual_type !~ m{\A(?:$wanted_types)\z}i ) {
+                $actual_type = lc( $actual_type );
+                $wanted_types = lc( join( " or ", map( "`$_'", @wanted_types ) ) );
+                $error->( "Option `$opt' value type is `$actual_type' but expected to be $wanted_types" );
+                next;
+            }; # if
+        }; # if
+        if ( exists( $spec->{ $opt }->{ values } )  ) {
+            my $values = $spec->{ $opt }->{ values };
+            if ( not grep( $_ eq $val, @$values ) ) {
+                $values = join( ", ", map( "`$_'", @$values ) );
+                $error->( "Option `$opt' value is `$val' but expected to be one of $values" );
+                next;
+            }; # if
+        }; # if
+        $opts{ $opt } = $val;
+    }; # while
+
+    # Assign default values.
+    foreach my $opt ( keys( %$spec ) ) {
+        if ( not defined( $opts{ $opt } ) and exists( $spec->{ $opt }->{ default } ) ) {
+            $opts{ $opt } = $spec->{ $opt }->{ default };
+        }; # if
+    }; # foreach $opt
+
+    # If we found any errors, raise them.
+    if ( @errors ) {
+        die join( "", @errors );
+    }; # if
+
+    return %opts;
+
+}; # sub validate
+
+# =================================================================================================
+# Get option helpers.
+# =================================================================================================
+
+=head2 Get option helpers.
+
+=cut
+
+# -------------------------------------------------------------------------------------------------
+
+=head3 get_options
+
+B<Synopsis:>
+
+    get_options( @arguments )
+
+B<Description:>
+
+It is very simple wrapper arounf Getopt::Long::GetOptions. It passes all arguments to GetOptions,
+and add definitions for standard help options: --help, --doc, --verbose, and --quiet.
+When GetOptions finihes, this subroutine checks exit code, if it is non-zero, standard error
+message is issued and script terminated.
+
+If --verbose or --quiet option is specified, C<tools.pm_verbose> environment variable is set.
+It is the way to propagate verbose/quiet mode to callee Perl scripts.
+
+=cut
+
+sub get_options {
+
+    Getopt::Long::Configure( "no_ignore_case" );
+    Getopt::Long::GetOptions(
+        "h0|usage"        => sub { Pod::Usage::pod2usage( -exitval => 0, -verbose => 0 ); },
+        "h1|h|help"       => sub { Pod::Usage::pod2usage( -exitval => 0, -verbose => 1 ); },
+        "h2|doc|manual"   => sub { Pod::Usage::pod2usage( -exitval => 0, -verbose => 2 ); },
+        "version"         => sub { print( "$tool version $main::VERSION\n" ); exit( 0 ); },
+        "v|verbose"       => sub { ++ $verbose;     $ENV{ "tools.pm_verbose"    } = $verbose;    },
+        "quiet"           => sub { -- $verbose;     $ENV{ "tools.pm_verbose"    } = $verbose;    },
+        "with-timestamps" => sub { $timestamps = 1; $ENV{ "tools.pm_timestamps" } = $timestamps; },
+        @_, # Caller argumetsa are at the end so caller options overrides standard.
+    ) or cmdline_error();
+
+}; # sub get_options
+
+
+# =================================================================================================
+# Print utilities.
+# =================================================================================================
+
+=pod
+
+=head2 Print utilities.
+
+Each of the print subroutines prepends each line of its output with the name of current script and
+the type of information, for example:
+
+    info( "Writing file..." );
+
+will print
+
+    <script>: (i): Writing file...
+
+while
+
+    warning( "File does not exist!" );
+
+will print
+
+    <script>: (!): File does not exist!
+
+Here are exported items:
+
+=cut
+
+# -------------------------------------------------------------------------------------------------
+
+sub _format_message($\@;$) {
+
+    my $prefix  = shift( @_ );
+    my $args    = shift( @_ );
+    my $no_eol  = shift( @_ );  # Do not append "\n" to the last line.
+    my $message = "";
+
+    my $ts = "";
+    if ( $timestamps ) {
+        my ( $sec, $min, $hour, $day, $month, $year ) = gmtime();
+        $month += 1;
+        $year  += 1900;
+        $ts = sprintf( "%04d-%02d-%02d %02d:%02d:%02d UTC: ", $year, $month, $day, $hour, $min, $sec );
+    }; # if
+    for my $i ( 1 .. @$args ) {
+        my @lines = split( "\n", $args->[ $i - 1 ] );
+        for my $j ( 1 .. @lines ) {
+            my $line = $lines[ $j - 1 ];
+            my $last_line = ( ( $i == @$args ) and ( $j == @lines ) );
+            my $eol = ( ( substr( $line, -1 ) eq "\n" ) or defined( $no_eol ) ? "" : "\n" );
+            $message .= "$ts$tool: ($prefix) " . $line . $eol;
+        }; # foreach $j
+    }; # foreach $i
+    return $message;
+
+}; # sub _format_message
+
+#--------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 $verbose
+
+B<Synopsis:>
+
+    $verbose
+
+B<Description:>
+
+Package variable. It determines verbosity level, which affects C<warning()>, C<info()>, and
+C<debug()> subroutnes .
+
+The variable gets initial value from C<tools.pm_verbose> environment variable if it is exists.
+If the environment variable does not exist, variable is set to 2.
+
+Initial value may be overridden later directly or by C<get_options> function.
+
+=cut
+
+$verbose = exists( $ENV{ "tools.pm_verbose" } ) ? $ENV{ "tools.pm_verbose" } : 2;
+
+#--------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 $timestamps
+
+B<Synopsis:>
+
+    $timestamps
+
+B<Description:>
+
+Package variable. It determines whether C<debug()>, C<info()>, C<warning()>, C<runtime_error()>
+subroutnes print timestamps or not.
+
+The variable gets initial value from C<tools.pm_timestamps> environment variable if it is exists.
+If the environment variable does not exist, variable is set to false.
+
+Initial value may be overridden later directly or by C<get_options()> function.
+
+=cut
+
+$timestamps = exists( $ENV{ "tools.pm_timestamps" } ) ? $ENV{ "tools.pm_timestamps" } : 0;
+
+# -------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 debug
+
+B<Synopsis:>
+
+    debug( @messages )
+
+B<Description:>
+
+If verbosity level is 3 or higher, print debug information to the stderr, prepending it with "(#)"
+prefix.
+
+=cut
+
+sub debug(@) {
+
+    if ( $verbose >= 3 ) {
+        STDOUT->flush();
+        STDERR->print( _format_message( "#", @_ ) );
+    }; # if
+    return 1;
+
+}; # sub debug
+
+#--------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 info
+
+B<Synopsis:>
+
+    info( @messages )
+
+B<Description:>
+
+If verbosity level is 2 or higher, print information to the stderr, prepending it with "(i)" prefix.
+
+=cut
+
+sub info(@) {
+
+    if ( $verbose >= 2 ) {
+        STDOUT->flush();
+        STDERR->print( _format_message( "i", @_  ) );
+    }; # if
+
+}; # sub info
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 warning
+
+B<Synopsis:>
+
+    warning( @messages )
+
+B<Description:>
+
+If verbosity level is 1 or higher, issue a warning, prepending it with "(!)" prefix.
+
+=cut
+
+sub warning(@) {
+
+    if ( $verbose >= 1 ) {
+        STDOUT->flush();
+        warn( _format_message( "!", @_  ) );
+    }; # if
+
+}; # sub warning
+
+# -------------------------------------------------------------------------------------------------
+
+=head3 cmdline_error
+
+B<Synopsis:>
+
+    cmdline_error( @message )
+
+B<Description:>
+
+Print error message and exit the program with status 2.
+
+This function is intended to complain on command line errors, e. g. unknown
+options, invalid arguments, etc.
+
+=cut
+
+sub cmdline_error(;$) {
+
+    my $message = shift( @_ );
+
+    if ( defined( $message ) ) {
+        if ( substr( $message, -1, 1 ) ne "\n" ) {
+            $message .= "\n";
+        }; # if
+    } else {
+        $message = "";
+    }; # if
+    STDOUT->flush();
+    die $message . "Try --help option for more information.\n";
+
+}; # sub cmdline_error
+
+# -------------------------------------------------------------------------------------------------
+
+=head3 runtime_error
+
+B<Synopsis:>
+
+    runtime_error( @message )
+
+B<Description:>
+
+Print error message and exits the program with status 3.
+
+This function is intended to complain on runtime errors, e. g.
+directories which are not found, non-writable files, etc.
+
+=cut
+
+sub runtime_error(@) {
+
+    STDOUT->flush();
+    die _format_message( "x", @_ );
+
+}; # sub runtime_error
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 question
+
+B<Synopsis:>
+
+    question( $prompt; $answer, $choices  )
+
+B<Description:>
+
+Print $promp to the stderr, prepending it with "question:" prefix. Read a line from stdin, chop
+"\n" from the end, it is answer.
+
+If $answer is defined, it is treated as first user input.
+
+If $choices is specified, it could be a regexp for validating user input, or a string. In latter
+case it interpreted as list of characters, acceptable (case-insensitive) choices. If user enters
+non-acceptable answer, question continue asking until answer is acceptable.
+If $choices is not specified, any answer is acceptable.
+
+In case of end-of-file (or Ctrl+D pressed by user), $answer is C<undef>.
+
+B<Examples:>
+
+    my $answer;
+    question( "Save file [yn]? ", $answer, "yn" );
+        # We accepts only "y", "Y", "n", or "N".
+    question( "Press enter to continue or Ctrl+C to abort..." );
+        # We are not interested in answer value -- in case of Ctrl+C the script will be terminated,
+        # otherwise we continue execution.
+    question( "File name? ", $answer );
+        # Any answer is acceptable.
+
+=cut
+
+sub question($;\$$) {
+
+    my $prompt  = shift( @_ );
+    my $answer  = shift( @_ );
+    my $choices = shift( @_ );
+    my $a       = ( defined( $answer ) ? $$answer : undef );
+
+    if ( ref( $choices ) eq "Regexp" ) {
+        # It is already a regular expression, do nothing.
+    } elsif ( defined( $choices ) ) {
+        # Convert string to a regular expression.
+        $choices = qr/[@{ [ quotemeta( $choices ) ] }]/i;
+    }; # if
+
+    for ( ; ; ) {
+        STDERR->print( _format_message( "?", @{ [ $prompt ] }, "no_eol" ) );
+        STDERR->flush();
+        if ( defined( $a ) ) {
+            STDOUT->print( $a . "\n" );
+        } else {
+            $a = <STDIN>;
+        }; # if
+        if ( not defined( $a ) ) {
+            last;
+        }; # if
+        chomp( $a );
+        if ( not defined( $choices ) or ( $a =~ m/^$choices$/ ) ) {
+            last;
+        }; # if
+        $a = undef;
+    }; # forever
+    if ( defined( $answer ) ) {
+        $$answer = $a;
+    }; # if
+
+}; # sub question
+
+# -------------------------------------------------------------------------------------------------
+
+# Returns volume part of path.
+sub get_vol($) {
+
+    my $path = shift( @_ );
+    my ( $vol, undef, undef ) = File::Spec->splitpath( $path );
+    return $vol;
+
+}; # sub get_vol
+
+# Returns directory part of path.
+sub get_dir($) {
+
+    my $path = File::Spec->canonpath( shift( @_ ) );
+    my ( $vol, $dir, undef ) = File::Spec->splitpath( $path );
+    my @dirs = File::Spec->splitdir( $dir );
+    pop( @dirs );
+    $dir = File::Spec->catdir( @dirs );
+    $dir = File::Spec->catpath( $vol, $dir, undef );
+    return $dir;
+
+}; # sub get_dir
+
+# Returns file part of path.
+sub get_file($) {
+
+    my $path = shift( @_ );
+    my ( undef, undef, $file ) = File::Spec->splitpath( $path );
+    return $file;
+
+}; # sub get_file
+
+# Returns file part of path without last suffix.
+sub get_name($) {
+
+    my $path = shift( @_ );
+    my ( undef, undef, $file ) = File::Spec->splitpath( $path );
+    $file =~ s{\.[^.]*\z}{};
+    return $file;
+
+}; # sub get_name
+
+# Returns last suffix of file part of path.
+sub get_ext($) {
+
+    my $path = shift( @_ );
+    my ( undef, undef, $file ) = File::Spec->splitpath( $path );
+    my $ext = "";
+    if ( $file =~ m{(\.[^.]*)\z} ) {
+        $ext = $1;
+    }; # if
+    return $ext;
+
+}; # sub get_ext
+
+sub cat_file(@) {
+
+    my $path = shift( @_ );
+    my $file = pop( @_ );
+    my @dirs = @_;
+
+    my ( $vol, $dirs ) = File::Spec->splitpath( $path, "no_file" );
+    @dirs = ( File::Spec->splitdir( $dirs ), @dirs );
+    $dirs = File::Spec->catdir( @dirs );
+    $path = File::Spec->catpath( $vol, $dirs, $file );
+
+    return $path;
+
+}; # sub cat_file
+
+sub cat_dir(@) {
+
+    my $path = shift( @_ );
+    my @dirs = @_;
+
+    my ( $vol, $dirs ) = File::Spec->splitpath( $path, "no_file" );
+    @dirs = ( File::Spec->splitdir( $dirs ), @dirs );
+    $dirs = File::Spec->catdir( @dirs );
+    $path = File::Spec->catpath( $vol, $dirs, "" );
+
+    return $path;
+
+}; # sub cat_dir
+
+# =================================================================================================
+# File and directory manipulation subroutines.
+# =================================================================================================
+
+=head2 File and directory manipulation subroutines.
+
+=over
+
+=cut
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<which( $file, @options )>
+
+Searches for specified executable file in the (specified) directories.
+Raises a runtime eroror if no executable file found. Returns a full path of found executable(s).
+
+Options:
+
+=over
+
+=item C<-all> =E<gt> I<bool>
+
+Do not stop on the first found file. Note, that list of full paths is returned in this case.
+
+=item C<-dirs> =E<gt> I<ref_to_array>
+
+Specify directory list to search through. If option is not passed, PATH environment variable
+is used for directory list.
+
+=item C<-exec> =E<gt> I<bool>
+
+Whether check for executable files or not. By default, C<which> searches executable files.
+However, on Cygwin executable check never performed.
+
+=back
+
+Examples:
+
+Look for "echo" in the directories specified in PATH:
+
+    my $echo = which( "echo" );
+
+Look for all occurenses of "cp" in the PATH:
+
+    my @cps = which( "cp", -all => 1 );
+
+Look for the first occurrence of "icc" in the specified directories:
+
+    my $icc = which( "icc", -dirs => [ ".", "/usr/local/bin", "/usr/bin", "/bin" ] );
+
+Look for the the C<omp_lib.f> file:
+
+    my @omp_lib = which( "omp_lib.f", -all => 1, -exec => 0, -dirs => [ @include ] );
+
+=cut
+
+sub which($@) {
+
+    my $file = shift( @_ );
+    my %opts = @_;
+
+    check_opts( %opts, [ qw( -all -dirs -exec ) ] );
+    if ( $opts{ -all } and not wantarray() ) {
+        local $Carp::CarpLevel = 1;
+        Carp::cluck( "`-all' option passed to `which' but list is not expected" );
+    }; # if
+    if ( not defined( $opts{ -exec } ) ) {
+        $opts{ -exec } = 1;
+    }; # if
+
+    my $dirs = ( exists( $opts{ -dirs } ) ? $opts{ -dirs } : [ File::Spec->path() ] );
+    my @found;
+
+    my @exts = ( "" );
+    if ( $^O eq "MSWin32" and $opts{ -exec } ) {
+        if ( defined( $ENV{ PATHEXT } ) ) {
+            push( @exts, split( ";", $ENV{ PATHEXT } ) );
+        } else {
+            # If PATHEXT does not exist, use default value.
+            push( @exts, qw{ .COM .EXE .BAT .CMD } );
+        }; # if
+    }; # if
+
+    loop:
+    foreach my $dir ( @$dirs ) {
+        foreach my $ext ( @exts ) {
+            my $path = File::Spec->catfile( $dir, $file . $ext );
+            if ( -e $path ) {
+                # Executable bit is not reliable on Cygwin, do not check it.
+                if ( not $opts{ -exec } or -x $path or $^O eq "cygwin" ) {
+                    push( @found, $path );
+                    if ( not $opts{ -all } ) {
+                        last loop;
+                    }; # if
+                }; # if
+            }; # if
+        }; # foreach $ext
+    }; # foreach $dir
+
+    if ( not @found ) {
+        # TBD: We need to introduce an option for conditional enabling this error.
+        # runtime_error( "Could not find \"$file\" executable file in PATH." );
+    }; # if
+    if ( @found > 1 ) {
+        # TBD: Issue a warning?
+    }; # if
+
+    if ( $opts{ -all } ) {
+        return @found;
+    } else {
+        return $found[ 0 ];
+    }; # if
+
+}; # sub which
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<abs_path( $path, $base )>
+
+Return absolute path for an argument.
+
+Most of the work is done by C<File::Spec->rel2abs()>. C<abs_path()> additionally collapses
+C<dir1/../dir2> to C<dir2>.
+
+It is not so naive and made intentionally. For example on Linux* OS in Bash if F<link/> is a symbolic
+link to directory F<some_dir/>
+
+    $ cd link
+    $ cd ..
+
+brings you back to F<link/>'s parent, not to parent of F<some_dir/>,
+
+=cut
+
+sub abs_path($;$) {
+
+    my ( $path, $base ) = @_;
+    $path = File::Spec->rel2abs( $path, ( defined( $base ) ? $base : $ENV{ PWD } ) );
+    my ( $vol, $dir, $file ) = File::Spec->splitpath( $path );
+    while ( $dir =~ s{/(?!\.\.)[^/]*/\.\.(?:/|\z)}{/} ) {
+    }; # while
+    $path = File::Spec->canonpath( File::Spec->catpath( $vol, $dir, $file ) );
+    return $path;
+
+}; # sub abs_path
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<rel_path( $path, $base )>
+
+Return relative path for an argument.
+
+=cut
+
+sub rel_path($;$) {
+
+    my ( $path, $base ) = @_;
+    $path = File::Spec->abs2rel( abs_path( $path ), $base );
+    return $path;
+
+}; # sub rel_path
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<real_path( $dir )>
+
+Return real absolute path for an argument. In the result all relative components (F<.> and F<..>)
+and U<symbolic links are resolved>.
+
+In most cases it is not what you want. Consider using C<abs_path> first.
+
+C<abs_path> function from B<Cwd> module works with directories only. This function works with files
+as well. But, if file is a symbolic link, function does not resolve it (yet).
+
+The function uses C<runtime_error> to raise an error if something wrong.
+
+=cut
+
+sub real_path($) {
+
+    my $orig_path = shift( @_ );
+    my $real_path;
+    my $message = "";
+    if ( not -e $orig_path ) {
+        $message = "\"$orig_path\" does not exists";
+    } else {
+        # Cwd::abs_path does not work with files, so in this case we should handle file separately.
+        my $file;
+        if ( not -d $orig_path ) {
+            ( my $vol, my $dir, $file ) = File::Spec->splitpath( File::Spec->rel2abs( $orig_path ) );
+            $orig_path = File::Spec->catpath( $vol, $dir );
+        }; # if
+        {
+            local $SIG{ __WARN__ } = sub { $message = $_[ 0 ]; };
+            $real_path = Cwd::abs_path( $orig_path );
+        };
+        if ( defined( $file ) ) {
+            $real_path = File::Spec->catfile( $real_path, $file );
+        }; # if
+    }; # if
+    if ( not defined( $real_path ) or $message ne "" ) {
+        $message =~ s/^stat\(.*\): (.*)\s+at .*? line \d+\s*\z/$1/;
+        runtime_error( "Could not find real path for \"$orig_path\"" . ( $message ne "" ? ": $message" : "" ) );
+    }; # if
+    return $real_path;
+
+}; # sub real_path
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<make_dir( $dir, @options )>
+
+Make a directory.
+
+This function makes a directory. If necessary, more than one level can be created.
+If directory exists, warning issues (the script behavior depends on value of
+C<-warning_level> option). If directory creation fails or C<$dir> exists but it is not a
+directory, error isssues.
+
+Options:
+
+=over
+
+=item C<-mode>
+
+The numeric mode for new directories, 0750 (rwxr-x---) by default.
+
+=back
+
+=cut
+
+sub make_dir($@) {
+
+    my $dir    = shift( @_ );
+    my %opts   =
+        validate(
+            params => \@_,
+            spec => {
+                parents => { type => "boolean", default => 1    },
+                mode    => { type => "scalar",  default => 0777 },
+            },
+        );
+
+    my $prefix = "Could not create directory \"$dir\"";
+
+    if ( -e $dir ) {
+        if ( -d $dir ) {
+        } else {
+            runtime_error( "$prefix: it exists, but not a directory." );
+        }; # if
+    } else {
+        eval {
+            File::Path::mkpath( $dir, 0, $opts{ mode } );
+        }; # eval
+        if ( $@ ) {
+            $@ =~ s{\s+at (?:[a-zA-Z0-9 /_.]*/)?tools\.pm line \d+\s*}{};
+            runtime_error( "$prefix: $@" );
+        }; # if
+        if ( not -d $dir ) { # Just in case, check it one more time...
+            runtime_error( "$prefix." );
+        }; # if
+    }; # if
+
+}; # sub make_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<copy_dir( $src_dir, $dst_dir, @options )>
+
+Copy directory recursively.
+
+This function copies a directory recursively.
+If source directory does not exist or not a directory, error issues.
+
+Options:
+
+=over
+
+=item C<-overwrite>
+
+Overwrite destination directory, if it exists.
+
+=back
+
+=cut
+
+sub copy_dir($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not copy directory \"$src\" to \"$dst\"";
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -d $src ) {
+        runtime_error( "$prefix: \"$src\" is not a directory." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -d $dst ) {
+            if ( $opts{ -overwrite } ) {
+                del_dir( $dst );
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a directory." );
+        }; # if
+    }; # if
+
+    execute( [ "cp", "-R", $src, $dst ] );
+
+}; # sub copy_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<move_dir( $src_dir, $dst_dir, @options )>
+
+Move directory.
+
+Options:
+
+=over
+
+=item C<-overwrite>
+
+Overwrite destination directory, if it exists.
+
+=back
+
+=cut
+
+sub move_dir($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not copy directory \"$src\" to \"$dst\"";
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -d $src ) {
+        runtime_error( "$prefix: \"$src\" is not a directory." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -d $dst ) {
+            if ( $opts{ -overwrite } ) {
+                del_dir( $dst );
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a directory." );
+        }; # if
+    }; # if
+
+    execute( [ "mv", $src, $dst ] );
+
+}; # sub move_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<clean_dir( $dir, @options )>
+
+Clean a directory: delete all the entries (recursively), but leave the directory.
+
+Options:
+
+=over
+
+=item C<-force> => bool
+
+If a directory is not writable, try to change permissions first, then clean it.
+
+=item C<-skip> => regexp
+
+Regexp. If a directory entry mached the regexp, it is skipped, not deleted. (As a subsequence,
+a directory containing skipped entries is not deleted.)
+
+=back
+
+=cut
+
+sub _clean_dir($);
+
+sub _clean_dir($) {
+    our %_clean_dir_opts;
+    my ( $dir ) = @_;
+    my $skip    = $_clean_dir_opts{ skip };    # Regexp.
+    my $skipped = 0;                           # Number of skipped files.
+    my $prefix  = "Cleaning `$dir' failed:";
+    my @stat    = stat( $dir );
+    my $mode    = $stat[ 2 ];
+    if ( not @stat ) {
+        runtime_error( $prefix, "Cannot stat `$dir': $!" );
+    }; # if
+    if ( not -d _ ) {
+        runtime_error( $prefix, "It is not a directory." );
+    }; # if
+    if ( not -w _ ) {        # Directory is not writable.
+        if ( not -o _ or not $_clean_dir_opts{ force } ) {
+            runtime_error( $prefix, "Directory is not writable." );
+        }; # if
+        # Directory is not writable but mine. Try to change permissions.
+        chmod( $mode | S_IWUSR, $dir )
+            or runtime_error( $prefix, "Cannot make directory writable: $!" );
+    }; # if
+    my $handle   = IO::Dir->new( $dir ) or runtime_error( $prefix, "Cannot read directory: $!" );
+    my @entries  = File::Spec->no_upwards( $handle->read() );
+    $handle->close() or runtime_error( $prefix, "Cannot read directory: $!" );
+    foreach my $entry ( @entries ) {
+        my $path = cat_file( $dir, $entry );
+        if ( defined( $skip ) and $entry =~ $skip ) {
+            ++ $skipped;
+        } else {
+            if ( -l $path ) {
+                unlink( $path ) or runtime_error( $prefix, "Cannot delete symlink `$path': $!" );
+            } else {
+                stat( $path ) or runtime_error( $prefix, "Cannot stat `$path': $! " );
+                if ( -f _ ) {
+                    del_file( $path );
+                } elsif ( -d _ ) {
+                    my $rc = _clean_dir( $path );
+                    if ( $rc == 0 ) {
+                        rmdir( $path ) or runtime_error( $prefix, "Cannot delete directory `$path': $!" );
+                    }; # if
+                    $skipped += $rc;
+                } else {
+                    runtime_error( $prefix, "`$path' is neither a file nor a directory." );
+                }; # if
+            }; # if
+        }; # if
+    }; # foreach
+    return $skipped;
+}; # sub _clean_dir
+
+
+sub clean_dir($@) {
+    my $dir  = shift( @_ );
+    our %_clean_dir_opts;
+    local %_clean_dir_opts =
+        validate(
+            params => \@_,
+            spec => {
+                skip  => { type => "regexpref" },
+                force => { type => "boolean"   },
+            },
+        );
+    my $skipped = _clean_dir( $dir );
+    return $skipped;
+}; # sub clean_dir
+
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<del_dir( $dir, @options )>
+
+Delete a directory recursively.
+
+This function deletes a directory. If directory can not be deleted or it is not a directory, error
+message issues (and script exists).
+
+Options:
+
+=over
+
+=back
+
+=cut
+
+sub del_dir($@) {
+
+    my $dir  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Deleting directory \"$dir\" failed";
+    our %_clean_dir_opts;
+    local %_clean_dir_opts =
+        validate(
+            params => \@_,
+            spec => {
+                force => { type => "boolean" },
+            },
+        );
+
+    if ( not -e $dir ) {
+        # Nothing to do.
+        return;
+    }; # if
+    if ( not -d $dir ) {
+        runtime_error( "$prefix: it is not a directory." );
+    }; # if
+    _clean_dir( $dir );
+    rmdir( $dir ) or runtime_error( "$prefix." );
+
+}; # sub del_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<change_dir( $dir )>
+
+Change current directory.
+
+If any error occurred, error issues and script exits.
+
+=cut
+
+sub change_dir($) {
+
+    my $dir = shift( @_ );
+
+    Cwd::chdir( $dir )
+        or runtime_error( "Could not chdir to \"$dir\": $!" );
+
+}; # sub change_dir
+
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<copy_file( $src_file, $dst_file, @options )>
+
+Copy file.
+
+This function copies a file. If source does not exist or is not a file, error issues.
+
+Options:
+
+=over
+
+=item C<-overwrite>
+
+Overwrite destination file, if it exists.
+
+=back
+
+=cut
+
+sub copy_file($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not copy file \"$src\" to \"$dst\"";
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -f $src ) {
+        runtime_error( "$prefix: \"$src\" is not a file." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -f $dst ) {
+            if ( $opts{ -overwrite } ) {
+                del_file( $dst );
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a file." );
+        }; # if
+    }; # if
+
+    File::Copy::copy( $src, $dst ) or runtime_error( "$prefix: $!" );
+    # On Windows* OS File::Copy preserves file attributes, but on Linux* OS it doesn't.
+    # So we should do it manually...
+    if ( $^O =~ m/^linux\z/ ) {
+        my $mode = ( stat( $src ) )[ 2 ]
+            or runtime_error( "$prefix: cannot get status info for source file." );
+        chmod( $mode, $dst )
+            or runtime_error( "$prefix: cannot change mode of destination file." );
+    }; # if
+
+}; # sub copy_file
+
+# -------------------------------------------------------------------------------------------------
+
+sub move_file($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not move file \"$src\" to \"$dst\"";
+
+    check_opts( %opts, [ qw( -overwrite ) ] );
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -f $src ) {
+        runtime_error( "$prefix: \"$src\" is not a file." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -f $dst ) {
+            if ( $opts{ -overwrite } ) {
+                #
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a file." );
+        }; # if
+    }; # if
+
+    File::Copy::move( $src, $dst ) or runtime_error( "$prefix: $!" );
+
+}; # sub move_file
+
+# -------------------------------------------------------------------------------------------------
+
+sub del_file($) {
+    my $files = shift( @_ );
+    if ( ref( $files ) eq "" ) {
+        $files = [ $files ];
+    }; # if
+    foreach my $file ( @$files ) {
+        debug( "Deleting file `$file'..." );
+        my $rc = unlink( $file );
+        if ( $rc == 0 && $! != ENOENT ) {
+            # Reporn an error, but ignore ENOENT, because the goal is achieved.
+            runtime_error( "Deleting file `$file' failed: $!" );
+        }; # if
+    }; # foreach $file
+}; # sub del_file
+
+# -------------------------------------------------------------------------------------------------
+
+=back
+
+=cut
+
+# =================================================================================================
+# File I/O subroutines.
+# =================================================================================================
+
+=head2 File I/O subroutines.
+
+=cut
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 read_file
+
+B<Synopsis:>
+
+    read_file( $file, @options )
+
+B<Description:>
+
+Read file and return its content. In scalar context function returns a scalar, in list context
+function returns list of lines.
+
+Note: If the last of file does not terminate with newline, function will append it.
+
+B<Arguments:>
+
+=over
+
+=item B<$file>
+
+A name or handle of file to read from.
+
+=back
+
+B<Options:>
+
+=over
+
+=item B<-binary>
+
+If true, file treats as a binary file: no newline conversion, no truncating trailing space, no
+newline removing performed. Entire file returned as a scalar.
+
+=item B<-bulk>
+
+This option is allowed only in binary mode. Option's value should be a reference to a scalar.
+If option present, file content placed to pointee scalar and function returns true (1).
+
+=item B<-chomp>
+
+If true, newline characters are removed from file content. By default newline characters remain.
+This option is not applicable in binary mode.
+
+=item B<-keep_trailing_space>
+
+If true, trainling space remain at the ends of lines. By default all trailing spaces are removed.
+This option is not applicable in binary mode.
+
+=back
+
+B<Examples:>
+
+Return file as single line, remove trailing spaces.
+
+    my $bulk = read_file( "message.txt" );
+
+Return file as list of lines with removed trailing space and
+newline characters.
+
+    my @bulk = read_file( "message.txt", -chomp => 1 );
+
+Read a binary file:
+
+    my $bulk = read_file( "message.txt", -binary => 1 );
+
+Read a big binary file:
+
+    my $bulk;
+    read_file( "big_binary_file", -binary => 1, -bulk => \$bulk );
+
+Read from standard input:
+
+    my @bulk = read_file( \*STDIN );
+
+=cut
+
+sub read_file($@) {
+
+    my $file = shift( @_ );  # The name or handle of file to read from.
+    my %opts = @_;           # Options.
+
+    my $name;
+    my $handle;
+    my @bulk;
+    my $error = \&runtime_error;
+
+    my @binopts = qw( -binary -error -bulk );                       # Options available in binary mode.
+    my @txtopts = qw( -binary -error -keep_trailing_space -chomp -layer ); # Options available in text (non-binary) mode.
+    check_opts( %opts, [ @binopts, @txtopts ] );
+    if ( $opts{ -binary } ) {
+        check_opts( %opts, [ @binopts ], "these options cannot be used with -binary" );
+    } else {
+        check_opts( %opts, [ @txtopts ], "these options cannot be used without -binary" );
+    }; # if
+    if ( not exists( $opts{ -error } ) ) {
+        $opts{ -error } = "error";
+    }; # if
+    if ( $opts{ -error } eq "warning" ) {
+        $error = \&warning;
+    } elsif( $opts{ -error } eq "ignore" ) {
+        $error = sub {};
+    } elsif ( ref( $opts{ -error } ) eq "ARRAY" ) {
+        $error = sub { push( @{ $opts{ -error } }, $_[ 0 ] ); };
+    }; # if
+
+    if ( ( ref( $file ) eq "GLOB" ) or UNIVERSAL::isa( $file, "IO::Handle" ) ) {
+        $name = "unknown";
+        $handle = $file;
+    } else {
+        $name = $file;
+        if ( get_ext( $file ) eq ".gz" and not $opts{ -binary } ) {
+            $handle = IO::Zlib->new( $name, "rb" );
+        } else {
+            $handle = IO::File->new( $name, "r" );
+        }; # if
+        if ( not defined( $handle ) ) {
+            $error->( "File \"$name\" could not be opened for input: $!" );
+        }; # if
+    }; # if
+    if ( defined( $handle ) ) {
+        if ( $opts{ -binary } ) {
+            binmode( $handle );
+            local $/ = undef;   # Set input record separator to undef to read entire file as one line.
+            if ( exists( $opts{ -bulk } ) ) {
+                ${ $opts{ -bulk } } = $handle->getline();
+            } else {
+                $bulk[ 0 ] = $handle->getline();
+            }; # if
+        } else {
+            if ( defined( $opts{ -layer } ) ) {
+                binmode( $handle, $opts{ -layer } );
+            }; # if
+            @bulk = $handle->getlines();
+            # Special trick for UTF-8 files: Delete BOM, if any.
+            if ( defined( $opts{ -layer } ) and $opts{ -layer } eq ":utf8" ) {
+                if ( substr( $bulk[ 0 ], 0, 1 ) eq "\x{FEFF}" ) {
+                    substr( $bulk[ 0 ], 0, 1 ) = "";
+                }; # if
+            }; # if
+        }; # if
+        $handle->close()
+            or $error->( "File \"$name\" could not be closed after input: $!" );
+    } else {
+        if ( $opts{ -binary } and exists( $opts{ -bulk } ) ) {
+            ${ $opts{ -bulk } } = "";
+        }; # if
+    }; # if
+    if ( $opts{ -binary } ) {
+        if ( exists( $opts{ -bulk } ) ) {
+            return 1;
+        } else {
+            return $bulk[ 0 ];
+        }; # if
+    } else {
+        if ( ( @bulk > 0 ) and ( substr( $bulk[ -1 ], -1, 1 ) ne "\n" ) ) {
+            $bulk[ -1 ] .= "\n";
+        }; # if
+        if ( not $opts{ -keep_trailing_space } ) {
+            map( $_ =~ s/\s+\n\z/\n/, @bulk );
+        }; # if
+        if ( $opts{ -chomp } ) {
+            chomp( @bulk );
+        }; # if
+        if ( wantarray() ) {
+            return @bulk;
+        } else {
+            return join( "", @bulk );
+        }; # if
+    }; # if
+
+}; # sub read_file
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 write_file
+
+B<Synopsis:>
+
+    write_file( $file, $bulk, @options )
+
+B<Description:>
+
+Write file.
+
+B<Arguments:>
+
+=over
+
+=item B<$file>
+
+The name or handle of file to writte to.
+
+=item B<$bulk>
+
+Bulk to write to a file. Can be a scalar, or a reference to scalar or an array.
+
+=back
+
+B<Options:>
+
+=over
+
+=item B<-backup>
+
+If true, create a backup copy of file overwritten. Backup copy is placed into the same directory.
+The name of backup copy is the same as the name of file with `~' appended. By default backup copy
+is not created.
+
+=item B<-append>
+
+If true, the text will be added to existing file.
+
+=back
+
+B<Examples:>
+
+    write_file( "message.txt", \$bulk );
+        # Write file, take content from a scalar.
+
+    write_file( "message.txt", \@bulk, -backup => 1 );
+        # Write file, take content from an array, create a backup copy.
+
+=cut
+
+sub write_file($$@) {
+
+    my $file = shift( @_ );  # The name or handle of file to write to.
+    my $bulk = shift( @_ );  # The text to write. Can be reference to array or scalar.
+    my %opts = @_;           # Options.
+
+    my $name;
+    my $handle;
+
+    check_opts( %opts, [ qw( -append -backup -binary -layer ) ] );
+
+    my $mode = $opts{ -append } ? "a": "w";
+    if ( ( ref( $file ) eq "GLOB" ) or UNIVERSAL::isa( $file, "IO::Handle" ) ) {
+        $name = "unknown";
+        $handle = $file;
+    } else {
+        $name = $file;
+        if ( $opts{ -backup } and ( -f $name ) ) {
+            copy_file( $name, $name . "~", -overwrite => 1 );
+        }; # if
+        $handle = IO::File->new( $name, $mode )
+            or runtime_error( "File \"$name\" could not be opened for output: $!" );
+    }; # if
+    if ( $opts{ -binary } ) {
+        binmode( $handle );
+    } elsif ( $opts{ -layer } ) {
+        binmode( $handle, $opts{ -layer } );
+    }; # if
+    if ( ref( $bulk ) eq "" ) {
+        if ( defined( $bulk ) ) {
+            $handle->print( $bulk );
+            if ( not $opts{ -binary } and ( substr( $bulk, -1 ) ne "\n" ) ) {
+                $handle->print( "\n" );
+            }; # if
+        }; # if
+    } elsif ( ref( $bulk ) eq "SCALAR" ) {
+        if ( defined( $$bulk ) ) {
+            $handle->print( $$bulk );
+            if ( not $opts{ -binary } and ( substr( $$bulk, -1 ) ne "\n" ) ) {
+                $handle->print( "\n" );
+            }; # if
+        }; # if
+    } elsif ( ref( $bulk ) eq "ARRAY" ) {
+        foreach my $line ( @$bulk ) {
+            if ( defined( $line ) ) {
+                $handle->print( $line );
+                if ( not $opts{ -binary } and ( substr( $line, -1 ) ne "\n" ) ) {
+                    $handle->print( "\n" );
+                }; # if
+            }; # if
+        }; # foreach
+    } else {
+        Carp::croak( "write_file: \$bulk must be a scalar or reference to (scalar or array)" );
+    }; # if
+    $handle->close()
+        or runtime_error( "File \"$name\" could not be closed after output: $!" );
+
+}; # sub write_file
+
+#--------------------------------------------------------------------------------------------------
+
+=cut
+
+# =================================================================================================
+# Execution subroutines.
+# =================================================================================================
+
+=head2 Execution subroutines.
+
+=over
+
+=cut
+
+#--------------------------------------------------------------------------------------------------
+
+sub _pre {
+
+    my $arg = shift( @_ );
+
+    # If redirection is not required, exit.
+    if ( not exists( $arg->{ redir } ) ) {
+        return 0;
+    }; # if
+
+    # Input parameters.
+    my $mode   = $arg->{ mode   }; # Mode, "<" (input ) or ">" (output).
+    my $handle = $arg->{ handle }; # Handle to manipulate.
+    my $redir  = $arg->{ redir  }; # Data, a file name if a scalar, or file contents, if a reference.
+
+    # Output parameters.
+    my $save_handle;
+    my $temp_handle;
+    my $temp_name;
+
+    # Save original handle (by duping it).
+    $save_handle = Symbol::gensym();
+    $handle->flush();
+    open( $save_handle, $mode . "&" . $handle->fileno() )
+        or die( "Cannot dup filehandle: $!" );
+
+    # Prepare a file to IO.
+    if ( UNIVERSAL::isa( $redir, "IO::Handle" ) or ( ref( $redir ) eq "GLOB" ) ) {
+        # $redir is reference to an object of IO::Handle class (or its decedant).
+        $temp_handle = $redir;
+    } elsif ( ref( $redir ) ) {
+        # $redir is a reference to content to be read/written.
+        # Prepare temp file.
+        ( $temp_handle, $temp_name ) =
+            File::Temp::tempfile(
+                "$tool.XXXXXXXX",
+                DIR    => File::Spec->tmpdir(),
+                SUFFIX => ".tmp",
+                UNLINK => 1
+            );
+        if ( not defined( $temp_handle ) ) {
+            runtime_error( "Could not create temp file." );
+        }; # if
+        if ( $mode eq "<" ) {
+            # It is a file to be read by child, prepare file content to be read.
+            $temp_handle->print( ref( $redir ) eq "SCALAR" ? ${ $redir } : @{ $redir } );
+            $temp_handle->flush();
+            seek( $temp_handle, 0, 0 );
+                # Unfortunatelly, I could not use OO interface to seek.
+                # ActivePerl 5.6.1 complains on both forms:
+                #    $temp_handle->seek( 0 );    # As declared in IO::Seekable.
+                #    $temp_handle->setpos( 0 );  # As described in documentation.
+        } elsif ( $mode eq ">" ) {
+            # It is a file for output. Clear output variable.
+            if ( ref( $redir ) eq "SCALAR" ) {
+                ${ $redir } = "";
+            } else {
+                @{ $redir } = ();
+            }; # if
+        }; # if
+    } else {
+        # $redir is a name of file to be read/written.
+        # Just open file.
+        if ( defined( $redir ) ) {
+            $temp_name = $redir;
+        } else {
+            $temp_name = File::Spec->devnull();
+        }; # if
+        $temp_handle = IO::File->new( $temp_name, $mode )
+            or runtime_error( "file \"$temp_name\" could not be opened for " . ( $mode eq "<" ? "input" : "output" ) . ": $!" );
+    }; # if
+
+    # Redirect handle to temp file.
+    open( $handle, $mode . "&" . $temp_handle->fileno() )
+        or die( "Cannot dup filehandle: $!" );
+
+    # Save output parameters.
+    $arg->{ save_handle } = $save_handle;
+    $arg->{ temp_handle } = $temp_handle;
+    $arg->{ temp_name   } = $temp_name;
+
+}; # sub _pre
+
+
+sub _post {
+
+    my $arg = shift( @_ );
+
+    # Input parameters.
+    my $mode   = $arg->{ mode   }; # Mode, "<" or ">".
+    my $handle = $arg->{ handle }; # Handle to save and set.
+    my $redir  = $arg->{ redir  }; # Data, a file name if a scalar, or file contents, if a reference.
+
+    # Parameters saved during preprocessing.
+    my $save_handle = $arg->{ save_handle };
+    my $temp_handle = $arg->{ temp_handle };
+    my $temp_name   = $arg->{ temp_name   };
+
+    # If no handle was saved, exit.
+    if ( not $save_handle ) {
+        return 0;
+    }; # if
+
+    # Close handle.
+    $handle->close()
+        or die( "$!" );
+
+    # Read the content of temp file, if necessary, and close temp file.
+    if ( ( $mode ne "<" ) and ref( $redir ) ) {
+        $temp_handle->flush();
+        seek( $temp_handle, 0, 0 );
+        if ( $^O =~ m/MSWin/ ) {
+            binmode( $temp_handle, ":crlf" );
+        }; # if
+        if ( ref( $redir ) eq "SCALAR" ) {
+            ${ $redir } .= join( "", $temp_handle->getlines() );
+        } elsif ( ref( $redir ) eq "ARRAY" ) {
+            push( @{ $redir }, $temp_handle->getlines() );
+        }; # if
+    }; # if
+    if ( not UNIVERSAL::isa( $redir, "IO::Handle" ) ) {
+        $temp_handle->close()
+            or die( "$!" );
+    }; # if
+
+    # Restore handle to original value.
+    $save_handle->flush();
+    open( $handle, $mode . "&" . $save_handle->fileno() )
+        or die( "Cannot dup filehandle: $!" );
+
+    # Close save handle.
+    $save_handle->close()
+        or die( "$!" );
+
+    # Delete parameters saved during preprocessing.
+    delete( $arg->{ save_handle } );
+    delete( $arg->{ temp_handle } );
+    delete( $arg->{ temp_name   } );
+
+}; # sub _post
+
+#--------------------------------------------------------------------------------------------------
+
+=item C<execute( [ @command ], @options )>
+
+Execute specified program or shell command.
+
+Program is specified by reference to an array, that array is passed to C<system()> function which
+executes the command. See L<perlfunc> for details how C<system()> interprets various forms of
+C<@command>.
+
+By default, in case of any error error message is issued and script terminated (by runtime_error()).
+Function returns an exit code of program.
+
+Alternatively, he function may return exit status of the program (see C<-ignore_status>) or signal
+(see C<-ignore_signal>) so caller may analyze it and continue execution.
+
+Options:
+
+=over
+
+=item C<-stdin>
+
+Redirect stdin of program. The value of option can be:
+
+=over
+
+=item C<undef>
+
+Stdin of child is attached to null device.
+
+=item a string
+
+Stdin of child is attached to a file with name specified by option.
+
+=item a reference to a scalar
+
+A dereferenced scalar is written to a temp file, and child's stdin is attached to that file.
+
+=item a reference to an array
+
+A dereferenced array is written to a temp file, and child's stdin is attached to that file.
+
+=back
+
+=item C<-stdout>
+
+Redirect stdout. Possible values are the same as for C<-stdin> option. The only difference is
+reference specifies a variable receiving program's output.
+
+=item C<-stderr>
+
+It similar to C<-stdout>, but redirects stderr. There is only one additional value:
+
+=over
+
+=item an empty string
+
+means that stderr should be redirected to the same place where stdout is redirected to.
+
+=back
+
+=item C<-append>
+
+Redirected stream will not overwrite previous content of file (or variable).
+Note, that option affects both stdout and stderr.
+
+=item C<-ignore_status>
+
+By default, subroutine raises an error and exits the script if program returns non-exit status. If
+this options is true, no error is raised. Instead, status is returned as function result (and $@ is
+set to error message).
+
+=item C<-ignore_signal>
+
+By default, subroutine raises an error and exits the script if program die with signal. If
+this options is true, no error is raised in such a case. Instead, signal number is returned (as
+negative value), error message is placed to C<$@> variable.
+
+If command is not even started, -256 is returned.
+
+=back
+
+Examples:
+
+    execute( [ "cmd.exe", "/c", "dir" ] );
+        # Execute NT shell with specified options, no redirections are
+        # made.
+
+    my $output;
+    execute( [ "cvs", "-n", "-q", "update", "." ], -stdout => \$output );
+        # Execute "cvs -n -q update ." command, output is saved
+        # in $output variable.
+
+    my @output;
+    execute( [ qw( cvs -n -q update . ) ], -stdout => \@output, -stderr => undef );
+        # Execute specified command,  output is saved in @output
+        # variable, stderr stream is redirected to null device
+        # (/dev/null in Linux* OS an nul in Windows* OS).
+
+=cut
+
+sub execute($@) {
+
+    # !!! Add something to complain on unknown options...
+
+    my $command = shift( @_ );
+    my %opts    = @_;
+    my $prefix  = "Could not execute $command->[ 0 ]";
+
+    check_opts( %opts, [ qw( -stdin -stdout -stderr -append -ignore_status -ignore_signal ) ] );
+
+    if ( ref( $command ) ne "ARRAY" ) {
+        Carp::croak( "execute: $command must be a reference to array" );
+    }; # if
+
+    my $stdin  = { handle => \*STDIN,  mode => "<" };
+    my $stdout = { handle => \*STDOUT, mode => ">" };
+    my $stderr = { handle => \*STDERR, mode => ">" };
+    my $streams = {
+        stdin  => $stdin,
+        stdout => $stdout,
+        stderr => $stderr
+    }; # $streams
+
+    for my $stream ( qw( stdin stdout stderr ) ) {
+        if ( exists( $opts{ "-$stream" } ) ) {
+            if ( ref( $opts{ "-$stream" } ) !~ m/\A(|SCALAR|ARRAY)\z/ ) {
+                Carp::croak( "execute: -$stream option: must have value of scalar, or reference to (scalar or array)." );
+            }; # if
+            $streams->{ $stream }->{ redir } = $opts{ "-$stream" };
+        }; # if
+        if ( $opts{ -append } and ( $streams->{ $stream }->{ mode } ) eq ">" ) {
+            $streams->{ $stream }->{ mode } = ">>";
+        }; # if
+    }; # foreach $stream
+
+    _pre( $stdin  );
+    _pre( $stdout );
+    if ( defined( $stderr->{ redir } ) and not ref( $stderr->{ redir } ) and ( $stderr->{ redir } eq "" ) ) {
+        if ( exists( $stdout->{ redir } ) ) {
+            $stderr->{ redir } = $stdout->{ temp_handle };
+        } else {
+            $stderr->{ redir } = ${ $stdout->{ handle } };
+        }; # if
+    }; # if
+    _pre( $stderr );
+    my $rc = system( @$command );
+    my $errno = $!;
+    my $child = $?;
+    _post( $stderr );
+    _post( $stdout );
+    _post( $stdin  );
+
+    my $exit = 0;
+    my $signal_num  = $child & 127;
+    my $exit_status = $child >> 8;
+    $@ = "";
+
+    if ( $rc == -1 ) {
+        $@ = "\"$command->[ 0 ]\" failed: $errno";
+        $exit = -256;
+        if ( not $opts{ -ignore_signal } ) {
+            runtime_error( $@ );
+        }; # if
+    } elsif ( $signal_num != 0 ) {
+        $@ = "\"$command->[ 0 ]\" failed due to signal $signal_num.";
+        $exit = - $signal_num;
+        if ( not $opts{ -ignore_signal } ) {
+            runtime_error( $@ );
+        }; # if
+    } elsif ( $exit_status != 0 ) {
+        $@ = "\"$command->[ 0 ]\" returned non-zero status $exit_status.";
+        $exit = $exit_status;
+        if ( not $opts{ -ignore_status } ) {
+            runtime_error( $@ );
+        }; # if
+    }; # if
+
+    return $exit;
+
+}; # sub execute
+
+#--------------------------------------------------------------------------------------------------
+
+=item C<backticks( [ @command ], @options )>
+
+Run specified program or shell command and return output.
+
+In scalar context entire output is returned in a single string. In list context list of strings
+is returned. Function issues an error and exits script if any error occurs.
+
+=cut
+
+
+sub backticks($@) {
+
+    my $command = shift( @_ );
+    my %opts    = @_;
+    my @output;
+
+    check_opts( %opts, [ qw( -chomp ) ] );
+
+    execute( $command, -stdout => \@output );
+
+    if ( $opts{ -chomp } ) {
+        chomp( @output );
+    }; # if
+
+    return ( wantarray() ? @output : join( "", @output ) );
+
+}; # sub backticks
+
+#--------------------------------------------------------------------------------------------------
+
+sub pad($$$) {
+    my ( $str, $length, $pad ) = @_;
+    my $lstr = length( $str );    # Length of source string.
+    if ( $lstr < $length ) {
+        my $lpad  = length( $pad );                         # Length of pad.
+        my $count = int( ( $length - $lstr ) / $lpad );     # Number of pad repetitions.
+        my $tail  = $length - ( $lstr + $lpad * $count );
+        $str = $str . ( $pad x $count ) . substr( $pad, 0, $tail );
+    }; # if
+    return $str;
+}; # sub pad
+
+# --------------------------------------------------------------------------------------------------
+
+=back
+
+=cut
+
+#--------------------------------------------------------------------------------------------------
+
+return 1;
+
+#--------------------------------------------------------------------------------------------------
+
+=cut
+
+# End of file.

diff --git a/final/runtime/tools/linux.inc b/final/runtime/tools/linux.inc
new file mode 100644
index 0000000..7ad2512
--- /dev/null
+++ b/final/runtime/tools/linux.inc

@@ -0,0 +1,35 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+####### Detections and Commands ###############################################
+
+ifndef arch
+        uname_m:=$(shell uname -m)
+        ifeq ($(uname_m),i686)
+                export arch:=32
+        endif
+        ifeq ($(uname_m),x86_64)
+                export arch:=32e
+        endif
+        ifndef arch
+                export arch:=$(uname_m)
+        endif
+endif
+
+CMD=sh -c
+CWD=$(shell pwd)
+CP=cp
+RM?=rm -f
+RMR?=rm -rf
+RD?=rmdir
+MD?=mkdir -p
+NUL= /dev/null
+SLASH=/

diff --git a/final/runtime/tools/macos.inc b/final/runtime/tools/macos.inc
new file mode 100644
index 0000000..38f86b1
--- /dev/null
+++ b/final/runtime/tools/macos.inc

@@ -0,0 +1,37 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+####### Detections and Commands ###############################################
+
+ifndef arch
+ ifeq ($(shell /usr/sbin/sysctl -n hw.machine),Power Macintosh)
+   ifeq ($(shell /usr/sbin/sysctl -n hw.optional.64bitops),1)
+     export arch:=ppc64
+   else
+     export arch:=ppc32
+   endif
+ else
+   ifeq ($(shell /usr/sbin/sysctl -n hw.optional.x86_64 2>/dev/null),1)
+     export arch:=intel64
+   else
+     export arch:=ia32
+   endif
+ endif
+endif
+
+CMD=$(SHELL) -c
+CWD=$(shell pwd)
+RM?=rm -f
+RMR?=rm -rf
+RD?=rmdir
+MD?=mkdir -p
+NUL= /dev/null
+SLASH=/

diff --git a/final/runtime/tools/make-fat-binaries.pl b/final/runtime/tools/make-fat-binaries.pl
new file mode 100755
index 0000000..b297e31
--- /dev/null
+++ b/final/runtime/tools/make-fat-binaries.pl

@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use IO::Dir;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+
+use tools;
+
+our $VERSION = "0.003";
+
+#
+# Subroutines.
+#
+
+sub check_dir($$) {
+
+    # Make sure a directory is a readable directory.
+
+    my ( $dir, $type ) = @_;
+
+    -e $dir or runtime_error( "Directory \"$dir\" does not exist" );
+    -d $dir or runtime_error( "\"$dir\" is not a directory" );
+    -r $dir or runtime_error( "Directory \"$dir\" is not readable" );
+
+}; # sub check_dir
+
+sub read_dir($) {
+
+    # Return list of files (not subdirectories) of specified directory.
+
+    my ( $dir ) = @_;
+    my $handle;
+    my $entry;
+    my @files;
+
+    $handle = IO::Dir->new( $dir ) or runtime_error( "Cannot open \"$dir\" directory: $!" );
+    while ( $entry = $handle->read() ) {
+        my $path = "$dir/$entry";
+        if ( $entry !~ m{\A\.} and -f $path ) {
+            push( @files, $entry );
+        }; # if
+    }; # while
+    $handle->close();
+
+    @files = sort( @files );
+    return @files;
+
+}; # sub read_dir
+
+# --------------------------------------------------------------------------------------------------
+# Main program.
+# --------------------------------------------------------------------------------------------------
+
+#
+# Parse command line.
+#
+my @dirs;    # List of input directories.
+my @files;   # List of files.
+my $output;  # Output directory.
+
+get_options(
+    "output=s" => \$output
+);
+
+@ARGV == 0 and cmdline_error( "No input directories specified" );
+
+#
+# Check input and output directories.
+#
+
+# Make shure there is no duplicated directories.
+my %dirs;
+$dirs{ $output } = "";
+foreach my $dir ( @ARGV ) {
+    if ( exists( $dirs{ $dir } ) ) {
+        cmdline_error( "Directory \"$dir\" has already been specified" );
+    }; # if
+    $dirs{ $dir } = "";
+    push( @dirs, $dir );
+}; # foreach $dir
+undef( %dirs );
+
+# Make sure all dirs are exist, dirs, and readable.
+check_dir( $output, "output" );
+foreach my $dir ( @dirs ) {
+    check_dir( $dir,  "input" );
+}; # foreach $dir
+
+# All input dirs should contain exactly the same list of files.
+my @errors;
+@files = read_dir( $dirs[ 0 ] );
+foreach my $dir ( @dirs ) {
+    my %files = map( ( $_ => 0 ), @files );
+    foreach my $file ( read_dir( $dir ) ) {
+        if ( not exists( $files{ $file } ) ) {
+            push( @errors, "Extra file: `" . cat_file( $dir, $file ) . "'." );
+        }; # if
+        $files{ $file } = 1;
+    }; # foreach $file
+    foreach my $file ( keys( %files ) ) {
+        if ( $files{ $file } == 0 ) {
+            push( @errors, "Missed file: `" . cat_file( $dir, $file ) . "'." );
+        }; # if
+    }; # foreach $file
+}; # foreach $dir
+if ( @errors ) {
+    runtime_error( @errors );
+}; # if
+
+#
+# Make fat binaries.
+#
+
+foreach my $file ( sort( @files ) ) {
+    info( "Making \"$file\"..." );
+    my $output_file = cat_file( $output, $file );
+    del_file( $output_file );
+    execute(
+        [
+            "lipo",
+            "-create",
+            "-output", $output_file,
+            map( cat_file( $_, $file ), @dirs )
+        ]
+    );
+}; # foreach $entry
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<make-fat-binaries.pl> -- Make set of fat (universal) binaries.
+
+=head1 SYNOPSIS
+
+B<make-fat-binaries.pl> I<OPTION>... I<INPUT_DIR>...
+
+=head1 OPTIONS
+
+=over
+
+=item B<--output=>I<DIR>
+
+Name of output directory to place fat binaries to. Directory must exist and be writable.
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print program version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<INPUT_DIR>
+
+Name of input directory to get thin files from. Directory must exist and be readable. At least one
+directory required.
+
+=back
+
+=head1 DESCRIPTION
+
+The script creates set of Mac-O fat (universal, multi-architecture) binaries from set of thin
+(single-architecture) files.
+
+The scripts reads files from input directory (or directoriers). It is assumed that one input
+directory keeps files for one architecture (e. g. i386), another directory contains files for
+another architecture (e. g. x86_64), etc. All input directories must contain the same set of files.
+The script issues an error if sets of files in input directories differ.
+
+If the script finishes successfully, output directory will contain the set universal binaries
+built from files with the same name in input directories.
+
+=head1 EXAMPLES
+
+Get thin binaries from C<mac_32.thin/> and C<mac_32e.thin/> directories, and put fat binaries to
+C<mac.fat/> directory:
+
+    $ make-fat-binaries.pl --output=mac.fat mac_32.thin mac_32e.thin
+
+
+=cut
+
+# end of file #

diff --git a/final/runtime/tools/message-converter.pl b/final/runtime/tools/message-converter.pl
new file mode 100755
index 0000000..f4f4b99
--- /dev/null
+++ b/final/runtime/tools/message-converter.pl

@@ -0,0 +1,775 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use File::Glob ":glob";
+use Encode qw{ encode };
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+use Platform ":vars";
+
+our $VERSION = "0.04";
+my $escape      = qr{%};
+my $placeholder = qr{(\d)\$(s|l?[du])};
+
+my $sections =
+    {
+        meta     => { short => "prp" }, # "prp" stands for "property".
+        strings  => { short => "str" },
+        formats  => { short => "fmt" },
+        messages => { short => "msg" },
+        hints    => { short => "hnt" },
+    };
+my @sections = qw{ meta strings formats messages hints };
+# Assign section properties: long name, set number, base number.
+map( $sections->{ $sections[ $_ ] }->{ long } = $sections[ $_ ],      ( 0 .. @sections - 1 ) );
+map( $sections->{ $sections[ $_ ] }->{ set  } = ( $_ + 1 ),           ( 0 .. @sections - 1 ) );
+map( $sections->{ $sections[ $_ ] }->{ base } = ( ( $_ + 1 ) << 16 ), ( 0 .. @sections - 1 ) );
+
+# Properties of Meta section.
+my @properties = qw{ Language Country LangId Version Revision };
+
+
+sub _generate_comment($$$) {
+
+    my ( $data, $open, $close ) = @_;
+    my $bulk =
+        $open . " Do not edit this file! " . $close . "\n" .
+        $open . " The file was generated from " . get_file( $data->{ "%meta" }->{ source } ) .
+            " by " . $tool . " on " . localtime() . ". " . $close . "\n";
+    return $bulk;
+
+}; # sub _generate_comment
+
+
+sub msg2sgn($) {
+
+    # Convert message string to signature. Signature is a list of placeholders in sorted order.
+    # For example, signature of "%1$s value \"%2$s\" is invalid." is "%1$s %2$s".
+
+    my ( $msg ) = @_;
+    my @placeholders;
+    pos( $msg ) = 0;
+    while ( $msg =~ m{\G.*?$escape$placeholder}g ) {
+        $placeholders[ $1 - 1 ] = "%$1\$$2";
+    }; # while
+    for ( my $i = 1; $i <= @placeholders; ++ $i ) {
+        if ( not defined( $placeholders[ $i - 1 ] ) ) {
+            $placeholders[ $i - 1 ] = "%$i\$-";
+        }; # if
+    }; # for $i
+    return join( " ", @placeholders );
+
+}; # sub msg2sgn
+
+
+sub msg2src($) {
+
+    # Convert message string to a C string constant.
+
+    my ( $msg ) = @_;
+    if ( $target_os eq "win" ) {
+        $msg =~ s{$escape$placeholder}{\%$1!$2!}g;
+    }; # if
+    return $msg;
+
+}; # sub msg2src
+
+
+my $special =
+    {
+        "n" => "\n",
+        "t" => "\t",
+    };
+
+sub msg2mc($) {
+    my ( $msg ) = @_;
+    $msg = msg2src( $msg ); # Get windows style placeholders.
+    $msg =~ s{\\(.)}{ exists( $special->{ $1 } ) ? $special->{ $1 } : $1 }ge;
+    return $msg;
+}; # sub msg2mc
+
+
+
+sub parse_message($) {
+
+    my ( $msg ) = @_;
+    pos( $msg ) = 0;
+    for ( ; ; ) {
+        if ( $msg !~ m{\G.*?$escape}gc ) {
+            last;
+        }
+        if ( $msg !~ m{\G$placeholder}gc ) {
+            return "Bad %-sequence near \"%" . substr( $msg, pos( $msg ), 7 ) . "\"";
+        }; # if
+    }; # forever
+    return undef;
+
+}; # sub parse_message
+
+
+sub parse_source($) {
+
+    my ( $name ) = @_;
+
+    my @bulk = read_file( $name, -layer => ":utf8" );
+    my $data = {};
+
+    my $line;
+    my $n = 0;         # Line number.
+    my $obsolete = 0;  # Counter of obsolete entries.
+    my $last_idx;
+    my %idents;
+    my $section;
+
+    my $error =
+        sub {
+            my ( $n, $line, $msg ) = @_;
+            runtime_error( "Error parsing $name line $n: " . "$msg:\n" . "    $line" );
+        }; # sub
+
+    foreach $line ( @bulk ) {
+        ++ $n;
+        # Skip empty lines and comments.
+        if ( $line =~ m{\A\s*(\n|#)} ) {
+            $last_idx = undef;
+            next;
+        }; # if
+        # Parse section header.
+        if ( $line =~ m{\A-\*-\s*([A-Z_]*)\s*-\*-\s*\n\z}i ) {
+            $section = ( lc( $1 ) );
+            if ( not grep( $section eq $_, @sections ) ) {
+                $error->( $n, $line, "Unknown section \"$section\" specified" );
+            }; # if
+            if ( exists( $data->{ $section } ) ) {
+                $error->( $n, $line, "Multiple sections of the same type specified" );
+            }; # if
+            %idents = ();     # Clean list of known message identifiers.
+            next;
+        }; # if
+        if ( not defined( $section ) ) {
+            $error->( $n, $line, "Section heading expected" );
+        }; # if
+        # Parse section body.
+        if ( $section eq "meta" ) {
+            if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
+                # Parse meta properties (such as Language, Country, and LangId).
+                my ( $property, $value ) = ( $1, $2 );
+                if ( not grep( $_ eq $property , @properties ) ) {
+                    $error->( $n, $line, "Unknown property \"$property\" specified" );
+                }; # if
+                if ( exists( $data->{ "%meta" }->{ $property } ) ) {
+                    $error->( $n, $line, "Property \"$property\" has already been specified" );
+                }; # if
+                $data->{ "%meta" }->{ $property } = $value;
+                $last_idx = undef;
+                next;
+            }; # if
+            $error->( $n, $line, "Property line expected" );
+        }; # if
+        # Parse message.
+        if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
+            my ( $ident, $message ) = ( $1, $2 );
+            if ( $ident eq "OBSOLETE" ) {
+                # If id is "OBSOLETE", add a unique suffix. It provides convenient way to mark
+                # obsolete messages.
+                ++ $obsolete;
+                $ident .= $obsolete;
+            }; # if
+            if ( exists( $idents{ $ident } ) ) {
+                $error->( $n, $line, "Identifier \"$ident\" is redefined" );
+            }; # if
+            # Check %-sequences.
+            my $err = parse_message( $message );
+            if ( $err ) {
+                $error->( $n, $line, $err );
+            }; # if
+            # Save message.
+            push( @{ $data->{ $section } }, [ $ident, $message ] );
+            $idents{ $ident } = 1;
+            $last_idx = @{ $data->{ $section } } - 1;
+            next;
+        }; # if
+        # Parse continuation line.
+        if ( $line =~ m{\A\s*"(.*)"\s*\z} ) {
+            my $message = $1;
+            if ( not defined( $last_idx )  ) {
+                $error->( $n, $line, "Unexpected continuation line" );
+            }; # if
+            # Check %-sequences.
+            my $err = parse_message( $message );
+            if ( $err ) {
+                $error->( $n, $line, $err );
+            }; # if
+            # Save continuation.
+            $data->{ $section }->[ $last_idx ]->[ 1 ] .= $message;
+            next;
+        }; # if
+        $error->( $n, $line, "Message definition expected" );
+    }; # foreach
+    $data->{ "%meta" }->{ source } = $name;
+    foreach my $section ( @sections ) {
+        if ( not exists( $data->{ $section } ) ) {
+            $data->{ $section } = [];
+        }; # if
+    }; # foreach $section
+
+    foreach my $property ( @properties ) {
+        if ( not defined( $data->{ "%meta" }->{ $property } ) ) {
+            runtime_error(
+                "Error parsing $name: " .
+                    "Required \"$property\" property is not specified"
+            );
+        }; # if
+        push( @{ $data->{ meta } }, [ $property, $data->{ "%meta" }->{ $property } ] );
+    }; # foreach
+
+    return $data;
+
+}; # sub parse_source
+
+
+sub generate_enum($$$) {
+
+    my ( $data, $file, $prefix ) = @_;
+    my $bulk = "";
+
+    $bulk =
+        _generate_comment( $data, "//", "//" ) .
+        "\n" .
+        "enum ${prefix}_id {\n\n" .
+        "    // A special id for absence of message.\n" .
+        "    ${prefix}_null = 0,\n\n";
+
+    foreach my $section ( @sections ) {
+        my $props = $sections->{ $section };    # Section properties.
+        my $short = $props->{ short };          # Short section name, frequently used.
+        $bulk .=
+            "    // Set #$props->{ set }, $props->{ long }.\n" .
+            "    ${prefix}_${short}_first = $props->{ base },\n";
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( $ident, undef ) = @$item;
+            $bulk .= "    ${prefix}_${short}_${ident},\n";
+        }; # foreach
+        $bulk .= "    ${prefix}_${short}_last,\n\n";
+    }; # foreach $type
+    $bulk .= "    ${prefix}_xxx_lastest\n\n";
+
+    $bulk .=
+        "}; // enum ${prefix}_id\n" .
+        "\n" .
+        "typedef enum ${prefix}_id  ${prefix}_id_t;\n" .
+        "\n";
+
+    $bulk .=
+        "\n" .
+        "// end of file //\n";
+
+    write_file( $file, \$bulk );
+
+}; # sub generate_enum
+
+
+sub generate_signature($$) {
+
+    my ( $data, $file ) = @_;
+    my $bulk = "";
+
+    $bulk .= "// message catalog signature file //\n\n";
+
+    foreach my $section ( @sections ) {
+        my $props = $sections->{ $section };    # Section properties.
+        my $short = $props->{ short };          # Short section name, frequently used.
+        $bulk .= "-*- " . uc( $props->{ long } ) . "-*-\n\n";
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( $ident, $msg ) = @$item;
+            $bulk .= sprintf( "%-40s %s\n", $ident, msg2sgn( $msg ) );
+        }; # foreach
+        $bulk .= "\n";
+    }; # foreach $type
+
+    $bulk .= "// end of file //\n";
+
+    write_file( $file, \$bulk );
+
+}; # sub generate_signature
+
+
+sub generate_default($$$) {
+
+    my ( $data, $file, $prefix ) = @_;
+    my $bulk = "";
+
+    $bulk .=
+        _generate_comment( $data, "//", "//" ) .
+        "\n";
+
+    foreach my $section ( @sections ) {
+        $bulk .=
+            "static char const *\n" .
+            "__${prefix}_default_${section}" . "[] =\n" .
+            "    {\n" .
+            "        NULL,\n";
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( undef, $msg ) = @$item;
+            $bulk .= "        \"" . msg2src( $msg ) . "\",\n";
+        }; # while
+        $bulk .=
+            "        NULL\n" .
+            "    };\n" .
+            "\n";
+    }; # foreach $type
+
+    $bulk .=
+        "struct kmp_i18n_section {\n" .
+        "    int           size;\n" .
+        "    char const ** str;\n" .
+        "}; // struct kmp_i18n_section\n" .
+        "typedef struct kmp_i18n_section  kmp_i18n_section_t;\n" .
+        "\n" .
+        "static kmp_i18n_section_t\n" .
+        "__${prefix}_sections[] =\n" .
+        "    {\n" .
+        "        { 0, NULL },\n";
+    foreach my $section ( @sections ) {
+        $bulk .=
+            "        { " . @{ $data->{ $section } } . ", __${prefix}_default_${section} },\n";
+    }; # foreach $type
+    $bulk .=
+        "        { 0, NULL }\n" .
+        "    };\n" .
+        "\n";
+
+    $bulk .=
+        "struct kmp_i18n_table {\n" .
+        "    int                   size;\n" .
+        "    kmp_i18n_section_t *  sect;\n" .
+        "}; // struct kmp_i18n_table\n" .
+        "typedef struct kmp_i18n_table  kmp_i18n_table_t;\n" .
+        "\n" .
+        "static kmp_i18n_table_t __kmp_i18n_default_table =\n" .
+        "    {\n" .
+        "        " . @sections . ",\n" .
+        "        __kmp_i18n_sections\n" .
+        "    };\n" .
+        "\n" .
+        "// end of file //\n";
+
+    write_file( $file, \$bulk );
+
+}; # sub generate_default
+
+
+sub generate_message_unix($$) {
+
+    my ( $data, $file ) = @_;
+    my $bulk     = "";
+
+    $bulk .=
+        _generate_comment( $data, "\$", "\$" ) .
+        "\n" .
+        "\$quote \"\n\n";
+
+    foreach my $section ( @sections ) {
+        $bulk .=
+            "\$ " . ( "-" x 78 ) . "\n\$ $section\n\$ " . ( "-" x 78 ) . "\n\n" .
+            "\$set $sections->{ $section }->{ set }\n" .
+            "\n";
+        my $n = 0;
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( undef, $msg ) = @$item;
+            ++ $n;
+            $bulk .= "$n \"" . msg2src( $msg ) . "\"\n";
+        }; # foreach
+        $bulk .= "\n";
+    }; # foreach $type
+
+    $bulk .=
+        "\n" .
+        "\$ end of file \$\n";
+
+    write_file( $file, \$bulk, -layer => ":utf8" );
+
+}; # sub generate_message_linux
+
+
+sub generate_message_windows($$) {
+
+    my ( $data, $file ) = @_;
+    my $bulk = "";
+    my $language = $data->{ "%meta" }->{ Language };
+    my $langid   = $data->{ "%meta" }->{ LangId };
+
+    $bulk .=
+        _generate_comment( $data, ";", ";" ) .
+        "\n" .
+        "LanguageNames = ($language=$langid:msg_$langid)\n" .
+        "\n";
+
+    $bulk .=
+        "FacilityNames=(\n";
+    foreach my $section ( @sections ) {
+        my $props = $sections->{ $section };    # Section properties.
+        $bulk .=
+            " $props->{ short }=" . $props->{ set } ."\n";
+    }; # foreach $section
+    $bulk .=
+        ")\n\n";
+
+    foreach my $section ( @sections ) {
+        my $short = $sections->{ $section }->{ short };
+        my $n = 0;
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( undef, $msg ) = @$item;
+            ++ $n;
+            $bulk .=
+                "MessageId=$n\n" .
+                "Facility=$short\n" .
+                "Language=$language\n" .
+                msg2mc( $msg ) . "\n.\n\n";
+        }; # foreach $item
+    }; # foreach $section
+
+    $bulk .=
+        "\n" .
+        "; end of file ;\n";
+
+    $bulk = encode( "UTF-16LE", $bulk ); # Convert text to UTF-16LE used in Windows* OS.
+    write_file( $file, \$bulk, -binary => 1 );
+
+}; # sub generate_message_windows
+
+
+#
+# Parse command line.
+#
+
+my $input_file;
+my $enum_file;
+my $signature_file;
+my $default_file;
+my $message_file;
+my $id;
+my $prefix = "";
+get_options(
+    Platform::target_options(),
+    "enum-file=s"      => \$enum_file,
+    "signature-file=s" => \$signature_file,
+    "default-file=s"   => \$default_file,
+    "message-file=s"   => \$message_file,
+    "id|lang-id"       => \$id,
+    "prefix=s"	       => \$prefix,
+);
+if ( @ARGV == 0 ) {
+    cmdline_error( "No source file specified -- nothing to do" );
+}; # if
+if ( @ARGV > 1 ) {
+    cmdline_error( "Too many source files specified" );
+}; # if
+$input_file = $ARGV[ 0 ];
+
+
+my $generate_message;
+if ( $target_os =~ m{\A(?:lin|mac)\z} ) {
+    $generate_message = \&generate_message_unix;
+} elsif ( $target_os eq "win" ) {
+    $generate_message = \&generate_message_windows;
+} else {
+    runtime_error( "OS \"$target_os\" is not supported" );
+}; # if
+
+
+#
+# Do the work.
+#
+
+my $data = parse_source( $input_file );
+if ( defined( $id ) ) {
+    print( $data->{ "%meta" }->{ LangId }, "\n" );
+}; # if
+if ( defined( $enum_file ) ) {
+    generate_enum( $data, $enum_file, $prefix );
+}; # if
+if ( defined( $signature_file ) ) {
+    generate_signature( $data, $signature_file );
+}; # if
+if ( defined( $default_file ) ) {
+    generate_default( $data, $default_file, $prefix );
+}; # if
+if ( defined( $message_file ) ) {
+    $generate_message->( $data, $message_file );
+}; # if
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<message-converter.pl> -- Convert message catalog source file into another text forms.
+
+=head1 SYNOPSIS
+
+B<message-converter.pl> I<option>... <file>
+
+=head1 OPTIONS
+
+=over
+
+=item B<--enum-file=>I<file>
+
+Generate enum file named I<file>.
+
+=item B<--default-file=>I<file>
+
+Generate default messages file named I<file>.
+
+=item B<--lang-id>
+
+Print language identifier of the message catalog source file.
+
+=item B<--message-file=>I<file>
+
+Generate message file.
+
+=item B<--signature-file=>I<file>
+
+Generate signature file.
+
+Signatures are used for checking compatibility. For example, to check a primary
+catalog and its translation to another language, signatures of both catalogs should be generated
+and compared. If signatures are identical, catalogs are compatible.
+
+=item B<--prefix=>I<prefix>
+
+Prefix to be used for all C identifiers (type and variable names) in enum and default messages
+files.
+
+=item B<--os=>I<str>
+
+Specify OS name the message formats to be converted for. If not specified expolicitly, value of
+LIBOMP_OS environment variable is used. If LIBOMP_OS is not defined, host OS is detected.
+
+Depending on OS, B<message-converter.pl> converts message formats to GNU style or MS style.
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full documentation and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--version>
+
+Print version string and exit.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+A name of input file.
+
+=back
+
+=head1 DESCRIPTION
+
+=head2 Message Catalog File Format
+
+It is plain text file in UTF-8 encoding. Empty lines and lines beginning with sharp sign (C<#>) are
+ignored. EBNF syntax of content:
+
+    catalog    = { section };
+    section    = header body;
+    header     = "-*- " section-id " -*-" "\n";
+    body       = { message };
+    message    = message-id string "\n" { string "\n" };
+    section-id = identifier;
+    message-id = "OBSOLETE" | identifier;
+    identifier = letter { letter | digit | "_" };
+    string     = """ { character } """;
+
+Identifier starts with letter, with following letters, digits, and underscores. Identifiers are
+case-sensitive. Setion identifiers are fixed: C<META>, C<STRINGS>, C<FORMATS>, C<MESSAGES> and
+C<HINTS>. Message identifiers must be unique within section. Special C<OBSOLETE> pseudo-identifier
+may be used many times.
+
+String is a C string literal which must not cross line boundaries.
+Long messages may occupy multiple lines, a string per line.
+
+Message may include printf-like GNU-style placeholders for arguments: C<%I<n>$I<t>>,
+where I<n> is argument number (C<1>, C<2>, ...),
+I<t> -- argument type, C<s> (string) or C<d> (32-bit integer).
+
+See also comments in F<i18n/en_US.txt>.
+
+=head2 Output Files
+
+This script can generate 3 different text files from single source:
+
+=over
+
+=item Enum file.
+
+Enum file is a C include file, containing definitions of message identifiers, e. g.:
+
+    enum kmp_i18n_id {
+
+        // Set #1, meta.
+        kmp_i18n_prp_first = 65536,
+        kmp_i18n_prp_Language,
+        kmp_i18n_prp_Country,
+        kmp_i18n_prp_LangId,
+        kmp_i18n_prp_Version,
+        kmp_i18n_prp_Revision,
+        kmp_i18n_prp_last,
+
+        // Set #2, strings.
+        kmp_i18n_str_first = 131072,
+        kmp_i18n_str_Error,
+        kmp_i18n_str_UnknownFile,
+        kmp_i18n_str_NotANumber,
+        ...
+
+        // Set #3, fotrmats.
+        ...
+
+        kmp_i18n_xxx_lastest
+
+    }; // enum kmp_i18n_id
+
+    typedef enum kmp_i18n_id  kmp_i18n_id_t;
+
+=item Default messages file.
+
+Default messages file is a C include file containing default messages to be embedded into
+application (and used if external message catalog does not exist or could not be open):
+
+    static char const *
+    __kmp_i18n_default_meta[] =
+        {
+            NULL,
+            "English",
+            "USA",
+            "1033",
+            "2",
+            "20090806",
+            NULL
+        };
+
+    static char const *
+    __kmp_i18n_default_strings[] =
+        {
+            "Error",
+            "(unknown file)",
+            "not a number",
+            ...
+            NULL
+        };
+
+    ...
+
+=item Message file.
+
+Message file is an input for message compiler, F<gencat> on Linux* OS and OS X*, or F<mc.exe> on
+Windows* OS.
+
+Here is the example of Linux* OS message file:
+
+    $quote "
+    1 "Japanese"
+    2 "Japan"
+    3 "1041"
+    4 "2"
+    5 "Based on Enlish message catalog revision 20090806"
+    ...
+
+Example of Windows* OS message file:
+
+    LanguageNames = (Japanese=10041:msg_1041)
+
+    FacilityNames = (
+     prp=1
+     str=2
+     fmt=3
+     ...
+    )
+
+    MessageId=1
+    Facility=prp
+    Language=Japanese
+    Japanese
+    .
+
+    ...
+
+=item Signature.
+
+Signature is a processed source file: comments stripped, strings deleted, but placeholders kept and
+sorted.
+
+    -*- FORMATS-*-
+
+    Info                                     %1$d %2$s
+    Warning                                  %1$d %2$s
+    Fatal                                    %1$d %2$s
+    SysErr                                   %1$d %2$s
+    Hint                                     %1$- %2$s
+    Pragma                                   %1$s %2$s %3$s %4$s
+
+The purpose of signatures -- compare two message source files for compatibility. If signatures of
+two message sources are the same, binary message catalogs will be compatible.
+
+=back
+
+=head1 EXAMPLES
+
+Generate include file containing message identifiers:
+
+    $ message-converter.pl --enum-file=kmp_i18n_id.inc en_US.txt
+
+Generate include file contating default messages:
+
+    $ message-converter.pl --default-file=kmp_i18n_default.inc en_US.txt
+
+Generate input file for message compiler, Linux* OS example:
+
+    $ message-converter.pl --message-file=ru_RU.UTF-8.msg ru_RU.txt
+
+Generate input file for message compiler, Windows* OS example:
+
+    > message-converter.pl --message-file=ru_RU.UTF-8.mc ru_RU.txt
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/objcopy.cpp b/final/runtime/tools/objcopy.cpp
new file mode 100644
index 0000000..a2811c6
--- /dev/null
+++ b/final/runtime/tools/objcopy.cpp

@@ -0,0 +1,520 @@
+/*
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stdlib.h>
+#include <iostream>
+#include <strstream>
+#include <fstream>
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+typedef std::string                      string_t;
+typedef std::vector< string_t >          strings_t;
+typedef std::map< string_t, string_t >   str_hash_t;
+typedef std::pair< string_t, string_t >  str_pair_t;
+#ifdef _WIN32
+    typedef long long  int64_t;
+#endif
+
+string_t
+shift( strings_t & strs ) {
+    string_t first = strs.front();
+    strs.erase( strs.begin() );
+    return first;
+} // shift
+
+string_t
+find(
+    str_hash_t const & hash,
+    string_t const &   key
+) {
+    string_t value;
+    str_hash_t::const_iterator it = hash.find( key );
+    if ( it != hash.end() ) {
+        value = it->second;
+    }; // if
+    return value;
+} // find
+
+void die( string_t const & message ) {
+    std::cerr << message << std::endl;
+    exit( 1 );
+} // die
+
+void stop( string_t const & message ) {
+    printf( "%s\n", message.c_str() );
+    exit( 1 );
+}
+
+// An entry in the symbol table of a .obj file.
+struct symbol_t {
+    long long        name;
+    unsigned         value;
+    unsigned  short  section_num;
+    unsigned  short  type;
+    char             storage_class;
+    char             nAux;
+}; // struct symbol_t
+
+
+class _rstream_t : public std::istrstream {
+
+    private:
+
+        const char * buf;
+
+    protected:
+
+        _rstream_t( pair< const char *, streamsize > p )
+            : istrstream( p.first, p.second ), buf( p.first )
+        {
+        }
+
+        ~_rstream_t() {
+            delete [] buf;
+        }
+
+}; // class _rstream_t
+
+/* A stream encapuslating the content of a file or the content of a string, overriding the
+   >> operator to read various integer types in binary form, as well as a symbol table
+   entry.
+*/
+class rstream_t : public _rstream_t {
+private:
+
+    template< typename type_t >
+    inline rstream_t & do_read( type_t & x ) {
+	read( (char*) & x, sizeof( type_t ) );
+	return * this;
+    }
+
+    static pair<const char*, streamsize> getBuf(const char *fileName) {
+	ifstream raw(fileName,ios::binary | ios::in);
+	if(!raw.is_open())
+	    stop("rstream.getBuf: Error opening file");
+	raw.seekg(0,ios::end);
+	streampos fileSize = raw.tellg();
+	if(fileSize < 0)
+	    stop("rstream.getBuf: Error reading file");
+	char *buf = new char[fileSize];
+	raw.seekg(0,ios::beg);
+	raw.read(buf, fileSize);
+	return pair<const char*, streamsize>(buf,fileSize);
+    }
+public:
+    // construct from a string
+    rstream_t( const char * buf, streamsize size ) :
+        _rstream_t( pair< const char *, streamsize >( buf, size ) )
+    {}
+    /* construct from a file whole content is fully read once to initialize the content of
+       this stream
+    */
+    rstream_t( string_t const & fileName )
+        : _rstream_t( getBuf( fileName.c_str() ) )
+    {
+    }
+
+    rstream_t & operator >>( int & x ) {
+	return do_read(x);
+    }
+    rstream_t & operator >>(unsigned &x) {
+	return do_read(x);
+    }
+    rstream_t & operator>>(short &x) {
+	return do_read(x);
+    }
+    rstream_t & operator>>(unsigned short &x) {
+	return do_read(x);
+    }
+    rstream_t & operator>>( symbol_t & e ) {
+	read((char*)&e, 18);
+	return *this;
+    }
+}; // class rstream_t
+
+// string table in a .OBJ file
+class StringTable {
+private:
+    map<string, unsigned> directory;
+    size_t length;
+    char *data;
+
+    // make <directory> from <length> bytes in <data>
+    void makeDirectory(void) {
+	unsigned i = 4;
+	while(i < length) {
+	    string s = string(data + i);
+	    directory.insert(make_pair(s, i));
+	    i += s.size() + 1;
+	}
+    }
+    // initialize <length> and <data> with contents specified by the arguments
+    void init(const char *_data) {
+	unsigned _length = *(unsigned*)_data;
+
+	if(_length < sizeof(unsigned) || _length != *(unsigned*)_data)
+	    stop("StringTable.init: Invalid symbol table");
+	if(_data[_length - 1]) {
+	    // to prevent runaway strings, make sure the data ends with a zero
+	    data = new char[length = _length + 1];
+	    data[_length] = 0;
+	} else {
+	    data = new char[length = _length];
+	}
+	*(unsigned*)data = length;
+	memcpy( data + sizeof(unsigned), _data + sizeof(unsigned), length - sizeof(unsigned) );
+	makeDirectory();
+    }
+public:
+    StringTable( rstream_t & f ) {
+	/* Construct string table by reading from f.
+	 */
+	streampos s;
+	unsigned strSize;
+	char *strData;
+
+	s = f.tellg();
+	f>>strSize;
+	if(strSize < sizeof(unsigned))
+	    stop("StringTable: Invalid string table");
+	strData = new char[strSize];
+	*(unsigned*)strData = strSize;
+	// read the raw data into <strData>
+	f.read(strData + sizeof(unsigned), strSize - sizeof(unsigned));
+	s = f.tellg() - s;
+	if(s < strSize)
+	    stop("StringTable: Unexpected EOF");
+	init(strData);
+	delete[]strData;
+    }
+    StringTable(const set<string> &strings) {
+	/* Construct string table from given strings.
+	 */
+	char *p;
+	set<string>::const_iterator it;
+	size_t s;
+
+	// count required size for data
+	for(length = sizeof(unsigned), it = strings.begin(); it != strings.end(); ++it) {
+	    size_t l = (*it).size();
+
+	    if(l > (unsigned) 0xFFFFFFFF)
+		stop("StringTable: String too long");
+	    if(l > 8) {
+		length += l + 1;
+		if(length > (unsigned) 0xFFFFFFFF)
+		    stop("StringTable: Symbol table too long");
+	    }
+	}
+	data = new char[length];
+	*(unsigned*)data = length;
+	// populate data and directory
+	for(p = data + sizeof(unsigned), it = strings.begin(); it != strings.end(); ++it) {
+	    const string &str = *it;
+	    size_t l = str.size();
+	    if(l > 8) {
+		directory.insert(make_pair(str, p - data));
+		memcpy(p, str.c_str(), l);
+		p[l] = 0;
+		p += l + 1;
+	    }
+	}
+    }
+    ~StringTable() {
+	delete[] data;
+    }
+    /* Returns encoding for given string based on this string table.
+       Error if string length is greater than 8 but string is not in
+       the string table--returns 0.
+    */
+    int64_t encode(const string &str) {
+	int64_t r;
+
+	if(str.size() <= 8) {
+	    // encoded directly
+	    ((char*)&r)[7] = 0;
+	    strncpy((char*)&r, str.c_str(), 8);
+	    return r;
+	} else {
+	    // represented as index into table
+	    map<string,unsigned>::const_iterator it = directory.find(str);
+	    if(it == directory.end())
+		stop("StringTable::encode: String now found in string table");
+	    ((unsigned*)&r)[0] = 0;
+	    ((unsigned*)&r)[1] = (*it).second;
+	    return r;
+	}
+    }
+    /* Returns string represented by x based on this string table.
+       Error if x references an invalid position in the table--returns
+       the empty string.
+    */
+    string decode(int64_t x) const {
+	if(*(unsigned*)&x == 0) {
+	    // represented as index into table
+	    unsigned &p = ((unsigned*)&x)[1];
+	    if(p >= length)
+		stop("StringTable::decode: Invalid string table lookup");
+	    return string(data + p);
+	} else {
+	    // encoded directly
+	    char *p = (char*)&x;
+	    int i;
+
+	    for(i = 0; i < 8 && p[i]; ++i);
+	    return string(p, i);
+	}
+    }
+    void write(ostream &os) {
+	os.write(data, length);
+    }
+};
+
+
+void
+obj_copy(
+    string_t const &    src,    // Name of source file.
+    string_t const &    dst,    // Name of destination file.
+    str_hash_t const &  redefs  // List of redefinititions.
+) {
+
+    set< string > strings; // set of all occurring symbols, appropriately prefixed
+    streampos fileSize;
+    size_t strTabStart;
+    unsigned symTabStart;
+    unsigned symNEntries;
+    int i;
+
+
+    string const error_reading = "Error reading \"" + src + "\" file: ";
+
+    rstream_t in( src );
+
+    in.seekg( 0, ios::end );
+    fileSize = in.tellg();
+
+    in.seekg( 8 );
+    in >> symTabStart >> symNEntries;
+    strTabStart = symTabStart + 18 * size_t( symNEntries );
+    in.seekg( strTabStart );
+    if ( in.eof() ) {
+        stop( error_reading + "Unexpected end of file" );
+    }
+    StringTable stringTableOld( in ); // Read original string table.
+
+    if ( in.tellg() != fileSize ) {
+        stop( error_reading + "Unexpected data after string table" );
+    }
+
+    // compute set of occurring strings with prefix added
+    for ( i = 0; i < symNEntries; ++ i ) {
+
+	symbol_t e;
+
+	in.seekg( symTabStart + i * 18 );
+	if ( in.eof() ) {
+            stop("hideSymbols: Unexpected EOF");
+        }
+	in >> e;
+	if ( in.fail() ) {
+            stop("hideSymbols: File read error");
+        }
+	if ( e.nAux ) {
+            i += e.nAux;
+        }
+	const string & s = stringTableOld.decode( e.name );
+	// if symbol is extern and found in <hide>, prefix and insert into strings,
+	// otherwise, just insert into strings without prefix
+        string_t name = find( redefs, s );
+	strings.insert( name != "" && e.storage_class == 2 ? name : s );
+    }
+
+    ofstream out( dst.c_str(), ios::trunc | ios::out | ios::binary );
+    if ( ! out.is_open() ) {
+        stop("hideSymbols: Error opening output file");
+    }
+
+    // make new string table from string set
+    StringTable stringTableNew = StringTable( strings );
+
+    // copy input file to output file up to just before the symbol table
+    in.seekg( 0 );
+    char * buf = new char[ symTabStart ];
+    in.read( buf, symTabStart );
+    out.write( buf, symTabStart );
+    delete [] buf;
+
+    // copy input symbol table to output symbol table with name translation
+    for ( i = 0; i < symNEntries; ++ i ) {
+	symbol_t e;
+
+	in.seekg( symTabStart + i * 18 );
+	if ( in.eof() ) {
+            stop("hideSymbols: Unexpected EOF");
+        }
+	in >> e;
+	if ( in.fail() ) {
+            stop("hideSymbols: File read error");
+        }
+	const string & s = stringTableOld.decode( e.name );
+	out.seekp( symTabStart + i * 18 );
+        string_t name = find( redefs, s );
+	e.name = stringTableNew.encode( ( e.storage_class == 2 && name != "" ) ? name : s );
+	out.write( (char*) & e, 18 );
+	if ( out.fail() ) {
+            stop( "hideSymbols: File write error" );
+        }
+	if ( e.nAux ) {
+	    // copy auxiliary symbol table entries
+	    int nAux = e.nAux;
+	    for (int j = 1; j <= nAux; ++j ) {
+		in >> e;
+		out.seekp( symTabStart + ( i + j ) * 18 );
+		out.write( (char*) & e, 18 );
+	    }
+	    i += nAux;
+	}
+    }
+    // output string table
+    stringTableNew.write( out );
+}
+
+
+void
+split( string_t const & str, char ch, string_t & head, string_t & tail ) {
+    string_t::size_type pos = str.find( ch );
+    if ( pos == string_t::npos ) {
+        head = str;
+        tail = "";
+    } else {
+        head = str.substr( 0, pos );
+        tail = str.substr( pos + 1 );
+    }; // if
+} // split
+
+
+void help() {
+    std::cout
+        << "NAME\n"
+        << "    objcopy -- copy and translate object files\n"
+        << "\n"
+        << "SYNOPSIS\n"
+        << "    objcopy options... source destination\n"
+        << "\n"
+        << "OPTIONS\n"
+        << "    --help  Print this help and exit.\n"
+        << "    --redefine-sym old=new\n"
+        << "            Rename \"old\" symbol in source object file to \"new\" symbol in\n"
+        << "            destination object file.\n"
+        << "    --redefine-syms sym_file\n"
+        << "            For each pair \"old new\" in sym_file rename \"old\" symbol in \n"
+        << "            source object file to \"new\" symbol in destination object file.\n"
+        << "\n"
+        << "ARGUMENTS\n"
+        << "    source  The name of source object file.\n"
+        << "    destination\n"
+        << "            The name of destination object file.\n"
+        << "\n"
+        << "DESCRIPTION\n"
+        << "    This program implements a minor bit of Linux* OS's objcopy utility on Windows* OS.\n"
+        << "    It can copy object files and edit its symbol table.\n"
+        << "\n"
+        << "EXAMPLES\n"
+        << "    \n"
+        << "        > objcopy --redefine-sym fastcpy=__xxx_fastcpy a.obj b.obj\n"
+        << "\n";
+} // help
+
+
+int
+main( int argc, char const * argv[] ) {
+
+    strings_t   args( argc - 1 );
+    str_hash_t  redefs;
+    strings_t   files;
+
+    std::copy( argv + 1, argv + argc, args.begin() );
+
+    while ( args.size() > 0 ) {
+        string_t arg = shift( args );
+        if ( arg.substr( 0, 2 ) == "--" ) {
+            // An option.
+            if ( 0  ) {
+            } else if ( arg == "--help" ) {
+                help();
+                return 0;
+            } else if ( arg == "--redefine-sym" ) {
+                if ( args.size() == 0 ) {
+                    die( "\"" + arg + "\" option requires an argument" );
+                }; // if
+                // read list of symbol pairs "old new" from command line.
+                string_t redef = shift( args );
+                string_t old_sym;
+                string_t new_sym;
+                split( redef, '=', old_sym, new_sym );
+                if ( old_sym.length() == 0 || new_sym.length() == 0 ) {
+                    die( "Illegal redefinition: \"" + redef + "\"; neither old symbol nor new symbol may be empty" );
+                }; // if
+                redefs.insert( str_pair_t( old_sym, new_sym ) );
+            } else if ( arg == "--redefine-syms" ) {
+                if ( args.size() == 0 ) {
+                    die( "\"" + arg + "\" option requires an argument" );
+                }; // if
+                // read list of symbol pairs "old new" from file.
+                string_t fname = shift( args );
+                string_t redef;
+		ifstream ifs( fname.c_str() );
+		while ( ifs.good() ) {
+                    getline( ifs, redef );// get pair of old/new symbols separated by space
+                    string_t old_sym;
+                    string_t new_sym;
+                    // AC: gcount() does not work here (always return 0), so comment it
+                    //if ( ifs.gcount() ) { // skip empty lines
+                    split( redef, ' ', old_sym, new_sym );
+                    if ( old_sym.length() == 0 || new_sym.length() == 0 ) {
+                        break;  // end of file reached (last empty line)
+                        //die( "Illegal redefinition: \"" + redef + "\"; neither old symbol nor new symbol may be empty" );
+                    }; // if
+                    redefs.insert( str_pair_t( old_sym, new_sym ) );
+                    //}
+		}
+            } else {
+                die( "Illegal option: \"" + arg + "\"" );
+            }; // if
+        } else {
+            // Not an option, a file name.
+            if ( files.size() >= 2 ) {
+                die( "Too many files specified; two files required (use --help option for help)" );
+            }; // if
+            files.push_back( arg );
+        }; // if
+    }; // while
+    if ( files.size() < 2 ) {
+        die( "Not enough files specified; two files required (use --help option for help)" );
+    }; // if
+
+    obj_copy( files[ 0 ], files[ 1 ], redefs );
+
+    return 0;
+
+} // main
+
+
+// end of file //

diff --git a/final/runtime/tools/required-objects.pl b/final/runtime/tools/required-objects.pl
new file mode 100755
index 0000000..6102a62
--- /dev/null
+++ b/final/runtime/tools/required-objects.pl

@@ -0,0 +1,642 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use File::Glob ":glob";
+use Data::Dumper;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+use Platform ":vars";
+
+our $VERSION = "0.004";
+
+# --------------------------------------------------------------------------------------------------
+# Set of objects:       # Ref to hash, keys are names of objects.
+#     object0:          # Ref to hash of two elements with keys "defined" and "undefined".
+#         defined:      # Ref to array of symbols defined in object0.
+#             - symbol0 # Symbol name.
+#             - ...
+#         undefined:    # Ref to array of symbols referenced in object0.
+#             - symbol0
+#             - ...
+#     object1:
+#         ...
+#     ...
+# --------------------------------------------------------------------------------------------------
+
+# --------------------------------------------------------------------------------------------------
+# Set of symbols:       # Ref to hash, keys are names of symbols.
+#    symbol0:           # Ref to array of object names where the symbol0 is defined.
+#        - object0      # Object file name.
+#        - ...
+#    symbol1:
+#        ...
+#    ...
+# --------------------------------------------------------------------------------------------------
+
+sub dump_objects($$$) {
+
+    my ( $title, $objects, $dump ) = @_;
+
+    if ( $dump > 0 ) {
+        STDERR->print( $title, "\n" );
+        foreach my $object ( sort( keys( %$objects ) ) ) {
+            STDERR->print( "    $object\n" );
+            if ( $dump > 1 ) {
+                STDERR->print( "        Defined symbols:\n" );
+                foreach my $symbol ( sort( @{ $objects->{ $object }->{ defined } } ) ) {
+                    STDERR->print( "            $symbol\n" );
+                }; # foreach $symbol
+                STDERR->print( "        Undefined symbols:\n" );
+                foreach my $symbol ( sort( @{ $objects->{ $object }->{ undefined } } ) ) {
+                    STDERR->print( "            $symbol\n" );
+                }; # foreach $symbol
+            }; # if
+        }; # foreach $object
+    }; # if
+
+}; # sub dump_objects
+
+sub dump_symbols($$$) {
+
+    my ( $title, $symbols, $dump ) = @_;
+
+    if ( $dump > 0 ) {
+        STDERR->print( $title, "\n" );
+        foreach my $symbol ( sort( keys( %$symbols ) ) ) {
+            STDERR->print( "    $symbol\n" );
+            if ( $dump > 1 ) {
+                foreach my $object ( sort( @{ $symbols->{ $symbol } } ) ) {
+                    STDERR->print( "        $object\n" );
+                }; # foreach
+            }; # if
+        }; # foreach $object
+    }; # if
+
+}; # sub dump_symbols
+
+# --------------------------------------------------------------------------------------------------
+# Name:
+#     load_symbols -- Fulfill objects data structure with symbol names.
+# Synopsis:
+#     load_symbols( $objects );
+# Arguments:
+#     $objects (in/out) -- Set of objects. On enter, it is expected that top-level hash has filled
+#         with object names only. On exit, it is completely fulfilled with lists of symbols
+#         defined or referenced in each object file.
+# Returns:
+#     Nothing.
+# Example:
+#     my $objects = { foo.o => {} };
+#     load_symbols( $objects );
+#     # Now $objects is { goo.o => { defined => [ ... ], undefined => [ ... ] } }.
+#
+# --------------------------------------------------------------------------------------------------
+# This version of load_symbols parses output of nm command and works on Linux* OS and OS X*.
+#
+sub _load_symbols_nm($) {
+
+    my $objects = shift( @_ );
+        # It is a ref to hash. Keys are object names, values are empty hashes (for now).
+    my @bulk;
+
+    if ( %$objects ) {
+        # Do not run nm if a set of objects is empty -- nm will try to open a.out in this case.
+        my $tool;
+        if($target_arch eq "mic") {
+            $tool = "x86_64-k1om-linux-nm"
+        } else {
+            $tool = "nm"
+        }
+        execute(
+            [
+                $tool,
+                "-g",    # Display only external (global) symbols.
+                "-o",    # Precede each symbol by the name of the input file.
+                keys( %$objects )
+                    # Running nm once (rather than once per object) improves performance
+                    # drastically.
+            ],
+            -stdout => \@bulk
+        );
+    }; # if
+
+    foreach my $line ( @bulk ) {
+        if ( $line !~ m{^(.*):(?: ?[0-9a-f]*| *) ([A-Za-z]) (.*)$} ) {
+            die "Cannot parse nm output, line:\n    $line\n";
+        }; # if
+        my ( $file, $tag, $symbol ) = ( $1, $2, $3 );
+        if ( not exists( $objects->{ $file } ) ) {
+            die "nm reported unknown object file:\n    $line\n";
+        }; # if
+        # AC: exclude some libc symbols from renaming, otherwise we have problems
+        #     in tests for gfortran + static libomp on Lin_32.
+        #     These symbols came from libtbbmalloc.a
+        if ( $target_os eq "lin" ) {
+            if ( $symbol =~ m{__i686} ) {
+                next;
+            }
+        }
+        # AC: added "w" to tags of undefined symbols, e.g. malloc is weak in libirc v12.1.
+        if ( $tag eq "U" or $tag eq "w" ) { # Symbol not defined.
+            push( @{ $objects->{ $file }->{ undefined } }, $symbol );
+        } else {             # Symbol defined.
+            push( @{ $objects->{ $file }->{ defined } }, $symbol );
+        }; # if
+    }; # foreach
+
+    return undef;
+
+}; # sub _load_symbols_nm
+
+# --------------------------------------------------------------------------------------------------
+# This version of load_symbols parses output of link command and works on Windows* OS.
+#
+sub _load_symbols_link($) {
+
+    my $objects = shift( @_ );
+        # It is a ref to hash. Keys are object names, values are empty hashes (for now).
+    my @bulk;
+
+    if ( %$objects ) {
+        # Do not run nm if a set of objects is empty -- nm will try to open a.out in this case.
+        execute(
+            [
+                "link",
+                "/dump",
+                "/symbols",
+                keys( %$objects )
+                    # Running nm once (rather than once per object) improves performance
+                    # drastically.
+            ],
+            -stdout => \@bulk
+        );
+    }; # if
+
+    my $num_re   = qr{[0-9A-F]{3,4}};
+    my $addr_re  = qr{[0-9A-F]{8}};
+    my $tag_re   = qr{DEBUG|ABS|UNDEF|SECT[0-9A-F]+};
+    my $class_re = qr{Static|External|Filename|Label|BeginFunction|EndFunction|WeakExternal|\.bf or\.ef};
+
+    my $file;
+    foreach my $line ( @bulk ) {
+        if ( $line =~ m{\ADump of file (.*?)\n\z} ) {
+            $file = $1;
+            if ( not exists( $objects->{ $file } ) ) {
+                die "link reported unknown object file:\n    $line\n";
+            }; # if
+        } elsif ( $line =~ m{\A$num_re } ) {
+            if ( not defined( $file ) ) {
+                die "link reported symbol of unknown object file:\n    $line\n";
+            }; # if
+            if ( $line !~ m{\A$num_re $addr_re ($tag_re)\s+notype(?: \(\))?\s+($class_re)\s+\| (.*?)\n\z} ) {
+                die "Cannot parse link output, line:\n    $line\n";
+            }; # if
+            my ( $tag, $class, $symbol ) = ( $1, $2, $3 );
+            # link.exe /dump sometimes prints comments for symbols, e. g.:
+            # ".?0_memcopyA ([Entry] ?0_memcopyA)", or "??_C@_01A@r?$AA@ (`string')".
+            # Strip these comments.
+            $symbol =~ s{ \(.*\)\z}{};
+            if ( $class eq "External" ) {
+                if ( $tag eq "UNDEF" ) { # Symbol not defined.
+                    push( @{ $objects->{ $file }->{ undefined } }, $symbol );
+                } else {                 # Symbol defined.
+                    push( @{ $objects->{ $file }->{ defined } }, $symbol );
+                }; # if
+            }; # if
+        } else {
+            # Ignore all other lines.
+        }; # if
+    }; # foreach
+
+    return undef;
+
+}; # sub _load_symbols_link
+
+# --------------------------------------------------------------------------------------------------
+# Name:
+#     symbols -- Construct set of symbols with specified tag in the specified set of objects.
+# Synopsis:
+#     my $symbols = defined_symbols( $objects, $tag );
+# Arguments:
+#     $objects (in) -- Set of objects.
+#     $tag (in) -- A tag, "defined" or "undefined".
+# Returns:
+#     Set of symbols with the specified tag.
+#
+sub symbols($$) {
+
+    my $objects = shift( @_ );
+    my $tag     = shift( @_ );
+
+    my $symbols = {};
+
+    foreach my $object ( keys( %$objects ) ) {
+        foreach my $symbol ( @{ $objects->{ $object }->{ $tag } } ) {
+            push( @{ $symbols->{ $symbol } }, $object );
+        }; # foreach $symbol
+    }; # foreach $object
+
+    return $symbols;
+
+}; # sub symbols
+
+sub defined_symbols($) {
+
+    my $objects = shift( @_ );
+    my $defined = symbols( $objects, "defined" );
+    return $defined;
+
+}; # sub defined_symbols
+
+sub undefined_symbols($) {
+
+    my $objects = shift( @_ );
+    my $defined = symbols( $objects, "defined" );
+    my $undefined = symbols( $objects, "undefined" );
+    foreach my $symbol ( keys( %$defined ) ) {
+        delete( $undefined->{ $symbol } );
+    }; # foreach symbol
+    return $undefined;
+
+}; # sub undefined_symbols
+
+# --------------------------------------------------------------------------------------------------
+# Name:
+#     _required_extra_objects -- Select a subset of extra objects required to resolve undefined
+#         symbols in a set of objects. It is a helper sub for required_extra_objects().
+# Synopsis:
+#     my $required = _required_extra_objects( $objects, $extra, $symbols );
+# Arguments:
+#     $objects (in) -- A set of objects to be searched for undefined symbols.
+#     $extra (in) -- A set of extra objects to be searched for defined symbols to resolve undefined
+#         symbols in objects.
+#     $symbols (in/out) -- Set of symbols defined in the set of external objects. At the first call
+#         it should consist of all the symbols defined in all the extra objects. Symbols defined in
+#         the selected subset of extra objects are removed from set of defined symbols, because
+#         they are out of interest for subsequent calls.
+# Returns:
+#     A subset of extra objects required by the specified set of objects.
+#
+sub _required_extra_objects($$$$) {
+
+    my $objects = shift( @_ );
+    my $extra   = shift( @_ );
+    my $symbols = shift( @_ );
+    my $dump    = shift( @_ );
+
+    my $required = {};
+
+    if ( $dump > 0 ) {
+        STDERR->print( "Required extra objects:\n" );
+    }; # if
+    foreach my $object ( keys( %$objects ) ) {
+        foreach my $symbol ( @{ $objects->{ $object }->{ undefined } } ) {
+            if ( exists( $symbols->{ $symbol } ) ) {
+                # Add all objects where the symbol is defined to the required objects.
+                foreach my $req_obj ( @{ $symbols->{ $symbol } } ) {
+                    if ( $dump > 0 ) {
+                        STDERR->print( "    $req_obj\n" );
+                        if ( $dump > 1 ) {
+                            STDERR->print( "        by $object\n" );
+                            STDERR->print( "            due to $symbol\n" );
+                        }; # if
+                    }; # if
+                    $required->{ $req_obj } = $extra->{ $req_obj };
+                }; # foreach $req_obj
+                # Delete the symbol from list of defined symbols.
+                delete( $symbols->{ $symbol } );
+            }; # if
+        }; # foreach $symbol
+    }; # foreach $object
+
+    return $required;
+
+}; # sub _required_extra_objects
+
+
+# --------------------------------------------------------------------------------------------------
+# Name:
+#     required_extra_objects -- Select a subset of extra objects required to resolve undefined
+#         symbols in a set of base objects and selected extra objects.
+# Synopsis:
+#     my $required = required_extra_objects( $base, $extra );
+# Arguments:
+#     $base (in/out) -- A set of base objects to be searched for undefined symbols. On enter, it is
+#         expected that top-level hash has filled with object names only. On exit, it is completely
+#         fulfilled with lists of symbols defined and/or referenced in each object file.
+#     $extra (in/out) -- A set of extra objects to be searched for defined symbols required to
+#         resolve undefined symbols in a set of base objects. Usage is similar to base objects.
+# Returns:
+#     A subset of extra object files.
+#
+sub required_extra_objects($$$) {
+
+    my $base    = shift( @_ );
+    my $extra   = shift( @_ );
+    my $dump    = shift( @_ );
+
+    # Load symbols for each object.
+    load_symbols( $base );
+    load_symbols( $extra );
+    if ( $dump ) {
+        dump_objects( "Base objects:", $base, $dump );
+        dump_objects( "Extra objects:", $extra, $dump );
+    }; # if
+
+    # Collect symbols defined in extra objects.
+    my $symbols = defined_symbols( $extra );
+
+    my $required = {};
+    # Select extra objects required by base objects.
+    my $delta = _required_extra_objects( $base, $extra, $symbols, $dump );
+    while ( %$delta ) {
+        %$required = ( %$required, %$delta );
+        # Probably, just selected objects require some more objects.
+        $delta = _required_extra_objects( $delta, $extra, $symbols, $dump );
+    }; # while
+
+    if ( $dump ) {
+        my $base_undefined = undefined_symbols( $base );
+        my $req_undefined = undefined_symbols( $required );
+        dump_symbols( "Symbols undefined in base objects:", $base_undefined, $dump );
+        dump_symbols( "Symbols undefined in required objects:", $req_undefined, $dump );
+    }; # if
+
+    return $required;
+
+}; # sub required_extra_objects
+
+
+# --------------------------------------------------------------------------------------------------
+# Name:
+#     copy_objects -- Copy (and optionally edit) object files to specified directory.
+# Synopsis:
+#     copy_objects( $objects, $target, $prefix, @symbols );
+# Arguments:
+#     $objects (in) -- A set of object files.
+#     $target (in) -- A name of target directory. Directory must exist.
+#     $prefix (in) -- A prefix to add to all the symbols listed in @symbols. If prefix is undefined,
+#         object files are just copied.
+#     @symbols (in) -- List of symbol names to be renamed.
+# Returns:
+#     None.
+#
+sub copy_objects($$;$\@) {
+
+    my $objects = shift( @_ );
+    my $target  = shift( @_ );
+    my $prefix  = shift( @_ );
+    my $symbols = shift( @_ );
+    my $tool;
+    my @redefine;
+    my @redefine_;
+    my $syms_file = "__kmp_sym_pairs.log";
+
+    if ( $target_arch eq "mic" ) {
+        $tool = "x86_64-k1om-linux-objcopy"
+    } else {
+        $tool = "objcopy"
+    }
+
+    if ( not -e $target ) {
+        die "\"$target\" directory does not exist\n";
+    }; # if
+    if ( not -d $target ) {
+        die "\"$target\" is not a directory\n";
+    }; # if
+
+    if ( defined( $prefix ) and @$symbols ) {
+        my %a = map ( ( "$_ $prefix$_" => 1 ), @$symbols );
+        @redefine_ = keys( %a );
+    }; # if
+    foreach my $line ( @redefine_ ) {
+        $line =~ s{$prefix(\W+)}{$1$prefix};
+        push( @redefine, $line );
+    }
+    write_file( $syms_file, \@redefine );
+    foreach my $src ( sort( keys( %$objects ) ) ) {
+        my $dst = cat_file( $target, get_file( $src ) );
+        if ( @redefine ) {
+            execute( [ $tool, "--redefine-syms", $syms_file, $src, $dst ] );
+        } else {
+            copy_file( $src, $dst, -overwrite => 1 );
+        }; # if
+    }; # foreach $object
+
+}; # sub copy_objects
+
+
+# --------------------------------------------------------------------------------------------------
+# Main.
+# --------------------------------------------------------------------------------------------------
+
+my $base  = {};
+my $extra = {};
+my $switcher = $base;
+my $dump = 0;
+my $print_base;
+my $print_extra;
+my $copy_base;
+my $copy_extra;
+my $prefix;
+
+# Parse command line.
+
+Getopt::Long::Configure( "permute" );
+get_options(
+    Platform::target_options(),
+    "base"         => sub { $switcher = $base;  },
+    "extra"        => sub { $switcher = $extra; },
+    "print-base"   => \$print_base,
+    "print-extra"  => \$print_extra,
+    "print-all"    => sub { $print_base = 1; $print_extra = 1; },
+    "copy-base=s"  => \$copy_base,
+    "copy-extra=s" => \$copy_extra,
+    "copy-all=s"   => sub { $copy_base = $_[ 1 ]; $copy_extra = $_[ 1 ]; },
+    "dump"         => sub { ++ $dump; },
+    "prefix=s"     => \$prefix,
+    "<>"    =>
+        sub {
+            my $arg = $_[ 0 ];
+            my @args;
+            if ( $^O eq "MSWin32" ) {
+                # Windows* OS does not expand wildcards. Do it...
+                @args = bsd_glob( $arg );
+            } else {
+                @args = ( $arg );
+            }; # if
+            foreach my $object ( @args ) {
+                if ( exists( $base->{ $object } ) or exists( $extra->{ $object } ) ) {
+                    die "Object \"$object\" has already been specified.\n";
+                }; # if
+                $switcher->{ $object } = { defined => [], undefined => [] };
+            }; # foreach
+        },
+);
+if ( not %$base ) {
+    cmdline_error( "No base objects specified" );
+}; # if
+
+if ( $target_os eq "win" ) {
+    *load_symbols = \&_load_symbols_link;
+} elsif ( $target_os eq "lin" ) {
+    *load_symbols = \&_load_symbols_nm;
+} elsif ( $target_os eq "mac" ) {
+    *load_symbols = \&_load_symbols_nm;
+} else {
+    runtime_error( "OS \"$target_os\" not supported" );
+}; # if
+
+# Do the work.
+
+my $required = required_extra_objects( $base, $extra, $dump );
+if ( $print_base ) {
+    print( map( "$_\n", sort( keys( %$base ) ) ) );
+}; # if
+if ( $print_extra ) {
+    print( map( "$_\n", sort( keys( %$required ) ) ) );
+}; # if
+my @symbols;
+if ( defined( $prefix ) ) {
+    foreach my $object ( sort( keys( %$required ) ) ) {
+        push( @symbols, @{ $required->{ $object }->{ defined } } );
+    }; # foreach $objects
+}; # if
+if ( $copy_base ) {
+    copy_objects( $base, $copy_base, $prefix, @symbols );
+}; # if
+if ( $copy_extra ) {
+    copy_objects( $required, $copy_extra, $prefix, @symbols );
+}; # if
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<required-objects.pl> -- Select a required extra object files.
+
+=head1 SYNOPSIS
+
+B<required-objects.pl> I<option>... [--base] I<file>... --extra I<file>...
+
+=head1 DESCRIPTION
+
+B<required-objects.pl> works with two sets of object files -- a set of I<base> objects
+and a set of I<extra> objects, and selects those extra objects which are required for resolving
+undefined symbols in base objects I<and> selected extra objects.
+
+Selected object files may be copied to specified location or their names may be printed to stdout,
+a name per line. Additionally, symbols defined in selected extra objects may be renamed.
+
+Depending on OS, different external tools may be used. For example, B<required-objects.pl> uses
+F<link.exe> on "win" and F<nm> on "lin" and "mac" OSes. Normally OS is autodetected, but
+detection can be overrided with B<--os> option. It may be helpful in cross-build environments.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--base>
+
+The list of base objects follows this option.
+
+=item B<--extra>
+
+List of extra objects follows this option.
+
+=item B<--print-all>
+
+Print list of base objects and list of required extra objects.
+
+=item B<--print-base>
+
+Print list of base objects.
+
+=item B<--print-extra>
+
+Print list of selected extra objects.
+
+=item B<--copy-all=>I<dir>
+
+Copy all base and selected extra objects to specified directory. The directory must exist. Existing
+files are overwritten.
+
+=item B<--copy-base=>I<dir>
+
+Copy all base objects to specified directory.
+
+=item B<--copy-extra=>I<dir>
+
+Copy selected extra objects to specified directory.
+
+=item B<--prefix=>I<str>
+
+If prefix is specified, copied object files are edited -- symbols defined in selected extra
+object files are renamed (in all the copied object files) by adding this prefix.
+
+F<objcopy> program should be available for performing this operation.
+
+=item B<--os=>I<str>
+
+Specify OS name. By default OS is autodetected.
+
+Depending on OS, B<required-objects.pl> uses different external tools.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full documentation and exit.
+
+=item B<--version>
+
+Print version and exit.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+A name of object file.
+
+=back
+
+=head1 EXAMPLES
+
+    $ required-objects.pl --base obj/*.o --extra ../lib/obj/*.o --print-extra > required.lst
+    $ ar cr libx.a obj/*.o $(cat required.lst)
+
+    $ required-objects.pl --base internal/*.o --extra external/*.o --prefix=__xyz_ --copy-all=obj
+    $ ar cr xyz.a obj/*.o
+
+=cut
+
+# end of file #
+

diff --git a/final/runtime/tools/src/common-checks.mk b/final/runtime/tools/src/common-checks.mk
new file mode 100644
index 0000000..250eaed
--- /dev/null
+++ b/final/runtime/tools/src/common-checks.mk

@@ -0,0 +1,95 @@
+# common-checks.mk #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# --------------------------------------------------------------------------------------------------
+# This file contains really common definitions used by multiple makefiles. Modify it carefully!
+# --------------------------------------------------------------------------------------------------
+
+#
+# Check tools versions.
+#
+ifeq "$(clean)" ""    # Do not check tools if clean goal specified.
+
+    check_tools_flags = --make
+
+    # determine if fortran check is required from goals
+    # MAKECMDGOALS is like argv for gnu make
+    ifneq "$(filter mod all,$(MAKECMDGOALS))" ""
+        check_tools_flags += --fortran
+    else
+        ifeq "$(MAKECMDGOALS)" "" # will default to all if no goals specified on command line
+            check_tools_flags += --fortran
+        endif
+    endif
+    ifneq "$(filter gcc clang,$(c))" "" # if build compiler is gcc or clang
+        check_tools_flags += --nointel
+    endif
+    ifeq "$(c)" "clang"
+        check_tools_flags += --clang
+    endif
+
+    curr_tools := $(strip $(shell $(perl) $(tools_dir)check-tools.pl $(oa-opts) $(check_tools_flags)))
+
+    ifeq "$(curr_tools)" ""
+        $(error check-tools.pl failed)
+    endif
+    ifneq "$(findstring N/A,$(curr_tools))" ""
+        missed_tools := $(filter %---_N/A_---,$(curr_tools))
+        missed_tools := $(subst =---_N/A_---,,$(missed_tools))
+        missed_tools := $(subst $(space),$(comma)$(space),$(missed_tools))
+        $(error Development tools not found: $(missed_tools))
+    endif
+    prev_tools := $(strip $(shell [ -e tools.cfg ] && cat tools.cfg))
+    $(call say,Tools  : $(curr_tools))
+    ifeq "$(prev_tools)" ""
+        # No saved config file, let us create it.
+        dummy := $(shell echo "$(curr_tools)" > tools.cfg)
+    else
+        # Check the saved config file matches current configuration.
+        ifneq "$(curr_tools)" "$(prev_tools)"
+            # Show the differtence between previous and current tools.
+            $(call say,Old tools : $(filter-out $(curr_tools),$(prev_tools)))
+            $(call say,New tools : $(filter-out $(prev_tools),$(curr_tools)))
+            # And initiate rebuild.
+            $(call say,Tools changed$(comma) rebuilding...)
+            dummy := $(shell $(rm) .rebuild && echo "$(curr_tools)" > tools.cfg)
+        endif
+    endif
+endif
+
+# Check config.
+ifeq "$(curr_config)" ""
+    $(error makefile must define `curr_config' variable)
+endif
+prev_config := $(shell [ -e build.cfg ] && cat build.cfg)
+curr_config := $(strip $(curr_config))
+ifeq "$(clean)" ""    # Do not check config if clean goal specified.
+    $(call say,Config : $(curr_config))
+    ifeq "$(prev_config)" ""
+        # No saved config file, let us create it.
+        dummy := $(shell echo "$(curr_config)" > build.cfg)
+    else
+        # Check saved config file matches current configuration.
+        ifneq "$(curr_config)" "$(prev_config)"
+            # Show the differtence between previous and current configurations.
+            $(call say,Old config : $(filter-out $(curr_config),$(prev_config)))
+            $(call say,New config : $(filter-out $(prev_config),$(curr_config)))
+            # And initiate rebuild.
+            $(call say,Configuration changed$(comma) rebuilding...)
+            dummy := $(shell $(rm) .rebuild && echo "$(curr_config)" > build.cfg)
+        endif
+    endif
+endif
+
+# end of file #
+

diff --git a/final/runtime/tools/src/common-defs.mk b/final/runtime/tools/src/common-defs.mk
new file mode 100644
index 0000000..4a059d0
--- /dev/null
+++ b/final/runtime/tools/src/common-defs.mk

@@ -0,0 +1,232 @@
+# common-defs.mk #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# --------------------------------------------------------------------------------------------------
+# This file contains really common definitions used by multiple makefiles. Modify it carefully!
+# --------------------------------------------------------------------------------------------------
+
+# --------------------------------------------------------------------------------------------------
+# Some tricky variables.
+# --------------------------------------------------------------------------------------------------
+empty :=
+space := $(empty) $(empty)
+comma := ,
+ifeq "$(date)" ""
+    $(error Caller should specify "date" variable.)
+endif
+
+# --------------------------------------------------------------------------------------------------
+# Helper finctions.
+# --------------------------------------------------------------------------------------------------
+
+# Synopsis:
+#     $(call say,text-to-print-to-the-screen)
+# Description:
+#     The function prints its argument to the screen. In case of older makes it is analog of
+#     $(warning), starting from make 3.81 is is analog of $(info).
+#
+say = $(warning $(1))
+ifneq "$(filter 3.81,$(MAKE_VERSION))" ""
+    say = $(info $(1))
+endif
+
+# Synopsis:
+#     architecture = $(call legal_arch,32)
+# Description:
+#     The function return printable name of specified architecture, IA-32 architecture or Intel(R) 64.
+#
+legal_arch = $(if $(filter 32,$(1)),IA-32,$(if $(filter 32e,$(1)),Intel(R) 64,$(if $(filter l1,$(1)),L1OM,$(if $(filter arm,$(1)),ARM,$(if $(filter ppc64,$(1)),PPC64,$(if $(filter ppc64le,$(1)),PPC64LE,$(if $(filter aarch64,$(1)),AArch64,$(if $(filter mic,$(1)),Intel(R) Many Integrated Core Architecture,$(error Bad architecture specified: $(1))))))))))
+
+# Synopsis:
+#     var_name = $(call check_variable,var,list)
+# Description:
+#     The function verifies the value of var varibale. If it is empty, the first word from the list
+#     is assigned to var as default value. Otherwise the var value must match one of words in the
+#     list, or error is issued.
+# Example:
+#     LINK_TYPE = $(call check_variable,LINK_TYPE,static dynamic)
+#
+check_variable = $(call _check_variable_words,$(1))$(call _check_variable_value,$(1),$(2))
+
+# Synopsis:
+#     $(call _check_variable_words,var)
+# Description:
+#     Checks that variable var is empty or single word. In case of multiple words an error is
+#     issued. It is helper function for check_variable.
+#
+_check_variable_words = $(if $(filter 0 1,$(words $($(1)))),,\
+    $(error Illegal value of $(1): "$($(1))"))
+
+# Synopsis:
+#     $(call _check_variable_value,var)
+# Description:
+#     If variable var is empty, the function returns the first word from the list. If variable is
+#     not empty and match one of words in the list, variable's value returned. Otherwise, error is
+#     issued. It is helper function for check_variable.
+#
+_check_variable_value = $(if $($(1)),$(if $(filter $(2),$($(1))),$($(1)),\
+    $(error Illegal value of $(1): "$($(1))")),$(firstword $(2)))
+
+# Synopsis:
+#     $(call debug,var)
+# Description:
+#     If LIBOMP_MAKE_DEBUG is not empty, var name and value printed. Use this for debug purpose.
+#
+ifeq "$(LIBOMP_MAKE_DEBUG)" ""
+    debug =
+else
+    debug = $(call say,debug: $(1)="$($(1))")
+endif
+
+# Synopsis:
+#     $(call header,target)
+# Description:
+#     Returns a string to print to show build progress.
+#
+header = ----- $(marker) --- $(1) -----
+
+# --------------------------------------------------------------------------------------------------
+# Global make settings.
+# --------------------------------------------------------------------------------------------------
+
+# Non-empty CDPATH may lead to problems on some platforms: simple "cd dir" (where "dir" is an
+# existing directory in current one) fails. Clearing CDPATH solves the problem.
+CDPATH =
+.SUFFIXES :            # Clean default list of suffixes.
+.DELETE_ON_ERROR :     # Delete target file in case of error.
+
+$(call say,$(call header,making $(if $(MAKECMDGOALS),$(MAKECMDGOALS),all)))
+
+# --------------------------------------------------------------------------------------------------
+# Check clean and clobber goals.
+# --------------------------------------------------------------------------------------------------
+
+# "clean" goal must be specified alone, otherwise we have troubles with dependency files.
+clean := $(filter clean%,$(MAKECMDGOALS))
+ifneq "$(clean)" ""                                    # "clean" goal is present in command line.
+    ifneq "$(filter-out clean%,$(MAKECMDGOALS))" ""    # there are non-clean goals.
+        $(error "clean" goals must not be mixed with other goals)
+    endif
+endif
+# Issue error on "clobber" target.
+ifneq "$(filter clobber,$(MAKECMDGOALS))" ""
+    $(error There is no clobber goal in makefile)
+endif
+
+# --------------------------------------------------------------------------------------------------
+# Mandatory variables passed from build.pl.
+# --------------------------------------------------------------------------------------------------
+
+os       := $(call check_variable,os,lin mac win)
+arch     := $(call check_variable,arch,32 32e 64 arm ppc64 ppc64le aarch64 mic)
+ifeq "$(arch)" "mic" # We want the flavor of mic (knf, knc, knl, etc.)
+    platform := $(os)_$(MIC_ARCH)
+else
+    platform := $(os)_$(arch)
+endif
+platform := $(call check_variable,platform,lin_32 lin_32e lin_64 lin_arm lin_knc lin_knf mac_32 mac_32e win_32 win_32e win_64 lin_ppc64 lin_ppc64le lin_aarch64)
+# oa-opts means "os and arch options". They are passed to almost all perl scripts.
+oa-opts  := --os=$(os) --arch=$(arch)
+
+# --------------------------------------------------------------------------------------------------
+# Directories.
+# --------------------------------------------------------------------------------------------------
+
+ifeq "$(LIBOMP_WORK)" ""
+    $(error Internal error: LIBOMP_WORK variable must be set in makefile.mk)
+endif
+tools_dir = $(LIBOMP_WORK)tools/
+# We do not define src/ and other directories here because they depends on target (RTL, DSL, tools).
+
+# --------------------------------------------------------------------------------------------------
+# File suffixes.
+# --------------------------------------------------------------------------------------------------
+
+ifeq "$(os)" "win" # win
+    asm = .asm
+    obj = .obj
+    lib = .lib
+    dll = .dll
+    exe = .exe
+    cat = $(dll)
+else # lin, mic or mac
+    asm = .s
+    obj = .o
+    lib = .a
+    ifeq "$(os)" "mac"
+        dll = .dylib
+    else
+        dll = .so
+    endif
+    exe = $(empty)
+    cat = .cat
+endif
+
+# --------------------------------------------------------------------------------------------------
+# File manipulation and misc commands.
+# --------------------------------------------------------------------------------------------------
+
+target = @echo "$(call header,$@)"
+ifeq "$(os)" "win"
+    cp    = cp -f
+    rm    = rm -f
+    mkdir = mkdir -p
+    touch = touch
+    perl  = perl
+    slash = \\
+else # lin, mic or mac
+    cp    = cp -f
+    rm    = rm -f
+    mkdir = mkdir -p
+    touch = touch
+    perl  = perl
+    slash = /
+endif
+
+# --------------------------------------------------------------------------------------------------
+# Common non-configuration options.
+# --------------------------------------------------------------------------------------------------
+# They may affect build process but does not affect result.
+
+# If TEST_DEPS is "off", test deps is still performed, but its result is ignored.
+TEST_DEPS  := $(call check_variable,TEST_DEPS,on off)
+# The same for test touch.
+TEST_TOUCH := $(call check_variable,TEST_TOUCH,on off)
+td-i = $(if $(filter off,$(TEST_DEPS)),-)
+tt-i = $(if $(filter off,$(TEST_TOUCH)),-)
+
+# --------------------------------------------------------------------------------------------------
+# Common targets.
+# --------------------------------------------------------------------------------------------------
+
+# All common targets are defined as phony. It allows "buil.pl --all test-xxx".
+# Makefile can define actions for a particiular test or leave it no-op.
+
+# all, the default target, should be the first one.
+.PHONY : all
+all :
+
+.PHONY : common clean clean-common fat inc l10n lib
+
+.PHONY : force-tests          tests
+.PHONY : force-test-touch     test-touch
+.PHONY : force-test-relo      test-relo
+.PHONY : force-test-execstack test-execstack
+.PHONY : force-test-instr     test-instr
+.PHONY : force-test-deps      test-deps
+
+tests = touch relo execstack instr deps
+tests       : $(addprefix test-,$(tests))
+force-tests : $(addprefix force-test-,$(tests))
+
+# end of file #

diff --git a/final/runtime/tools/src/common-rules.mk b/final/runtime/tools/src/common-rules.mk
new file mode 100644
index 0000000..44f97c2
--- /dev/null
+++ b/final/runtime/tools/src/common-rules.mk

@@ -0,0 +1,200 @@
+# common-rules.mk #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# --------------------------------------------------------------------------------------------------
+# This file contains really common definitions used by multiple makefiles. Modify it carefully!
+# --------------------------------------------------------------------------------------------------
+
+# --- Creating a directory ---
+# A directory cannot be a target, because in Linux* OS directory's timestamp is updated each time a
+# file is created or deleted in the directory. We use ".dir" file in place of directory. If such
+# file exists, it means directory exists also.
+
+.PRECIOUS : %/.dir                     # Do not delete automatically created files.
+
+%/.dir :
+	$(target)
+	$(mkdir) $(dir $@)
+	$(touch) $@
+
+# --- Rebuilding ---
+# Removing or touching .rebuild file causes rebuild.
+# To let it work, .rebuild should be added as prerequisite to every rule (dependency with commands)
+# except clean* and force*, in this and other makefiles.
+.rebuild :
+	$(target)
+	$(touch) $@
+
+# -- Creating dependency file for C/C++ ---
+
+%.d : %.c .rebuild
+	$(target)
+	$(c) $(cpp-flags) $(c-flags) $(c-flags-m) $< > $@
+
+%.d : %.cpp .rebuild
+	$(target)
+	$(cxx) $(cpp-flags) $(cxx-flags) $(cxx-flags-m) $< > $@
+
+# -- Creating preprocessed file for C/C++ ---
+
+%.i : %.c .rebuild
+	$(target)
+	$(c) $(cpp-flags) $(c-flags) -P $(c-out)$@ $<
+
+%.i : %.cpp .rebuild
+	$(target)
+	$(cxx) $(cpp-flags) $(cxx-flags) -P $(cxx-out)$@ $<
+
+# -- Compiling C/C++ files ---
+
+%$(obj) : %.c .rebuild
+	$(target)
+	$(c) $(cpp-flags) $(c-flags) $(c-out)$@ $<
+
+%$(obj) : %.cpp .rebuild
+	$(target)
+	$(cxx) $(cpp-flags) $(cxx-flags) $(cxx-out)$@ $<
+
+# -- Generate assembly files ---
+
+%$(asm) : %.c .rebuild
+	$(target)
+	$(c) $(cpp-flags) $(c-flags) -S $(c-out)$@ $<
+
+%$(asm) : %.cpp .rebuild
+	$(target)
+	$(cxx) $(cpp-flags) $(cxx-flags) -S $(cxx-out)$@ $<
+
+# -- Compiling asm files ---
+
+%$(obj) : %$(asm) .rebuild
+	$(target)
+        # There is a bug on mic: icc does not work with "-x assembler-with-cpp" option, so we have
+        # to preprocess file manually and then assembly it.
+        ifeq "$(arch)" "mic"
+	    $(c) -E $(cpp-flags) $< > $@.tmp
+	    $(as) $(as-flags) -x assembler $(as-out)$@ $@.tmp
+        else
+	    $(as) $(as-flags) $(as-out)$@ $<
+        endif
+
+# -- Expanding variables in template files ---
+
+# General rule "% : %.var" does not work good, so we have to write more specific rules:
+# "%.h : %.h.var", etc.
+
+.PRECIOUS : %.h %.f %.rc               # Do not delete automatically created files.
+
+expand-vars = $(perl) $(tools_dir)expand-vars.pl --strict $(ev-flags) $< $@
+
+# Any generated file depends on kmp_version.c, because we extract build number from that file.
+
+%.h  : %.h.var  \
+    kmp_version.c $(tools_dir)expand-vars.pl .rebuild
+	$(target)
+	$(expand-vars)
+
+%.f  : %.f.var  \
+    kmp_version.c $(tools_dir)expand-vars.pl .rebuild
+	$(target)
+	$(expand-vars)
+
+%.f90  : %.f90.var  \
+    kmp_version.c $(tools_dir)expand-vars.pl .rebuild
+	$(target)
+	$(expand-vars)
+
+%.rc : %.rc.var \
+   kmp_version.c $(tools_dir)expand-vars.pl .rebuild
+	$(target)
+	$(expand-vars)
+
+# -- Making static library ---
+
+.PRECIOUS : %$(lib)                    # Do not delete automatically created files.
+
+%$(lib) : %$(lib).lst .rebuild
+	$(target)
+	$(rm) $@
+	$(ar) $(ar-flags) $(ar-out)$@ $$(cat $<)
+        # strip debug info in case it is requested (works for Linux* OS only)
+        ifneq "$(dbg_strip)" ""
+            ifeq "$(DEBUG_INFO)" "off"
+                ifeq "$(arch)" "mic"
+	            x86_64-k1om-linux-objcopy --strip-debug $@
+                else
+	        objcopy --strip-debug $@
+            endif
+        endif
+        endif
+
+# -- Making dynamic library ---
+
+.PRECIOUS : %$(dll)                    # Do not delete automatically created files.
+
+# makefile.mk should properly define imp_file, def_file, res_file, and pdb_file:
+#     lin and mac: def_file and res_file should be empty, imp_file and pdb_file do not matter.
+#     win: all the variabe may be empty; if a variable specified, it affects ld-flags.
+# Note: imp_file and pdb_file are side effect of building this target.
+# Note: to workaround CQ215229 $ld-flags-extra introduced to keep options be placed after objects
+%$(dll) : %$(dll).lst $(def_file) $(res_file) .rebuild
+	$(target)
+	$(ld) $(ld-flags-dll) $(ld-flags) $(ld-out)$@ $$(cat $<) $(ld-flags-extra) $(res_file)
+        # If stripped pdb exist, rename it to normal pdb name. See devtools.mk for explanation.
+        ifneq "$(pdb_file)" ""
+            ifeq "$(DEBUG_INFO)" "off"
+	        mv $(pdb_file) $(pdb_file).nonstripped
+	        mv $(pdb_file).stripped $(pdb_file)
+            endif
+        endif
+
+ifneq "$(pdb_file)" ""
+$(pdb_file) : $(lib_file)
+endif
+
+%.dbg : %$(dll) .rebuild
+	$(target)
+        ifeq "$(arch)" "mic"
+	    x86_64-k1om-linux-objcopy  --only-keep-debug $< $@
+        else
+	objcopy --only-keep-debug $< $@ 
+        endif
+
+
+.PRECIOUS: %.res                       # Do not delete automatically created files.
+
+%.res : %.rc .rebuild
+	$(target)
+	rc -fo$@ $<
+
+# --- Building helper tools from sources ---
+
+.PRECIOUS: %$(exe)                     # Do not delete automatically created files.
+
+%$(exe) : $(tools_dir)%.cpp .rebuild
+	$(target)
+	$(cxx) $(cxx-out)$@ $<
+
+# --- Forcing a test ---
+
+test-%/.force : test-%/.dir
+	$(target)
+	$(rm) $(dir $@).{test,force}
+
+# --- Removing a file in build directory ---
+
+rm-% :
+	$(target)
+	$(rm) $(patsubst rm-%,%,$@)
+
+# end of file #

diff --git a/final/runtime/tools/src/common-tools.mk b/final/runtime/tools/src/common-tools.mk
new file mode 100644
index 0000000..38614d2
--- /dev/null
+++ b/final/runtime/tools/src/common-tools.mk

@@ -0,0 +1,487 @@
+# common-tools.mk #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# --------------------------------------------------------------------------------------------------
+# Dev tools and general options (like -fpic, -O2 or -g).
+# --------------------------------------------------------------------------------------------------
+
+# c       -- C compiler.
+# cxx     -- C++ compiler.
+# cpp     -- C preprocessor.
+# fort    -- Fortran compiler.
+# as      -- Assembler.
+# ar      -- Librarian (static library maker).
+# ld      -- Linker (dynamic library maker).
+# *-out   -- Flag denoting output file. If space between flag and file name required, add explicit
+#            space to variable, e. g.: "c-out = -o$(space)".
+# *-flags -- Flags to appropriate program, e. g. c-flags -- flags for C compiler, etc.
+
+# --- Common definitions ---
+
+# Add current directory (it contains generated files).
+# Note: It is important to specify current dir as "./" (not just "."). Otherwise Intel compiler
+# on Windows* OS generates such a dependency: "kmp_runtime.obj: .\kmp_i18n.inc", and make complains
+# "No rule to build .\kmp_i18n.inc". Using "./" solves the problem.
+cpp-flags += -I ./
+# For non-x86 architecture
+ifeq "$(filter 32 32e 64 mic,$(arch))" ""
+    cpp-flags += $(shell pkg-config --cflags libffi)
+endif
+# Add all VPATH directories to path for searching include files.
+cpp-flags += $(foreach i,$(VPATH),-I $(i))
+
+
+cpp-flags += -D USE_DEBUGGER=1
+
+# Shouldn't this be being set from the command line somehow?
+cpp-flags += -D USE_ITT_BUILD
+
+ifeq "$(OPTIMIZATION)" "on"
+    cpp-flags += -D NDEBUG
+else
+    cpp-flags += -D _DEBUG -D BUILD_DEBUG
+    ifeq "$(os)" "win"
+        # This is forced since VS2010 tool produces inconsistent directives
+        # between objects, resulting in a link failure.
+        cpp-flags += -D _ITERATOR_DEBUG_LEVEL=0
+    endif
+endif
+
+# --- Linux* OS, Intel(R) Many Integrated Core Architecture and OS X* definitions ---
+
+ifneq "$(filter lin mac,$(os))" ""
+    # --- C/C++ ---
+    ifeq "$(c)" ""
+        c = icc
+    endif
+    # C++ compiler is a complement to C compiler.
+    ifeq "$(c)" "icc"
+        cxx = icpc
+    endif
+    ifeq "$(c)" "gcc"
+        cxx = g++
+    endif
+    ifeq "$(c)" "clang"
+        cxx = clang++
+    endif
+    # Output file flag.
+    c-out   = -o$(space)
+    cxx-out = -o$(space)
+    # Compile only, no link.
+    c-flags   += -c
+    cxx-flags += -c
+    # Generating dependecy file.
+    c-flags-m   += -M -MG
+    cxx-flags-m += -M -MG
+    # Enable C99 language.
+    ifneq "$(CPLUSPLUS)" "on"
+        c-flags += -std=gnu99
+    endif
+    # Generate position-independent code (SDL requirements).
+    c-flags   += -fPIC
+    cxx-flags += -fPIC
+    # Emit debugging information.
+    ifeq "$(DEBUG_INFO)" "on"
+        c-flags   += -g
+        cxx-flags += -g
+    endif
+    # Instrument program for profiling, gather extra information.
+    ifeq "$(COVERAGE)" "on"
+        ifeq "$(c)" "icc"
+            c-flags   += -prof_genx
+        endif
+        ifeq "$(cxx)" "icpc"
+            cxx-flags += -prof_genx
+        endif
+    endif
+    # Turn optimization on or off.
+    ifeq "$(OPTIMIZATION)" "on"
+        # -inline-min-size=1 improves performance of PARALLEL EPCC up to 10% on fxi64lin01,
+        # doesn't change performance on fxe64lin01.
+        # Presence of the -inline-min-size=1 switch should only help
+        # to promote performance stability between changes,
+        # even if it has no observable impact right now.
+	ifneq "$(filter icl icl.exe,$(c))" ""
+            c-flags   += -O2 -inline-min-size=1
+	else
+            c-flags   += -O2
+	endif
+        ifneq "$(filter icl icl.exe,$(cxx))" ""
+            cxx-flags += -O2 -inline-min-size=1
+	else
+            cxx-flags += -O2
+	endif
+    else
+        c-flags   += -O0
+        cxx-flags += -O0
+    endif
+    # --- Assembler ---
+    ifeq "$(c)" "icc"
+        as        = icc
+    endif
+    ifeq "$(c)" "gcc"
+        as        = gcc
+    endif
+    ifeq "$(c)" "clang"
+        as        = clang
+    endif
+    as-out    = -o$(space)
+    as-flags += $(cpp-flags)
+    # Compile only, no link.
+    as-flags += -c
+    as-flags += -x assembler-with-cpp
+    # --- Fortran ---
+    ifeq "$(c)" "icc"
+        fort = ifort
+    endif
+    ifeq "$(c)" "gcc"
+        fort = gfortran
+    endif
+    ifeq "$(c)" "clang"
+        fort = gfortran
+    endif
+    ifeq "$(fort)" ""
+        fort = ifort
+    endif
+    fort-out    = -o$(space)
+    fort-flags += -c
+endif
+
+# --- Linux* OS definitions ---
+
+ifeq "$(os)" "lin"
+ifneq "$(arch)" "mic"
+    # --- C/C++ ---
+    # On lin_32, we want to maintain stack alignment to be compatible with GNU binaries built with
+    # compiler.
+    ifeq "$(c)" "icc"
+        ifeq "$(arch)" "32"
+            c-flags   += -falign-stack=maintain-16-byte
+            cxx-flags += -falign-stack=maintain-16-byte
+        endif
+        # Generate code that will run on any Pentium or later processor.
+        ifeq "$(arch)" "32"
+            c-flags   += -mia32
+            cxx-flags += -mia32
+        endif
+    endif
+    ifeq "$(c)" "gcc"
+        ifeq "$(arch)" "arm"
+            c-flags   += -marm
+        endif
+    endif
+    # --- Librarian ---
+    ar        = ar
+    ar-out    = $(empty)
+    ar-flags += cr
+    # --- Linker ---
+    # Use ld by default, however, makefile may specify ld=$(c) before including devtools.mk.
+    ifeq "$(ld)" ""
+        ld = $(c)
+    endif
+    ld-flags-dll += -shared
+    ifeq "$(ld)" "ld"
+        ld-out = -o$(space)
+        ifeq "$(arch)" "32"
+            ld-flags += -m elf_i386
+        endif
+        ifeq "$(arch)" "32e"
+            ld-flags += -m elf_x86_64
+        endif
+        ld-flags     += -x -lc -ldl
+	# SDL (Security Development Lifecycle) flags:
+	# -z noexecstack - Stack execution protection.
+	# -z relro -z now - Data relocation and protection.
+        ld-flags     += -z relro -z now
+        ld-flags     += -z noexecstack
+        ld-flags-dll += -soname=$(@F)
+    endif
+    ifeq "$(ld)" "$(c)"
+        ld-out    = $(c-out)
+	# SDL (Security Development Lifecycle) flags:
+	# -z noexecstack - Stack execution protection.
+	# -z relro -z now - Data relocation and protection.
+        ld-flags     += -Wl,-z,relro -Wl,-z,now
+        ld-flags += -Wl,-z,noexecstack
+        ld-flags-dll += -Wl,-soname=$(@F)
+    endif
+    ifeq "$(ld)" "$(cxx)"
+        ld-out    = $(cxx-out)
+	# SDL (Security Development Lifecycle) flags:
+	# -z noexecstack - Stack execution protection.
+	# -z relro -z now - Data relocation and protection.
+        ld-flags     += -Wl,-z,relro -Wl,-z,now
+        ld-flags += -Wl,-z,noexecstack
+        ld-flags-dll += -Wl,-soname=$(@F)
+    endif
+endif
+endif
+
+# --- Intel(R) Many Integrated Core Architecture definitions ---
+
+ifeq "$(arch)" "mic"
+    # --- C/C++ ---
+    # Intel(R) Many Integrated Core Architecture specific options, need clarification for purpose:
+    #c-flags     += -mmic -mP2OPT_intrin_disable_name=memcpy -mP2OPT_intrin_disable_name=memset -mGLOB_freestanding -mGLOB_nonstandard_lib -nostdlib -fno-builtin
+    #cxx-flags   += -mmic -mP2OPT_intrin_disable_name=memcpy -mP2OPT_intrin_disable_name=memset -mGLOB_freestanding -mGLOB_nonstandard_lib -nostdlib -fno-builtin
+    # icc for mic has a bug: it generates dependencies for target like file.obj, while real object
+    # files are named file.o. -MT is a workaround for the problem.
+    c-flags-m   += -MT $(basename $@).o
+    cxx-flags-m += -MT $(basename $@).o
+    # --- Librarian ---
+    ar        = ar
+    ar-out    = $(empty)
+    ar-flags += cr
+    # --- Linker ---
+    # Use $(c) by default, however, makefile may specify another linker (e.g. ld=ld) before including devtools.mk.
+    ifeq "$(ld)" ""
+        ld = $(c)
+    endif
+    ifeq "$(ld)" "ld"
+        ld-out   = -o$(space)
+        ld-flags += -m elf_l1om_fbsd
+        ld-flags-dll += -shared -x -lc
+	# SDL (Security Development Lifecycle) flags:
+	# -z noexecstack - Stack execution protection.
+	# -z relro -z now - Data relocation and protection.
+        ld-flags     += -z noexecstack
+        ld-flags     += -z relro -z now
+        ld-flags-dll += -soname=$(@F)
+        # Now find out path to libraries.
+            ld-flags-L := $(shell $(c) -Wl,-v -\# 2>&1 | grep -e "-L")
+            $(call debug,ld-flags-L)
+            # Remove continuation characters; first add a space to the end (" -Lpath1 /" -> "-Lpath1 / ")
+            ld-flags-L := $(filter-out \,$(ld-flags-L))
+            $(call debug,ld-flags-L)
+            # Linker treats backslash ('\') as an escape symbol, so replace it with forward slash.
+            ld-flags-L := $(subst \,/,$(ld-flags-L))
+            $(call debug,ld-flags-L)
+        ld-flags += $(ld-flags-L)
+    endif
+    ifeq "$(ld)" "$(c)"
+        ld-out        = $(c-out)
+        ld-flags-dll += -shared -Wl,-x -Wl,-soname=$(@F)
+	# SDL (Security Development Lifecycle) flags:
+	# -z noexecstack - Stack execution protection.
+	# -z relro -z now - Data relocation and protection.
+        ld-flags     += -Wl,-z,noexecstack
+        ld-flags     += -Wl,-z,relro -Wl,-z,now
+    endif
+    ifeq "$(ld)" "$(cxx)"
+        ld-out        = $(cxx-out)
+        ld-flags-dll += -shared -Wl,-x -Wl,-soname=$(@F)
+	# SDL (Security Development Lifecycle) flags:
+	# -z noexecstack - Stack execution protection.
+	# -z relro -z now - Data relocation and protection.
+        ld-flags     += -Wl,-z,noexecstack
+        ld-flags     += -Wl,-z,relro -Wl,-z,now
+    endif
+endif
+
+# --- OS X* definitions ---
+
+ifeq "$(os)" "mac"
+    # --- Librarian ---
+    ar        = libtool
+    ar-out    = -o$(space)
+    ar-flags += -static
+    # --- Linker ---
+    # Use C compiler as linker by default, however, makefile may specify ld=$(libtool) before
+    # including devtools.mk.
+    ifeq "$(ld)" ""
+        ld = $(c)
+    endif
+    ifeq "$(ld)" "libtool"
+        ld-out        = -o$(space)
+        ld-flags-dll += -dynamic
+        ld-flags     += -lc -ldl
+    endif
+    ifeq "$(ld)" "$(c)"
+        ld-out        = $(c-out)
+        ld-flags-dll += -dynamiclib
+    endif
+    ifeq "$(ld)" "$(cxx)"
+        ld-out        = $(cxx-out)
+        ld-flags-dll += -dynamiclib
+    endif
+    # These options suitable for any linker, either C compiler or libtool.
+    ld-flags-dll += -headerpad_max_install_names
+    ld-flags-dll += -install_name $(@F)
+endif
+
+# --- Windows* OS definitions ---
+
+ifeq "$(os)" "win"
+    # Disable warning "function "..." (declared at line ... of ...) was declared deprecated...".
+    cpp-flags += -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE
+    # --- C/C++ ---
+    ifeq "$(c)" ""
+        c = icl.exe
+    endif
+    cxx    = $(c)
+    # Often default icl.cfg file in compiler bin/ directory contains options -Qvc and
+    # -Qlocation,link. Setting ICLCFG (and IFORTCFG) to specially prepared empty config file
+    # overrides default config.
+    ICLCFG   = $(tools_dir)icc.cfg
+    IFORTCFG = $(tools_dir)icc.cfg
+    export ICLCFG
+    export IFORTCFG
+    # Output file.
+    c-out   = -o$(space)
+    cxx-out = -o$(space)
+    # Disable annoying compiler logo.
+    c-flags   += -nologo
+    cxx-flags += -nologo
+    # Generate code that will run on any Pentium or later processor.
+    ifeq "$(arch)" "32"
+        c-flags   += -arch:ia32
+        cxx-flags += -arch:ia32
+    endif
+    # Compile only, no link.
+    c-flags   += -c
+    cxx-flags += -c
+    # -QM  -- Generate dependency file.
+    # -QMM -- do not include system headers. On Windows* OS, system headers may be located in
+    #         "C:\Program Files\...", but path with space confuses make, so we exclude system
+    #         headers.
+    # -QMG -- Treat missed headers as generated. We do have some generated include files.
+    c-flags-m   += -QM -QMM -QMG
+    cxx-flags-m += -QM -QMM -QMG
+    # Enable C99 language.
+    ifneq "$(CPLUSPLUS)" "on"
+    	c-flags   += -Qstd=gnu99
+    endif
+    # Enable C++ exception handling.
+    # ??? Why we disable it on Linux* OS?
+    cxx-flags += -EHsc
+    ifeq "$(arch)" "32"
+        ifneq "$(filter icl icl.exe,$(c))" ""
+            c-flags   += -Qsafeseh
+        endif
+        ifneq "$(filter icl icl.exe,$(cxx))" ""
+            cxx-flags += -Qsafeseh
+        endif
+    endif
+    # Emit debugging information.
+    ifeq "$(DEBUG_INFO)" "on"
+        c-flags   += -Zi
+        cxx-flags += -Zi
+    endif
+    # Instrument program for profiling, gather extra information.
+    ifeq "$(COVERAGE)" "on"
+        c-flags   += -Qprof_genx
+        cxx-flags += -Qprof_genx
+    endif
+    # Turn optimization on or off.
+    ifeq "$(OPTIMIZATION)" "on"
+        # Presence of the -inline-min-size=1 switch should only help
+        # to promote performance stability between changes,
+        # even if it has no observable impact right now.
+        # See the Linux* OS section above.
+	ifneq "$(filter icl icl.exe,$(c))" ""
+	    c-flags   += -O2 -Qinline-min-size=1
+	else
+	    c-flags   += -O2
+	endif 
+        ifneq "$(filter icl icl.exe,$(cxx))" ""
+            cxx-flags += -O2 -Qinline-min-size=1
+	else
+            cxx-flags += -O2
+	endif
+    else
+        c-flags   += -Od
+        cxx-flags += -Od
+        # Enable stack frame runtime error checking.
+        # !!! 0Obsolete option. Should use /RTC instead.
+        c-flags   += -RTC1
+        cxx-flags += -RTC1
+    endif
+    # SDL (Security Development Lifecycle) flags:
+    #   GS - Stack-based Buffer Overrun Detection
+    #   DynamicBase - Image Randomization
+    c-flags   += -GS -DynamicBase  
+    cxx-flags += -GS -DynamicBase  
+    # --- Assembler ---
+    ifeq "$(arch)" "32"
+        as   = ml
+    endif
+    ifeq "$(arch)" "32e"
+        as   = ml64
+    endif
+    ifeq "$(as)" "ias"
+        as-out   = -o$(space)
+    endif
+    ifneq "$(filter ml ml64,$(as))" ""
+        as-out   = -Fo
+        as-flags += -nologo -c
+        # SDL (Security Development Lifecycle) flags:
+        #   DynamicBase - Image Randomization
+	as-flags += -DynamicBase 
+    endif
+    # --- Fortran ---
+    fort        = ifort
+    fort-out    = -o$(space)
+    fort-flags += -nologo
+    fort-flags += -c
+    # SDL (Security Development Lifecycle) flags:
+    #   GS - Stack-based Buffer Overrun Detection
+    #   DynamicBase - Image Randomization
+    fort-flags += -GS -DynamicBase 
+    # --- Librarian ---
+    ar     = link.exe
+    ar-out = -out:
+    # Generate static library. Must be the first option.
+    ar-flags += -lib
+    # Turn off tool banner.
+    ar-flags += -nologo
+    # --- Linker ---
+    ld       = link.exe
+    ld-out   = -out:
+    # Generate dynamic library.
+    ld-flags-dll += -dll
+    # Turn off tool banner.
+    ld-flags += -nologo
+    # Generate pdb (Program DataBase, debug information) file.
+    # If DEBUG_INFO is on, generate normal (full-featured) pdb file. Otherwise, we need only
+    # stripped pdb. But stripped pdb cannot be generated alone, we have to generate normal *and*
+    # stripped pdb. After generating both pdb files we rename stripped pdb to normal pdb name (see
+    # rules.mk).
+    ifeq "$(DEBUG_INFO)" "on"
+        ld-flags += $(if $(pdb_file),-debug -pdb:$(pdb_file))
+    else
+        ld-flags += $(if $(pdb_file),-debug -pdb:$(pdb_file) -pdbstripped:$(pdb_file).stripped)
+    endif
+    # Use def file, if $(def_file) is specified.
+    ld-flags += $(if $(def_file),-def:$(def_file))
+    # Generate import library, if $(imp_file) is specified.
+    ld-flags += $(if $(imp_file),-implib:$(imp_file))
+    # Specify architecture.
+    ifeq "$(arch)" "32"
+        ar-flags += -machine:i386
+        ld-flags += -machine:i386
+    endif
+    ifeq "$(arch)" "32e"
+        ar-flags += -machine:amd64
+        ld-flags += -machine:amd64
+    endif
+    # SAFESEH
+    ifeq "$(arch)" "32"
+        as-flags += -safeseh
+        ld-flags += -safeseh
+    endif
+    # SDL (Security Development Lifecycle) flags:
+    #   NXCompat - Data Execution Prevention
+    ld-flags += -NXCompat -DynamicBase
+endif
+
+# end of file #

diff --git a/final/runtime/tools/windows.inc b/final/runtime/tools/windows.inc
new file mode 100644
index 0000000..3d2e070
--- /dev/null
+++ b/final/runtime/tools/windows.inc

@@ -0,0 +1,27 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+export SHELL = cmd
+
+# TODO give an error if archs doesn't match
+ifndef arch
+    $(error Could not detect arch: please specify on command line.)
+endif
+
+
+CMD=cmd /C
+CWD=$(shell cmd /C echo %CD%)
+RM=cmd /C del /Q /F
+RD=cmd /C rmdir
+MD=cmd /c mkdir
+SLASH=\\
+NUL = nul
+

diff --git a/final/runtime/tools/wipe-string.pl b/final/runtime/tools/wipe-string.pl
new file mode 100755
index 0000000..deab95f
--- /dev/null
+++ b/final/runtime/tools/wipe-string.pl

@@ -0,0 +1,183 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use File::Glob ":glob";
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.02";
+
+sub wipe($$$) {
+
+    my ( $input, $output, $wipe ) = @_;
+    my $bulk = read_file( $input, -binary => 1 );
+    $bulk =~ s{($wipe)}{ " " x length( $1 ) }ge;
+    write_file( $output, \$bulk, -binary => 1 );
+    return undef;
+
+}; # sub wipe
+
+my @wipe;
+my $target = ".";
+get_options(
+    "wipe-literal=s"     =>
+        sub { my $arg = $_[ 1 ]; push( @wipe, qr{@{ [ quotemeta( $arg ) ] }} ); },
+    "wipe-regexp=s"      =>
+        sub { my $arg = $_[ 1 ]; push( @wipe, qr{$arg} ); },
+    "target-directory=s" => \$target,
+);
+
+# Convert strings to regular expression.
+my $wipe = qr{@{ [ join( "|", @wipe ) ] }};
+
+my %jobs;
+
+# Collect files to process.
+# jobs: output -> input.
+foreach my $arg ( @ARGV ) {
+    my @inputs = ( $^O eq "MSWin32" ? bsd_glob( $arg ) : ( $arg ) );
+    foreach my $input ( @inputs ) {
+        my $file   = get_file( $input );
+        my $output = cat_file( $target, $file );
+        if ( exists( $jobs{ $output } ) ) {
+            runtime_error(
+                "\"$jobs{ $output }\" and \"$input\" input files tend to be written " .
+                    "to the same output file \"$output\""
+            );
+        }; # if
+        $jobs{ $output } = $input;
+    }; # foreach
+}; # foreach $file
+
+# Process files.
+%jobs = reverse( %jobs ); # jobs: input -> output.
+foreach my $input ( sort( keys( %jobs ) ) ) {
+    my $output = $jobs{ $input };
+    info( "\"$input\" -> \"$output\"" );
+    wipe( $input, $output, $wipe );
+}; # foreach $input
+
+exit( 0 );
+
+__END__
+
+#
+# Embedded documentation.
+#
+
+=pod
+
+=head1 NAME
+
+B<wipe-string.pl> -- Wipe string in text or binary files.
+
+=head1 SYNOPSIS
+
+B<wipe-string.pl> I<OPTION>... I<FILE>...
+
+=head1 OPTIONS
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--target-directory=>I<dir>
+
+Directory to put result files to. By default result files are written in the current working
+directory.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--version>
+
+Print version and exit.
+
+=item B<--wipe-literal=>I<str>
+
+Specify literal string to wipe. Multiple literals are allowed.
+
+=item B<--wipe-regexp=>I<str>
+
+Specify Perl regular expression to wipe. Multiple regular expressions may be specified.
+
+Be careful. Protect special characters from beign interpreted by shell.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+File name to wipe string in.
+
+=back
+
+=head1 DESCRIPTION
+
+The script wipes strings in files. String may be specified literally or by Perl regular expression.
+Strings are wiped by replacing characters with spaces, so size of file remains the same. The script
+may be applied to both text and binary files.
+
+Result files are written by default to current directory, or to directory specified by
+B<--target-directory> option, if any. If multiple input files tend to be written to the same output
+file (e. g. identically named input files located in different directories), the script generates an
+error.
+
+The script reads entire file, process it, and the writes to disk. Therefore it is (almost) safe to
+update files in-place (see examples).
+
+=head1 EXAMPLES
+
+Wipe "Copyright" word in all the files with "txt" suffix in current directory, overwrite original
+files (update them in-place):
+
+    wipe-string.pl --wipe-literal="Copyright" *.txt
+
+Wipe "Copyright" and "Copyleft" words in all the files with "txt" suffix in current directory,
+write result files to ../wiped directory:
+
+    wipe-string.pl --wipe-literal=Copyright --wipe-literal=Copyleft --target-dir=../wiped *.txt
+
+Wipe "Copyright" and "Copyleft" words in files from "doc" directory, write result files to current
+directory;
+
+    wipe-string.pl --wipe-regexp="Copyright|Copyleft" doc/*
+
+Wipe "defaultlib" directive in all the library files:
+
+    wipe-string.pl --wipe-regexp="-defaultlib:[A-Za-z0-9_.]+" *.lib
+
+(Be careful: the script does not analyze structure of library and object files, it just wipes
+U<strings>, so it wipes all the occurrences of strings matching to specified regular expression.)
+
+=cut
+
+# end of file #

diff --git a/final/testsuite/LICENSE b/final/testsuite/LICENSE
new file mode 100644
index 0000000..c19fb01
--- /dev/null
+++ b/final/testsuite/LICENSE

@@ -0,0 +1,34 @@
+  Copyright (c) 2011, 2012 
+  University of Houston System
+  
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+  o Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+  
+  o Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  
+  o Neither the name of the University of Houston System nor the names of its
+  contributors may be used to
+    endorse or promote products derived from this software without specific
+    prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+

diff --git a/final/testsuite/LLVM-IR/lit.cfg b/final/testsuite/LLVM-IR/lit.cfg
new file mode 100644
index 0000000..1957819
--- /dev/null
+++ b/final/testsuite/LLVM-IR/lit.cfg

@@ -0,0 +1,78 @@
+# -*- Python -*-
+
+# Configuration file for the 'lit' test runner.
+
+import os
+import sys
+import re
+import platform
+
+try:
+   import lit.util
+   import lit.formats
+except ImportError:
+   pass
+
+# name: The name of this test suite.
+config.name = 'OpenMPValidationSuite'
+
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+# suffixes: A list of file extensions to treat as test files
+# Note this can be overridden by lit.local.cfg files
+config.suffixes = ['.ll']
+
+# test_source_root: The root path where tests are located.
+#config.test_source_root = "/home/ichoyjx/install/openmp/testsuite/bin"
+#os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+#mpvs_obj_root = getattr(config, 'mpvs_obj_root', None)
+#if mpvs_obj_root is not None:
+config.test_exec_root = "./"
+#os.path.join(mpvs_obj_root, 'src')
+
+# Discover the 'clang' and 'clangcc' to use.
+
+import os
+
+def inferClang(PATH):
+    # Determine which clang to use.
+    clang = os.getenv('CLANG')
+
+    # If the user set clang in the environment, definitely use that and don't
+    # try to validate.
+    if clang:
+        return clang
+
+    # Otherwise look in the path.
+    clang = lit.util.which('clang', PATH)
+
+    if not clang:
+        lit_config.fatal("couldn't find 'clang' program, try setting "
+                         "CLANG in your environment")
+
+    return clang
+
+config.clang = inferClang(config.environment['PATH']).replace('\\', '/')
+config.substitutions.append( ('%clang', ' ' + config.clang + ' ') )
+
+# Propogate some environment variable to test environment.
+def addEnv(name):
+    if name in os.environ:
+        config.environment[name] = os.environ[name]
+
+addEnv('HOME')
+addEnv('PWD')
+
+
+addEnv('C_INCLUDE_PATH')
+addEnv('CPLUS_INCLUDE_PATH')
+addEnv('LIBRARY_PATH')
+addEnv('LD_LIBRARY_PATH')
+addEnv('DYLD_LIBRARY_PATH')
+
+# Check that the object root is known.
+if config.test_exec_root is None:
+    lit.fatal('test execution root not set!')

diff --git a/final/testsuite/LLVM-IR/lit.site.cfg.in b/final/testsuite/LLVM-IR/lit.site.cfg.in
new file mode 100644
index 0000000..11866d4
--- /dev/null
+++ b/final/testsuite/LLVM-IR/lit.site.cfg.in

@@ -0,0 +1,30 @@
+## Autogenerated by LLVM/Clang configuration.
+#config.mpvs_src_root = "/home/ichoyjx/install/openmp/testsuite/bin/c"
+config.mpvs_obj_root = "/tmp"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+
+
+# Needed to check if a hack needs to be applied
+config.llvm_version_major = "@LLVM_VERSION_MAJOR@"
+
+# Compilers
+# FIXME: use llvmcc not llvmgcc
+config.llvmclang = "clang"
+
+# Features
+config.enable_uclibc = True if @ENABLE_UCLIBC@ == 1 else False
+config.enable_posix_runtime = True if @ENABLE_POSIX_RUNTIME@ == 1 else False
+config.have_selinux = True if @HAVE_SELINUX@ == 1 else False
+
+# Current target
+config.target_triple = "@TARGET_TRIPLE@"
+
+# Let the main config do the real work.
+try:
+  lit
+except NameError:
+  # Use lit_config class
+  lit_config.load_config(config, "@MPVS_SOURCE_DIR@/lit.cfg")
+else:
+  # Use old lit class
+  lit.load_config(config, "@MPVS_SOURCE_DIR@/lit.cfg")

diff --git a/final/testsuite/LLVM-IR/lit.tmp b/final/testsuite/LLVM-IR/lit.tmp
new file mode 100644
index 0000000..81d30fb
--- /dev/null
+++ b/final/testsuite/LLVM-IR/lit.tmp

@@ -0,0 +1 @@
+../LLVM-IR/lin_32e/

diff --git a/final/testsuite/Makefile b/final/testsuite/Makefile
new file mode 100644
index 0000000..cdbc3dc
--- /dev/null
+++ b/final/testsuite/Makefile

@@ -0,0 +1,254 @@
+# General Makefile containing all the necessary compiler flags for the tests
+
+# modify CC and CFLAGS for OpenMP C compilers
+# modify FC and FFLAGS for OpenMP Fortran compilers
+
+
+# Content:
+#########################################################
+
+# 1. Basic usage
+# 2. General testsuite settings
+# 3. Compiler selection and Flags
+
+#########################################################
+
+
+#########################################################
+# 1. Basic usage
+#########################################################
+# 	make ctest		generate c test "ctest"
+# 	make ftest		generate fortran test "ftest"
+#	make clean		removes all sources and binaries
+# 	make cleanall	removes sources,binaries and logfiles
+
+
+#########################################################
+# 2. General testsuite settings
+#########################################################
+
+# For general testsuite settings see the configuration file
+# ompts.conf
+
+#########################################################
+# 3. Compiler selection and Flags
+#########################################################
+
+# CLANG Compiler
+CC     = clang
+CFLAGS = -fopenmp -S -emit-llvm
+FC     = gfortran
+FFLAGS = -fopenmp -lm
+FFLAGS = -fopenmp -lm -O3
+
+# GNU Compiler
+# CC     = gcc
+# CFLAGS = -fopenmp -lm -O3
+# FC     = gfortran 
+# FFLAGS = -fopenmp -lm
+# FFLAGS = -fopenmp -lm -O3
+
+# Fujitsu Compilers:
+#CC = fcc
+#CFLAGS = -KOMP,fast_GP2=2
+#FC=frt
+#FFLAGS=-KOMP,fast_GP2=2 -w -Am -X9 -Fixed
+
+
+# PGI compilers
+#CC = pgcc
+#CFLAGS = -mp
+#CFLAGS = -mp -DVERBOSE
+#CFLAGS = -fast -mp
+
+#FC = pgf90
+#FFLAGS = -fast -mp
+#FFLAGS = -mp -g
+
+
+# Intel compilers:
+#CC = icc
+#CFLAGS = -O3 -ip -openmp
+#CFLAGS = -Wall -O0 -openmp
+#CFLAGS =  -openmp -lm
+#CFLAGS =  -openmp -lm -DVERBOSE
+
+#FC = ifort
+#FFLAGS = -openmp -lm -fpp
+
+# Omni compilers:
+#CC = ompcc
+#CFLAGS = -O3 -lm
+
+
+# Assure compilers:
+#CC = assurec
+#CFLAGS = -O3 -WApname=project -DUSE_ASSURE=1
+#FC =
+#FFLAGS =
+
+# NEC:
+#CC = c++
+#CC = sxc++
+#CFLAGS = -Popenmp
+
+#FC=sxf90
+#FFLAGS= -Popenmp
+
+
+# Hitachi:
+#CC = xcc
+#CFLAGS = -O4 -pvec +Op -parallel -omp
+#FC =
+#FFLAGS =
+
+
+# SGI:
+#CC = cc
+#CFLAGS = -mp -lm
+#FC =
+#FFLAGS =
+
+
+# IBM compilers:
+#CC = xlc_r
+#CFLAGS = -qsmp=omp -lm
+
+
+#FC=xlf90_r
+#FFLAGS=-qsmp=omp -qfixed=132 -qlanglvl=extended
+
+
+# SUN compilers
+#CC = cc
+#CFLAGS = -fast -xopenmp -lm
+
+#FC =f90
+#FFLAGS = -xopenmp -fast -lm
+
+
+# open64 compilers
+# remark: -I. was a workaround because the installation came without omp.h file
+#CC = opencc
+#CFLAGS = -O0 -openmp -lm -I. -lomp -lpthread
+#CFLAGS = -O0 -openmp -lm -I /home/matthew/opt/usr/include -lpthread
+#CFLAGS = -openmp -lm
+
+#FC = openf90
+#FFLAGS = -O0 -openmp -lm  -lomp -lpthread
+
+
+#Pathscale Compiler
+#CC = pathcc
+#CFLAGS = -mp -Ofast -lm
+
+#FC=pathf90
+#FFLAGS= -mp -Ofast -lm
+
+
+#OpenUH Compiler
+#CC = uhcc
+#CFLAGS = -mp
+
+#FC = uhf90
+#FFLAGS = -mp
+
+# Check path
+FILE=LLVM-IR/lit.tmp
+CHECK_PATH=`cat $(FILE)`
+
+#########################################################################
+
+help:
+	@echo "OpenMP Validation Suite, v3.0"
+	@echo "(C) University of Stuttgart, University of Houston"
+	@echo ""
+	@echo "Do make [rule], where rule may be one of:"
+	@echo "  ctest"
+	@echo "    Build and run the OpenMP/C validation tests."
+	@echo "  ftest"
+	@echo "    Build and run the OpenMP/Fortran validation tests."
+	@echo "  print_compile_options"
+	@echo "    Display the default compiler and compiler options being tested"
+	@echo "  cleansrcs"
+	@echo "    Remove sources generated from the templates"
+	@echo "  cleanbins"
+	@echo "    Remove executables from bin/ directory"
+	@echo "  cleanouts"
+	@echo "    Remove all *.out files from bin/ directory"
+	@echo "  cleanlogs"
+	@echo "    Remove all *.log files from bin/ directory"
+	@echo "  clean"
+	@echo "    Clean out and log files"
+	@echo "  distclean"
+	@echo "    Clean arch test directory with clean"
+	@echo "  veryclean"
+	@echo "    Remove the entire bin/c directory with distclean"
+
+omp_my_sleep:
+	mkdir -p bin/c
+	cp omp_my_sleep.h bin/c/
+omp_testsuite: omp_testsuite.h
+	mkdir -p bin/c
+	cp omp_testsuite.h bin/c/
+omp_testsuite.h: ompts-c.conf c/*
+	./ompts_makeHeader.pl -f=ompts-c.conf -t=c
+.c.o: omp_testsuite omp_my_sleep
+	$(CC) $(CFLAGS) -c $<
+
+ctest: omp_my_sleep omp_testsuite
+	./runtest.pl --lang=c testlist-c.txt
+	cd bin/; make -j
+
+ftest:
+	mkdir -p bin/fortran
+	cp omp_testsuite.f bin/fortran
+	cp omp_my_sleep.f bin/fortran
+	./runtest.pl --lang=fortran testlist-f.txt
+
+print_compile_options:
+	@echo "-------------------"
+	@echo "C compiler"
+	@echo "-------------------"
+	@echo "compilation: $(CC) $(CFLAGS)"
+	@$(CC) --version
+	@echo "-------------------"
+	@echo "Fortran compiler"
+	@echo "-------------------"
+	@echo "compilation: $(FC) $(FFLAGS)"
+	@$(FC) --version
+
+
+
+path_to_arch=$(shell ../runtime/tools/check-openmp-test.pl)
+
+cleansrcs:
+	find bin/ -iname "*.[cf]" -exec rm -rf {} \;
+cleanbins:
+	find bin/c -perm /a+x -type f -exec rm -rf {} \;
+	find ./bin/c -not -name "*.c" -a -not -name "*.ll" -type f -exec rm -rf {} \;
+cleanouts:
+	find bin/ -iname "*.out" -exec rm -rf {} \;
+cleanlogs:
+	find bin/ -iname "*.log" -exec rm -rf {} \;
+	rm -rf ./$(path_to_arch)
+
+clean: cleanouts cleanlogs cleanbins cleansrcs
+
+distclean: clean
+	rm -rf bin/c/*
+	rm -rf LLVM-IR/$(CHECK_PATH)
+
+veryclean: distclean
+	rm -rf bin/c/
+	rm -rf bin/fortran/
+
+
+cleanall: 
+	rm -rf ./bin/c/
+	rm -rf ./bin/fortran/
+	rm -rf ./LLVM-IR/$(path_to_arch)
+	rm -rf ./$(path_to_arch)
+	rm -rf ./*.log
+
+

diff --git a/final/testsuite/README_LLVM_OPENMP b/final/testsuite/README_LLVM_OPENMP
new file mode 100644
index 0000000..108cada
--- /dev/null
+++ b/final/testsuite/README_LLVM_OPENMP

@@ -0,0 +1,46 @@
+ ============================================================================
+|              To use the OpenMP Validation Suite for testing                | 
+| 			LLVM OpenMP Implementation                           |
+|       	   High Performance Computing and Tools,                     | 
+|			University of Houston          			     |
+|                             Jan. 2015                                      |
+ ============================================================================
+
+Two new files have been added to openmp/runtime: check-openmp.pl and check-openmp-test.pl 
+Go to /openmp/runtime, run:
+make test compiler=clang
+make test-clean  
+
+------------------------------------------------------------------------------
+For the OpenMP-LLVM Project, following are the Files and Directories to know about: 
+
+
+  c                         	 Directory containing the templates for the c tests
+  fortran                   	 Directory containing the templates for the Fortran
+  Makefile                  	 Makefile containing options for compilation
+  README_LLVM_OPENMP             README file for to use the OpenMP test-suite 
+  README_OpenMP_Validation_Suite README file on the Validation Suite, templates, etc., 
+				 as a basis for testing the LLVM OpenMP implementation. 
+  LLVM_IR			 Directory containing target-specific IR files
+  bin/lit files		         In the lit model, every test must exist inside some test suite. 
+				 lit resolves the inputs specified on the command line to 
+				 test suites by searching upwards from the input path 
+				 until it finds a lit.cfg or lit.site.cfg file. 
+				 These files serve as both a marker of test suites and 
+				 as configuration files which lit loads in order to 
+				 understand how to find and run the tests inside the test suite.	
+  bin/header			 Header for the tests, calls clang driver for the test, 
+				 and executes the resulting executable			
+  bin/Makefile 			 To generate LLVM_IR files 
+  bin/distribute.sh 		 Generates LLVM_IR files, finds the architecture and OS, moves 
+				 the generated IR files to the corresponding folder 
+  runtime/tools/check-openmp.pl  Retrieving the right path 
+  runtime/tools/
+  check-openmp-test.pl		 Returns the architecture and OS, e.g. lin_32e
+------------------------------------------------------------------------------
+
+Contact and Support
+==============================================================================
+
+Contact: http://www.cs.uh.edu/~hpctools/
+Email: sunita@cs.uh.edu

diff --git a/final/testsuite/README_OpenMP_Validation_Suite b/final/testsuite/README_OpenMP_Validation_Suite
new file mode 100644
index 0000000..8ca7c61
--- /dev/null
+++ b/final/testsuite/README_OpenMP_Validation_Suite

@@ -0,0 +1,334 @@
+ ============================================================================
+|                   OpenMP Validation Suite  V 3.0                           |
+|              High Performance Computing Center, Stuttgart                  |
+|       High Performance Computing and Tools, University of Houston          |
+|                              Jan. 2012                                     |
+ ============================================================================
+
+
+TABLE OF CONTENTS
+
+I    	INTRODUCTION
+I.1.	  Aims and general function
+I.2. 	  Files and directories
+
+II		USAGE
+II.1.	  First run with make
+II.2	  Where to search for results
+II.3.	  Using the runtest script
+
+III		Adding and modifying tests
+III.1.	  The template structure
+
+IV		Known Issues and Workaround
+
+V       Contact and Support
+
+------------------------------------------------------------------------------
+
+I.  INTRODUCTION
+==============================================================================
+
+I.1.  Aims and general function
+--------------------------------
+
+The OpenMP validation suite is designed to verify the correct implementation
+of OpenMP in compilers. It is capable of checking Fortran as well as c
+compiler.
+
+Testing the implementation is based on statistics. Each directive is tested
+by computing and verifying the result with the already known value. In most
+cases a wrong implementation can result in the right values. So the tests are
+run several times.
+
+Additionally, the validation suite creates so called crosstests for each
+directive. These are tests in which either the directive is missing or used
+with different arguments. If this so called crosstest fails, this indicates
+strongly that the previous test is capable of testing the directive.
+
+Lastly, an orphaned test is also run to determine if the directive being
+tested is able to correctly run when 'orphaned' from the main function.
+Essentially, the directive's code is placed into its own function which is
+called during execution of the main function and often inside a parallel
+region.
+
+
+I.2.  Files and directories
+----------------------------
+
+
+d c                         directory containing the templates for the c tests
+d fortran                   directory containing the templates for the Fortran
+                            tests
+  Makefile                  Makefile containing options for compilation
+  common_utility.f
+  omp_my_sleep.h            thread save sleep function
+  omp_testsuite.f           Fortran header file
+  omp_testsuite.h           autogenerated c-header file
+  ompts-c.conf              configuration file for the c tests about how often
+                            the tests shall be executed or how large the loop size
+                            is
+  ompts_makeHeader.pl	    perl module for automatically generation of an up
+                            to date header file
+  ompts_parserFunctions.pm	perl module containing general functions for
+				            the parser.pl script
+  ompts_parser.pl	        script for generating the source code out of the templates
+  ompts_standaloneProc.c	framework for the c tests
+  ompts_standaloneProc.f	framework for the Fortran tests
+  README		            the README file you've already found ;-)
+  LICENSE                   contains license information
+  runtest.pl		        the frame program of the test suite
+  testlist-f.txt	        test list containing the available tests for Fortran
+  testlist-c.txt	        test list containing the available tests for c
+
+
+------------------------------------------------------------------------------
+
+II.  USAGE
+==============================================================================
+
+
+II.1. First run with make
+--------------------------
+
+
+You can do a first simple run of the testsuite only after one step of
+configuration:
+
+1) Modify the ompts.conf and ompts-c.conf file, change the number of threads
+and number of repetitions of each test.
+
+2) Modify the Makefile, uncommenting the CC/FC and CFLAGS/FFLAGS variables for
+the compiler of your choice.
+
+And now you can run the testsuite either for a C compiler with:
+
+>	make ctest
+
+or for a Fortran compiler with:
+
+>	make ftest
+
+
+II.2. Running custom tests
+--------------------------
+
+
+In order to run single tests or custom groups of tests, two make commmands
+are defined: make stest and make fstest. These two command reference the file
+customtest.txt when looking for a testlist to use. Simply edit customtest.txt
+to include the desired test or tests. If customtest.txt contains c tests,
+
+>      make stest
+
+or for fortran tests
+
+>      make fstest
+
+In order to change the number of threads used in the tests, simply edit the
+Makefile variables MINTHREADS and MAXTHREADS. By default, they are configured
+to use 2 threads. To change the number of times each test is run, for c tests
+edit the REPETITIONS variable in the file ompts-c.conf. The LOOPCOUNT and
+SLEEPTIME variables can also be changed here. For fortran tests, edit the file
+omp_testsuite.f to change both the LOOPCOUNT and the number of times each test
+is run.
+
+
+
+II.3. Understanding the results
+---------------------------------
+
+
+When running the testsuite the results will be shown on the screen.
+
+If you need the results for further purpose you can use the results.txt, which
+is a simple list containing the results for each directive
+in a single line. Each line starts with the name of the directive. Then follows
+the result of the test given in the percentage of the passed tests. If 100% of
+the tests passed successfully, the second number gives the result of the
+corresponding crosstest. Crosstests will only be run if the normal test passes
+with 100% accuracy. If a crosstest was not run or a test does not exist, it is
+denotated by a "-".
+After the results of the normal tests, there follow a series of tests in the
+orphaned mode. If there were no orphaned tests available this is shown by a "-".
+
+If you run the testsuite with different numbers of threads (e.g. using the
+runtest.pl script) the results are shown in blocks of 4 columns for each number
+of threads.
+
+If a test fails you can find more detailed information in the ompts.log,
+bin/c/*.out and *.log files. While the ompts.log file contains all compiler
+error messages for all tests, the *.out and *.log files contain detailed inforamtion
+on the execution process of the single tests.
+In the *.out files there are listed all the results of the single executions of
+the tests. In the *.log files there are error messages of the tests itself.
+
+
+II.4. Cleaning Up
+-----------------
+
+
+Because many files are generated for each tested directive, it is often necessary
+to clean the main directory after a battery of tests. To clean all generated files
+in the main directory including the results and log files,
+
+>     make clean
+
+
+To clean only the logs and out files,
+
+>     make cleanlogs
+
+To clean only the results,
+
+>     make cleanresults
+
+
+
+II.4. Using the runtest script
+-------------------------------
+
+
+So for special purpose you can use the runtest.pl script, which allows a lot
+more options for the execution process than the execution with make.
+
+Using the runtest.pl script is rather easy. You can use the the test suite only
+after two steps of modifications:
+
+1.) Modify the Makefile to your wishes choosing your compiler and the necessary
+    compiler flags.
+2.) If necessary edit one of the test lists (testlist-c.txt) and comment out the
+    tests you do not want to run using # at the beginning of a line. Testlists for Fortran end
+    with -f.txt while test lists for c with -c.txt.
+
+And now you can run the test suite either for Fortran using
+	#	>  ./runtest.pl -lang=fortran -d=fortran testlist-f.txt
+or for c
+	#	>  ./runtest.pl -lang=c -d=c testlist-c.txt
+
+With the --help option you can show the complete list of options and their
+explanations.
+
+The test results are summarized in cresults or fresults.txt while *.log keep
+details for individual tests. There is also a file (ompts.log) keeping
+compilation information. (see section II.2 )
+
+If you don't want to test the directives in orphaned mode you can use the
+-norphan option. You also can use the runtest.pl script either to compile all
+tests or run compiled tests e.g. for cross compilation on other platforms. For
+this there are the options -norun and -nocompile.
+
+Happy testing!
+
+
+------------------------------------------------------------------------------
+
+III.	How to add new tests / The structure of test templates
+==============================================================================
+
+III.1  The template structure
+------------------------------
+
+The test suite is based on templates so that you only have one file for test,
+crosstest and the orphaned versions of them.
+
+	A) Description of the template structure
+
+The syntax of the templates is much like xml. So each test begins with
+'<ompts:test>' and ends with '</ompts:test>'.
+
+In between there are several other blocks holding information:
+
+- <ompts:testdescription> </ompts:testdescription> In between this tag you can
+  give a description on what the test checks and how it works.
+
+- <ompts:ompversion> </ompts:version> This tag is used to specify the
+  OpenMP-version which includes the tested directive.
+
+- <ompts:directive> </ompts:directive> Used to specify the directive how it is
+  called in the programming language.
+
+- <ompts:dependences> </ompts:dependences> With this tag you can specify other
+  omp directives which are necessary for the correct execution of your test.
+The directives have to be listed by their directive names as it is called in
+the programming language. Multiple directives are separated by ','.
+
+- <ompts:testcode> </ompts:testcode> In this tag stands the whole source code
+for the test / crosstest.  Each test has to be written as a function. The
+syntax of the functions differs between C and Fortran: In C it has to take a
+file pointer 'FILE * logFile' and return an int. If the test has been passed
+successful it has to return a value unequal to 0. The file pointer can be used
+to write information into a log file.  In Fortran the function takes no
+argument and the function name must not exceed XX characters.  The return value
+has to be specified using the '<testfunctionname>' tags. It has to be 1 if the
+test succeeded and 0 if the test failed. For details see the example.
+
+To tell the test suite the name of your test function you have to enclose it
+into the '<ompts:testcode:functionname> </ompts:testcode:functionname>' tag.
+
+If there are differences between test and crosstest you can use the
+<ompts:check> </ompts:check> and <ompts:crosscheck> </ompts:crosscheck> tag.
+When generating the test the parser will use the code enclosed in
+<ompts:check> tags and cut out the code written in <ompts:crosscheck> tags. So
+you have two possibilities to write your template for test and crosstest: The
+first way you can write the complete code is to write the test in one
+<ompts:check> tag and later the crosstest in one <ompts:crosscheck> tag.  The
+second way is to write both tests only by enclosing differing parts in
+corresponding tags.
+
+The first method should be preferred if test and crosstest differ much from
+each other. The second e.g. if you only want to change a few options like
+replacing an omp singleprivate clause by an omp private clause or to cut out
+a directive like omp flush.  When you use the first way you have to take care
+of the function name! You have to declare it twice with
+<ompts:testcode:functionname>!
+
+- <ompts:orphan> </ompts:orphan> This tag can be used if you want to enable
+  your test to check the directive in orphan regions, too.  The code enclosed
+in this part will be written to a separate function which will be called
+instead.  If you have variables which are used outside this region you have to
+declare them as global variables enclosed in an <ompts:orphan:vars> tag. For
+further information see the description of the <ompts:orphan:vars> tag.
+
+- <ompts:orphan:vars> </ompts:orphan:vars> This tag is used to specify global
+  variables for an orphan region which allow the exchange of values between
+the main program and the orphaned functions. The usage differs between C and
+Fortran.  In C you have to use a single declaration for each variable. You can
+either declare all variables in one single or in several different regions. You
+must not initialize the variables inside!  In Fortran you have to put all
+declarations in one single tag. Because there exist no global variables as in
+C you have to use common blocks. For further information see the examples.
+
+III.2.  Adding tests to the test lists
+--------------------------------------
+
+After you have created a new test you have to add them to a testlist. Simply
+add the function name in a new Line into a file.
+
+
+
+------------------------------------------------------------------------------
+
+
+IV.   Known Issues and Workaround
+==============================================================================
+
+The Sun OS has a problem with the -maxdepth option on the 'make cleanall'
+command. This prevents the tests from being removed from the working directory
+and can cause problems with future tests. To remedy, edit the makefile line
+under the clean command:
+
+     -rm [cf]test*.[cf] [cf]crosstest*.[cf] [cf]ctest*.[cf] [cf]orphan*.[cf]
+
+Change to:
+
+     -rm [cf]test* [cf]crosstest* [cf]ctest* [cf]orphan*
+
+
+
+------------------------------------------------------------------------------
+
+V.   Contact and Support
+==============================================================================
+
+Contact: http://www.cs.uh.edu/~hpctools

diff --git a/final/testsuite/adding_xfails.py b/final/testsuite/adding_xfails.py
new file mode 100755
index 0000000..2ad48d9
--- /dev/null
+++ b/final/testsuite/adding_xfails.py

@@ -0,0 +1,32 @@
+
+import os
+import commands
+
+perl = "/usr/bin/perl"
+LLVM = "./LLVM-IR/"
+temp_filename = "temp"
+XFAIL_text = "; XFAIL: *\n"
+
+
+arch_file_list = dict()
+arch_file_list['lin_32e'] = ['test_omp_task_final.ll', 'test_omp_task_untied.ll']
+
+
+arch_script = "../runtime/tools/check-openmp-test.pl"
+arch_cmd = perl + " " + arch_script
+arch = commands.getoutput(arch_cmd)
+arch = arch[:len(arch)-1]
+
+print "Adding XFAILS ..."
+
+for f in arch_file_list[arch]:
+	filename = LLVM + arch + "/" + f
+	lines = open(filename).readlines()
+	lines.insert(1, XFAIL_text)
+	f2 = open(temp_filename, "w")
+	for l in lines:
+		f2.write(l)
+	f2.close()
+
+	os.system("mv " + temp_filename + " " + filename)
+

diff --git a/final/testsuite/bin/Makefile b/final/testsuite/bin/Makefile
new file mode 100644
index 0000000..1dbd3ba
--- /dev/null
+++ b/final/testsuite/bin/Makefile

@@ -0,0 +1,23 @@
+CC     = clang
+CFLAGS = -fopenmp -S -emit-llvm
+INCL =
+LIBS =
+
+BIN =
+SRC_DIR = ./c
+SRC_FILES := $(foreach DIR, $(SRC_DIR), $(wildcard $(SRC_DIR)/*.c))
+OBJS := $(patsubst %.c, %.o, $(SRC_FILES))
+TARGETLIST := $(patsubst %.c, %, $(SRC_FILES))
+
+all:$(TARGETLIST)
+	./distribute.sh
+	@ echo all done
+
+.PHONY: clean
+
+$(TARGETLIST): $(SRC_FILES)
+	$(CC) $(CFLAGS) $(LIBS) $(INCL) $@.c -o $@.ll
+
+clean:
+	@ rm -rf c/*
+	@ echo clean bin done

diff --git a/final/testsuite/bin/distribute.sh b/final/testsuite/bin/distribute.sh
new file mode 100755
index 0000000..dbb0f5f
--- /dev/null
+++ b/final/testsuite/bin/distribute.sh

@@ -0,0 +1,125 @@
+#!/bin/bash
+
+# add header for .ll files
+
+# get tmp header
+cp header /tmp/tmp.header
+echo >> /tmp/tmp.header
+
+# create temporary test package
+mkdir c-$MACHTYPE$OSTYPE
+`cp c/*.ll c-$MACHTYPE$OSTYPE/`
+
+# add new header into .ll files
+for file in c-$MACHTYPE$OSTYPE/*
+do
+    cp $file /tmp/tmp.ll.bf
+    cat /tmp/tmp.header /tmp/tmp.ll.bf > /tmp/tmp.ll
+    mv /tmp/tmp.ll $file
+done
+
+
+# in bin/, target is ../LLVM-IR/ARCH/OS
+LEVEL=../LLVM-IR/
+ARCH_PATH=../LLVM-IR/
+OS_PATH=../LLVM-IR/
+
+# for linux system, add your arch and os here
+declare -a ARCHes=(x86 x86_64 powerpc arm mips darwin)
+declare -a OSes=(linux macosx windows darwin)
+
+declare lowerARCH
+declare lowerOS
+
+# target directory name
+declare upperARCH
+declare upperOS
+
+lowerARCH=$(echo "$MACHTYPE" | tr '[:upper:]' '[:lower:]')
+lowerOS=$(echo "$OSTYPE" | tr '[:upper:]' '[:lower:]')
+
+# ARCH
+for i in ${ARCHes[@]}
+do
+    result=$(echo "${lowerARCH}" | grep $i)
+    if [[ "$result" != "" ]]
+    then
+        # upperARCH=$i
+		upperARCH=$(echo "$i" | tr '[:lower:]' '[:upper:]')
+    fi
+done
+
+if [[ "$upperARCH" == "" ]]
+then
+    echo "Not found ${lowerARCH} in the [${ARCHes[@]}]!"
+    exit
+fi
+
+# OS
+for i in ${OSes[@]}
+do
+    result=$(echo "${lowerOS}" | grep $i)
+    if [[ "$result" != "" ]]
+    then
+        # upperOS=$i
+		upperOS=$(echo "$i" | tr '[:lower:]' '[:upper:]')
+    fi
+done
+
+if [[ "$upperOS" == "" ]]
+then
+    echo "Not found ${lowerOS} in the [${OSes[@]}]!"
+    exit
+fi
+
+# survived, assemble the path
+# ARCH_PATH+=$upperARCH/
+# OS_PATH+=$upperARCH/$upperOS/
+ARCH_newFormat=.
+if [ $upperARCH = "X86" ]; then
+    ARCH_newFormat=32
+else
+    ARCH_newFormat=32e
+fi
+OS_newFormat=.
+if [ $upperOS = "LINUX" ]; then
+    OS_newFormat=lin
+elif [ $upperOS = "MACOSX" ]; then
+    OS_newFormat=mac
+elif [ $upperOS = "WINDOWS" ]; then
+    OS_newFormat=win
+elif [ $upperOS = "DARWIN" ]; then
+    OS_newFormat=dar
+else
+    OS_newFormat=unknown
+fi
+OS_PATH+=$OS_newFormat"_"$ARCH_newFormat
+
+# test and create directory
+if [ ! -d "$LEVEL" ]; then
+    mkdir $LEVEL
+    mkdir $OS_PATH
+else
+    if [ ! -d "$OS_PATH" ]; then
+        mkdir $OS_PATH
+    fi
+fi
+
+# reserve the tmp path to LLVM-IR/ARCH/OS
+echo $OS_PATH"/" > lit.tmp
+
+# OS_ARCH=$OS_newFormat"_"$ARCH_newFormat
+# echo -e "if not '$OS_ARCH' in config.root.targets:" > $OS_PATH'/'lit.local.cfg
+# echo -e "\tconfig.unsupported = True" >> $OS_PATH'/'lit.local.cfg
+
+# copy *.ll files to ARCH/OS
+`cp lit.* $LEVEL`
+
+# omit orph test
+`rm c-$MACHTYPE$OSTYPE/ctest_*.ll`
+`rm c-$MACHTYPE$OSTYPE/orph_ctest_*.ll`
+`cp c-$MACHTYPE$OSTYPE/*.ll $OS_PATH`
+
+# clean
+`rm /tmp/tmp.*`
+rm -rf c-$MACHTYPE$OSTYPE/

diff --git a/final/testsuite/bin/header b/final/testsuite/bin/header
new file mode 100644
index 0000000..3741e62
--- /dev/null
+++ b/final/testsuite/bin/header

@@ -0,0 +1,6 @@
+; RUN: %clang -lomp -lm %s -o %t && %t 2>&1 | FileCheck %s
+; CHECK-NOT: Test failed
+; CHECK: Directive worked without errors
+
+
+

diff --git a/final/testsuite/bin/lit.cfg b/final/testsuite/bin/lit.cfg
new file mode 100644
index 0000000..1957819
--- /dev/null
+++ b/final/testsuite/bin/lit.cfg

@@ -0,0 +1,78 @@
+# -*- Python -*-
+
+# Configuration file for the 'lit' test runner.
+
+import os
+import sys
+import re
+import platform
+
+try:
+   import lit.util
+   import lit.formats
+except ImportError:
+   pass
+
+# name: The name of this test suite.
+config.name = 'OpenMPValidationSuite'
+
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+# suffixes: A list of file extensions to treat as test files
+# Note this can be overridden by lit.local.cfg files
+config.suffixes = ['.ll']
+
+# test_source_root: The root path where tests are located.
+#config.test_source_root = "/home/ichoyjx/install/openmp/testsuite/bin"
+#os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+#mpvs_obj_root = getattr(config, 'mpvs_obj_root', None)
+#if mpvs_obj_root is not None:
+config.test_exec_root = "./"
+#os.path.join(mpvs_obj_root, 'src')
+
+# Discover the 'clang' and 'clangcc' to use.
+
+import os
+
+def inferClang(PATH):
+    # Determine which clang to use.
+    clang = os.getenv('CLANG')
+
+    # If the user set clang in the environment, definitely use that and don't
+    # try to validate.
+    if clang:
+        return clang
+
+    # Otherwise look in the path.
+    clang = lit.util.which('clang', PATH)
+
+    if not clang:
+        lit_config.fatal("couldn't find 'clang' program, try setting "
+                         "CLANG in your environment")
+
+    return clang
+
+config.clang = inferClang(config.environment['PATH']).replace('\\', '/')
+config.substitutions.append( ('%clang', ' ' + config.clang + ' ') )
+
+# Propogate some environment variable to test environment.
+def addEnv(name):
+    if name in os.environ:
+        config.environment[name] = os.environ[name]
+
+addEnv('HOME')
+addEnv('PWD')
+
+
+addEnv('C_INCLUDE_PATH')
+addEnv('CPLUS_INCLUDE_PATH')
+addEnv('LIBRARY_PATH')
+addEnv('LD_LIBRARY_PATH')
+addEnv('DYLD_LIBRARY_PATH')
+
+# Check that the object root is known.
+if config.test_exec_root is None:
+    lit.fatal('test execution root not set!')

diff --git a/final/testsuite/bin/lit.site.cfg.in b/final/testsuite/bin/lit.site.cfg.in
new file mode 100644
index 0000000..11866d4
--- /dev/null
+++ b/final/testsuite/bin/lit.site.cfg.in

@@ -0,0 +1,30 @@
+## Autogenerated by LLVM/Clang configuration.
+#config.mpvs_src_root = "/home/ichoyjx/install/openmp/testsuite/bin/c"
+config.mpvs_obj_root = "/tmp"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+
+
+# Needed to check if a hack needs to be applied
+config.llvm_version_major = "@LLVM_VERSION_MAJOR@"
+
+# Compilers
+# FIXME: use llvmcc not llvmgcc
+config.llvmclang = "clang"
+
+# Features
+config.enable_uclibc = True if @ENABLE_UCLIBC@ == 1 else False
+config.enable_posix_runtime = True if @ENABLE_POSIX_RUNTIME@ == 1 else False
+config.have_selinux = True if @HAVE_SELINUX@ == 1 else False
+
+# Current target
+config.target_triple = "@TARGET_TRIPLE@"
+
+# Let the main config do the real work.
+try:
+  lit
+except NameError:
+  # Use lit_config class
+  lit_config.load_config(config, "@MPVS_SOURCE_DIR@/lit.cfg")
+else:
+  # Use old lit class
+  lit.load_config(config, "@MPVS_SOURCE_DIR@/lit.cfg")

diff --git a/final/testsuite/bin/lit.tmp b/final/testsuite/bin/lit.tmp
new file mode 100644
index 0000000..81d30fb
--- /dev/null
+++ b/final/testsuite/bin/lit.tmp

@@ -0,0 +1 @@
+../LLVM-IR/lin_32e/

diff --git a/final/testsuite/c/has_openmp.c b/final/testsuite/c/has_openmp.c
new file mode 100644
index 0000000..983a973
--- /dev/null
+++ b/final/testsuite/c/has_openmp.c

@@ -0,0 +1,30 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the OpenMp support.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>_OPENMP</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>has_openmp</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+    int rvalue = 0;
+    </ompts:orphan:vars>
+
+    <ompts:orphan>
+    <ompts:check>
+#ifdef _OPENMP
+	rvalue = 1;
+#endif
+    </ompts:check>
+    <ompts:crosscheck>
+#if 0
+	rvalue = 1;
+#endif
+    </ompts:crosscheck>
+      </ompts:orphan>
+	return (rvalue);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_atomic.c b/final/testsuite/c/omp_atomic.c
new file mode 100644
index 0000000..6c2d1aa
--- /dev/null
+++ b/final/testsuite/c/omp_atomic.c

@@ -0,0 +1,434 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp atomic directive by counting up a variable in a parallelized loop with an atomic directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp atomic</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+#include <math.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_atomic</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int sum;
+        int diff;
+        double dsum = 0;
+        double dt = 0.5;	/* base of geometric row for + and - test*/
+        double ddiff;
+        int product;
+        int x;
+        int *logics;
+        int bit_and = 1;
+        int bit_or = 0;
+        int exclusiv_bit_or = 0;
+    </ompts:orphan:vars>
+
+#define DOUBLE_DIGITS 20	/* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800	/* 10! */
+    int j;
+    int known_sum;
+    int known_diff;
+    int known_product;
+    int result = 0;
+    int logic_and = 1;
+    int logic_or = 0;
+    double dknown_sum;
+    double rounding_error = 1.E-9;
+    double dpt, div;
+    int logicsArray[LOOPCOUNT];
+    logics = logicsArray;
+    
+    sum = 0;
+    diff = 0;
+    product = 1;
+
+#pragma omp parallel
+    {
+	<ompts:orphan>
+	    int i;
+#pragma omp for
+	    for (i = 1; i <= LOOPCOUNT; i++)
+	    {
+		<ompts:check>#pragma omp atomic</ompts:check>
+		sum += i;
+	    }
+	</ompts:orphan>
+    }
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    if (known_sum != sum)
+    {
+        fprintf (logFile, 
+                 "Error in sum with integers: Result was %d instead of %d.\n", 
+                 sum, known_sum);
+        result++;
+    }
+    
+#pragma omp parallel
+    {
+        <ompts:orphan>   
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; i++)
+            {
+                 <ompts:check>#pragma omp atomic</ompts:check>
+                 diff -= i;
+            }
+        </ompts:orphan>
+    }
+    known_diff = ((LOOPCOUNT - 1) * LOOPCOUNT) / 2 * -1;
+    if (diff != known_diff)
+    {
+        fprintf (logFile,
+              "Error in difference with integers: Result was %d instead of 0.\n",
+               diff);
+        result++;
+    }
+
+    /* Tests for doubles */
+    dsum = 0;
+    dpt = 1;
+
+    for (j = 0; j < DOUBLE_DIGITS; ++j)
+      {
+        dpt *= dt;
+      }
+    dknown_sum = (1 - dpt) / (1 -dt);
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < DOUBLE_DIGITS; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                dsum += pow (dt, i);
+            }
+        </ompts:orphan>
+    }
+
+    if (dsum != dknown_sum && (fabs (dsum - dknown_sum) > rounding_error))
+    {
+        fprintf (logFile,
+                 "Error in sum with doubles: Result was %f instead of: %f (Difference: %E)\n",
+                 dsum, dknown_sum, dsum - dknown_sum);
+        result++;
+    }
+
+    dpt = 1;
+
+    for (j = 0; j < DOUBLE_DIGITS; ++j)
+    {
+        dpt *= dt;
+    }
+    ddiff = (1 - dpt) / (1 - dt);
+#pragma omp parallel
+   {
+         <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < DOUBLE_DIGITS; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                ddiff -= pow (dt, i);
+            }
+         </ompts:orphan>
+    } 
+    if (fabs (ddiff) > rounding_error)
+    {
+        fprintf (logFile,
+                 "Error in difference with doubles: Result was %E instead of 0.0\n",
+                 ddiff);
+        result++;
+    }
+
+#pragma omp parallel
+    {
+         <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 1; i <= MAX_FACTOR; i++)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                product *= i;
+            }
+         </ompts:orphan>
+    }
+    
+    known_product = KNOWN_PRODUCT;
+    if (known_product != product)
+    {
+        fprintf (logFile,
+                 "Error in product with integers: Result was %d instead of %d\n",
+                 product, known_product);
+        result++;
+    }
+
+    product = KNOWN_PRODUCT;
+#pragma omp parallel
+    {
+        <ompts:orphan>
+           int i;
+#pragma omp for
+            for (i = 1; i <= MAX_FACTOR; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                product /= i;
+            }
+         </ompts:orphan>
+    }
+
+    if (product != 1)
+    {
+        fprintf (logFile,
+                 "Error in product division with integers: Result was %d instead of 1\n",
+                 product);
+        result++;
+    }
+    
+    div = 5.0E+5;
+#pragma omp parallel
+    {
+            int i;
+#pragma omp for
+            for (i = 1; i <= MAX_FACTOR; i++)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                div /= i;
+            }
+    }
+
+    if (fabs(div-0.137787) >= 1.0E-4 )
+    {
+        result++;
+        fprintf (logFile,
+                 "Error in division with double: Result was %f instead of 0.137787\n", div);
+    }
+
+    x = 0;
+
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                x++;
+            }
+         </ompts:orphan>
+    }
+
+    if (x != LOOPCOUNT)
+    {
+        result++;
+        fprintf (logFile, "Error in ++\n");
+    }
+
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                x--;
+            }
+        </ompts:orphan>
+    }
+
+    if (x != 0)
+    {
+        result++;
+        fprintf (logFile, "Error in --\n");
+    }
+
+    for (j = 0; j < LOOPCOUNT; ++j)
+    {
+        logics[j] = 1;
+    }
+    bit_and = 1;
+
+#pragma omp parallel
+    {
+        <ompts:orphan>
+           int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                bit_and &= logics[i];
+            }
+         </ompts:orphan>
+    }
+
+    if (!bit_and)
+    {
+        result++;
+        fprintf (logFile, "Error in BIT AND part 1\n");
+    }
+
+    bit_and = 1;
+    logics[LOOPCOUNT / 2] = 0;
+
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                bit_and &= logics[i];
+            }
+        </ompts:orphan>
+    }
+
+    if (bit_and)
+    {
+        result++;
+        fprintf (logFile, "Error in BIT AND part 2\n");
+    }
+
+    for (j = 0; j < LOOPCOUNT; j++)
+    {
+        logics[j] = 0;
+    }
+    bit_or = 0;
+
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                bit_or |= logics[i];
+            }
+        </ompts:orphan>
+    }
+
+    if (bit_or)
+    {
+        result++;
+        fprintf (logFile, "Error in BIT OR part 1\n");
+    }
+    bit_or = 0;
+    logics[LOOPCOUNT / 2] = 1;
+
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                bit_or |= logics[i];
+            }
+        </ompts:orphan>
+    }
+                                                                                   
+    if (!bit_or)
+    {
+        result++;
+        fprintf (logFile, "Error in BIT OR part 2\n");
+    }
+
+    for (j = 0; j < LOOPCOUNT; j++)
+    {
+        logics[j] = 0;
+    }
+    exclusiv_bit_or = 0;
+
+#pragma omp parallel
+    {
+        <ompts:orphan> 
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                 <ompts:check>#pragma omp atomic</ompts:check>
+                 exclusiv_bit_or ^= logics[i];
+            }
+        </ompts:orphan>
+    }
+                                                                                   
+    if (exclusiv_bit_or) 
+    {
+        result++;
+        fprintf (logFile, "Error in EXCLUSIV BIT OR part 1\n");
+    }
+
+    exclusiv_bit_or = 0;
+    logics[LOOPCOUNT / 2] = 1;
+    
+#pragma omp parallel
+    {
+        <ompts:orphan> 
+            int i;
+#pragma omp for
+            for (i = 0; i < LOOPCOUNT; ++i)
+            {
+                 <ompts:check>#pragma omp atomic</ompts:check>
+                 exclusiv_bit_or ^= logics[i];
+            }
+        </ompts:orphan>
+    }
+                                                                                   
+    if (!exclusiv_bit_or) 
+    {
+        result++;
+        fprintf (logFile, "Error in EXCLUSIV BIT OR part 2\n");
+    }
+
+    x = 1;
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < 10; ++i)
+            {
+                 <ompts:check>#pragma omp atomic</ompts:check>
+                 x <<= 1;
+            }
+        </ompts:orphan>
+    }
+
+    if ( x != 1024)
+    {
+        result++;
+        fprintf (logFile, "Error in <<\n");
+        x = 1024;
+    }
+
+#pragma omp parallel
+    {
+        <ompts:orphan>
+            int i;
+#pragma omp for
+            for (i = 0; i < 10; ++i)
+            {
+                <ompts:check>#pragma omp atomic</ompts:check>
+                x >>= 1;
+            }
+        </ompts:orphan>
+    }
+
+    if (x != 1)
+    {
+        result++;
+        fprintf (logFile, "Error in >>\n");
+    }
+
+    return (result == 0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_barrier.c b/final/testsuite/c/omp_barrier.c
new file mode 100644
index 0000000..df669e2
--- /dev/null
+++ b/final/testsuite/c/omp_barrier.c

@@ -0,0 +1,41 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp barrier directive. The test creates several threads and sends one of them sleeping before setting a flag. After the barrier the other ones do some little work depending on the flag.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp barrier</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_barrier</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int result1;
+	int result2;
+    </ompts:orphan:vars>
+
+    result1 = 0;
+    result2 = 0;
+
+#pragma omp parallel
+    {
+    <ompts:orphan>
+	int rank;
+	rank = omp_get_thread_num ();
+	if (rank ==1) {
+        my_sleep(SLEEPTIME_LONG);
+        result2 = 3;
+	}
+<ompts:check>#pragma omp barrier</ompts:check>
+	if (rank == 2) {
+	    result1 = result2;
+	}
+    </ompts:orphan>
+    }
+    printf("result1=%d\n",result1);
+    return (result1 == 3);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_critical.c b/final/testsuite/c/omp_critical.c
new file mode 100644
index 0000000..cb3f264
--- /dev/null
+++ b/final/testsuite/c/omp_critical.c

@@ -0,0 +1,42 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp critical directive by counting up a variable in a parallelized loop within a critical section.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp critical</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_critical</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int sum;
+    </ompts:orphan:vars>
+    sum=0;
+    int known_sum;
+	  
+    <ompts:orphan>
+    #pragma omp parallel
+    {
+      int mysum=0;
+      int i;
+      
+      #pragma omp for
+	    for (i = 0; i < 1000; i++)
+	      mysum = mysum + i;
+
+    <ompts:check>#pragma omp critical</ompts:check>
+	    sum = mysum +sum;
+        
+    }	/* end of parallel */
+    </ompts:orphan>
+    
+    printf("sum=%d\n",sum);
+    known_sum = 999 * 1000 / 2;
+    return (known_sum == sum);
+
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_flush.c b/final/testsuite/c/omp_flush.c
new file mode 100644
index 0000000..3fabf8e
--- /dev/null
+++ b/final/testsuite/c/omp_flush.c

@@ -0,0 +1,50 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp flush directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp flush</ompts:directive>
+<ompts:dependences>omp barrier</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_flush</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int result1;
+	int result2;
+	int dummy;
+    </ompts:orphan:vars>
+
+	result1 = 0;
+	result2 = 0;
+
+#pragma omp parallel
+    {
+	int rank;
+	rank = omp_get_thread_num ();
+
+#pragma omp barrier
+	if (rank == 1) {
+	    result2 = 3;
+	    <ompts:orphan>
+		<ompts:check>#pragma omp flush (result2)</ompts:check>
+		dummy = result2;
+	    </ompts:orphan>
+	}
+
+	if (rank == 0) {
+	    <ompts:check>my_sleep(SLEEPTIME_LONG);</ompts:check>
+	    <ompts:orphan>
+		<ompts:check>#pragma omp flush (result2)</ompts:check>
+		result1 = result2;
+	    </ompts:orphan>
+	}
+    }	/* end of parallel */
+
+    return ((result1 == result2) && (result2 == dummy) && (result2 == 3));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_collapse.c b/final/testsuite/c/omp_for_collapse.c
new file mode 100644
index 0000000..7216f40
--- /dev/null
+++ b/final/testsuite/c/omp_for_collapse.c

@@ -0,0 +1,56 @@
+<ompts:test>
+<ompts:testdescription>Test with omp for collapse clause. Bind with two loops. Without the collapse clause, the first loop will not be ordered</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp for collapse</ompts:directive>
+<ompts:dependences>omp critical,omp for schedule</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+
+#include "omp_testsuite.h"
+
+
+/* Utility function to check that i is increasing monotonically 
+   with each call */
+static int check_i_islarger (int i)
+{
+    static int last_i;
+    int islarger;
+    if (i==1)
+      last_i=0;
+    islarger = ((i >= last_i)&&(i - last_i<=1));
+    last_i = i;
+    return (islarger);
+}
+
+int <ompts:testcode:functionname>omp_for_collapse</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int is_larger = 1;
+    </ompts:orphan:vars>
+
+    #pragma omp parallel
+    {
+	<ompts:orphan>
+	  int i,j;
+	  int my_islarger = 1;
+      #pragma omp for private(i,j) schedule(static,1) <ompts:check>collapse(2)</ompts:check> ordered
+	    for (i = 1; i < 100; i++){
+          <ompts:crosscheck>my_islarger = check_i_islarger(i)&& my_islarger;</ompts:crosscheck>
+          for (j =1; j <100; j++)
+          {
+            <ompts:check>
+		    #pragma omp ordered
+		      my_islarger = check_i_islarger(i)&&my_islarger;
+            </ompts:check>
+	      }	/* end of for */
+        }
+      #pragma omp critical
+		is_larger = is_larger && my_islarger;
+	</ompts:orphan>
+    }
+
+    return (is_larger);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_firstprivate.c b/final/testsuite/c/omp_for_firstprivate.c
new file mode 100644
index 0000000..2272014
--- /dev/null
+++ b/final/testsuite/c/omp_for_firstprivate.c

@@ -0,0 +1,55 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp for firstprivate clause by counting up a variable in a parallelized loop. Each thread has a firstprivate variable (1) and an variable (2) declared by for firstprivate. First it stores the result of its last iteration in variable (2). Then it stores the value of the variable (2) in its firstprivate variable (1). At the end all firstprivate variables (1) are added to a total sum in a critical section and compared with the correct result.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp for firstprivate</ompts:directive>
+<ompts:dependences>omp critical,omp parallel firstprivate</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int <ompts:testcode:functionname>omp_for_firstprivate</ompts:testcode:functionname> (FILE * logFile)
+{
+    int sum;
+    <ompts:orphan:vars>
+	int sum0;
+    </ompts:orphan:vars>
+
+    int known_sum;
+    int threadsnum;
+
+    sum = 0;
+    sum0 = 12345;
+    sum1 = 0;
+
+#pragma omp parallel
+    {
+#pragma omp single
+        {
+            threadsnum=omp_get_num_threads();
+        }
+	/* sum0 = 0; */
+	<ompts:orphan>
+	int i;
+#pragma omp for <ompts:check>firstprivate(sum0)</ompts:check><ompts:crosscheck>private(sum0)</ompts:crosscheck>
+	for (i = 1; i <= LOOPCOUNT; i++)
+	{
+	    sum0 = sum0 + i;
+	    sum1 = sum0;
+	}	/* end of for */
+	</ompts:orphan>
+#pragma omp critical
+	{
+	    sum = sum + sum1;
+	}	/* end of critical */
+    }	/* end of parallel */    
+
+    known_sum = 12345* threadsnum+ (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    return (known_sum == sum);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_lastprivate.c b/final/testsuite/c/omp_for_lastprivate.c
new file mode 100644
index 0000000..c2080a2
--- /dev/null
+++ b/final/testsuite/c/omp_for_lastprivate.c

@@ -0,0 +1,52 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp for lastprivate clause by counting up a variable in a parallelized loop. Each thread saves the next summand in a lastprivate variable i0. At the end i0 is compared to the value of the expected last summand.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp for lastprivate</ompts:directive>
+<ompts:dependences>omp critical,omp parallel firstprivate,omp schedule</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+
+#include "omp_testsuite.h"
+
+int sum0;
+#pragma omp threadprivate(sum0)
+
+int <ompts:testcode:functionname>omp_for_lastprivate</ompts:testcode:functionname> (FILE * logFile)
+{
+	int sum = 0;
+	int known_sum;
+	<ompts:orphan:vars>
+	    int i0;
+	</ompts:orphan:vars>
+
+	i0 = -1;
+
+#pragma omp parallel
+	{
+	    sum0 = 0;
+	    {	/* Begin of orphaned block */
+	    <ompts:orphan>
+		int i;
+#pragma omp for schedule(static,7) <ompts:check>lastprivate(i0)</ompts:check>
+		for (i = 1; i <= LOOPCOUNT; i++)
+		{
+		    sum0 = sum0 + i;
+		    i0 = i;
+		}	/* end of for */
+	    </ompts:orphan>
+	    }	/* end of orphaned block */
+
+#pragma omp critical
+	    {
+		sum = sum + sum0;
+	    }	/* end of critical */
+	}	/* end of parallel */    
+
+	known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+	fprintf(logFile," known_sum = %d , sum = %d \n",known_sum,sum);
+	fprintf(logFile," LOOPCOUNT = %d , i0 = %d \n",LOOPCOUNT,i0);
+	return ((known_sum == sum) && (i0 == LOOPCOUNT) );
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_nowait.c b/final/testsuite/c/omp_for_nowait.c
new file mode 100644
index 0000000..a3ec40b
--- /dev/null
+++ b/final/testsuite/c/omp_for_nowait.c

@@ -0,0 +1,57 @@
+<ompts:test>
+<ompts:description>Test which checks the omp parallel for nowait directive. It fills an array with values and operates on these in the following.</ompts:description>
+<ompts:directive>omp parallel for nowait</ompts:directive>
+<ompts:version>1.0</ompts:version>
+<ompts:dependences>omp parallel for, omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_for_nowait</ompts:testcode:functionname> (FILE * logFile)
+{
+	<ompts:orphan:vars>
+		int result;
+		int count;
+	</ompts:orphan:vars>
+	int j;
+	int myarray[LOOPCOUNT];
+
+	result = 0;
+	count = 0;
+
+#pragma omp parallel 
+	{
+	<ompts:orphan>
+		int rank;
+		int i;
+
+		rank = omp_get_thread_num();
+
+#pragma omp for <ompts:check>nowait</ompts:check> 
+		for (i = 0; i < LOOPCOUNT; i++) {
+			if (i == 0) {
+				fprintf (logFile, "Thread nr %d entering for loop and going to sleep.\n", rank);
+				my_sleep(SLEEPTIME);
+				count = 1;
+#pragma omp flush(count)
+				fprintf (logFile, "Thread nr %d woke up and set count = 1.\n", rank);
+			}
+		}
+		
+		fprintf (logFile, "Thread nr %d exited first for loop and enters the second.\n", rank);
+#pragma omp for
+		for (i = 0; i < LOOPCOUNT; i++) 
+		{
+#pragma omp flush(count)
+			if (count == 0)
+				result = 1;
+		}
+	</ompts:orphan>
+	}
+	
+	return result;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_ordered.c b/final/testsuite/c/omp_for_ordered.c
new file mode 100644
index 0000000..6297d04
--- /dev/null
+++ b/final/testsuite/c/omp_for_ordered.c

@@ -0,0 +1,60 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp ordered directive by counting up an variable in an parallelized loop and watching each iteration if the sumand is larger as the last one.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp for ordered</ompts:directive>
+<ompts:dependences>omp critical,omp for schedule</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+/* Utility function to check that i is increasing monotonically 
+   with each call */
+static int check_i_islarger (int i)
+{
+    int islarger;
+    islarger = (i > last_i);
+    last_i = i;
+    return (islarger);
+}
+
+int <ompts:testcode:functionname>omp_for_ordered</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int sum;
+	int is_larger = 1;
+    </ompts:orphan:vars>
+    int known_sum;
+
+    last_i = 0;
+    sum = 0;
+
+#pragma omp parallel
+    {
+	<ompts:orphan>
+	    int i;
+	    int my_islarger = 1;
+#pragma omp for schedule(static,1) ordered
+	    for (i = 1; i < 100; i++)
+	    {
+		<ompts:check>#pragma omp ordered</ompts:check>
+		{
+		    my_islarger = check_i_islarger(i) && my_islarger;
+		    sum = sum + i;
+		}	/* end of ordered */
+	    }	/* end of for */
+#pragma omp critical
+	    {
+		is_larger = is_larger && my_islarger;
+	    }	/* end of critical */
+	</ompts:orphan>
+    }
+
+    known_sum=(99 * 100) / 2;
+    return ((known_sum == sum) && is_larger);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_private.c b/final/testsuite/c/omp_for_private.c
new file mode 100644
index 0000000..8a65110
--- /dev/null
+++ b/final/testsuite/c/omp_for_private.c

@@ -0,0 +1,64 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp for private clause by counting up a variable in a parallelized loop. Each thread has a private variable (1) and a variable (2) declared by for private. First it stores the result of its last iteration in variable (2). Then this thread waits some time before it stores the value of the variable (2) in its private variable (1). At the beginning of the next iteration the value of (1) is assigned to (2). At the end all private variables (1) are added to a total sum in a critical section and compared with the correct result.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp for private</ompts:directive>
+<ompts:dependences>omp parallel,omp flush,omp critical,omp threadprivate</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+static void do_some_work (){
+    int i;
+    double sum = 0;
+    for(i = 0; i < 1000; i++){
+	sum += sqrt ((double) i);
+    }
+}
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int <ompts:testcode:functionname>omp_for_private</ompts:testcode:functionname> (FILE * logFile)
+{
+    int sum = 0;
+    <ompts:orphan:vars>
+	int sum0;
+    </ompts:orphan:vars>
+
+    int known_sum;
+
+    sum0 = 0;	/* setting (global) sum0 = 0 */
+
+#pragma omp parallel
+    {
+	sum1 = 0;	/* setting sum1 in each thread to 0 */
+
+	{	/* begin of orphaned block */
+	<ompts:orphan>
+	    int i;
+#pragma omp for <ompts:check>private(sum0)</ompts:check> schedule(static,1)
+	    for (i = 1; i <= LOOPCOUNT; i++)
+	    {
+		sum0 = sum1;
+#pragma omp flush
+		sum0 = sum0 + i;
+		do_some_work ();
+#pragma omp flush
+		sum1 = sum0;
+	    }	/* end of for */
+	</ompts:orphan>
+	}	/* end of orphaned block */
+
+#pragma omp critical
+	{
+	    sum = sum + sum1;
+	}	/*end of critical*/
+    }	/* end of parallel*/    
+
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    return (known_sum == sum);
+}                                
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_reduction.c b/final/testsuite/c/omp_for_reduction.c
new file mode 100644
index 0000000..6dd1917
--- /dev/null
+++ b/final/testsuite/c/omp_for_reduction.c

@@ -0,0 +1,429 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp for reduction directive wich all its options.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp for reduction</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "omp_testsuite.h"
+
+
+int <ompts:testcode:functionname>omp_for_reduction</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	double dt;
+	int sum;
+	int diff;
+	int product = 1;
+	double dsum;
+	double dknown_sum;
+	double ddiff;
+	int logic_and;
+	int logic_or;
+	int bit_and;
+	int bit_or;
+	int exclusiv_bit_or;
+	int *logics;
+    </ompts:orphan:vars>
+
+#define DOUBLE_DIGITS 20	/* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800	/* 10! */
+
+    int i;
+    int known_sum;
+    int known_product;
+    double rounding_error = 1.E-9;	/* over all rounding error to be ignored in the double tests */
+    double dpt;
+    int result = 0;
+    int logicsArray[LOOPCOUNT];
+
+    /* Variables for integer tests */
+    sum = 0;
+    product = 1;
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    /* variabels for double tests */
+    dt = 1. / 3.;	/* base of geometric row for + and - test*/
+    dsum = 0.;
+    /* Variabeles for logic  tests */
+    logics = logicsArray;
+    logic_and = 1;
+    logic_or = 0;
+    /* Variabeles for bit operators tests */
+    bit_and = 1;
+    bit_or = 0;
+    /* Variables for exclusiv bit or */
+    exclusiv_bit_or = 0;
+
+
+/****************************************************************************/
+/** Tests for integers                                                     **/
+/****************************************************************************/
+
+
+/**** Testing integer addition ****/
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(+:sum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	    for (j = 1; j <= LOOPCOUNT; j++)
+	    {
+		sum = sum + j;
+	    }
+	</ompts:orphan>
+    }
+
+    if (known_sum != sum) {
+	result++;
+	fprintf (logFile, "Error in sum with integers: Result was %d instead of %d.\n", sum, known_sum); 
+    }
+
+
+/**** Testing integer subtracton ****/
+
+    diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(-:diff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	    for (j = 1; j <= LOOPCOUNT; j++)
+	    {
+		diff = diff - j;
+	    }
+	</ompts:orphan>
+    }
+
+    if (diff != 0) {
+	result++;
+	fprintf (logFile, "Error in difference with integers: Result was %d instead of 0.\n", diff);
+    }
+
+
+/**** Testing integer multiplication ****/
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(*:product)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	    for (j = 1; j <= MAX_FACTOR; j++)
+	    {
+		product *= j;
+	    }
+	</ompts:orphan>
+    }
+
+    known_product = KNOWN_PRODUCT;
+    if(known_product != product)
+    {
+	result++;
+	fprintf (logFile,"Error in Product with integers: Result was %d instead of %d\n",product,known_product);
+    }
+
+
+/****************************************************************************/
+/** Tests for doubles                                                      **/
+/****************************************************************************/
+
+
+/**** Testing double addition ****/
+
+    dsum = 0.;
+    dpt = 1.;
+
+    for (i = 0; i < DOUBLE_DIGITS; ++i)
+    {
+	dpt *= dt;
+    }
+    dknown_sum = (1 - dpt) / (1 - dt);
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(+:dsum)</ompts:check>
+	    for (j = 0; j < DOUBLE_DIGITS; j++)
+	    {	
+		dsum += pow (dt, j);
+	    }
+	</ompts:orphan>
+    }
+
+    if (fabs (dsum - dknown_sum) > rounding_error) {
+	result++; 
+	fprintf (logFile, "\nError in sum with doubles: Result was %f instead of: %f (Difference: %E)\n", dsum, dknown_sum, dsum-dknown_sum);
+    }
+
+#if 0
+    dpt = 1.;
+    for (i = 0; i < DOUBLE_DIGITS; ++i)
+    {
+	dpt *= dt;
+    }
+#endif
+
+
+/**** Testing double subtraction ****/
+
+    ddiff = (1 - dpt) / (1 - dt);
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(-:ddiff)</ompts:check>
+	    for (j = 0; j < DOUBLE_DIGITS; ++j)
+	    {
+		ddiff -= pow (dt, j);
+	    }
+	</ompts:orphan>
+    }
+
+    if (fabs (ddiff) > rounding_error) {
+	result++;
+	fprintf (logFile, "Error in Difference with doubles: Result was %E instead of 0.0\n", ddiff);
+    }
+
+
+/****************************************************************************/
+/** Tests for logical values                                               **/
+/****************************************************************************/
+
+
+/**** Testing logic and ****/
+
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+	logics[i] = 1;
+    }
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(&&:logic_and)</ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		logic_and = (logic_and && logics[j]);
+	    }
+	</ompts:orphan>
+    }
+
+    if(!logic_and) {
+	result++;
+	fprintf (logFile, "Error in logic AND part 1\n");
+    }
+
+    logic_and = 1;
+    logics[LOOPCOUNT / 2] = 0;
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		logic_and = logic_and && logics[j];
+	    }
+	</ompts:orphan>
+    }
+
+    if(logic_and) {
+	result++;
+	fprintf (logFile, "Error in logic AND part 2\n");
+    }
+
+
+/**** Testing logic or ****/
+
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+	logics[i] = 0;
+    }
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(||:logic_or)  </ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		logic_or = logic_or || logics[j];
+	    }
+	</ompts:orphan>
+    }
+
+    if (logic_or) {
+	result++;
+	fprintf (logFile, "Error in logic OR part 1\n");
+    }
+
+    logic_or = 0;
+    logics[LOOPCOUNT / 2] = 1;
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(||:logic_or)</ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		logic_or = logic_or || logics[j];
+	    }
+	</ompts:orphan>
+    }
+
+    if(!logic_or) {
+	result++;
+	fprintf (logFile, "Error in logic OR part 2\n");
+    }
+
+
+/****************************************************************************/
+/** Tests for bit values                                                   **/
+/****************************************************************************/
+
+
+/**** Testing bit and ****/
+
+    for (i = 0; i < LOOPCOUNT; ++i)
+    {
+	logics[i] = 1;
+    }
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(&:bit_and)  </ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		bit_and = (bit_and & logics[j]);
+	    }
+	</ompts:orphan>
+    }
+
+    if (!bit_and) {
+	result++;
+	fprintf (logFile, "Error in BIT AND part 1\n");
+    }
+
+    bit_and = 1;
+    logics[LOOPCOUNT / 2] = 0;
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(&:bit_and)</ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		bit_and = bit_and & logics[j];
+	    }
+	</ompts:orphan>
+    }
+    if (bit_and) {
+	result++;
+	fprintf (logFile, "Error in BIT AND part 2\n");
+    }
+
+
+/**** Testing bit or ****/
+
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+	logics[i] = 0;
+    }
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(|:bit_or)</ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		bit_or = bit_or | logics[j];
+	    }
+	</ompts:orphan>
+    }
+
+    if (bit_or) {
+	result++;
+	fprintf (logFile, "Error in BIT OR part 1\n");
+    }
+
+    bit_or = 0;
+    logics[LOOPCOUNT / 2] = 1;
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(|:bit_or)</ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		bit_or = bit_or | logics[j];
+	    }
+	</ompts:orphan>
+    }
+    if (!bit_or) {
+	result++;
+	fprintf (logFile, "Error in BIT OR part 2\n");
+    }
+
+
+/**** Testing exclusive bit or ****/
+
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+	logics[i] = 0;
+    }
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+	    }
+	</ompts:orphan>
+    }
+    if (exclusiv_bit_or) {
+	result++;
+	fprintf (logFile, "Error in EXCLUSIV BIT OR part 1\n");
+    }
+
+    exclusiv_bit_or = 0;
+    logics[LOOPCOUNT / 2] = 1;
+
+#pragma omp parallel 
+    {
+	<ompts:orphan>
+	    int j;
+#pragma omp for schedule(dynamic,1) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check>
+	    for (j = 0; j < LOOPCOUNT; ++j)
+	    {
+		exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+	    }
+	</ompts:orphan>
+    }
+    if (!exclusiv_bit_or) {
+	result++;
+	fprintf (logFile, "Error in EXCLUSIV BIT OR part 2\n");
+    }
+
+    /*fprintf ("\nResult:%d\n", result);*/
+    return (result == 0);
+
+    free (logics);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_schedule_auto.c b/final/testsuite/c/omp_for_schedule_auto.c
new file mode 100644
index 0000000..e61a1c4
--- /dev/null
+++ b/final/testsuite/c/omp_for_schedule_auto.c

@@ -0,0 +1,55 @@
+<ompts:test>
+<ompts:testdescription>Test with omp for schedule auto</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp for auto</ompts:directive>
+<ompts:dependences>omp critical,omp parallel firstprivate</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int <ompts:testcode:functionname>omp_for_auto</ompts:testcode:functionname> (FILE * logFile)
+{
+    int sum;
+    <ompts:orphan:vars>
+	int sum0;
+    </ompts:orphan:vars>
+
+    int known_sum;
+    int threadsnum;
+
+    sum = 0;
+    sum0 = 12345;
+    sum1 = 0;
+
+#pragma omp parallel
+    {
+#pragma omp single
+        {
+            threadsnum=omp_get_num_threads();
+        }
+	/* sum0 = 0; */
+	<ompts:orphan>
+	int i;
+#pragma omp for <ompts:check>firstprivate(sum0) schedule(auto)</ompts:check><ompts:crosscheck>private(sum0)</ompts:crosscheck>
+	for (i = 1; i <= LOOPCOUNT; i++)
+	{
+	    sum0 = sum0 + i;
+	    sum1 = sum0;
+	}	/* end of for */
+	</ompts:orphan>
+#pragma omp critical
+	{
+	    sum = sum + sum1;
+	}	/* end of critical */
+    }	/* end of parallel */    
+
+    known_sum = 12345* threadsnum+ (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    return (known_sum == sum);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_schedule_dynamic.c b/final/testsuite/c/omp_for_schedule_dynamic.c
new file mode 100644
index 0000000..d2cda56
--- /dev/null
+++ b/final/testsuite/c/omp_for_schedule_dynamic.c

@@ -0,0 +1,111 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the dynamic option of the omp for schedule directive</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp for schedule(dynamic)</ompts:directive>
+<ompts:dependences>omp flush,omp for nowait,omp critical,omp single</ompts:dependences>
+<ompts:testcode>
+
+/*
+* Test for dynamic scheduling with chunk size
+* Method: caculate how many times the iteration space is dispatched
+*         and judge if each dispatch has the requested chunk size
+*         unless it is the last one.
+* It is possible for two adjacent chunks are assigned to the same thread
+* Modifyied by Chunhua Liao
+*/
+#include <stdio.h>
+#include <omp.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFDMAX_SIZE 100
+const int chunk_size = 7;
+
+int <ompts:testcode:functionname>omp_for_schedule_dynamic</ompts:testcode:functionname> (FILE * logFile)
+{
+  int tid;
+<ompts:orphan:vars>  
+  int *tids;
+  int i;
+</ompts:orphan:vars>
+
+  int tidsArray[CFDMAX_SIZE];
+  int count = 0;
+  int tmp_count = 0; /*dispatch times*/
+  int *tmp;  /*store chunk size for each dispatch*/
+  int result = 0;
+  
+  tids = tidsArray;
+
+#pragma omp parallel private(tid) shared(tids)
+  {				/* begin of parallel */
+     <ompts:orphan>
+      int tid;
+
+    tid = omp_get_thread_num ();
+#pragma omp for <ompts:check>schedule(dynamic,chunk_size)</ompts:check>
+    for (i = 0; i < CFDMAX_SIZE; i++)
+      {
+	tids[i] = tid;
+      }
+     </ompts:orphan>
+  }				/* end of parallel */
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i)
+    {
+      if (tids[i] != tids[i + 1])
+	{
+	  count++;
+	}
+    }
+
+  tmp = (int *) malloc (sizeof (int) * (count + 1));
+  tmp[0] = 1;
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i)
+    {
+      if (tmp_count > count)
+	{
+	  printf ("--------------------\nTestinternal Error: List too small!!!\n--------------------\n");	/* Error handling */
+	  break;
+	}
+      if (tids[i] != tids[i + 1])
+	{
+	  tmp_count++;
+	  tmp[tmp_count] = 1;
+	}
+      else
+	{
+	  tmp[tmp_count]++;
+	}
+    }
+/*
+printf("debug----\n");
+    for (i = 0; i < CFDMAX_SIZE; ++i)
+	printf("%d ",tids[i]);
+printf("debug----\n");
+*/
+/* is dynamic statement working? */
+  for (i = 0; i < count; i++)
+    {
+      if ((tmp[i]%chunk_size)!=0) 
+/*it is possible for 2 adjacent chunks assigned to a same thread*/
+	{
+         result++;
+  fprintf(logFile,"The intermediate dispatch has wrong chunksize.\n");
+	  /*result += ((tmp[i] / chunk_size) - 1);*/
+	}
+    }
+  if ((tmp[count]%chunk_size)!=(CFDMAX_SIZE%chunk_size))
+   { 
+   result++;
+  fprintf(logFile,"the last dispatch has wrong chunksize.\n");
+   }
+  /* for (int i=0;i<count+1;++i) printf("%d\t:=\t%d\n",i+1,tmp[i]); */
+  return (result==0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_schedule_guided.c b/final/testsuite/c/omp_for_schedule_guided.c
new file mode 100644
index 0000000..83564a5
--- /dev/null
+++ b/final/testsuite/c/omp_for_schedule_guided.c

@@ -0,0 +1,225 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the guided option of the omp for schedule directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp for schedule(guided)</ompts:directive>
+<ompts:dependences>omp flush,omp for nowait,omp critical,omp single</ompts:dependences>
+<ompts:testcode>
+/* Test for guided scheduling
+ * Ensure threads get chunks interleavely first
+ * Then judge the chunk sizes are decreasing to a stable value
+ * Modified by Chunhua Liao
+ * For example, 100 iteration on 2 threads, chunksize 7
+ * one line for each dispatch, 0/1 means thread id
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  24
+ * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1              18
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0                      14
+ * 1 1 1 1 1 1 1 1 1 1                              10
+ * 0 0 0 0 0 0 0 0                                   8
+ * 1 1 1 1 1 1 1                                     7
+ * 0 0 0 0 0 0 0                                     7
+ * 1 1 1 1 1 1 1                                     7
+ * 0 0 0 0 0                                         5
+*/
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define NUMBER_OF_THREADS 10
+#define CFSMAX_SIZE 1000
+#define MAX_TIME  0.005
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0001
+#endif
+
+int <ompts:testcode:functionname>omp_for_schedule_guided</ompts:testcode:functionname> (FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int * tids;
+	int * chunksizes;
+	int notout;
+	int maxiter;
+    </ompts:orphan:vars>
+
+    int threads;
+    int i;
+    int result;
+
+    tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+	maxiter = 0;
+    result = 1;
+    notout = 1;
+
+/* Testing if enought threads are available for this check. */
+#pragma omp parallel
+	{
+#pragma omp single
+	  {
+		threads = omp_get_num_threads ();
+	  } /* end of single */
+	} /* end of parallel */
+
+	if (threads < 2) {
+	  printf ("This test only works with at least two threads .\n");
+	  fprintf (logFile, "This test only works with at least two threads. Available were only %d thread(s).\n", threads);
+	  return (0);
+	} /* end if */
+
+
+    /* Now the real parallel work:  
+     *
+	 * Each thread will start immediately with the first chunk.
+     */
+#pragma omp parallel shared(tids,maxiter)
+    {	/* begin of parallel */
+      <ompts:orphan>
+      double count;
+      int tid;
+      int j;
+
+      tid = omp_get_thread_num ();
+
+#pragma omp for nowait <ompts:check>schedule(guided)</ompts:check>
+      for(j = 0; j < CFSMAX_SIZE; ++j)
+      {
+	count = 0.;
+#pragma omp flush(maxiter)
+	if (j > maxiter)
+	{
+#pragma omp critical
+	  {
+	    maxiter = j;
+	  }	/* end of critical */ 
+	}
+	/*printf ("thread %d sleeping\n", tid);*/
+#pragma omp flush(maxiter,notout)	
+	while (notout && (count < MAX_TIME) && (maxiter == j))
+	{
+#pragma omp flush(maxiter,notout)
+	  my_sleep (SLEEPTIME);
+	  count += SLEEPTIME;
+#ifdef VERBOSE
+	  printf(".");
+#endif
+	}
+#ifdef VERBOSE
+	if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+	/*printf ("thread %d awake\n", tid);*/
+	tids[j] = tid;
+#ifdef VERBOSE
+	printf("%d finished by %d\n",j,tid);
+#endif
+      }	/* end of for */
+
+      notout = 0;
+#pragma omp flush(maxiter,notout)
+      </ompts:orphan>
+    }	/* end of parallel */
+
+/*******************************************************
+ * evaluation of the values                            *
+ *******************************************************/
+	{
+	  int determined_chunksize = 1;
+	  int last_threadnr = tids[0];
+	  int global_chunknr = 0;
+	  int local_chunknr[NUMBER_OF_THREADS];
+	  int openwork = CFSMAX_SIZE;
+	  int expected_chunk_size;
+	  double c = 1;
+
+	  for (i = 0; i < NUMBER_OF_THREADS; i++)
+		local_chunknr[i] = 0;
+
+	  tids[CFSMAX_SIZE] = -1;
+
+      /*
+	   * determine the number of global chunks
+	   */
+	  /*fprintf(logFile,"# global_chunknr thread local_chunknr chunksize\n"); */
+	  for(i = 1; i <= CFSMAX_SIZE; ++i)
+	  {
+		if (last_threadnr==tids[i]) { 
+		  determined_chunksize++; 
+		}
+		else
+		{
+		  /* fprintf (logFile, "%d\t%d\t%d\t%d\n", global_chunknr,last_threadnr, local_chunknr[last_threadnr], m); */
+		  global_chunknr++;
+		  local_chunknr[last_threadnr]++;
+		  last_threadnr = tids[i];
+		  determined_chunksize = 1;
+		}
+	  }
+	  /* now allocate the memory for saving the sizes of the global chunks */
+	  chunksizes = (int*)malloc(global_chunknr * sizeof(int));
+
+      /*
+	   * Evaluate the sizes of the global chunks
+	   */
+	  global_chunknr = 0;
+	  determined_chunksize = 1;
+	  last_threadnr = tids[0];	    
+	  for (i = 1; i <= CFSMAX_SIZE; ++i)
+	  {
+		/* If the threadnumber was the same as before increase the detected chunksize for this chunk
+		 * otherwise set the detected chunksize again to one and save the number of the next thread in last_threadnr. 
+		 */
+		if (last_threadnr == tids[i]) { 
+		  determined_chunksize++; 
+		}
+		else {
+		  chunksizes[global_chunknr] = determined_chunksize;
+		  global_chunknr++;
+		  local_chunknr[last_threadnr]++;
+		  last_threadnr = tids[i];
+		  determined_chunksize = 1;
+		}
+	  }
+
+#ifdef VERBOSE
+	  fprintf (logFile, "found\texpected\tconstant\n");
+#endif
+
+	  /* identify the constant c for the exponential decrease of the chunksize */
+	  expected_chunk_size = openwork / threads;
+	  c = (double) chunksizes[0] / expected_chunk_size;
+	  
+	  for (i = 0; i < global_chunknr; i++)
+	  {
+		/* calculate the new expected chunksize */
+		if (expected_chunk_size > 1) 
+		  expected_chunk_size = c * openwork / threads;
+		
+#ifdef VERBOSE
+		fprintf (logFile, "%8d\t%8d\t%lf\n", chunksizes[i], expected_chunk_size, c * chunksizes[i]/expected_chunk_size);
+#endif
+		
+		/* check if chunksize is inside the rounding errors */
+		if (abs (chunksizes[i] - expected_chunk_size) >= 2) {
+		  result = 0;
+#ifndef VERBOSE
+		  fprintf (logFile, "Chunksize differed from expected value: %d instead of %d\n", chunksizes[i], expected_chunk_size);
+		  return 0;
+#endif
+		} /* end if */
+
+#ifndef VERBOSE
+		if (expected_chunk_size - chunksizes[i] < 0 )
+		  fprintf (logFile, "Chunksize did not decrease: %d instead of %d\n", chunksizes[i],expected_chunk_size);
+#endif
+
+		/* calculating the remaining amount of work */
+		openwork -= chunksizes[i];
+	  }	
+	}
+    return result;
+}
+</ompts:testcode>
+</ompts:test>
+

diff --git a/final/testsuite/c/omp_for_schedule_static.c b/final/testsuite/c/omp_for_schedule_static.c
new file mode 100644
index 0000000..d71d7d4
--- /dev/null
+++ b/final/testsuite/c/omp_for_schedule_static.c

@@ -0,0 +1,165 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the static option of the omp for schedule directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp for schedule(static)</ompts:directive>
+<ompts:dependences>omp for nowait,omp flush,omp critical,omp single</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define NUMBER_OF_THREADS 10
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+
+int <ompts:testcode:functionname>omp_for_schedule_static</ompts:testcode:functionname> (FILE * logFile)
+{
+  int threads;
+  int i,lasttid;
+  <ompts:orphan:vars>
+  int * tids;
+  int notout;
+  int maxiter;
+  int chunk_size;
+  </ompts:orphan:vars>
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+  chunk_size = 7;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+#pragma omp parallel shared(tids,counter)
+  {	/* begin of parallel*/
+#pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }	/* end of single */
+  }	/* end of parallel */
+
+  if (threads < 2)
+  {
+    printf ("This test only works with at least two threads");
+    fprintf (logFile,"This test only works with at least two threads");
+    return 0;
+  }
+  else 
+  {
+    fprintf (logFile,"Using an internal count of %d\nUsing a specified chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+    tids[CFSMAX_SIZE] = -1;	/* setting endflag */
+#pragma omp parallel shared(tids)
+    {	/* begin of parallel */
+      <ompts:orphan>
+	double count;
+      int tid;
+      int j;
+
+      tid = omp_get_thread_num ();
+
+#pragma omp for nowait <ompts:check>schedule(static,chunk_size)</ompts:check>
+      for(j = 0; j < CFSMAX_SIZE; ++j)
+      {
+	count = 0.;
+#pragma omp flush(maxiter)
+	if (j > maxiter)
+	{
+#pragma omp critical
+	  {
+	    maxiter = j;
+	  }	/* end of critical */ 
+	}
+	/*printf ("thread %d sleeping\n", tid);*/
+	while (notout && (count < MAX_TIME) && (maxiter == j))
+	{
+#pragma omp flush(maxiter,notout)
+	  my_sleep (SLEEPTIME);
+	  count += SLEEPTIME;
+	  printf(".");
+	}
+#ifdef VERBOSE
+	if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+	/*printf ("thread %d awake\n", tid);*/
+	tids[j] = tid;
+#ifdef VERBOSE
+	printf("%d finished by %d\n",j,tid);
+#endif
+      }	/* end of for */
+
+      notout = 0;
+#pragma omp flush(maxiter,notout)
+      </ompts:orphan>
+    }	/* end of parallel */
+
+    /**** analysing the data in array tids ****/
+
+    lasttid = tids[0];
+    tmp_count = 0; 
+
+    for (i = 0; i < CFSMAX_SIZE + 1; ++i)
+    {
+      /* If the work  was done by the same thread increase tmp_count by one. */
+      if (tids[i] == lasttid) {
+	tmp_count++;
+#ifdef VERBOSE
+	fprintf (logFile, "%d: %d \n", i, tids[i]);
+#endif
+	continue;
+      }
+
+      /* Check if the next thread had has the right thread number. When finding 
+       * threadnumber -1 the end should be reached. 
+       */	  
+      if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+	/* checking for the right chunk size */
+	if (tmp_count == chunk_size) {
+	  tmp_count = 1;
+	  lasttid = tids[i];
+#ifdef VERBOSE
+	  fprintf (logFile, "OK\n");
+#endif
+	}
+	/* If the chunk size was wrong, check if the end was reached */
+	else {
+	  if (tids[i] == -1) {
+	    if (i == CFSMAX_SIZE) {
+	      fprintf (logFile, "Last thread had chunk size %d\n", tmp_count);
+	      break;
+	    }
+	    else {
+	      fprintf (logFile, "ERROR: Last thread (thread with number -1) was found before the end.\n");
+	      result = 0;
+	    }
+	  }
+	  else {
+	    fprintf (logFile, "ERROR: chunk size was %d. (assigned was %d)\n", tmp_count, chunk_size);
+	    result = 0;
+	  }
+	}
+      }
+      else {
+	fprintf(logFile, "ERROR: Found thread with number %d (should be inbetween 0 and %d).", tids[i], threads - 1);
+	result = 0;
+      }
+#ifdef VERBOSE
+      fprintf (logFile, "%d: %d \n", i, tids[i]);
+#endif
+    }
+  }
+
+  return result;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_for_schedule_static_3.c b/final/testsuite/c/omp_for_schedule_static_3.c
new file mode 100644
index 0000000..928667f
--- /dev/null
+++ b/final/testsuite/c/omp_for_schedule_static_3.c

@@ -0,0 +1,212 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the static option of the omp for schedule directive considering the specifications for the chunk distribution of several loop regions is the same as specified in the Open MP standard version 3.0.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp for schedule(static)</ompts:directive>
+<ompts:dependences>omp for nowait,omp flush,omp critical,omp single</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define NUMBER_OF_THREADS 10
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+#define VERBOSE 0
+
+
+int <ompts:testcode:functionname>omp_for_schedule_static_3</ompts:testcode:functionname> (FILE * logFile)
+{
+  int threads;
+  int i,lasttid;
+  <ompts:orphan:vars>
+  int * tids;
+  int * tids2;
+  int notout;
+  int maxiter;
+  int chunk_size;
+  </ompts:orphan:vars>
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+  chunk_size = 7;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+#pragma omp parallel shared(tids,counter)
+  {	/* begin of parallel*/
+#pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }	/* end of single */
+  }	/* end of parallel */
+
+  if (threads < 2)
+  {
+    printf ("This test only works with at least two threads");
+    fprintf (logFile,"This test only works with at least two threads");
+    return 0;
+  }
+  else 
+  {
+    fprintf (logFile,"Using an internal count of %d\nUsing a specified chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+    tids[CFSMAX_SIZE] = -1;	/* setting endflag */
+#pragma omp parallel shared(tids)
+    {	/* begin of parallel */
+      <ompts:orphan>
+	double count;
+      int tid;
+      int j;
+
+      tid = omp_get_thread_num ();
+
+#pragma omp for nowait <ompts:check>schedule(static,chunk_size)</ompts:check>
+      for(j = 0; j < CFSMAX_SIZE; ++j)
+      {
+	count = 0.;
+#pragma omp flush(maxiter)
+	if (j > maxiter)
+	{
+#pragma omp critical
+	  {
+	    maxiter = j;
+	  }	/* end of critical */ 
+	}
+	/*printf ("thread %d sleeping\n", tid);*/
+	while (notout && (count < MAX_TIME) && (maxiter == j))
+	{
+#pragma omp flush(maxiter,notout)
+	  my_sleep (SLEEPTIME);
+	  count += SLEEPTIME;
+	  printf(".");
+	}
+#ifdef VERBOSE
+	if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+	/*printf ("thread %d awake\n", tid);*/
+	tids[j] = tid;
+#ifdef VERBOSE
+	printf("%d finished by %d\n",j,tid);
+#endif
+      }	/* end of for */
+
+      notout = 0;
+#pragma omp flush(maxiter,notout)
+      </ompts:orphan>
+    }	/* end of parallel */
+
+    /**** analysing the data in array tids ****/
+
+    lasttid = tids[0];
+    tmp_count = 0; 
+
+    for (i = 0; i < CFSMAX_SIZE + 1; ++i)
+    {
+      /* If the work  was done by the same thread increase tmp_count by one. */
+      if (tids[i] == lasttid) {
+	tmp_count++;
+#ifdef VERBOSE
+	fprintf (logFile, "%d: %d \n", i, tids[i]);
+#endif
+	continue;
+      }
+
+      /* Check if the next thread had has the right thread number. When finding 
+       * threadnumber -1 the end should be reached. 
+       */	  
+      if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+	/* checking for the right chunk size */
+	if (tmp_count == chunk_size) {
+	  tmp_count = 1;
+	  lasttid = tids[i];
+#ifdef VERBOSE
+	  fprintf (logFile, "OK\n");
+#endif
+	}
+	/* If the chunk size was wrong, check if the end was reached */
+	else {
+	  if (tids[i] == -1) {
+	    if (i == CFSMAX_SIZE) {
+	      fprintf (logFile, "Last thread had chunk size %d\n", tmp_count);
+	      break;
+	    }
+	    else {
+	      fprintf (logFile, "ERROR: Last thread (thread with number -1) was found before the end.\n");
+	      result = 0;
+	    }
+	  }
+	  else {
+	    fprintf (logFile, "ERROR: chunk size was %d. (assigned was %d)\n", tmp_count, chunk_size);
+	    result = 0;
+	  }
+	}
+      }
+      else {
+	fprintf(logFile, "ERROR: Found thread with number %d (should be inbetween 0 and %d).", tids[i], threads - 1);
+	result = 0;
+      }
+#ifdef VERBOSE
+      fprintf (logFile, "%d: %d \n", i, tids[i]);
+#endif
+    }
+  }
+
+  /* Now we check if several loop regions in one parallel region have the same 
+   * logical assignement of chunks to threads.
+   * We use the nowait clause to increase the probability to get an error. */
+
+  /* First we allocate some more memmory */
+ free (tids);
+  tids = (int *) malloc (sizeof (int) * LOOPCOUNT);
+  tids2 = (int *) malloc (sizeof (int) * LOOPCOUNT);
+
+#pragma omp parallel 
+  {
+      <ompts:orphan>
+      {
+          int n;
+#pragma omp for <ompts:check>schedule(static)</ompts:check> nowait
+          for (n = 0; n < LOOPCOUNT; n++)
+          {
+              if (LOOPCOUNT == n + 1 )
+                  my_sleep(SLEEPTIME);
+
+              tids[n] = omp_get_thread_num();
+          }
+      }
+      </ompts:orphan>
+      <ompts:orphan>
+      {
+          int m;
+#pragma omp for <ompts:check>schedule(static)</ompts:check> nowait
+          for (m = 1; m <= LOOPCOUNT; m++)
+          {
+              tids2[m-1] = omp_get_thread_num();
+          }
+      }
+      </ompts:orphan>
+  }
+
+  for (i = 0; i < LOOPCOUNT; i++)
+      if (tids[i] != tids2[i]) {
+          fprintf (logFile, "Chunk no. %d was assigned once to thread %d and later to thread %d.\n", i, tids[i],tids2[i]);
+          result = 0;
+      }
+
+  free (tids);
+  free (tids2);
+  return result;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_get_num_threads.c b/final/testsuite/c/omp_get_num_threads.c
new file mode 100644
index 0000000..4b092ba
--- /dev/null
+++ b/final/testsuite/c/omp_get_num_threads.c

@@ -0,0 +1,39 @@
+<ompts:test>
+<ompts:testdescription>Test which checks that the omp_get_num_threads returns the correct number of threads. Therefor it counts up a variable in a parallelized section and compars this value with the result of the omp_get_num_threads function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_get_num_threads</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_get_num_threads</ompts:testcode:functionname> (FILE * logFile)
+{
+    /* checks that omp_get_num_threads is equal to the number of
+       threads */
+    <ompts:orphan:vars>
+	int nthreads_lib;
+    </ompts:orphan:vars>
+    int nthreads = 0;
+
+    nthreads_lib = -1;
+
+#pragma omp parallel 
+    {
+#pragma omp critical
+	{
+	    nthreads++;
+	}	/* end of critical */
+#pragma omp single
+	{ 
+<ompts:orphan>
+	    <ompts:check>nthreads_lib = omp_get_num_threads ();</ompts:check>
+</ompts:orphan>
+	}	/* end of single */
+    }	/* end of parallel */
+
+	fprintf (logFile, "Counted %d threads. get_num_threads returned %d.\n", nthreads, nthreads_lib);
+    return (nthreads == nthreads_lib);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_get_wtick.c b/final/testsuite/c/omp_get_wtick.c
new file mode 100644
index 0000000..e82b57f
--- /dev/null
+++ b/final/testsuite/c/omp_get_wtick.c

@@ -0,0 +1,24 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_get_wtick function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_get_wtick</ompts:directive>
+<ompts:testcode>
+#include<stdio.h>
+
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_get_wtick</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	double tick;
+    </ompts:orphan:vars>
+    tick = -1.;
+
+    <ompts:orphan>
+	<ompts:check>tick = omp_get_wtick ();</ompts:check>
+    </ompts:orphan>
+    fprintf (logFile, "Work took %lf sec. time.\n", tick);
+    return ((tick > 0.0) && (tick < 0.01));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_get_wtime.c b/final/testsuite/c/omp_get_wtime.c
new file mode 100644
index 0000000..9f0226d
--- /dev/null
+++ b/final/testsuite/c/omp_get_wtime.c

@@ -0,0 +1,38 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_get_wtime function. It compares the time with which is called a sleep function with the time it took by messuring the difference between the call of the sleep function and its end.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_get_wtime</ompts:directive>
+<ompts:testcode>
+#include<stdio.h>
+#include<stdlib.h>
+#include<unistd.h>
+
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_get_wtime</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	double start;
+	double end;
+    </ompts:orphan:vars>
+    double measured_time;
+    int wait_time = 1; 
+
+    start = 0;
+    end = 0;
+
+    <ompts:orphan>
+	<ompts:check>start = omp_get_wtime ();</ompts:check>
+    </ompts:orphan>
+	my_sleep (wait_time); 
+    <ompts:orphan>
+	<ompts:check>end = omp_get_wtime ();</ompts:check>
+    </ompts:orphan>
+	measured_time = end-start;
+    fprintf(logFile, "Work took %lf sec. time.\n", measured_time);
+    return ((measured_time > 0.99 * wait_time) && (measured_time < 1.01 * wait_time)) ;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_in_parallel.c b/final/testsuite/c/omp_in_parallel.c
new file mode 100644
index 0000000..09b6111
--- /dev/null
+++ b/final/testsuite/c/omp_in_parallel.c

@@ -0,0 +1,51 @@
+<ompts:test>
+<ompts:testdescription>Test which checks that omp_in_parallel returns false when called from a serial region and true when called within a parallel region.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_in_parallel</ompts:directive>
+<ompts:testcode>
+/*
+ * Checks that false is returned when called from serial region
+ * and true is returned when called within parallel region. 
+ */
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_in_parallel</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+	int serial;
+	int isparallel;
+    </ompts:orphan:vars>
+
+    serial = 1;
+    isparallel = 0;
+
+    <ompts:check>
+	<ompts:orphan>
+	    serial = omp_in_parallel ();
+	</ompts:orphan>
+
+#pragma omp parallel
+    {
+#pragma omp single
+	{
+	    <ompts:orphan>
+		isparallel = omp_in_parallel ();
+	    </ompts:orphan>
+	}
+    }
+    </ompts:check>
+
+    <ompts:crosscheck>
+#pragma omp parallel
+	{
+#pragma omp single
+	    {
+
+	    }
+	}
+    </ompts:crosscheck>
+
+	return (!(serial) && isparallel);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_lock.c b/final/testsuite/c/omp_lock.c
new file mode 100644
index 0000000..a529c07
--- /dev/null
+++ b/final/testsuite/c/omp_lock.c

@@ -0,0 +1,45 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_set_lock  and the omp_unset_lock function by counting the threads entering and exiting a single region with locks.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_lock_t lck;
+    
+int <ompts:testcode:functionname>omp_lock</ompts:testcode:functionname>(FILE * logFile)
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+  omp_init_lock (&lck);
+  
+#pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for(i = 0; i < LOOPCOUNT; i++)
+      {
+	<ompts:orphan>
+	    <ompts:check>omp_set_lock (&lck);</ompts:check>
+	</ompts:orphan>
+#pragma omp flush
+	nr_threads_in_single++;
+#pragma omp flush           
+	nr_iterations++;
+	nr_threads_in_single--;
+	result = result + nr_threads_in_single;
+	<ompts:orphan>
+	    <ompts:check>omp_unset_lock(&lck);</ompts:check>
+	</ompts:orphan>
+      }
+  }
+  omp_destroy_lock (&lck);
+  
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+  
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_master.c b/final/testsuite/c/omp_master.c
new file mode 100644
index 0000000..cdb2f1e
--- /dev/null
+++ b/final/testsuite/c/omp_master.c

@@ -0,0 +1,37 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp master directive by counting up a variable in a omp master section.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp master</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_master</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int nthreads;
+	int executing_thread;
+    </ompts:orphan:vars>
+
+    nthreads = 0;
+    executing_thread = -1;
+
+#pragma omp parallel
+    {
+	<ompts:orphan>
+	    <ompts:check>#pragma omp master </ompts:check>
+	    {
+#pragma omp critical
+		{
+		    nthreads++;
+		}
+		executing_thread = omp_get_thread_num ();
+
+	    } /* end of master*/
+	</ompts:orphan>
+    } /* end of parallel*/
+    return ((nthreads == 1) && (executing_thread == 0));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_master_3.c b/final/testsuite/c/omp_master_3.c
new file mode 100644
index 0000000..8e98129
--- /dev/null
+++ b/final/testsuite/c/omp_master_3.c

@@ -0,0 +1,44 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp master directive by counting up a variable in a omp master section. It also checks that the master thread has the thread number 0 as specified in the Open MP standard version 3.0.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp master</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_master_3</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int nthreads;
+	int executing_thread;
+        int tid_result = 0; /* counts up the number of wrong thread no. for
+                               the master thread. (Must be 0) */
+    </ompts:orphan:vars>
+
+    nthreads = 0;
+    executing_thread = -1;
+
+#pragma omp parallel
+    {
+	<ompts:orphan>
+	    <ompts:check>#pragma omp master </ompts:check>
+	    {
+                int tid = omp_get_thread_num();
+                if (tid != 0) {
+#pragma omp critical
+                    { tid_result++; }
+                }
+#pragma omp critical
+		{
+		    nthreads++;
+		}
+		executing_thread = omp_get_thread_num ();
+
+	    } /* end of master*/
+	</ompts:orphan>
+    } /* end of parallel*/
+    return ((nthreads == 1) && (executing_thread == 0) && (tid_result == 0));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_nest_lock.c b/final/testsuite/c/omp_nest_lock.c
new file mode 100644
index 0000000..3ade389
--- /dev/null
+++ b/final/testsuite/c/omp_nest_lock.c

@@ -0,0 +1,45 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_set_nest_lock and the omp_unset_nest_lock function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_nest_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_nest_lock_t lck;
+
+int <ompts:testcode:functionname>omp_nest_lock</ompts:testcode:functionname>(FILE * logFile)
+{
+    int nr_threads_in_single = 0;
+    int result = 0;
+    int nr_iterations = 0;
+    int i;
+
+    omp_init_nest_lock (&lck);
+
+#pragma omp parallel shared(lck)  
+    {
+#pragma omp for
+	for(i = 0; i < LOOPCOUNT; i++)
+	{
+	    <ompts:orphan>
+		<ompts:check>omp_set_nest_lock (&lck);</ompts:check>
+	    </ompts:orphan>
+#pragma omp flush
+		nr_threads_in_single++;
+#pragma omp flush           
+	    nr_iterations++;
+	    nr_threads_in_single--;
+	    result = result + nr_threads_in_single;
+	    <ompts:orphan>
+		<ompts:check>omp_unset_nest_lock (&lck);</ompts:check>
+	    </ompts:orphan>
+	}
+    }
+    omp_destroy_nest_lock (&lck);
+
+    return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_nested.c b/final/testsuite/c/omp_nested.c
new file mode 100644
index 0000000..9657bc8
--- /dev/null
+++ b/final/testsuite/c/omp_nested.c

@@ -0,0 +1,42 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_nested function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_nested</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+/*
+ * Test if the compiler supports nested parallelism
+ * By Chunhua Liao, University of Houston
+ * Oct. 2005
+ */
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_nested</ompts:testcode:functionname>(FILE * logFile)
+{
+
+    <ompts:orphan:vars>
+        int counter = 0;
+    </ompts:orphan:vars>
+
+#ifdef _OPENMP
+    <ompts:check>omp_set_nested(1);</ompts:check>
+    <ompts:crosscheck>omp_set_nested(0);</ompts:crosscheck>
+#endif
+
+#pragma omp parallel shared(counter)
+{
+<ompts:orphan>
+#pragma omp critical
+    counter ++;
+#pragma omp parallel
+    {
+#pragma omp critical
+        counter --;
+    }
+</ompts:orphan>
+}
+    return (counter != 0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_copyin.c b/final/testsuite/c/omp_parallel_copyin.c
new file mode 100644
index 0000000..382bfa2
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_copyin.c

@@ -0,0 +1,47 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel copyin directive.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parallel copyin</ompts:directive>
+<ompts:dependences>omp critical,omp threadprivate</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+static int sum1 = 789;
+#pragma omp threadprivate(sum1)
+
+int <ompts:testcode:functionname>omp_parallel_copyin</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int sum, num_threads;
+    </ompts:orphan:vars>
+    int known_sum;
+
+    sum = 0;
+    sum1 = 7;
+    num_threads = 0;
+
+#pragma omp parallel <ompts:check>copyin(sum1)</ompts:check>
+    {
+	/*printf("sum1=%d\n",sum1);*/
+	<ompts:orphan>
+	int i;
+#pragma omp for 
+	    for (i = 1; i < 1000; i++)
+	    {
+		sum1 = sum1 + i;
+	    } /*end of for*/
+#pragma omp critical
+	{
+	    sum = sum + sum1;
+            num_threads++;
+	} /*end of critical*/
+	</ompts:orphan>
+    } /* end of parallel*/    
+    known_sum = (999 * 1000) / 2 + 7 * num_threads;
+    return (known_sum == sum);
+
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_default.c b/final/testsuite/c/omp_parallel_default.c
new file mode 100644
index 0000000..6c87371
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_default.c

@@ -0,0 +1,44 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the default option of the parallel construct.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parallel default</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_default</ompts:testcode:functionname> (FILE * logFile)
+{
+  <ompts:orphan:vars>
+  int i;
+  int sum;
+  int mysum;
+  </ompts:orphan:vars>
+  
+  int known_sum;
+  sum =0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+
+  <ompts:orphan>
+  #pragma omp parallel <ompts:check>default(shared)</ompts:check> private(i) private(mysum<ompts:crosscheck>,sum</ompts:crosscheck>)
+  {
+	mysum = 0;
+  #pragma omp for
+	for (i = 1; i <= LOOPCOUNT; i++)
+	{
+	  mysum = mysum + i;
+	} 
+#pragma omp critical
+	{
+	  sum = sum + mysum;
+	}   /* end of critical */
+  }   /* end of parallel */
+  </ompts:orphan>
+  if (known_sum != sum) {
+  	fprintf(logFile, "KNOWN_SUM = %d; SUM = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_firstprivate.c b/final/testsuite/c/omp_parallel_firstprivate.c
new file mode 100644
index 0000000..2415b4c
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_firstprivate.c

@@ -0,0 +1,48 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel firstprivate directive.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parallel firstprivate</ompts:directive>
+<ompts:dependences>omp for omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+//static int sum1 = 789;
+
+int <ompts:testcode:functionname>omp_parallel_firstprivate</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int sum, num_threads,sum1;
+    </ompts:orphan:vars>
+    int known_sum;
+
+    sum = 0;
+    sum1=7;
+    num_threads = 0;
+
+
+#pragma omp parallel <ompts:check>firstprivate(sum1)</ompts:check><ompts:crosscheck>private(sum1)</ompts:crosscheck>
+    {
+
+	/*printf("sum1=%d\n",sum1);*/
+	<ompts:orphan>
+	int i;
+#pragma omp for 
+	    for (i = 1; i < 1000; i++)
+	    {
+		sum1 = sum1 + i;
+	    } /*end of for*/
+#pragma omp critical
+	{
+	    sum = sum + sum1;
+            num_threads++;
+	} /*end of critical*/
+	</ompts:orphan>
+    } /* end of parallel*/    
+    known_sum = (999 * 1000) / 2 + 7 * num_threads;
+    return (known_sum == sum);
+
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_for_firstprivate.c b/final/testsuite/c/omp_parallel_for_firstprivate.c
new file mode 100644
index 0000000..5cb4926
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_for_firstprivate.c

@@ -0,0 +1,36 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel for firstprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel for firstprivate</ompts:directive>
+<ompts:dependences>omp parallel for reduction,omp parallel for private</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_for_firstprivate</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+    int sum ;
+    int i2;
+    int i;
+    </ompts:orphan:vars>
+    
+    sum=0;
+    i2=3;
+    int known_sum;
+
+    #pragma omp parallel for reduction(+:sum) private(i) <ompts:check>firstprivate(i2)</ompts:check><ompts:crosscheck>private(i2)</ompts:crosscheck>
+    <ompts:orphan>
+    for (i = 1; i <= LOOPCOUNT; i++)
+    {
+	  sum = sum + (i + i2);
+    } /*end of for*/
+    </ompts:orphan>
+    
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 + i2 * LOOPCOUNT;
+    
+    return (known_sum == sum);
+
+} /* end of check_parallel_for_fistprivate */
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_for_if.c b/final/testsuite/c/omp_parallel_for_if.c
new file mode 100644
index 0000000..52886ba
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_for_if.c

@@ -0,0 +1,38 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel for if directive. Needs at least two threads.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel for if</ompts:directive>
+<ompts:dependences></ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_for_if</ompts:testcode:functionname>(FILE * logFile){
+    int known_sum;
+    <ompts:orphan:vars>
+    int num_threads;
+    int sum, sum2;
+    int i;
+    int control;
+    </ompts:orphan:vars>
+    control = 0;
+    num_threads=0;
+    sum = 0;
+    sum2 = 0;
+
+#pragma omp parallel for private(i) <ompts:check>if (control==1)</ompts:check>
+    <ompts:orphan>
+    for (i=0; i <= LOOPCOUNT; i++)
+    {
+        num_threads = omp_get_num_threads();
+	sum = sum + i;
+    } /*end of for*/
+
+    </ompts:orphan>
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    fprintf (logFile, "Number of threads determined by omp_get_num_threads: %d\n", num_threads);
+    return (known_sum == sum && num_threads == 1);
+} /* end of check_parallel_for_private */
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_for_lastprivate.c b/final/testsuite/c/omp_parallel_for_lastprivate.c
new file mode 100644
index 0000000..909deba
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_for_lastprivate.c

@@ -0,0 +1,34 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel for lastprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel for lastprivate</ompts:directive>
+<ompts:dependences>omp parallel for reduction,omp parallel for private</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_for_lastprivate</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+    int sum;
+    int i;
+    int i0;
+    </ompts:orphan:vars>
+
+    sum =0;
+    i0 = -1;
+    int known_sum;
+
+#pragma omp parallel for reduction(+:sum) schedule(static,7) private(i) <ompts:check>lastprivate(i0)</ompts:check><ompts:crosscheck>private(i0)</ompts:crosscheck>
+    <ompts:orphan>
+    for (i = 1; i <= LOOPCOUNT; i++)
+    {
+	sum = sum + i;
+	i0 = i;
+    } /*end of for*/
+    /* end of parallel*/    
+    </ompts:orphan>
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    return ((known_sum == sum) && (i0 == LOOPCOUNT));
+} /* end of check_parallel_for_lastprivate */
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_for_ordered.c b/final/testsuite/c/omp_parallel_for_ordered.c
new file mode 100644
index 0000000..b1a4bc9
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_for_ordered.c

@@ -0,0 +1,62 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel for ordered directive</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel for ordered</ompts:directive>
+<ompts:dependences>omp parallel schedule(static)</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+int i;
+
+#pragma omp threadprivate(i)
+
+/* Variable ii is used to avoid problems with a threadprivate variable used as a loop
+ * index. See test omp_threadprivate_for.
+ */
+static int ii;
+#pragma omp threadprivate(ii)
+
+/*! 
+  Utility function: returns true if the passed argument is larger than 
+  the argument of the last call of this function.
+  */
+static int check_i_islarger2 (int i){
+	int islarger;
+	islarger = (i > last_i);
+	last_i = i;
+	return (islarger);
+}
+
+int <ompts:testcode:functionname>omp_parallel_for_ordered</ompts:testcode:functionname>(FILE * logFile){
+	<ompts:orphan:vars>
+	int sum;
+	int is_larger;
+	</ompts:orphan:vars>
+
+	int known_sum;
+	int i;
+	
+	sum = 0;
+	is_larger = 1;
+	last_i = 0;
+#pragma omp parallel for schedule(static,1) private(i) <ompts:check>ordered</ompts:check>
+	for (i = 1; i < 100; i++)
+	{
+		ii = i;
+	<ompts:orphan>
+<ompts:check>#pragma omp ordered</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+			is_larger = check_i_islarger2 (ii) && is_larger;
+			sum  = sum + ii;
+		}
+	</ompts:orphan>
+	}
+	known_sum = (99 * 100) / 2;
+	fprintf (logFile," known_sum = %d , sum = %d \n", known_sum, sum);
+	fprintf (logFile," is_larger = %d\n", is_larger);
+	return (known_sum == sum) && is_larger;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_for_private.c b/final/testsuite/c/omp_parallel_for_private.c
new file mode 100644
index 0000000..40280cc
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_for_private.c

@@ -0,0 +1,48 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel for private directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel for private</ompts:directive>
+<ompts:dependences>omp parallel for reduction,omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/*! Utility function to spend some time in a loop */
+static void do_some_work (void){
+    int i;
+    double sum = 0;
+    for(i = 0; i < 1000; i++){
+	sum += sqrt (i);
+    }
+}
+
+int <ompts:testcode:functionname>omp_parallel_for_private</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+    int sum;
+    int i;
+    int i2;
+    </ompts:orphan:vars>
+    
+    sum =0;
+    i2=0;
+    int known_sum;
+  
+#pragma omp parallel for reduction(+:sum) schedule(static,1) private(i) <ompts:check>private(i2)</ompts:check>
+    <ompts:orphan>
+    
+    for (i=1;i<=LOOPCOUNT;i++)
+    {
+	i2 = i;
+#pragma omp flush
+	do_some_work ();
+#pragma omp flush
+	sum = sum + i2;
+    } /*end of for*/
+    </ompts:orphan>
+
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    return (known_sum == sum);
+} /* end of check_parallel_for_private */
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_for_reduction.c b/final/testsuite/c/omp_parallel_for_reduction.c
new file mode 100644
index 0000000..6c991f7
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_for_reduction.c

@@ -0,0 +1,280 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel for reduction directive with all its options.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel for reduction</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+
+int <ompts:testcode:functionname>omp_parallel_for_reduction</ompts:testcode:functionname>(FILE * logFile){
+	<ompts:orphan:vars>
+    int sum;
+	int known_sum;
+	double dsum;
+	double dknown_sum;
+	double dt=0.5;				/* base of geometric row for + and - test*/
+	double rounding_error= 1.E-9;
+#define DOUBLE_DIGITS 20		/* dt^DOUBLE_DIGITS */
+	int diff;
+	double ddiff;
+	int product;
+	int known_product;
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800	/* 10! */
+	int logic_and;
+	int logic_or;
+	int bit_and;
+	int bit_or;
+	int exclusiv_bit_or;
+	int logics[LOOPCOUNT];
+	int i;
+	double dpt;
+	int result;
+    </ompts:orphan:vars>
+
+    sum =0;
+    dsum=0;
+	dt = 1./3.;
+    result = 0;
+    product = 1;
+	logic_and=1;
+	logic_or=0;
+	bit_and=1;
+	bit_or=0;
+	exclusiv_bit_or=0;
+
+	known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+<ompts:orphan>
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(+:sum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=1;i<=LOOPCOUNT;i++)
+	{
+		sum=sum+i;
+	}
+
+	if(known_sum!=sum)
+	{
+		result++;
+		fprintf(logFile,"Error in sum with integers: Result was %d instead of %d\n",sum,known_sum); 
+	}
+
+	diff = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(-:diff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=1;i<=LOOPCOUNT;++i)
+	{
+		diff=diff-i;
+	}
+
+	if(diff != 0)
+	{
+		result++;
+		fprintf(logFile,"Error in difference with integers: Result was %d instead of 0.\n",diff);
+	}
+
+	/* Tests for doubles */
+	dsum=0;
+	dpt=1;
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		dpt*=dt;
+	}
+	dknown_sum = (1-dpt)/(1-dt);
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(+:dsum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		dsum += pow(dt,i);
+	}
+
+	if( fabs(dsum-dknown_sum) > rounding_error )
+	{
+		result++; 
+		fprintf(logFile,"Error in sum with doubles: Result was %f instead of %f (Difference: %E)\n",dsum,dknown_sum, dsum-dknown_sum);
+	}
+
+	dpt=1;
+
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		dpt*=dt;
+	}
+	fprintf(logFile,"\n");
+	ddiff = (1-dpt)/(1-dt);
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(-:ddiff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		ddiff -= pow(dt,i);
+	}
+	if( fabs(ddiff) > rounding_error)
+	{
+		result++;
+		fprintf(logFile,"Error in Difference with doubles: Result was %E instead of 0.0\n",ddiff);
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(*:product)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=1;i<=MAX_FACTOR;i++)
+	{
+		product *= i;
+	}
+
+	known_product = KNOWN_PRODUCT;
+	if(known_product != product)
+	{
+		result++;
+		fprintf(logFile,"Error in Product with integers: Result was %d instead of %d\n\n",product,known_product);
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_and = (logic_and && logics[i]);
+	}
+	if(!logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 1.\n");
+	}
+
+	logic_and = 1;
+	logics[LOOPCOUNT/2]=0;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_and = logic_and && logics[i];
+	}
+	if(logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 2.\n");
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_or = logic_or || logics[i];
+	}
+	if(logic_or)
+	{
+		result++;
+		fprintf(logFile,"Error in logic OR part 1.\n");
+	}
+	logic_or = 0;
+	logics[LOOPCOUNT/2]=1;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_or = logic_or || logics[i];
+	}
+	if(!logic_or)
+	{
+		result++;
+		fprintf(logFile,"Error in logic OR part 2.\n");
+	}
+
+
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_and = (bit_and & logics[i]);
+	}
+	if(!bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 1.\n");
+	}
+
+	bit_and = 1;
+	logics[LOOPCOUNT/2]=0;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_and = bit_and & logics[i];
+	}
+	if(bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 2.\n");
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_or = bit_or | logics[i];
+	}
+	if(bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 1\n");
+	}
+	bit_or = 0;
+	logics[LOOPCOUNT/2]=1;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_or = bit_or | logics[i];
+	}
+	if(!bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 2\n");
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+	}
+	if(exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 1\n");
+	}
+
+	exclusiv_bit_or = 0;
+	logics[LOOPCOUNT/2]=1;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+	}
+	if(!exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 2\n");
+	}
+</ompts:orphan>
+	/*printf("\nResult:%d\n",result);*/
+	return (result==0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_if.c b/final/testsuite/c/omp_parallel_if.c
new file mode 100644
index 0000000..d3f45b1
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_if.c

@@ -0,0 +1,40 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the if option of the parallel construct.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parallel if</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_if</ompts:testcode:functionname> (FILE * logFile)
+{
+<ompts:orphan:vars>
+  int i;
+  int sum;
+  int known_sum;
+  int mysum;
+  int control=1;
+</ompts:orphan:vars>
+  sum =0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+#pragma omp parallel private(i) <ompts:check>if(control==0)</ompts:check>
+  {
+	<ompts:orphan>
+    mysum = 0;
+	for (i = 1; i <= LOOPCOUNT; i++)
+	{
+	  mysum = mysum + i;
+	} 
+#pragma omp critical
+	{
+	  sum = sum + mysum;
+	}   /* end of critical */
+  </ompts:orphan>
+  }   /* end of parallel */
+
+  return (known_sum == sum);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_num_threads.c b/final/testsuite/c/omp_parallel_num_threads.c
new file mode 100644
index 0000000..897fed2
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_num_threads.c

@@ -0,0 +1,46 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_parallel_num_threads directive by counting the threads in a parallel region which was started with an explicitly stated number of threads.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parellel num_threads</ompts:directive>
+<ompts:dependences>omp master,omp parallel reduction,omp atomic</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_num_threads</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+	int failed;
+	int threads;
+	int nthreads;
+    </ompts:orphan:vars>
+
+    int max_threads = 0;
+
+    failed = 0;
+
+    /* first we check how many threads are available */
+#pragma omp parallel
+    {
+#pragma omp master
+	max_threads = omp_get_num_threads ();
+    }
+
+    /* we increase the number of threads from one to maximum:*/
+    for (threads = 1; threads <= max_threads; threads++)
+    {
+	nthreads = 0;
+
+	<ompts:orphan>
+#pragma omp parallel reduction(+:failed) <ompts:check>num_threads(threads)</ompts:check>
+	    {
+		failed = failed + !(threads == omp_get_num_threads ());
+#pragma omp atomic
+	    nthreads += 1;
+	    }
+	</ompts:orphan>
+	failed = failed + !(nthreads == threads);
+    }
+    return (!failed);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_private.c b/final/testsuite/c/omp_parallel_private.c
new file mode 100644
index 0000000..fdccc09
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_private.c

@@ -0,0 +1,50 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel private directive.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parallel private</ompts:directive>
+<ompts:dependences>omp for omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+//static int sum1 = 789;
+
+int <ompts:testcode:functionname>omp_parallel_private</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int sum, num_threads,sum1;
+    </ompts:orphan:vars>
+    int known_sum;
+
+    sum = 0;
+    <ompts:crosscheck> sum1=0; </ompts:crosscheck>
+    num_threads = 0;
+
+
+#pragma omp parallel <ompts:check>private(sum1)</ompts:check>
+    {
+	<ompts:check>
+	sum1 = 7;
+	</ompts:check>
+	/*printf("sum1=%d\n",sum1);*/
+	<ompts:orphan>
+	int i;
+#pragma omp for 
+	    for (i = 1; i < 1000; i++)
+	    {
+		sum1 = sum1 + i;
+	    } /*end of for*/
+#pragma omp critical
+	{
+	    sum = sum + sum1;
+            num_threads++;
+	} /*end of critical*/
+	</ompts:orphan>
+    } /* end of parallel*/    
+    known_sum = (999 * 1000) / 2 + 7 * num_threads;
+    return (known_sum == sum);
+
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_reduction.c b/final/testsuite/c/omp_parallel_reduction.c
new file mode 100644
index 0000000..96f8394
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_reduction.c

@@ -0,0 +1,278 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel reduction directive with all its options.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parallel reduction</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+
+int <ompts:testcode:functionname>omp_parallel_reduction</ompts:testcode:functionname>(FILE * logFile){
+	<ompts:orphan:vars>
+    int sum;
+	int known_sum;
+	double dsum;
+	double dknown_sum;
+	double dt=0.5;				/* base of geometric row for + and - test*/
+	double rounding_error= 1.E-9;
+#define DOUBLE_DIGITS 20		/* dt^DOUBLE_DIGITS */
+	int diff;
+	double ddiff;
+	int product;
+	int known_product;
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800	/* 10! */
+	int logic_and;
+	int logic_or;
+	int bit_and;
+	int bit_or;
+	int exclusiv_bit_or;
+	int logics[LOOPCOUNT];
+	int i;
+	double dpt;
+	int result;
+</ompts:orphan:vars>
+    sum =0;
+    dsum=0;
+    product=1;
+	logic_and=1;
+	logic_or=0;
+	bit_and=1;
+	bit_or=0;
+	exclusiv_bit_or=0;
+    result=0;
+	dt = 1./3.;
+	known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+<ompts:orphan>
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(+:sum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=1;i<=LOOPCOUNT;i++)
+	{
+		sum=sum+i;
+	}
+
+	if(known_sum!=sum)
+	{
+		result++;
+		fprintf(logFile,"Error in sum with integers: Result was %d instead of %d\n",sum,known_sum); 
+	}
+
+	diff = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(-:diff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=1;i<=LOOPCOUNT;++i)
+	{
+		diff=diff-i;
+	}
+
+	if(diff != 0)
+	{
+		result++;
+		fprintf(logFile,"Error in difference with integers: Result was %d instead of 0.\n",diff);
+	}
+
+	/* Tests for doubles */
+	dsum=0;
+	dpt=1;
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		dpt*=dt;
+	}
+	dknown_sum = (1-dpt)/(1-dt);
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(+:dsum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		dsum += pow(dt,i);
+	}
+
+	if( fabs(dsum-dknown_sum) > rounding_error )
+	{
+		result++; 
+		fprintf(logFile,"Error in sum with doubles: Result was %f instead of %f (Difference: %E)\n",dsum,dknown_sum, dsum-dknown_sum);
+	}
+
+	dpt=1;
+
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		dpt*=dt;
+	}
+	fprintf(logFile,"\n");
+	ddiff = (1-dpt)/(1-dt);
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(-:ddiff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for (i=0;i<DOUBLE_DIGITS;++i)
+	{
+		ddiff -= pow(dt,i);
+	}
+	if( fabs(ddiff) > rounding_error)
+	{
+		result++;
+		fprintf(logFile,"Error in Difference with doubles: Result was %E instead of 0.0\n",ddiff);
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(*:product)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=1;i<=MAX_FACTOR;i++)
+	{
+		product *= i;
+	}
+
+	known_product = KNOWN_PRODUCT;
+	if(known_product != product)
+	{
+		result++;
+		fprintf(logFile,"Error in Product with integers: Result was %d instead of %d\n\n",product,known_product);
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_and = (logic_and && logics[i]);
+	}
+	if(!logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 1.\n");
+	}
+
+	logic_and = 1;
+	logics[LOOPCOUNT/2]=0;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_and = logic_and && logics[i];
+	}
+	if(logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 2.\n");
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_or = logic_or || logics[i];
+	}
+	if(logic_or)
+	{
+		result++;
+		fprintf(logFile,"Error in logic OR part 1.\n");
+	}
+	logic_or = 0;
+	logics[LOOPCOUNT/2]=1;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logic_or = logic_or || logics[i];
+	}
+	if(!logic_or)
+	{
+		result++;
+		fprintf(logFile,"Error in logic OR part 2.\n");
+	}
+
+
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_and = (bit_and & logics[i]);
+	}
+	if(!bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 1.\n");
+	}
+
+	bit_and = 1;
+	logics[LOOPCOUNT/2]=0;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_and = bit_and & logics[i];
+	}
+	if(bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 2.\n");
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_or = bit_or | logics[i];
+	}
+	if(bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 1\n");
+	}
+	bit_or = 0;
+	logics[LOOPCOUNT/2]=1;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		bit_or = bit_or | logics[i];
+	}
+	if(!bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 2\n");
+	}
+
+	for(i=0;i<LOOPCOUNT;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+	}
+	if(exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 1\n");
+	}
+
+	exclusiv_bit_or = 0;
+	logics[LOOPCOUNT/2]=1;
+
+#pragma omp parallel for schedule(dynamic,1) private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	for(i=0;i<LOOPCOUNT;++i)
+	{
+		exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+	}
+	if(!exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 2\n");
+	}
+    </ompts:orphan>
+	/*printf("\nResult:%d\n",result);*/
+	return (result==0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_sections_firstprivate.c b/final/testsuite/c/omp_parallel_sections_firstprivate.c
new file mode 100644
index 0000000..2933839
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_sections_firstprivate.c

@@ -0,0 +1,49 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel sections firstprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel sections firstprivate</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_sections_firstprivate</ompts:testcode:functionname>(FILE * logFile){
+  <ompts:orphan:vars>
+  int sum;
+  int sum0;
+  </ompts:orphan:vars>
+  int known_sum;
+  sum =7;
+  sum0=11;
+
+<ompts:orphan>
+#pragma omp parallel sections <ompts:check>firstprivate(sum0)</ompts:check><ompts:crosscheck>private(sum0)</ompts:crosscheck>
+  {
+#pragma omp section 
+    {
+#pragma omp critical
+      {
+	sum= sum+sum0;
+      }                         /*end of critical */
+    }    
+#pragma omp section
+    {
+#pragma omp critical
+      {
+	sum= sum+sum0;
+      }                         /*end of critical */
+    }
+#pragma omp section
+    {
+#pragma omp critical
+      {
+	sum= sum+sum0;
+      }                         /*end of critical */
+    }               
+    }      /*end of parallel sections*/
+</ompts:orphan>
+known_sum=11*3+7;
+return (known_sum==sum); 
+}                              /* end of check_section_firstprivate*/
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_sections_lastprivate.c b/final/testsuite/c/omp_parallel_sections_lastprivate.c
new file mode 100644
index 0000000..55ae5f0
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_sections_lastprivate.c

@@ -0,0 +1,70 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel sections lastprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel sections lastprivate</ompts:directive>
+<ompts:dependences>omp critical,omp parallel sections private</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_sections_lastprivate</ompts:testcode:functionname>(FILE * logFile){
+  <ompts:orphan:vars>
+  int sum;
+  int sum0;
+  int i;
+  int i0;
+  </ompts:orphan:vars>
+  int known_sum;
+  sum =0;
+  sum0 = 0;
+  i0 = -1;
+  
+  <ompts:orphan>
+#pragma omp parallel sections private(i,sum0) <ompts:check>lastprivate(i0)</ompts:check><ompts:crosscheck>private(i0)</ompts:crosscheck>
+    {
+#pragma omp section  
+      {
+	sum0=0;
+	for (i=1;i<400;i++)
+	  {
+	    sum0=sum0+i;
+	    i0=i;
+	  }
+#pragma omp critical
+	{
+	  sum= sum+sum0;
+	}                         /*end of critical*/
+      }/* end of section */
+#pragma omp section 
+      {
+	sum0=0;
+	for(i=400;i<700;i++)
+	  {
+	    sum0=sum0+i;                       /*end of for*/
+	    i0=i;
+	  }
+#pragma omp critical
+	{
+	  sum= sum+sum0;
+	}                         /*end of critical*/
+      }
+#pragma omp section 
+      {
+	sum0=0;
+	for(i=700;i<1000;i++)
+	  {
+	    sum0=sum0+i;
+      i0=i;
+	  }
+#pragma omp critical
+	{
+	  sum= sum+sum0;
+	}                         /*end of critical*/
+      }
+    }/* end of parallel sections*/
+  </ompts:orphan> 
+  known_sum=(999*1000)/2;
+  return ((known_sum==sum) && (i0==999) );
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_sections_private.c b/final/testsuite/c/omp_parallel_sections_private.c
new file mode 100644
index 0000000..c5fe414
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_sections_private.c

@@ -0,0 +1,65 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel sections private directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel sections private</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_sections_private</ompts:testcode:functionname>(FILE * logFile){
+  <ompts:orphan:vars>
+  int sum;
+  int sum0;
+  int i;
+  </ompts:orphan:vars>
+  int known_sum;
+  sum = 7;
+  sum0=0;
+
+<ompts:orphan>
+#pragma omp parallel sections private(<ompts:check>sum0,</ompts:check> i)
+  {
+#pragma omp section 
+    {
+      <ompts:check>
+      sum0=0;
+      </ompts:check>
+      for (i=1;i<400;i++)
+	sum0=sum0+i;
+#pragma omp critical
+      {
+	sum= sum+sum0;
+      }                         /*end of critical */
+    }    
+#pragma omp section
+    {
+      <ompts:check>
+      sum0=0;
+      </ompts:check>
+      for(i=400;i<700;i++)
+	sum0=sum0+i;
+#pragma omp critical
+      {
+	sum= sum+sum0;
+      }                         /*end of critical */
+    }
+#pragma omp section
+    {
+      <ompts:check>
+      sum0=0;
+      </ompts:check>
+      for(i=700;i<1000;i++)
+	sum0=sum0+i;
+#pragma omp critical
+      {
+	sum= sum+sum0;
+      }                         /*end of critical */
+    }               
+  }        /*end of parallel sections*/
+</ompts:orphan>
+known_sum=(999*1000)/2+7;
+return (known_sum==sum); 
+}                              /* end of check_section_private*/
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_sections_reduction.c b/final/testsuite/c/omp_parallel_sections_reduction.c
new file mode 100644
index 0000000..cadea6a
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_sections_reduction.c

@@ -0,0 +1,568 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel sections reduction directive with all its option.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel sections reduction</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+
+int <ompts:testcode:functionname>omp_parallel_sections_reduction</ompts:testcode:functionname>(FILE * logFile){
+	<ompts:orphan:vars>
+    int sum;
+	int known_sum;	
+	double dpt;
+    double dsum;
+	double dknown_sum;
+	double dt=0.5;				/* base of geometric row for + and - test*/
+	double rounding_error= 1.E-5;
+	int diff;
+	double ddiff;
+	int product;
+	int known_product;
+	int logic_and;
+	int bit_and;
+	int logic_or;
+	int bit_or;
+	int exclusiv_bit_or;
+	int logics[1000];
+	int i;
+	int result;
+    </ompts:orphan:vars>
+    
+    sum = 7;
+    dsum=0;
+    product =1;
+    dpt = 1;
+	logic_and=1;
+	bit_and=1;
+	logic_or=0;
+	bit_or=0;
+	exclusiv_bit_or=0;
+    result =0;
+	/*  int my_islarger;*/
+	/*int is_larger=1;*/
+	known_sum = (999*1000)/2+7;
+
+    <ompts:orphan>
+#pragma omp parallel sections private(i) <ompts:check>reduction(+:sum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=1;i<300;i++)
+			{
+				sum=sum+i;
+			}
+		}
+#pragma omp section
+		{
+			for (i=300;i<700;i++)
+			{
+				sum=sum+i;
+			}
+		}
+#pragma omp section
+		{
+			for (i=700;i<1000;i++)
+			{
+				sum=sum+i;
+			}
+		}
+	}
+
+	if(known_sum!=sum)
+	{
+		result++;
+		fprintf(logFile,"Error in sum with integers: Result was %d instead of %d.\n",sum, known_sum);
+	}
+
+	diff = (999*1000)/2;
+#pragma omp parallel sections private(i) <ompts:check>reduction(-:diff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=1;i<300;i++)
+			{
+				diff=diff-i;
+			}
+		}
+#pragma omp section
+		{
+			for (i=300;i<700;i++)
+			{
+				diff=diff-i;
+			}
+		}
+#pragma omp section
+		{
+			for (i=700;i<1000;i++)
+			{
+				diff=diff-i;
+			}
+		}
+	}
+
+
+	if(diff != 0)
+	{
+		result++;
+		fprintf(logFile,"Error in Difference with integers: Result was %d instead of 0.\n",diff);
+	}
+	for (i=0;i<20;++i)
+	{
+		dpt*=dt;
+	}
+	dknown_sum = (1-dpt)/(1-dt);
+#pragma omp parallel sections private(i) <ompts:check>reduction(+:dsum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=0;i<6;++i)
+			{
+				dsum += pow(dt,i);
+			}
+		}
+#pragma omp section
+		{
+			for (i=6;i<12;++i)
+			{
+				dsum += pow(dt,i);
+			}
+		}
+#pragma omp section
+		{
+			for (i=12;i<20;++i)
+			{
+				dsum += pow(dt,i);
+			}
+		}
+	}
+
+
+	if( fabs(dsum-dknown_sum) > rounding_error )
+	{
+		result++; 
+		fprintf(logFile,"Error in sum with doubles: Result was %f instead of %f (Difference: %E)\n",dsum,dknown_sum, dsum-dknown_sum);
+	}
+
+	dpt=1;
+
+	for (i=0;i<20;++i)
+	{
+		dpt*=dt;
+	}
+	fprintf(logFile,"\n");
+	ddiff = (1-dpt)/(1-dt);
+#pragma omp parallel sections private(i) <ompts:check>reduction(-:ddiff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=0;i<6;++i)
+			{
+				ddiff -= pow(dt,i);
+			}
+		}
+#pragma omp section
+		{
+			for (i=6;i<12;++i)
+			{
+				ddiff -= pow(dt,i);
+			}
+		}
+#pragma omp section
+		{
+			for (i=12;i<20;++i)
+			{
+				ddiff -= pow(dt,i);
+			}
+		}
+	}
+
+	if( fabs(ddiff) > rounding_error)
+	{
+		result++;
+		fprintf(logFile,"Error in Difference with doubles: Result was %E instead of 0.0\n",ddiff);
+	}
+
+	known_product = 3628800;
+#pragma omp parallel sections private(i) <ompts:check>reduction(*:product)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{	
+			for(i=1;i<3;i++)
+			{
+				product *= i;
+			}
+		}
+#pragma omp section
+		{
+			for(i=3;i<7;i++)
+			{
+				product *= i;
+			}
+		}
+#pragma omp section
+		{
+			for(i=7;i<11;i++)
+			{
+				product *= i;
+			}
+		}
+	}
+
+
+	if(known_product != product)
+	{
+		result++;
+		fprintf(logFile,"Error in Product with integers: Result was %d instead of %d\n",product,known_product);
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=1;i<300;i++)
+			{
+				logic_and = (logic_and && logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=300;i<700;i++)
+			{
+				logic_and = (logic_and && logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=700;i<1000;i++)
+			{
+				logic_and = (logic_and && logics[i]);
+			}
+		}
+	}
+
+	if(!logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 1\n");
+	}
+
+	logic_and = 1;
+	logics[501] = 0;
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=1;i<300;i++)
+			{
+				logic_and = (logic_and && logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=300;i<700;i++)
+			{
+				logic_and = (logic_and && logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=700;i<1000;i++)
+			{
+				logic_and = (logic_and && logics[i]);
+			}
+		}
+	}
+
+	if(logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 2");
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=1;i<300;i++)
+			{
+				logic_or = (logic_or || logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=300;i<700;i++)
+			{
+				logic_or = (logic_or || logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=700;i<1000;i++)
+			{
+				logic_or = (logic_or || logics[i]);
+			}
+		}
+	}
+
+	if(logic_or)
+	{
+		result++;
+		fprintf(logFile,"Error in logic OR part 1\n");
+	}
+
+	logic_or = 0;
+	logics[501]=1;
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for (i=1;i<300;i++)
+			{
+				logic_or = (logic_or || logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=300;i<700;i++)
+			{
+				logic_or = (logic_or || logics[i]);
+			}
+		}
+#pragma omp section
+		{
+			for (i=700;i<1000;i++)
+			{
+				logic_or = (logic_or || logics[i]);
+			}
+		}
+	}
+
+	if(!logic_or)
+	{
+		result++;
+		fprintf(logFile,"Error in logic OR part 2\n");
+	}
+
+	for(i=0;i<1000;++i)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{	
+			for(i=0;i<300;++i)
+			{
+				bit_and = (bit_and & logics[i]);
+			}
+		}
+#pragma omp section
+		{	
+			for(i=300;i<700;++i)
+			{
+				bit_and = (bit_and & logics[i]);
+			}
+		}
+#pragma omp section
+		{	
+			for(i=700;i<1000;++i)
+			{
+				bit_and = (bit_and & logics[i]);
+			}
+		}
+	}
+	if(!bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 1\n");
+	}
+
+	bit_and = 1;
+	logics[501]=0;
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for(i=0;i<300;++i)
+			{
+				bit_and = bit_and & logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=300;i<700;++i)
+			{
+				bit_and = bit_and & logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=700;i<1000;++i)
+			{
+				bit_and = bit_and & logics[i];
+			}
+		}
+	}
+	if(bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 2");
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for(i=0;i<300;++i)
+			{
+				bit_or = bit_or | logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=300;i<700;++i)
+			{
+				bit_or = bit_or | logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=700;i<1000;++i)
+			{
+				bit_or = bit_or | logics[i];
+			}
+		}
+	}
+	if(bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 1\n");
+	}
+	bit_or = 0;
+	logics[501]=1;
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for(i=0;i<300;++i)
+			{
+				bit_or = bit_or | logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=300;i<700;++i)
+			{
+				bit_or = bit_or | logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=700;i<1000;++i)
+			{
+				bit_or = bit_or | logics[i];
+			}
+		}
+	}
+	if(!bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 2\n");
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{	
+			for(i=0;i<300;++i)
+			{
+				exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+			}
+		}
+#pragma omp section
+		{	
+			for(i=300;i<700;++i)
+			{
+				exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+			}
+		}
+#pragma omp section
+		{	
+			for(i=700;i<1000;++i)
+			{
+				exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+			}
+		}
+	}
+	if(exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 1\n");
+	}
+
+	exclusiv_bit_or = 0;
+	logics[501]=1;
+
+#pragma omp parallel sections private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+	{
+#pragma omp section
+		{
+			for(i=0;i<300;++i)
+			{
+				exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=300;i<700;++i)
+			{
+				exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+			}
+		}
+#pragma omp section
+		{
+			for(i=700;i<1000;++i)
+			{
+				exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+			}
+		}
+	}
+	if(!exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 2\n");
+	}
+</ompts:orphan>
+	/*printf("\nResult:%d\n",result);*/
+	return (result==0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_parallel_shared.c b/final/testsuite/c/omp_parallel_shared.c
new file mode 100644
index 0000000..1f28e38
--- /dev/null
+++ b/final/testsuite/c/omp_parallel_shared.c

@@ -0,0 +1,43 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the shared option of the parallel construct.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp parallel shared</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_parallel_shared</ompts:testcode:functionname> (FILE * logFile)
+{
+  <ompts:orphan:vars>
+  int i;
+  int sum;
+  </ompts:orphan:vars>
+  sum = 0;
+  int known_sum;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+
+#pragma omp parallel private(i) <ompts:check>shared(sum)</ompts:check> <ompts:crosscheck>firstprivate(sum)</ompts:crosscheck>
+  {
+    <ompts:orphan>
+    int mysum = 0;
+#pragma omp for
+	for (i = 1; i <= LOOPCOUNT; i++)
+	{
+	  mysum = mysum + i;
+	} 
+#pragma omp critical
+	{
+	  sum = sum + mysum;
+	}   /* end of critical */
+</ompts:orphan>
+
+  }   /* end of parallel */
+  if (known_sum != sum) {
+  	fprintf(logFile, "KNOWN_SUM = %d; SUM = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_section_firstprivate.c b/final/testsuite/c/omp_section_firstprivate.c
new file mode 100644
index 0000000..a035d10
--- /dev/null
+++ b/final/testsuite/c/omp_section_firstprivate.c

@@ -0,0 +1,52 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp section firstprivate directive by adding a variable which is defined before the parallel region.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp firstprivate</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+
+int <ompts:testcode:functionname>omp_section_firstprivate</ompts:testcode:functionname>(FILE * logFile){
+	<ompts:orphan:vars>
+	    int sum;
+	    int sum0;
+	</ompts:orphan:vars>
+	int known_sum;
+
+	sum0 = 11;
+	sum = 7;
+#pragma omp parallel
+	{
+<ompts:orphan>
+#pragma omp  sections <ompts:check>firstprivate(sum0)</ompts:check><ompts:crosscheck>private(sum0)</ompts:crosscheck>
+		{
+#pragma omp section 
+			{
+#pragma omp critical
+				{
+					sum = sum + sum0;
+				} /*end of critical */
+			}    
+#pragma omp section
+			{
+#pragma omp critical
+				{
+					sum = sum + sum0;
+				} /*end of critical */
+			}
+#pragma omp section
+			{
+#pragma omp critical
+				{
+					sum = sum + sum0;
+				} /*end of critical */
+			}               
+		} /*end of sections*/
+</ompts:orphan>
+	} /* end of parallel */
+	known_sum = 11 * 3 + 7;
+	return (known_sum == sum); 
+} /* end of check_section_firstprivate*/
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_section_lastprivate.c b/final/testsuite/c/omp_section_lastprivate.c
new file mode 100644
index 0000000..56ae0df
--- /dev/null
+++ b/final/testsuite/c/omp_section_lastprivate.c

@@ -0,0 +1,73 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp section lastprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp section lastprivate</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+
+int <ompts:testcode:functionname>omp_section_lastprivate</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+	int i0 = -1;
+	int sum = 0;
+        int i;
+        int sum0 = 0;
+    </ompts:orphan:vars>
+    int known_sum;
+
+    i0 = -1;
+    sum = 0;
+
+#pragma omp parallel
+    {
+	<ompts:orphan>
+#pragma omp sections <ompts:check>lastprivate(i0)</ompts:check><ompts:crosscheck>private(i0)</ompts:crosscheck> private(i,sum0)
+	{
+#pragma omp section  
+	    {
+		sum0 = 0;
+		for (i = 1; i < 400; i++)
+		{
+		    sum0 = sum0 + i;
+		    i0 = i;
+		}
+#pragma omp critical
+		{
+		    sum = sum + sum0;
+		} /*end of critical*/
+	    } /* end of section */
+#pragma omp section 
+	    {
+		sum0 = 0;
+		for(i = 400; i < 700; i++)
+		{
+		    sum0 = sum0 + i;
+		    i0 = i;
+		}
+#pragma omp critical
+		{
+		    sum = sum + sum0;
+		} /*end of critical*/
+	    }
+#pragma omp section 
+	    {
+		sum0 = 0;
+		for(i = 700; i < 1000; i++)
+		{
+		    sum0 = sum0 + i;
+		    i0 = i;
+		}
+#pragma omp critical
+		{
+		    sum = sum + sum0;
+		} /*end of critical*/
+	    }
+	} /* end of sections*/
+	</ompts:orphan>
+    } /* end of parallel*/    
+    known_sum = (999 * 1000) / 2;
+    return ((known_sum == sum) && (i0 == 999) );
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_section_private.c b/final/testsuite/c/omp_section_private.c
new file mode 100644
index 0000000..399a2f8
--- /dev/null
+++ b/final/testsuite/c/omp_section_private.c

@@ -0,0 +1,69 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp section private directive by upcounting a variable in a to several sections splitted loop.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp section private</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_section_private</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+	int sum;
+	int sum0;
+    int i;
+    </ompts:orphan:vars>
+    int known_sum;
+
+    sum = 7;
+    sum0 = 0;
+
+#pragma omp parallel
+    {
+	<ompts:orphan>
+#pragma omp  sections <ompts:check>private(sum0,i)</ompts:check><ompts:crosscheck>private(i)</ompts:crosscheck>
+	{
+#pragma omp section 
+	    {
+		<ompts:check>
+        sum0 = 0;
+        </ompts:check>
+		for (i = 1; i < 400; i++)
+		    sum0 = sum0 + i;
+#pragma omp critical
+		{
+		    sum = sum + sum0;
+		} /*end of critical */
+	    }    
+#pragma omp section
+	    {
+          <ompts:check>
+		sum0 = 0;
+          </ompts:check>
+		for (i = 400; i < 700; i++)
+		    sum0 = sum0 + i;
+#pragma omp critical
+		{
+		    sum = sum + sum0;
+		} /*end of critical */
+	    }
+#pragma omp section
+	    {
+          <ompts:check>
+		sum0 = 0;
+          </ompts:check>
+		for (i = 700; i < 1000; i++)
+		    sum0 = sum0 + i;
+#pragma omp critical
+		{
+		    sum = sum + sum0;
+		} /*end of critical */
+	    }               
+	} /*end of sections*/
+	</ompts:orphan>
+    } /* end of parallel */
+    known_sum = (999 * 1000) / 2 + 7;
+    return (known_sum == sum); 
+} /* end of check_section_private*/
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_sections_nowait.c b/final/testsuite/c/omp_sections_nowait.c
new file mode 100644
index 0000000..b91a375
--- /dev/null
+++ b/final/testsuite/c/omp_sections_nowait.c

@@ -0,0 +1,66 @@
+<ompts:test>
+<ompts:description>Test which checks the omp parallel for nowait directive. It fills an array with values and operates on these in the following.</ompts:description>
+<ompts:directive>omp parallel sections nowait</ompts:directive>
+<ompts:version>1.0</ompts:version>
+<ompts:dependences>omp parallel sections, omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_sections_nowait</ompts:testcode:functionname> (FILE * logFile)
+{
+	<ompts:orphan:vars>
+		int result;
+		int count;
+	</ompts:orphan:vars>
+	int j;
+
+	result = 0;
+	count = 0;
+
+#pragma omp parallel 
+	{
+	<ompts:orphan>
+	int rank;
+
+	rank = omp_get_thread_num ();
+	
+#pragma omp sections <ompts:check>nowait</ompts:check>
+		{
+#pragma omp section
+			{
+				fprintf (logFile, "Thread nr %d enters first section and gets sleeping.\n", rank);
+				my_sleep(SLEEPTIME);
+				count = 1;
+				fprintf (logFile, "Thread nr %d woke up an set count to 1.\n", rank);
+#pragma omp flush(count)
+			}
+#pragma omp section
+			{
+				fprintf (logFile, "Thread nr %d executed work in the first section.\n", rank);
+			}
+		}
+/* Begin of second sections environment */
+#pragma omp sections
+		{
+#pragma omp section
+			{
+				fprintf (logFile, "Thread nr %d executed work in the second section.\n", rank);
+			}
+#pragma omp section
+			{
+				fprintf (logFile, "Thread nr %d executed work in the second section and controls the value of count\n", rank);
+				if (count == 0)
+					result = 1;
+				fprintf (logFile, "cout was %d", count);
+			}
+		}
+	</ompts:orphan>
+	}
+	
+	return result;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_sections_reduction.c b/final/testsuite/c/omp_sections_reduction.c
new file mode 100644
index 0000000..7e7829a
--- /dev/null
+++ b/final/testsuite/c/omp_sections_reduction.c

@@ -0,0 +1,613 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp sections reduction directive with all its options.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp sections reduction</ompts:directive>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+
+int <ompts:testcode:functionname>omp_sections_reduction</ompts:testcode:functionname>(FILE * logFile){
+	<ompts:orphan:vars>
+    int sum;
+	int known_sum;
+	double dpt,dsum;
+	double dknown_sum;
+	double dt=0.5;				/* base of geometric row for + and - test*/
+	double rounding_error= 1.E-9;
+	int diff;
+	double ddiff;
+	int product;
+	int known_product;
+	int logic_and;
+	int bit_and;
+	int logic_or;
+	int bit_or;
+	int exclusiv_bit_or;
+	int logics[1000];
+	int i;
+	int result;
+    </ompts:orphan:vars>
+	/*  int my_islarger;*/
+	/*int is_larger=1;*/
+
+    sum =7;
+    dpt =1;
+    dsum=0;
+    product =1;
+	logic_and=1;
+	bit_and=1;
+	logic_or=0;
+	bit_or=0;
+	exclusiv_bit_or=0;
+    result = 0;
+	dt = 1./3.;
+	known_sum = (999*1000)/2+7;
+<ompts:orphan>
+#pragma omp parallel
+	{
+
+#pragma omp sections private(i) <ompts:check>reduction(+:sum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=1;i<300;i++)
+				{
+					sum=sum+i;
+				}
+			}
+#pragma omp section
+			{
+				for (i=300;i<700;i++)
+				{
+					sum=sum+i;
+				}
+			}
+#pragma omp section
+			{
+				for (i=700;i<1000;i++)
+				{
+					sum=sum+i;
+				}
+			}
+		}
+	}
+
+	if(known_sum!=sum)
+	{
+		++result;
+		fprintf(logFile,"Error in sum with integers: Result was %d instead of %d\n", sum,known_sum);
+	}
+
+	diff = (999*1000)/2;
+#pragma omp parallel
+	{
+#pragma omp sections private(i) <ompts:check>reduction(-:diff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=1;i<300;i++)
+				{
+					diff=diff-i;
+				}
+			}
+#pragma omp section
+			{
+				for (i=300;i<700;i++)
+				{
+					diff=diff-i;
+				}
+			}
+#pragma omp section
+			{
+				for (i=700;i<1000;i++)
+				{
+					diff=diff-i;
+				}
+			}
+		}
+	}
+
+	if(diff != 0)
+	{
+		result++;
+		fprintf(logFile,"Error in Difference with integers: Result was %d instead of 0.\n",diff);
+	}
+
+	for (i=0;i<20;++i)
+	{
+		dpt*=dt;
+	}
+	dknown_sum = (1-dpt)/(1-dt);
+#pragma omp parallel
+	{
+#pragma omp sections private(i) <ompts:check>reduction(+:dsum)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=0;i<6;++i)
+				{
+					dsum += pow(dt,i);
+				}
+			}
+#pragma omp section
+			{
+				for (i=6;i<12;++i)
+				{
+					dsum += pow(dt,i);
+				}
+			}
+#pragma omp section
+			{
+				for (i=12;i<20;++i)
+				{
+					dsum += pow(dt,i);
+				}
+			}
+		}
+	}
+
+	if( fabs(dsum-dknown_sum) > rounding_error )
+	{
+		result++; 
+		fprintf(logFile,"Error in sum with doubles: Result was %f instead of %f (Difference: %E)\n",dsum,dknown_sum, dsum-dknown_sum);
+	}
+
+	dpt=1;
+
+	for (i=0;i<20;++i)
+	{
+		dpt*=dt;
+	}
+	fprintf(logFile,"\n");
+	ddiff = (1-dpt)/(1-dt);
+#pragma omp parallel
+	{
+#pragma omp sections private(i) <ompts:check>reduction(-:ddiff)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=0;i<6;++i)
+				{
+					ddiff -= pow(dt,i);
+				}
+			}
+#pragma omp section
+			{
+				for (i=6;i<12;++i)
+				{
+					ddiff -= pow(dt,i);
+				}
+			}
+#pragma omp section
+			{
+				for (i=12;i<20;++i)
+				{
+					ddiff -= pow(dt,i);
+				}
+			}
+		}
+	}
+
+	if( fabs(ddiff) > rounding_error)
+	{
+		result++;
+		fprintf(logFile,"Error in Difference with doubles: Result was %E instead of 0.0\n",ddiff);
+	}
+
+
+	known_product = 3628800;
+#pragma omp parallel
+	{
+#pragma omp sections private(i) <ompts:check>reduction(*:product)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{	
+				for(i=1;i<3;i++)
+				{
+					product *= i;
+				}
+			}
+#pragma omp section
+			{
+				for(i=3;i<7;i++)
+				{
+					product *= i;
+				}
+			}
+#pragma omp section
+			{
+				for(i=7;i<11;i++)
+				{
+					product *= i;
+				}
+			}
+		}
+	}
+
+	if(known_product != product)
+	{
+		result++;
+		fprintf(logFile,"Error in Product with integers: Result was %d instead of %d\n",product,known_product);
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel
+	{
+#pragma omp sections private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=1;i<300;i++)
+				{
+					logic_and = (logic_and && logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=300;i<700;i++)
+				{
+					logic_and = (logic_and && logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=700;i<1000;i++)
+				{
+					logic_and = (logic_and && logics[i]);
+				}
+			}
+		}
+	}
+
+	if(!logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 1\n");
+	}
+
+	logic_and = 1;
+	logics[501] = 0;
+
+#pragma omp parallel
+	{
+#pragma omp sections private(i) <ompts:check>reduction(&&:logic_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=1;i<300;i++)
+				{
+					logic_and = (logic_and && logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=300;i<700;i++)
+				{
+					logic_and = (logic_and && logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=700;i<1000;i++)
+				{
+					logic_and = (logic_and && logics[i]);
+				}
+			}
+		}
+	}
+
+	if(logic_and)
+	{
+		result++;
+		fprintf(logFile,"Error in logic AND part 2\n");
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=1;i<300;i++)
+				{
+					logic_or = (logic_or || logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=300;i<700;i++)
+				{
+					logic_or = (logic_or || logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=700;i<1000;i++)
+				{
+					logic_or = (logic_or || logics[i]);
+				}
+			}
+		}
+	}
+
+	if(logic_or)
+	{
+		result++;
+		fprintf(logFile,"\nError in logic OR part 1\n");
+	}
+
+	logic_or = 0;
+	logics[501]=1;
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(||:logic_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for (i=1;i<300;i++)
+				{
+					logic_or = (logic_or || logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=300;i<700;i++)
+				{
+					logic_or = (logic_or || logics[i]);
+				}
+			}
+#pragma omp section
+			{
+				for (i=700;i<1000;i++)
+				{
+					logic_or = (logic_or || logics[i]);
+				}
+			}
+		}
+	}
+
+	if(!logic_or)
+	{
+		result++;
+		fprintf(logFile,"Error in logic OR part 2\n");
+	}
+
+
+	for(i=0;i<1000;++i)
+	{
+		logics[i]=1;
+	}
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{	
+				for(i=0;i<300;++i)
+				{
+					bit_and = (bit_and & logics[i]);
+				}
+			}
+#pragma omp section
+			{	
+				for(i=300;i<700;++i)
+				{
+					bit_and = (bit_and & logics[i]);
+				}
+			}
+#pragma omp section
+			{	
+				for(i=700;i<1000;++i)
+				{
+					bit_and = (bit_and & logics[i]);
+				}
+			}
+		}
+	}
+	if(!bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 1\n");
+	}
+
+	bit_and = 1;
+	logics[501]=0;
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(&:bit_and)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for(i=0;i<300;++i)
+				{
+					bit_and = bit_and & logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=300;i<700;++i)
+				{
+					bit_and = bit_and & logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=700;i<1000;++i)
+				{
+					bit_and = bit_and & logics[i];
+				}
+			}
+		}
+	}
+	if(bit_and)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT AND part 2\n");
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for(i=0;i<300;++i)
+				{
+					bit_or = bit_or | logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=300;i<700;++i)
+				{
+					bit_or = bit_or | logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=700;i<1000;++i)
+				{
+					bit_or = bit_or | logics[i];
+				}
+			}
+		}
+	}
+	if(bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 1\n");
+	}
+	bit_or = 0;
+	logics[501]=1;
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(|:bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for(i=0;i<300;++i)
+				{
+					bit_or = bit_or | logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=300;i<700;++i)
+				{
+					bit_or = bit_or | logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=700;i<1000;++i)
+				{
+					bit_or = bit_or | logics[i];
+				}
+			}
+		}
+	}
+	if(!bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in BIT OR part 2\n");
+	}
+
+	for(i=0;i<1000;i++)
+	{
+		logics[i]=0;
+	}
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{	
+				for(i=0;i<300;++i)
+				{
+					exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+				}
+			}
+#pragma omp section
+			{	
+				for(i=300;i<700;++i)
+				{
+					exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+				}
+			}
+#pragma omp section
+			{	
+				for(i=700;i<1000;++i)
+				{
+					exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+				}
+			}
+		}
+	}
+	if(exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 1\n");
+	}
+
+	exclusiv_bit_or = 0;
+	logics[501]=1;
+
+#pragma omp parallel 
+	{
+#pragma omp sections private(i) <ompts:check>reduction(^:exclusiv_bit_or)</ompts:check><ompts:crosscheck></ompts:crosscheck>
+		{
+#pragma omp section
+			{
+				for(i=0;i<300;++i)
+				{
+					exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=300;i<700;++i)
+				{
+					exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+				}
+			}
+#pragma omp section
+			{
+				for(i=700;i<1000;++i)
+				{
+					exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+				}
+			}
+		}
+	}
+	if(!exclusiv_bit_or)
+	{
+		result++;
+		fprintf(logFile,"Error in EXCLUSIV BIT OR part 2\n");
+	}
+</ompts:orphan>
+	/*printf("\nResult:%d\n",result);*/
+	return (result==0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_single.c b/final/testsuite/c/omp_single.c
new file mode 100644
index 0000000..032b0d2
--- /dev/null
+++ b/final/testsuite/c/omp_single.c

@@ -0,0 +1,43 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp single directive by controlling how often a directive is called in an omp single region.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp single</ompts:directive>
+<ompts:dependences>omp parallel private,omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_single</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int nr_threads_in_single;
+	int result;
+	int nr_iterations;
+	int i;
+    </ompts:orphan:vars>
+
+    nr_threads_in_single = 0;
+    result = 0;
+    nr_iterations = 0;
+
+#pragma omp parallel private(i)
+    {
+	for (i = 0; i < LOOPCOUNT; i++)
+	{
+	    <ompts:orphan>
+		<ompts:check>#pragma omp single </ompts:check>
+		{  
+#pragma omp flush
+		    nr_threads_in_single++;
+#pragma omp flush                         
+		    nr_iterations++;
+		    nr_threads_in_single--;
+		    result = result + nr_threads_in_single;
+		} /* end of single */    
+	    </ompts:orphan>
+	} /* end of for  */
+    } /* end of parallel */
+    return ((result == 0) && (nr_iterations == LOOPCOUNT));
+} /* end of check_single*/
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_single_copyprivate.c b/final/testsuite/c/omp_single_copyprivate.c
new file mode 100644
index 0000000..bc7cd54
--- /dev/null
+++ b/final/testsuite/c/omp_single_copyprivate.c

@@ -0,0 +1,50 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp single copyprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp single copyprivate</ompts:directive>
+<ompts:dependences>omp parllel,omp critical</ompts:dependences>
+<ompts:testcode>
+#include "omp_testsuite.h"
+
+int j;
+#pragma omp threadprivate(j)
+
+int <ompts:testcode:functionname>omp_single_copyprivate</ompts:testcode:functionname>(FILE * logFile)                                   
+{
+    <ompts:orphan:vars>
+	int result;
+	int nr_iterations;
+    </ompts:orphan:vars>
+
+    result = 0;
+    nr_iterations = 0;
+#pragma omp parallel
+    {
+	<ompts:orphan>
+	    int i;
+            for (i = 0; i < LOOPCOUNT; i++)
+	    {
+		/*
+		   int thread;
+		   thread = omp_get_thread_num ();
+		 */
+#pragma omp single <ompts:check>copyprivate(j)</ompts:check>
+		{
+		    nr_iterations++;
+		    j = i;
+		    /*printf ("thread %d assigns, j = %d, i = %d\n", thread, j, i);*/
+		}
+		/*	#pragma omp barrier*/
+#pragma omp critical
+		{
+		    /*printf ("thread = %d, j = %d, i = %d\n", thread, j, i);*/
+		    result = result + j - i;
+		}
+#pragma omp barrier
+	    } /* end of for */
+	</ompts:orphan>
+    } /* end of parallel */
+    return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_single_nowait.c b/final/testsuite/c/omp_single_nowait.c
new file mode 100644
index 0000000..9b7a250
--- /dev/null
+++ b/final/testsuite/c/omp_single_nowait.c

@@ -0,0 +1,60 @@
+<ompts:test>
+<ompts:testdescription></ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp single nowait</ompts:directive>
+<ompts:dependences>omp critical,omp atomic</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int my_iterations;
+#pragma omp threadprivate(my_iterations)
+
+int <ompts:testcode:functionname>omp_single_nowait</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int nr_iterations;
+    </ompts:orphan:vars>
+
+    int total_iterations = 0;
+    int i;
+
+    nr_iterations = 0;
+    my_iterations = 0;
+
+#pragma omp parallel private(i)
+    {
+	for (i = 0; i < LOOPCOUNT; i++)
+	{
+	    <ompts:orphan>
+		<ompts:check>#pragma omp single nowait</ompts:check>
+		{
+#pragma omp atomic  
+		    nr_iterations++;
+		} /* end of single*/    
+	    </ompts:orphan>
+	} /* end of for  */
+    } /* end of parallel */
+
+#pragma omp parallel private(i) 
+    {
+	my_iterations = 0;
+	for (i = 0; i < LOOPCOUNT; i++)
+	{
+	    <ompts:orphan>
+		<ompts:check>#pragma omp single nowait</ompts:check>
+		{
+		    my_iterations++;
+		} /* end of single*/    
+	    </ompts:orphan>
+	} /* end of for  */
+#pragma omp critical
+	{
+	    total_iterations += my_iterations;
+	}
+
+    } /* end of parallel */
+    return ((nr_iterations == LOOPCOUNT) && (total_iterations == LOOPCOUNT));
+} /* end of check_single_nowait*/
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_single_private.c b/final/testsuite/c/omp_single_private.c
new file mode 100644
index 0000000..a7502c7
--- /dev/null
+++ b/final/testsuite/c/omp_single_private.c

@@ -0,0 +1,56 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp single private directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp singel private</ompts:directive>
+<ompts:dependences>omp critical,omp flush,omp single nowait</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int myit = 0;
+#pragma omp threadprivate(myit)
+int myresult = 0;
+#pragma omp threadprivate(myresult)
+
+int <ompts:testcode:functionname>omp_single_private</ompts:testcode:functionname>(FILE * logFile)
+{
+    <ompts:orphan:vars>
+	int nr_threads_in_single;
+	int result;
+	int nr_iterations;
+    </ompts:orphan:vars>
+    int i;
+
+    myit = 0;
+    nr_threads_in_single = 0;
+    nr_iterations = 0;
+    result = 0;
+
+#pragma omp parallel private(i)
+    {
+	myresult = 0;
+	myit = 0;
+	for (i = 0; i < LOOPCOUNT; i++)
+	{
+	<ompts:orphan>
+#pragma omp single <ompts:check>private(nr_threads_in_single) </ompts:check>nowait
+	    {  
+		nr_threads_in_single = 0;
+#pragma omp flush
+		nr_threads_in_single++;
+#pragma omp flush                         
+		myit++;
+		myresult = myresult + nr_threads_in_single;
+	    } /* end of single */    
+	</ompts:orphan>
+	} /* end of for */
+#pragma omp critical
+	{
+            result += nr_threads_in_single;
+	    nr_iterations += myit;
+	}
+    } /* end of parallel */
+    return ((result == 0) && (nr_iterations == LOOPCOUNT));
+} /* end of check_single private */ 
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task.c b/final/testsuite/c/omp_task.c
new file mode 100644
index 0000000..ae89790
--- /dev/null
+++ b/final/testsuite/c/omp_task.c

@@ -0,0 +1,52 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp task directive. The idea of the tests is to generate a set of tasks in a single region. We let pause the tasks generated so that other threads get sheduled to the newly opened tasks.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task</ompts:directive>
+<ompts:dependences>omp single</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+
+int <ompts:testcode:functionname>omp_task</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+    int tids[NUM_TASKS];
+    int i;
+    </ompts:orphan:vars>
+
+#pragma omp parallel 
+{
+#pragma omp single
+    {
+        for (i = 0; i < NUM_TASKS; i++) {
+            <ompts:orphan>
+            /* First we have to store the value of the loop index in a new variable
+             * which will be private for each task because otherwise it will be overwritten
+             * if the execution of the task takes longer than the time which is needed to 
+             * enter the next step of the loop!
+             */
+            int myi;
+            myi = i;
+
+<ompts:check>#pragma omp task</ompts:check>
+            {
+                my_sleep (SLEEPTIME);
+
+                tids[myi] = omp_get_thread_num();
+            } /* end of omp task */
+            </ompts:orphan>
+        } /* end of for */
+    } /* end of single */
+} /*end of parallel */
+
+/* Now we ckeck if more than one thread executed the tasks. */
+    for (i = 1; i < NUM_TASKS; i++) {
+        if (tids[0] != tids[i])
+            return 1;
+    }
+    return 0;
+} /* end of check_parallel_for_private */
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task_final.c b/final/testsuite/c/omp_task_final.c
new file mode 100644
index 0000000..4e00c5f
--- /dev/null
+++ b/final/testsuite/c/omp_task_final.c

@@ -0,0 +1,52 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp task directive. The idea of the tests is to generate a set of tasks in a single region. We let pause the tasks generated so that other threads get sheduled to the newly opened tasks.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task final</ompts:directive>
+<ompts:dependences>omp single</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+
+int <ompts:testcode:functionname>omp_task_final</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+    int tids[NUM_TASKS];
+    int i;
+    </ompts:orphan:vars>
+    int error;
+#pragma omp parallel 
+{
+#pragma omp single
+    {
+        for (i = 0; i < NUM_TASKS; i++) {
+            <ompts:orphan>
+            /* First we have to store the value of the loop index in a new variable
+             * which will be private for each task because otherwise it will be overwritten
+             * if the execution of the task takes longer than the time which is needed to 
+             * enter the next step of the loop!
+             */
+            int myi;
+            myi = i;
+
+            #pragma omp task <ompts:check>final(i>=10)</ompts:check>
+            {
+                my_sleep (SLEEPTIME);
+
+                tids[myi] = omp_get_thread_num();
+            } /* end of omp task */
+            </ompts:orphan>
+        } /* end of for */
+    } /* end of single */
+} /*end of parallel */
+
+/* Now we ckeck if more than one thread executed the tasks. */
+    for (i = 10; i < NUM_TASKS; i++) {
+        if (tids[10] != tids[i])
+            error++;
+    }
+    return (error==0);
+} /* end of check_parallel_for_private */
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task_firstprivate.c b/final/testsuite/c/omp_task_firstprivate.c
new file mode 100644
index 0000000..7aa0746
--- /dev/null
+++ b/final/testsuite/c/omp_task_firstprivate.c

@@ -0,0 +1,51 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the firstprivate clause of the task directive. We create a set of tasks in a single region. We defines a variable named sum unequal zero which gets declared firstprivate for each task. Now each task calcualtes a sum using this private variable. Before each calcualation step we introduce a flush command so that maybe the private variabel gets bad. At the end we check if the calculated sum was right.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task firstprivate</ompts:directive>
+<ompts:dependences>omp single,omp flush,omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int <ompts:testcode:functionname>omp_task_firstprivate</ompts:testcode:functionname> (FILE * logFile)
+{
+    int i;
+    <ompts:orphan:vars>
+    int sum = 1234;
+    int known_sum;
+    int result = 0; /* counts the wrong sums from tasks */
+    </ompts:orphan:vars>
+
+    known_sum = 1234 + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+#pragma omp parallel
+    {
+#pragma omp single
+        {
+            for (i = 0; i < NUM_TASKS; i++)
+            {
+                <ompts:orphan>
+#pragma omp task <ompts:check>firstprivate(sum)</ompts:check>
+                {
+                    int j;
+                    for (j = 0; j <= LOOPCOUNT; j++) {
+#pragma omp flush
+                        sum += j;
+                    }
+
+                    /* check if calculated sum was right */
+                    if (sum != known_sum) {
+#pragma omp critical 
+                      { result++; }
+                    }
+                } /* end of omp task */
+                </ompts:orphan>
+            }	/* end of for */
+        } /* end of single */
+    }	/* end of parallel*/
+
+    return (result == 0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task_if.c b/final/testsuite/c/omp_task_if.c
new file mode 100644
index 0000000..c07ab2f
--- /dev/null
+++ b/final/testsuite/c/omp_task_if.c

@@ -0,0 +1,43 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the if clause of the omp task directive. The idear of the tests is to generate a tasks in a single region and pause it immediately. The parent thread now shall set a counter variable which the paused task shall evaluate when woke up.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task if</ompts:directive>
+<ompts:dependences>omp single,omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+
+int <ompts:testcode:functionname>omp_task_if</ompts:testcode:functionname>(FILE * logFile){
+    <ompts:orphan:vars>
+    int condition_false;
+    int count;
+    int result;
+    </ompts:orphan:vars>
+    count=0;
+    condition_false = (logFile == NULL);
+#pragma omp parallel 
+{
+#pragma omp single
+    {
+        <ompts:orphan>
+#pragma omp task <ompts:check>if (condition_false)</ompts:check> shared(count, result)
+        {
+            my_sleep (SLEEPTIME_LONG);
+//#pragma omp flush (count)
+            result = (0 == count);
+        } /* end of omp task */
+        </ompts:orphan>
+
+        count = 1;
+//#pragma omp flush (count)
+
+    } /* end of single */
+} /*end of parallel */
+
+    return result;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task_imp_firstprivate.c b/final/testsuite/c/omp_task_imp_firstprivate.c
new file mode 100644
index 0000000..bb8f0ef
--- /dev/null
+++ b/final/testsuite/c/omp_task_imp_firstprivate.c

@@ -0,0 +1,51 @@
+<ompts:test>
+<ompts:testdescription> Test to see if implied private works properly</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task</ompts:directive>
+<ompts:dependences>omp single</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+
+
+
+/* Utility function do spend some time in a loop */
+int <ompts:testcode:functionname>omp_task_imp_firstprivate</ompts:testcode:functionname> (FILE * logFile)
+{
+    int i=5;
+    int k = 0;
+    int result = 0;
+    int task_result = 1;
+   #pragma omp parallel firstprivate(i)
+    {
+      #pragma omp single
+      {
+     
+        
+          for (k = 0; k < NUM_TASKS; k++)
+	        {
+                    #pragma omp task shared(result , task_result<ompts:crosscheck>, i</ompts:crosscheck>)
+                        {
+                          int j;
+			  //check if i is private
+                          if(i != 5)
+                            task_result = 0;
+                       
+                          for(j = 0; j < NUM_TASKS; j++)
+                              i++;
+                          //this should be firstprivate implicitly
+                        }
+		}
+
+	  #pragma omp taskwait
+	  result = (task_result && i==5);
+       }
+                
+    }
+    
+    return result;
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task_private.c b/final/testsuite/c/omp_task_private.c
new file mode 100644
index 0000000..59e26dd
--- /dev/null
+++ b/final/testsuite/c/omp_task_private.c

@@ -0,0 +1,53 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the private clause of the task directive. We create a set of tasks in a single region. We defines a variable named sum which gets declared private for each task. Now each task calcualtes a sum using this private variable. Before each calcualation step we introduce a flush command so that maybe the private variabel gets bad. At the end we check if the calculated sum was right.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task private</ompts:directive>
+<ompts:dependences>omp single,omp flush,omp critical</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int <ompts:testcode:functionname>omp_task_private</ompts:testcode:functionname> (FILE * logFile)
+{
+    int i;
+    <ompts:orphan:vars>
+    int known_sum;
+    int sum = 0;
+    int result = 0; /* counts the wrong sums from tasks */
+    </ompts:orphan:vars>
+
+    known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+#pragma omp parallel
+    {
+#pragma omp single
+        {
+            for (i = 0; i < NUM_TASKS; i++)
+            {
+                <ompts:orphan>
+#pragma omp task <ompts:check>private(sum)</ompts:check> shared(result, known_sum)
+                {
+                    int j;
+		    //if sum is private, initialize to 0
+		    <ompts:check>sum = 0;</ompts:check>
+                    for (j = 0; j <= LOOPCOUNT; j++) {
+#pragma omp flush
+                        sum += j;
+                    }
+                    /* check if calculated sum was right */
+                    if (sum != known_sum) {
+#pragma omp critical 
+                        result++;
+                    }
+                } /* end of omp task */
+                </ompts:orphan>
+            }	/* end of for */
+        } /* end of single */
+    }	/* end of parallel*/
+
+    return (result == 0);
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task_shared.c b/final/testsuite/c/omp_task_shared.c
new file mode 100644
index 0000000..74c2109
--- /dev/null
+++ b/final/testsuite/c/omp_task_shared.c

@@ -0,0 +1,47 @@
+<ompts:test>
+<ompts:testdescription> Test to see if implied shared works correctly</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task</ompts:directive>
+<ompts:dependences>omp single, omp task firstprivate</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+
+
+
+/* Utility function do spend some time in a loop */
+int <ompts:testcode:functionname>omp_task_imp_shared</ompts:testcode:functionname> (FILE * logFile)
+{
+   <ompts:orphan:vars>
+    int i;
+   </ompts:orphan:vars>
+    i=0;
+    int k = 0;
+    int result = 0;
+
+   #pragma omp parallel
+    {
+       #pragma omp single
+          for (k = 0; k < NUM_TASKS; k++)           
+                  {
+                    <ompts:orphan>
+                    #pragma omp task <ompts:crosscheck> firstprivate(i) </ompts:crosscheck> <ompts:check> shared(i)</ompts:check>
+                        {
+                          #pragma omp atomic
+                            i++;
+                          //this should be shared implicitly
+                                
+                        }
+                  </ompts:orphan> 
+                  }
+
+    }
+
+result = i;
+return ((result == NUM_TASKS));
+     
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_task_untied.c b/final/testsuite/c/omp_task_untied.c
new file mode 100644
index 0000000..3d6ebe2
--- /dev/null
+++ b/final/testsuite/c/omp_task_untied.c

@@ -0,0 +1,69 @@
+<ompts:test>
+<ompts:testdescription>Test for untied clause. First generate a set of tasks and pause it immediately. Then we resume half of them and check whether they are scheduled by different threads</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task untied</ompts:directive>
+<ompts:dependences>omp taskwait</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_task_untied</ompts:testcode:functionname>(FILE * logFile){
+
+  <ompts:orphan:vars>
+  int i;
+  int count;
+  int start_tid[NUM_TASKS];
+  int current_tid[NUM_TASKS];
+  </ompts:orphan:vars>
+  count = 0;
+  
+  /*initialization*/
+  for (i=0; i< NUM_TASKS; i++){
+    start_tid[i]=0;
+    current_tid[i]=0;
+  }
+  
+  #pragma omp parallel firstprivate(i)
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        <ompts:orphan>
+        int myi = i;
+        #pragma omp task <ompts:check>untied</ompts:check>
+        {
+          my_sleep(SLEEPTIME);
+          start_tid[myi] = omp_get_thread_num();
+          current_tid[myi] = omp_get_thread_num();
+          
+          #pragma omp taskwait
+          
+          <ompts:check>if((start_tid[myi] %2) !=0){</ompts:check>
+            my_sleep(SLEEPTIME);
+            current_tid[myi] = omp_get_thread_num();
+          <ompts:check>
+          } /* end of if */ 
+          else {
+            current_tid[myi] = omp_get_thread_num();
+          }
+          </ompts:check>
+
+        } /*end of omp task */
+        </ompts:orphan>
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel */
+
+  for (i=0;i<NUM_TASKS; i++)
+  {
+    printf("start_tid[%d]=%d, current_tid[%d]=%d\n",i, start_tid[i], i , current_tid[i]);
+    if (current_tid[i] == start_tid[i])
+      count++;
+  }
+  return (count<NUM_TASKS);
+}
+</ompts:testcode>
+
+</ompts:test>

diff --git a/final/testsuite/c/omp_taskwait.c b/final/testsuite/c/omp_taskwait.c
new file mode 100644
index 0000000..88761f2
--- /dev/null
+++ b/final/testsuite/c/omp_taskwait.c

@@ -0,0 +1,75 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp taskwait directive. First we generate a set of tasks, which set the elements of an array to a specific value. Then we do a taskwait and check if all tasks finished meaning all array elements contain the right value. Then we generate a second set setting the array elements to another value. After the parallel region we check if all tasks of the second set finished and were executed after the tasks of the first set.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp taskwait</ompts:directive>
+<ompts:dependences>omp single,omp task</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+
+int <ompts:testcode:functionname>omp_taskwait</ompts:testcode:functionname>(FILE * logFile){
+    int result1 = 0;     /* Stores number of not finished tasks after the taskwait */
+    int result2 = 0;     /* Stores number of wrong array elements at the end */
+
+    int array[NUM_TASKS];
+    int i;
+
+    /* fill array */
+    for (i = 0; i < NUM_TASKS; i++) 
+        array[i] = 0;
+
+#pragma omp parallel 
+    {
+#pragma omp single
+        {
+            for (i = 0; i < NUM_TASKS; i++) {
+                /* First we have to store the value of the loop index in a new variable
+                 * which will be private for each task because otherwise it will be overwritten
+                 * if the execution of the task takes longer than the time which is needed to 
+                 * enter the next step of the loop!
+                 */
+                int myi;
+                myi = i;
+#pragma omp task
+                {
+                    my_sleep (SLEEPTIME);
+                    array[myi] = 1;
+                } /* end of omp task */
+            } /* end of for */
+
+<ompts:orphan>
+<ompts:check>#pragma omp taskwait</ompts:check>
+</ompts:orphan>
+
+            /* check if all tasks were finished */
+            for (i = 0; i < NUM_TASKS; i++) 
+                if (array[i] != 1)
+                    result1++;
+
+            /* generate some more tasks which now shall overwrite 
+             * the values in the tids array */
+            for (i = 0; i < NUM_TASKS; i++) {
+                int myi;
+                myi = i;
+#pragma omp task
+                {
+                    array[myi] = 2;
+                } /* end of omp task */
+            } /* end of for */
+
+        } /* end of single */
+    } /*end of parallel */
+
+    /* final check, if all array elements contain the right values: */
+    for (i = 0; i < NUM_TASKS; i++) {
+        if (array[i] != 2)
+            result2++;
+    }
+
+    return ((result1 == 0) && (result2 == 0));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_taskyield.c b/final/testsuite/c/omp_taskyield.c
new file mode 100644
index 0000000..1d1271e
--- /dev/null
+++ b/final/testsuite/c/omp_taskyield.c

@@ -0,0 +1,59 @@
+<ompts:test>
+<ompts:testdescription>Test taskyield directive. First generate a set of tasks and pause it immediately. Then we resume half of them and check whether they are scheduled by different threads</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp taskyield</ompts:directive>
+<ompts:dependences>omp taskwait</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int <ompts:testcode:functionname>omp_taskyield</ompts:testcode:functionname>(FILE * logFile){
+
+  <ompts:orphan:vars>
+  int i;
+  int count = 0;
+  int start_tid[NUM_TASKS];
+  int current_tid[NUM_TASKS];
+  </ompts:orphan:vars>
+  for (i=0; i< NUM_TASKS; i++){
+    start_tid[i]=0;
+    current_tid[i]=0;
+  }
+  
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        <ompts:orphan>
+        int myi = i;
+        <ompts:check>#pragma omp task untied</ompts:check>
+        {
+          my_sleep(SLEEPTIME);
+          start_tid[myi] = omp_get_thread_num();
+          
+          #pragma omp taskyield
+          
+          if((start_tid[myi] %2) ==0){
+            my_sleep(SLEEPTIME);
+            current_tid[myi] = omp_get_thread_num();
+          } /*end of if*/
+        } /* end of omp task */
+        </ompts:orphan>
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel */
+
+  for (i=0;i<NUM_TASKS; i++)
+  {
+    //printf("start_tid[%d]=%d, current_tid[%d]=%d\n",i, start_tid[i], i , current_tid[i]);
+    if (current_tid[i] == start_tid[i])
+      count++;
+  }
+  return (count<NUM_TASKS);
+}
+</ompts:testcode>
+
+</ompts:test>

diff --git a/final/testsuite/c/omp_test_lock.c b/final/testsuite/c/omp_test_lock.c
new file mode 100644
index 0000000..e889dfb
--- /dev/null
+++ b/final/testsuite/c/omp_test_lock.c

@@ -0,0 +1,47 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_testlock function. The test counts the threads entering and exiting a single region which is build with a test_lock in an endless loop.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_test_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_lock_t lck;
+
+int <ompts:testcode:functionname>omp_test_lock</ompts:testcode:functionname>(FILE * logFile)
+{
+    int nr_threads_in_single = 0;
+    int result = 0;
+    int nr_iterations = 0;
+    int i;
+
+    omp_init_lock (&lck);
+
+#pragma omp parallel shared(lck)  
+    {
+
+#pragma omp for
+	for (i = 0; i < LOOPCOUNT; i++)
+	{
+	    /*omp_set_lock(&lck);*/
+	    <ompts:orphan>
+		<ompts:check>while (!omp_test_lock (&lck))
+		{};</ompts:check>
+	    </ompts:orphan>
+#pragma omp flush
+	    nr_threads_in_single++;
+#pragma omp flush           
+	    nr_iterations++;
+	    nr_threads_in_single--;
+	    result = result + nr_threads_in_single;
+	    <ompts:check>omp_unset_lock (&lck);</ompts:check>
+	}
+    }
+    omp_destroy_lock(&lck);
+
+    return ((result == 0) && (nr_iterations == LOOPCOUNT));
+
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_test_nest_lock.c b/final/testsuite/c/omp_test_nest_lock.c
new file mode 100644
index 0000000..a8d6a45
--- /dev/null
+++ b/final/testsuite/c/omp_test_nest_lock.c

@@ -0,0 +1,48 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_test_nest_lock function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_test_nest_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+
+static omp_nest_lock_t lck;
+
+int <ompts:testcode:functionname>omp_test_nest_lock</ompts:testcode:functionname>(FILE * logFile)
+{
+    
+    int nr_threads_in_single = 0;
+    int result = 0;
+    int nr_iterations = 0;
+    int i;
+
+    omp_init_nest_lock (&lck);
+
+#pragma omp parallel shared(lck) 
+    {
+
+#pragma omp for
+	for (i = 0; i < LOOPCOUNT; i++)
+	{
+	    /*omp_set_lock(&lck);*/
+<ompts:orphan>
+	    <ompts:check>while(!omp_test_nest_lock (&lck))
+	    {};</ompts:check>
+</ompts:orphan>
+#pragma omp flush
+	    nr_threads_in_single++;
+#pragma omp flush           
+	    nr_iterations++;
+	    nr_threads_in_single--;
+	    result = result + nr_threads_in_single;
+	    <ompts:check>omp_unset_nest_lock (&lck);</ompts:check>
+	}
+    }
+    omp_destroy_nest_lock (&lck);
+
+    return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_threadprivate.c b/final/testsuite/c/omp_threadprivate.c
new file mode 100644
index 0000000..8e4d7aa
--- /dev/null
+++ b/final/testsuite/c/omp_threadprivate.c

@@ -0,0 +1,97 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp threadprivate directive by filling an array with random numbers in an parallelised region. Each thread generates one number of the array and saves this in a temporary threadprivate variable. In a second parallelised region the test controls, that the temporary variable contains still the former value by comparing it with the one in the array.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp threadprivate</ompts:directive>
+<ompts:dependences>omp critical,omp_set_dynamic,omp_get_num_threads</ompts:dependences>
+<ompts:testcode>
+/*
+ * Threadprivate is tested in 2 ways:
+ * 1. The global variable declared as threadprivate should have
+ *    local copy for each thread. Otherwise race condition and 
+ *    wrong result.
+ * 2. If the value of local copy is retained for the two adjacent
+ *    parallel regions
+ */
+#include "omp_testsuite.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+static int sum0=0;
+static int myvalue = 0;
+
+<ompts:check>#pragma omp threadprivate(sum0)</ompts:check>
+<ompts:check>#pragma omp threadprivate(myvalue)</ompts:check>
+
+
+int <ompts:testcode:functionname>omp_threadprivate</ompts:testcode:functionname>(FILE * logFile)
+{
+	int sum = 0;
+	int known_sum;
+	int i; 
+	int iter;
+	int *data;
+	int size;
+	int failed = 0;
+	int my_random;
+	omp_set_dynamic(0);
+
+    #pragma omp parallel private(i) 
+    {
+	  sum0 = 0;
+      #pragma omp for 
+	    for (i = 1; i <= LOOPCOUNT; i++)
+		{
+			sum0 = sum0 + i;
+		} /*end of for*/
+      #pragma omp critical
+	  {
+	      sum = sum + sum0;
+	  } /*end of critical */
+	} /* end of parallel */    
+	known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+	if (known_sum != sum ) {
+		fprintf (logFile, " known_sum = %d, sum = %d\n", known_sum, sum);
+	}
+
+	/* the next parallel region is just used to get the number of threads*/
+	omp_set_dynamic(0);
+    #pragma omp parallel
+	{
+      #pragma omp master
+	  {
+			size=omp_get_num_threads();
+			data=(int*) malloc(size*sizeof(int));
+	  }
+	}/* end parallel*/
+
+
+	srand(45);
+	for (iter = 0; iter < 100; iter++){
+		my_random = rand();	/* random number generator is called inside serial region*/
+
+	/* the first parallel region is used to initialiye myvalue and the array with my_random+rank*/
+    #pragma omp parallel
+	{
+	    int rank;
+		rank = omp_get_thread_num ();
+		myvalue = data[rank] = my_random + rank;
+	}
+
+	/* the second parallel region verifies that the value of "myvalue" is retained */
+    #pragma omp parallel reduction(+:failed)
+	{
+	    int rank;
+		rank = omp_get_thread_num ();
+		failed = failed + (myvalue != data[rank]);
+		if(myvalue != data[rank]){
+		  fprintf (logFile, " myvalue = %d, data[rank]= %d\n", myvalue, data[rank]);
+		}
+	}
+  }
+  free (data);
+
+	return (known_sum == sum) && !failed;
+
+} /* end of check_threadprivate*/
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/c/omp_threadprivate_for.c b/final/testsuite/c/omp_threadprivate_for.c
new file mode 100644
index 0000000..034359d
--- /dev/null
+++ b/final/testsuite/c/omp_threadprivate_for.c

@@ -0,0 +1,44 @@
+<ompts:test>
+<ompts:testdescription>Test which checks if a variable declared as threadprivate can be used as a loopindex.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp threadprivate</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+#include "omp_testsuite.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+static int i;
+<ompts:check>#pragma omp threadprivate(i)</ompts:check>
+
+int <ompts:testcode:functionname>omp_threadprivate_for</ompts:testcode:functionname>(FILE * logFile)
+{
+		int known_sum;
+		int sum;
+		known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+		sum = 0;
+
+#pragma omp parallel
+	{
+		int sum0 = 0;
+#pragma omp for
+		for (i = 1; i <= LOOPCOUNT; i++)
+		{
+			sum0 = sum0 + i;
+		} /*end of for*/
+#pragma omp critical
+		{
+			sum = sum + sum0;
+		} /*end of critical */
+	} /* end of parallel */    
+	
+	if (known_sum != sum ) {
+		fprintf (logFile, " known_sum = %d, sum = %d\n", known_sum, sum);
+	}
+
+	return (known_sum == sum);
+
+} /* end of check_threadprivate*/
+</ompts:testcode>
+</ompts:test>
+

diff --git a/final/testsuite/common_utility.f b/final/testsuite/common_utility.f
new file mode 100644
index 0000000..f6d1078
--- /dev/null
+++ b/final/testsuite/common_utility.f

@@ -0,0 +1,57 @@
+      subroutine print_result(s,crossfailed,M,name)     
+      implicit none
+      character (len=*)::name
+	   real cert
+	   integer M,crossfailed
+	   integer s
+	   character (len=11) :: c
+ 	   character (len=18) :: c2
+	   cert=100.0*crossfailed/M
+!           print *, "cert", cert, "cross ", crossfailed
+!	test1=hundred*crossfailed
+	   c="% certainty"
+	   c2=" ... verified with "
+      if(s.eq.1) then
+         write (*,"(A, A, F7.2, A)") name, c2, cert, c
+      else
+         write (*,"(A,A)") name," ... FAILED"
+      endif
+	   end
+
+      subroutine do_test(test_func,cross_test_func,name,N,failed,
+     x  num_tests,crosschecked)
+      implicit none
+      integer succeed
+      integer crossfail
+      integer failed
+      integer, external::test_func
+      integer, external::cross_test_func
+      character (len=*)::name
+      integer fail
+      integer N,i
+      integer num_tests,crosschecked
+      num_tests=num_tests+1
+      succeed=1
+      crossfail=0
+      fail=0
+      do i=1,N
+         if(test_func().eq.0) then
+            succeed=0
+            fail=fail+1
+            exit
+         end if
+         if(cross_test_func().eq.0) then
+!            print *, crossfail
+            crossfail=crossfail+1.0 
+         end if
+      enddo
+      
+      if (fail .ne. 0) then
+         failed=failed+1
+      else
+         if(crossfail .ne. 0) then
+            crosschecked=crosschecked+1
+         end if
+      endif
+      call print_result(succeed,crossfail,N,name)
+      end

diff --git a/final/testsuite/fortran/OMP1_TEST b/final/testsuite/fortran/OMP1_TEST
new file mode 100644
index 0000000..a164ce0
--- /dev/null
+++ b/final/testsuite/fortran/OMP1_TEST

@@ -0,0 +1,36 @@
+has_openmp
+omp_nested
+omp_get_num_threads
+omp_in_parallel
+do_ordered
+do_reduction
+do_private
+do_firstprivate
+do_lastprivate
+section_reduction
+section_private
+section_firstprivate
+section_lastprivate
+single
+single_private
+single_nowait
+par_do_ordered
+par_do_reduction
+par_do_private
+par_do_firstprivate
+par_do_lastprivate
+par_section_reduction
+par_section_private
+par_section_firstprivate
+par_section_lastprivate
+omp_master_thread
+omp_critical
+omp_atomic
+omp_barrier
+omp_flush
+omp_threadprivate
+omp_copyin
+omp_lock
+omp_testlock
+omp_nest_lock
+omp_nest_testlock

diff --git a/final/testsuite/fortran/OMP2_TEST b/final/testsuite/fortran/OMP2_TEST
new file mode 100644
index 0000000..2e47665
--- /dev/null
+++ b/final/testsuite/fortran/OMP2_TEST

@@ -0,0 +1,5 @@
+omp_num_threads
+omp_workshare
+omp_time
+omp_ticks_time
+single_copyprivate

diff --git a/final/testsuite/fortran/common_utility.f b/final/testsuite/fortran/common_utility.f
new file mode 100644
index 0000000..f6d1078
--- /dev/null
+++ b/final/testsuite/fortran/common_utility.f

@@ -0,0 +1,57 @@
+      subroutine print_result(s,crossfailed,M,name)     
+      implicit none
+      character (len=*)::name
+	   real cert
+	   integer M,crossfailed
+	   integer s
+	   character (len=11) :: c
+ 	   character (len=18) :: c2
+	   cert=100.0*crossfailed/M
+!           print *, "cert", cert, "cross ", crossfailed
+!	test1=hundred*crossfailed
+	   c="% certainty"
+	   c2=" ... verified with "
+      if(s.eq.1) then
+         write (*,"(A, A, F7.2, A)") name, c2, cert, c
+      else
+         write (*,"(A,A)") name," ... FAILED"
+      endif
+	   end
+
+      subroutine do_test(test_func,cross_test_func,name,N,failed,
+     x  num_tests,crosschecked)
+      implicit none
+      integer succeed
+      integer crossfail
+      integer failed
+      integer, external::test_func
+      integer, external::cross_test_func
+      character (len=*)::name
+      integer fail
+      integer N,i
+      integer num_tests,crosschecked
+      num_tests=num_tests+1
+      succeed=1
+      crossfail=0
+      fail=0
+      do i=1,N
+         if(test_func().eq.0) then
+            succeed=0
+            fail=fail+1
+            exit
+         end if
+         if(cross_test_func().eq.0) then
+!            print *, crossfail
+            crossfail=crossfail+1.0 
+         end if
+      enddo
+      
+      if (fail .ne. 0) then
+         failed=failed+1
+      else
+         if(crossfail .ne. 0) then
+            crosschecked=crosschecked+1
+         end if
+      endif
+      call print_result(succeed,crossfail,N,name)
+      end

diff --git a/final/testsuite/fortran/do_collapse.f b/final/testsuite/fortran/do_collapse.f
new file mode 100644
index 0000000..1817250
--- /dev/null
+++ b/final/testsuite/fortran/do_collapse.f

@@ -0,0 +1,67 @@
+<ompts:test>
+<ompts:testdescription>Test with omp for collapse clause. Bind with two loops. Without the collapse clause, the first loop will not be ordered</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp do collapse</ompts:directive>
+<ompts:dependences>omp critical,omp do schedule</ompts:dependences>
+<ompts:testcode>
+      LOGICAL FUNCTION check_is_larger(i)
+        implicit none
+        INTEGER :: i
+        INTEGER, save :: last_i
+        LOGICAL :: is_larger
+
+        if (i .eq. 1) last_i = 0
+
+        is_larger = (i .ge. last_i) .and. ((i-last_i) .le. 1)
+        last_i = i
+
+        check_is_larger = is_larger
+
+      END FUNCTION check_is_larger
+
+      INTEGER FUNCTION <ompts:testcode:functionname>do_collapse</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER i, j
+<ompts:orphan:vars>
+        LOGICAL check_is_larger
+        LOGICAL my_is_larger
+        LOGICAL is_larger
+        COMMON /orphvars/ is_larger
+</ompts:orphan:vars>
+
+        INCLUDE "omp_testsuite.f"
+
+        is_larger = .true.
+
+!$omp parallel private(my_is_larger)
+<ompts:orphan>
+        my_is_larger = .true.
+!$omp do private(i,j) schedule(static,1) <ompts:check>collapse(2)</ompts:check>
+!$omp+   ordered
+        DO i=1,100
+          <ompts:crosscheck>
+          my_is_larger = check_is_larger(i) .and. my_is_larger
+          </ompts:crosscheck>
+          DO j=1,00
+          <ompts:check>
+!$omp ordered
+            my_is_larger = check_is_larger(i) .and. my_is_larger
+!$omp end ordered
+          </ompts:check>
+          END DO
+        END DO
+!$omp end do
+!$omp critical
+        is_larger = is_larger .and. my_is_larger
+!$omp end critical
+</ompts:orphan>
+!$omp end parallel
+
+      if (is_larger) then
+        <testfunctionname></testfunctionname> = 1
+      else
+        <testfunctionname></testfunctionname> = 0
+      end if
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/do_firstprivate.f b/final/testsuite/fortran/do_firstprivate.f
new file mode 100644
index 0000000..0792361
--- /dev/null
+++ b/final/testsuite/fortran/do_firstprivate.f

@@ -0,0 +1,56 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp do firstprivate clause by counting up a variable in a parallelized loop. Each thread has a firstprivate variable (1) and an variable (2) declared by do firstprivate. First it stores the result of its last iteration in variable (2). Then it stores the value of the variable (2) in its firstprivate variable (1). At the end all firstprivate variables (1) are added to a total sum in a critical section and compared with the correct result.</ompts:testdescription>
+<ompts:version>2.0</ompts:version>
+<ompts:directive>omp do firstprivate</ompts:directive>
+<ompts:dependences>omp parallel private, omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>do_firstprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, known_sum
+        INTEGER numthreads
+        INTEGER omp_get_num_threads
+<ompts:orphan:vars>
+        INTEGER sum0, sum1, i
+        COMMON /orphvars/ sum0, sum1, i
+</ompts:orphan:vars>
+  
+        INCLUDE "omp_testsuite.f"
+  
+        sum = 0
+        sum0 = 12345
+        sum1 = 0
+  
+  
+!$omp parallel firstprivate(sum1)
+!$omp single
+        numthreads = omp_get_num_threads()
+!$omp end single
+
+
+<ompts:orphan>
+!$omp do <ompts:check>firstprivate(sum0)</ompts:check><ompts:crosscheck>private (sum0)</ompts:crosscheck>
+        DO i=1,LOOPCOUNT
+          sum0 = sum0 + i
+          sum1 = sum0
+        END DO
+!$omp end do
+</ompts:orphan>
+
+
+!$omp critical
+        WRITE (1,*) sum0
+        sum = sum + sum1
+!$omp end critical
+!$omp end parallel
+
+
+        known_sum=12345*numthreads+ (LOOPCOUNT*(LOOPCOUNT+1))/2
+        IF ( known_sum .EQ. sum ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          WRITE (1,*) "Found sum was", sum, "instead of", known_sum
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/do_lastprivate.f b/final/testsuite/fortran/do_lastprivate.f
new file mode 100644
index 0000000..4202e50
--- /dev/null
+++ b/final/testsuite/fortran/do_lastprivate.f

@@ -0,0 +1,39 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp do lastprivate clause by counting up a variable in a parallelized loop. Each thread saves the next summand in a lastprivate variable i0. At the end i0 is compared to the value of the expected last summand.</ompts:testdescription>
+<ompts:version>2.0</ompts:version>
+<ompts:directive>omp do private</ompts:directive>
+<ompts:dependences>omp parallel firstprivate, omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>do_lastprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum,known_sum
+<ompts:orphan:vars>
+        INTEGER sum0,i0,i
+        COMMON /orphvars/ sum0,i0,i
+</ompts:orphan:vars>
+        INCLUDE "omp_testsuite.f"
+        sum = 0
+        sum0 = 0
+        i0 = -1
+!$omp parallel firstprivate(sum0)
+<ompts:orphan>
+!$omp do schedule(static,7) <ompts:check>lastprivate(i0)</ompts:check>
+        DO i=1, LOOPCOUNT
+          sum0 = sum0 + i
+          i0 = i
+        END DO
+<ompts:check>!$omp end do</ompts:check>
+</ompts:orphan>
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end parallel
+      known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2
+        IF ( known_sum .EQ. sum .AND. i0 .EQ. LOOPCOUNT ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/do_ordered.f b/final/testsuite/fortran/do_ordered.f
new file mode 100644
index 0000000..b087f81
--- /dev/null
+++ b/final/testsuite/fortran/do_ordered.f

@@ -0,0 +1,73 @@
+<ompts:test>
+<ompts:directive>do ordered</ompts:directive>
+<ompts:version>2.0</ompts:version>
+<ompts:dependences>parallel private, critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION i_islarger(i)
+        IMPLICIT NONE
+        INTEGER i, islarger
+        INTEGER last_i
+        COMMON /mycom/ last_i
+        IF ( i .GT. last_i) THEN
+          islarger = 1
+        ELSE
+          islarger = 0
+        END If
+        last_i = i
+        i_islarger = islarger
+      END
+
+      INTEGER FUNCTION <ompts:testcode:functionname>do_ordered</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER known_sum, is_larger
+        INTEGER last_i
+        INTEGER i_islarger
+        COMMON /mycom/ last_i
+
+<ompts:orphan:parms> i </ompts:orphan:parms>
+
+<ompts:orphan:vars>
+        INTEGER sum, i, my_islarger
+        COMMON /orphvars/ my_islarger, sum
+</ompts:orphan:vars>
+
+        sum = 0
+        is_larger = 1
+        last_i = 0
+!$omp parallel private(my_islarger)
+        my_islarger = 1
+!$omp do schedule(static,1) ordered
+        DO i=1, 99
+<ompts:orphan>
+<ompts:check>
+!$omp ordered
+</ompts:check>
+          IF (i_islarger(i) .EQ. 1 .AND. my_islarger .EQ. 1) THEN
+            my_islarger = 1
+          ELSE
+            my_islarger = 0
+          END IF
+          sum = sum + i
+<ompts:check>
+!$omp end ordered
+</ompts:check>
+</ompts:orphan>
+        END DO
+!$omp end do
+!$omp critical
+        IF (is_larger .EQ. 1 .AND. my_islarger .EQ. 1 ) THEN
+          is_larger = 1
+        ELSE
+          is_larger = 0
+        END IF
+!$omp end critical
+!$omp end parallel
+        known_sum = (99*100)/2
+        IF ( known_sum .EQ. sum .AND. is_larger .EQ. 1) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/do_private.f b/final/testsuite/fortran/do_private.f
new file mode 100644
index 0000000..2e587cf
--- /dev/null
+++ b/final/testsuite/fortran/do_private.f

@@ -0,0 +1,67 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp do private clause by counting up a variable in a parallelized loop. Each thread has a private variable (1) and an variable (2) declared by for private. First it stores the result of its last iteration in variable (2). Then this thread waits some time before it stores the value of the variable (2) in its private variable (1). At the beginning of the next iteration the value of (1) is assigned to (2). At the end all private variables (1) are added to a total sum in a critical section and compared with the correct result.</ompts:testdescription>
+<ompts:version>2.0</ompts:version>
+<ompts:directive>omp do private</ompts:directive>
+<ompts:dependences>omp parallel private, omp flush, omp critical</ompts:dependences>
+<ompts:testcode>
+      SUBROUTINE do_some_work()
+        IMPLICIT NONE
+        INTEGER i
+        INTRINSIC sqrt
+        DOUBLE PRECISION sum
+
+        INCLUDE "omp_testsuite.f"
+        sum=0.0
+        DO i=0, LOOPCOUNT-1
+          sum = sum + sqrt(REAL(i))
+        ENDDO
+
+      END
+
+      INTEGER FUNCTION <ompts:testcode:functionname>do_private</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, known_sum
+<ompts:orphan:vars>
+        INTEGER sum0, sum1, i
+        COMMON /orphvars/ sum0, sum1, i
+</ompts:orphan:vars>        
+
+        INCLUDE "omp_testsuite.f"
+
+        sum  = 0
+        sum0 = 0
+        sum1 = 0
+
+!$omp parallel private(sum1)
+        sum0 = 0
+        sum1 = 0
+
+<ompts:orphan>
+!$omp do <ompts:check>private(sum0)</ompts:check> schedule(static,1)
+        DO i=1, LOOPCOUNT
+          sum0 = sum1
+!$omp flush
+          sum0 = sum0 + i
+          CALL do_some_work()
+!$omp flush
+!          print *, sum0
+          sum1 = sum0
+        END DO
+!$omp end do
+</ompts:orphan>
+
+!$omp critical
+        sum = sum + sum1
+!$omp end critical
+!$omp end parallel
+
+        known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2
+!        print *, "sum:", sum, "known_sum", known_sum
+        IF ( known_sum .EQ. sum) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/do_reduction.f b/final/testsuite/fortran/do_reduction.f
new file mode 100644
index 0000000..2b25a45
--- /dev/null
+++ b/final/testsuite/fortran/do_reduction.f

@@ -0,0 +1,514 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp do reduction directive wich all its options.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp do reduction</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>do_reduction</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum2, known_sum
+        INTEGER known_product
+        DOUBLE PRECISION rounding_error, dpt
+        INTEGER double_DIGITS
+        DOUBLE PRECISION dknown_sum
+        INTEGER result
+        INCLUDE "omp_testsuite.f"
+        PARAMETER (known_product=3628800)
+        PARAMETER (rounding_error=1.E-6)
+<ompts:orphan:vars>
+        INTEGER sum,diff,product,i
+        DOUBLE PRECISION dsum,ddiff,dt
+        LOGICAL logic_and, logic_or, logic_eqv,logic_neqv
+        LOGICAL logics(LOOPCOUNT)
+        INTEGER bit_and, bit_or
+        INTEGER exclusiv_bit_or
+        INTEGER min_value, max_value
+        INTEGER int_array(LOOPCOUNT)
+        DOUBLE PRECISION d_array(LOOPCOUNT)
+        DOUBLE PRECISION dmin, dmax
+        COMMON /orphvars/ sum,product,diff,i,dsum,ddiff,dt,logic_and,
+     &    logic_or,logic_eqv,logic_neqv,logics,bit_and,bit_or,int_array,
+     &    exclusiv_bit_or,min_value,dmin,dmax,d_array,max_value
+        INTEGER MAX_FACTOR
+        PARAMETER (double_DIGITS=20,MAX_FACTOR=10)
+</ompts:orphan:vars>
+
+        dt = 1./3.
+        known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+        product = 1
+        sum2 = 0
+        sum = 0
+        dsum = 0.
+        result =0 
+        logic_and = .true.
+        logic_or = .false.
+        bit_and = 1
+        bit_or = 0
+        exclusiv_bit_or = 0
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic, 1) <ompts:check>reduction(+:sum)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          sum = sum + i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (known_sum .NE. sum) THEN
+          result = result + 1
+          WRITE(1,*) "Error in sum with integers: Result was ",
+     &    sum,"instead of ", known_sum
+        END IF
+
+
+        diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic, 1) <ompts:check>reduction (-: diff)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          diff = diff - i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+  
+        IF ( diff .NE. 0 ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in difference with integers: Result was ",
+     &    sum,"instead of 0."
+        END IF
+
+!... Test for doubles
+        dsum =0.
+        dpt = 1
+
+        DO i=1, DOUBLE_DIGITS
+          dpt= dpt * dt
+        END DO
+        dknown_sum = (1-dpt)/(1-dt)
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(+:dsum)</ompts:check>
+        DO i=0,DOUBLE_DIGITS-1
+          dsum = dsum + dt**i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+ 
+        IF(dsum .NE. dknown_sum .AND. 
+     &     ABS(dsum - dknown_sum) .GT. rounding_error ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in sum with doubles: Result was ",
+     &       dsum,"instead of ",dknown_sum,"(Difference: ",
+     &       dsum - dknown_sum,")"
+        END IF
+        dpt = 1
+
+
+      
+        DO i=1, DOUBLE_DIGITS
+           dpt = dpt*dt
+        END DO
+        ddiff = ( 1-dpt)/(1-dt)
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(-:ddiff)</ompts:check>
+        DO i=0, DOUBLE_DIGITS-1
+          ddiff = ddiff - dt**i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( ABS(ddiff) .GT. rounding_error ) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Difference with doubles: Result was ",
+     &       ddiff,"instead of 0.0"
+        END IF
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(*:product)</ompts:check>
+        DO i=1,MAX_FACTOR
+           product = product * i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (known_product .NE. product) THEN
+           result = result + 1
+           write(1,*) "Error in Product with integers: Result was ",
+     &       product," instead of",known_product 
+        END IF
+
+        DO i=1,LOOPCOUNT
+          logics(i) = .TRUE.
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.AND.:logic_and)</ompts:check>
+        DO i=1,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (.NOT. logic_and) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic AND part 1"
+        END IF
+
+
+        logic_and = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.AND.:logic_and)</ompts:check>
+        DO i=1,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (logic_and) THEN
+           result = result + 1
+           WRITE(1,*) "Error in logic AND pass 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+         logics(i) = .FALSE.
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.OR.:logic_or)</ompts:check>
+        DO i = 1, LOOPCOUNT
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (logic_or) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 1"
+        END IF
+
+        logic_or = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.OR.:logic_or)</ompts:check>
+        DO i=1,LOOPCOUNT
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( .NOT. logic_or ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 2"
+        END IF
+
+!... Test logic EQV, unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .TRUE.
+        END DO
+
+        logic_eqv = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.EQV.:logic_eqv)</ompts:check>
+        DO i = 1, LOOPCOUNT
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (.NOT. logic_eqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 1"
+        END IF
+
+        logic_eqv = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.EQV.:logic_eqv)</ompts:check>
+        DO i=1,LOOPCOUNT
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( logic_eqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 2"
+        END IF
+
+!... Test logic NEQV, which is unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .FALSE.
+        END DO
+
+        logic_neqv = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.NEQV.:logic_neqv)</ompts:check>
+        DO i = 1, LOOPCOUNT
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (logic_neqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 1"
+        END IF
+
+        logic_neqv = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(.neqv.:logic_neqv)</ompts:check>
+        DO i=1,LOOPCOUNT
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( .NOT. logic_neqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+           int_array(i) = 1
+        END DO
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(IAND:bit_and)</ompts:check>
+        DO i=1, LOOPCOUNT
+!... iand(I,J): Returns value resulting from boolean AND of 
+!... pair of bits in each of I and J. 
+          bit_and = IAND(bit_and,int_array(i))
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( bit_and .LT. 1 ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 1"
+        END IF
+
+        bit_and = 1
+        int_array(LOOPCOUNT/2) = 0
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(IAND:bit_and)</ompts:check>
+        DO i=1, LOOPCOUNT
+          bit_and = IAND ( bit_and, int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF( bit_and .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(IOR:bit_or)</ompts:check>
+        DO i=1, LOOPCOUNT
+!... Ior(I,J): Returns value resulting from boolean OR of 
+!... pair of bits in each of I and J. 
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( bit_or .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 1"
+        END IF
+
+
+        bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(IOR:bit_or)</ompts:check>
+        DO i=1, LOOPCOUNT
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( bit_or .LE. 0) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(IEOR:exclusiv_bit_or)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( exclusiv_bit_or .ge. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ieor part 1"
+        END IF
+
+        exclusiv_bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(IEOR:exclusiv_bit_or)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( exclusiv_bit_or .LE. 0) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ieor part 2"
+        END IF
+
+        DO i=1,LOOPCOUNT
+          int_array(i) = 10 - i
+        END DO
+
+        min_value = 65535
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(MIN:min_value)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          min_value = MIN(min_value,int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( min_value .GT. (10-LOOPCOUNT) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+           int_array(i) = i
+        END DO
+
+        max_value = -32768
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(MAX:max_value)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          max_value = MAX(max_value,int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( max_value .LT. LOOPCOUNT )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MAX"
+        END IF
+
+!... test double min, max
+        DO i=1,LOOPCOUNT
+           d_array(i) = 10 - i*dt
+        END DO
+
+        dmin = 2**10
+        dt = 0.5
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(MIN:dmin)</ompts:check>
+        DO i = 1, LOOPCOUNT
+            dmin= MIN(dmin,d_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( dmin .GT. (10-dt) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in double MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+           d_array(i) = i * dt
+        END DO
+
+        dmax= - (2**10)
+
+!$omp parallel
+<ompts:orphan>
+!$omp do schedule(dynamic,1) <ompts:check>reduction(MAX:dmax)</ompts:check>
+        DO i = 1, LOOPCOUNT
+            dmax= MAX(dmax,d_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( dmax .LT. LOOPCOUNT*dt )THEN
+          result = result + 1
+          WRITE(1,*) "Error in double MAX"
+        END IF
+
+        IF ( result .EQ. 0 ) THEN
+          <testfunctionname></testfunctionname> =  1
+        ELSE
+          <testfunctionname></testfunctionname> =  0
+        END IF
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/do_schedule_dynamic.f b/final/testsuite/fortran/do_schedule_dynamic.f
new file mode 100644
index 0000000..18f8ba7
--- /dev/null
+++ b/final/testsuite/fortran/do_schedule_dynamic.f

@@ -0,0 +1,90 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the dynamic option of the omp do schedule directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp do schedule(dynamic)</ompts:directive>
+<ompts:dependences>omp flush,omp do nowait,omp critical,omp single</ompts:dependences>
+<ompts:testcode>
+
+
+      INTEGER FUNCTION <ompts:testcode:functionname>do_schedule_dynamic</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        CHARACTER*30 logfile
+        INTEGER omp_get_thread_num,omp_get_num_threads
+        INTEGER threads
+        INTEGER count, tmp_count
+        INTEGER,ALLOCATABLE:: tmp(:)
+        INTEGER ii
+        INTEGER result
+
+<ompts:orphan:vars>
+        INTEGER CFDMAX_SIZE
+        PARAMETER (CFDMAX_SIZE = 1000)
+        INTEGER i,tids(0:CFDMAX_SIZE-1),tid,chunk_size
+        COMMON /orphvars/ i,tids,tid,chunk_size
+</ompts:orphan:vars>
+
+        chunk_size = 7
+        count = 0
+        tmp_count = 0
+        result = 0
+        ii = 0
+
+!$omp parallel private(tid)
+        tid = omp_get_thread_num()
+<ompts:orphan>
+!$omp do <ompts:check>schedule(dynamic,chunk_size)</ompts:check>
+        DO i=0, CFDMAX_SIZE-1
+          tids(i) = tid
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        DO i=0, CFDMAX_SIZE - 2
+          IF ( tids(i) .ne. tids(i+1) ) THEN
+            count = count + 1
+          END IF
+        END DO
+ 
+        ALLOCATE( tmp(0:count) )
+        tmp(0) = 1
+ 
+        DO i = 0, CFDMAX_SIZE - 2
+          IF ( tmp_count .GT. count ) THEN
+            WRITE(*,*) "--------------------"
+            WRITE(*,*) "Testinternal Error: List too small!!!"
+            WRITE(*,*) "--------------------"
+            GOTO 10
+          END If
+          IF ( tids(i) .NE. tids(i+1) ) then
+            tmp_count = tmp_count + 1
+            tmp(tmp_count) = 1
+          ELSE
+            tmp(tmp_count) = tmp(tmp_count) +1
+          END IF 
+        END DO          
+
+!... is dynamic statement working? 
+
+ 10     DO i=0, count -1
+          IF ( MOD(tmp(i),chunk_size) .ne. 0 ) THEN
+! ... it is possible for 2 adjacent chunks assigned to a same thread 
+            result = result + 1
+            WRITE(1,*) "The intermediate dispatch has wrong chunksize."
+          END IF
+        END DO
+
+        IF ( MOD(tmp(count), chunk_size) .NE. 
+     &     MOD (CFDMAX_SIZE, chunk_size) ) THEN
+          result = result + 1
+          WRITE(1,*) "the last dispatch has wrong chunksize."
+        END IF
+         
+        IF ( result .eq. 0) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/do_schedule_guided.f b/final/testsuite/fortran/do_schedule_guided.f
new file mode 100644
index 0000000..5d0e7ed
--- /dev/null
+++ b/final/testsuite/fortran/do_schedule_guided.f

@@ -0,0 +1,174 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the guided option of the omp do schedule directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp do schedule(guided)</ompts:directive>
+<ompts:dependences>omp flush,omp do nowait,omp critical,omp single</ompts:dependences>
+<ompts:testcode>
+  ! TODO:
+  ! C. Niethammer:
+  !       Find check to decide if the test was run as schedule(static) because
+  !       this also can pass the test if the work is divided into thread-counts 
+      INTEGER FUNCTION <ompts:testcode:functionname>do_schedule_guided</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER omp_get_thread_num,omp_get_num_threads
+        CHARACTER*20 logfile
+        INTEGER threads
+        INTEGER tmp_count
+        INTEGER, allocatable :: tmp(:)
+        INTEGER ii, flag
+        INTEGER result
+        INTEGER expected
+        INTEGER openwork
+        DOUBLE PRECISION c
+
+                <ompts:orphan:vars>
+        INTEGER i
+        INTEGER tid
+        INTEGER count
+
+        INTEGER DELAY 
+        INTEGER MAX_TIME
+        INTEGER CFSMAX_SIZE
+
+! ... choose small iteration space for small sync. overhead
+        PARAMETER (DELAY = 1)
+        PARAMETER (MAX_TIME = 5)
+        PARAMETER (CFSMAX_SIZE = 150)
+
+        INTEGER notout
+        INTEGER maxiter
+        INTEGER tids(0:CFSMAX_SIZE-1)
+
+        COMMON /orphvars/ notout,maxiter,tids
+                </ompts:orphan:vars>
+
+        result = 0
+        notout = 1
+        maxiter = 0
+        count = 0
+        tmp_count = 0
+        openwork = CFSMAX_SIZE
+<ompts:check>
+
+! Determine the number of available threads
+!$omp parallel 
+!$omp single
+        threads = omp_get_num_threads()
+!$omp end single
+!$omp end parallel
+        IF ( threads .LT. 2) THEN
+          PRINT *,"This test only works with at least two threads"
+          WRITE(1,*) "This test only works with at least two threads"
+          <testfunctionname></testfunctionname> = 0
+          STOP
+        END IF
+
+! ... Now the real parallel work:
+! ... Each thread will start immediately with the first chunk.
+    
+!$omp parallel private(tid,count) shared(tids,maxiter)
+        tid = omp_get_thread_num()
+        <ompts:orphan>
+!$omp do schedule(guided)
+        DO i = 0 , CFSMAX_SIZE-1
+          count = 0
+!$omp flush(maxiter)
+          IF ( i .GT. maxiter ) THEN                 
+!$omp critical
+            maxiter = i
+!$omp end critical
+          END IF
+
+!..         if it is not our turn we wait
+!           a) until another thread executed an iteration
+!           with a higher iteration count
+!           b) we are at the end of the loop (first thread finished
+!             and set notout=0 OR
+!           c) timeout arrived 
+
+!$omp flush(maxiter,notout)
+          IF ( notout .GE. 1 .AND. count .LT. MAX_TIME
+     &         .AND. maxiter .EQ. i ) THEN
+              DO WHILE ( notout .GE. 1 .AND. count .LT. MAX_TIME
+     &          .AND. maxiter .EQ. i )
+                CALL sleep(DELAY)
+                count = count + DELAY
+              END DO         
+          END IF
+           tids(i) = tid
+        END DO
+!$omp end do nowait
+        </ompts:orphan>
+
+        notout = 0
+!$omp flush(notout)
+
+!$omp end parallel 
+
+!*******************************************************!
+! evaluation of the values
+!*******************************************************!
+        count = 0
+
+        DO i=0, CFSMAX_SIZE - 2
+          IF ( tids(i) .NE. tids(i+1) ) THEN
+            count = count + 1
+          END IF
+        END DO
+
+        ALLOCATE( tmp(0:count)  )
+        tmp_count = 0
+        tmp(0) = 1
+! ... calculate the chunksize for each dispatch
+
+        DO i=0, CFSMAX_SIZE - 2
+          IF ( tids(i) .EQ. tids(i+1) ) THEN
+           tmp(tmp_count) = tmp(tmp_count) + 1
+          ELSE
+            tmp_count = tmp_count + 1
+            tmp(tmp_count) = 1
+          END IF
+        END DO
+
+! ... Check if chunk sizes are decreased until equals to 
+! ... the specified one, ignore the last dispatch 
+! ... for possible smaller remainder
+
+! Determine the constant 
+        expected = openwork / threads
+        c = real(tmp(0)) / real(expected)
+        WRITE(1,*) "Found constant to be ", c
+
+        DO i = 0, count - 2
+          WRITE(1,*) "open:", openwork, "size:", tmp(i)
+          IF (expected .GT. 1) THEN
+            expected = c * openwork / threads
+          END IF
+
+          IF (abs(tmp(i) - expected) .GE. 2 ) THEN
+            result = 1
+            WRITE(1,*) "Chunksize differed from expected ",
+     &         "value: ",tmp(i), "instead ", expected
+          END IF
+
+          IF (i .GT. 0 .AND. (tmp(i-1) - tmp(i)) .LT. 0) THEN
+            WRITE(1,*) "Chunksize did not decrease: ", tmp(i),
+     &         "instead",tmp(i-1)
+          END IF
+
+          openwork = openwork - tmp(i)  
+        END DO
+
+        IF ( result .EQ. 0 ) THEN
+           <testfunctionname></testfunctionname> = 1 
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:check>
+<ompts:crosscheck>
+      <testfunctionname></testfunctionname> = 0
+      END
+</ompts:crosscheck>
+</ompts:testcode>
+</omtps:test>

diff --git a/final/testsuite/fortran/do_schedule_static.f b/final/testsuite/fortran/do_schedule_static.f
new file mode 100644
index 0000000..7ab02f8
--- /dev/null
+++ b/final/testsuite/fortran/do_schedule_static.f

@@ -0,0 +1,70 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the static option of the omp do schedule directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp do schedule(static)</ompts:directive>
+<ompts:dependences>omp do nowait,omp flush,omp critical,omp single</ompts:dependences>
+<ompts:testcode>
+
+      INTEGER FUNCTION <ompts:testcode:functionname>do_schedule_static</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER omp_get_thread_num,omp_get_num_threads
+        CHARACTER*30 logfile
+        INTEGER threads
+        INTEGER count
+        INTEGER ii
+        INTEGER result
+<ompts:orphan:vars>
+        INTEGER CFSMAX_SIZE
+        PARAMETER (CFSMAX_SIZE = 1000)
+        INTEGER i,tids(0:CFSMAX_SIZE-1), tid, chunk_size
+        COMMON /orphvars/ i,tid,tids,chunk_size
+</ompts:orphan:vars>
+
+        chunk_size = 7
+        result = 0
+        ii = 0
+
+!$omp parallel 
+!$omp single
+        threads = omp_get_num_threads()
+!$omp end single
+!$omp end parallel
+
+        IF ( threads .LT. 2) THEN
+          PRINT *,"This test only works with at least two threads"
+          WRITE(1,*) "This test only works with at least two threads"
+          <testfunctionname></testfunctionname> = 0
+          STOP
+        ELSE
+          WRITE(1,*) "Using an internal count of ",CFSMAX_SIZE
+          WRITE(1,*) "Using a specified chunksize of ",chunk_size
+    
+!$omp parallel private(tid) shared(tids)
+          tid = omp_get_thread_num()
+<ompts:orphan>
+!$omp do <ompts:check>schedule(static,chunk_size)</ompts:check>
+          DO i = 0 ,CFSMAX_SIZE -1
+            tids(i) = tid
+          END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+          DO i = 0, CFSMAX_SIZE-1
+!... round-robin for static chunk
+            ii = mod( i/chunk_size,threads)
+            IF (tids(i) .NE. ii ) THEN
+              result = result + 1
+              WRITE(1,*)"Iteration ",i,"should be assigned to ",
+     &           ii,"instead of ",tids(i)
+            END IF
+          END DO
+          IF ( result .EQ. 0 )THEN
+            <testfunctionname></testfunctionname> = 1
+          ELSE
+            <testfunctionname></testfunctionname> = 0
+          END IF
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/has_openmp.f b/final/testsuite/fortran/has_openmp.f
new file mode 100644
index 0000000..5585ef2
--- /dev/null
+++ b/final/testsuite/fortran/has_openmp.f

@@ -0,0 +1,25 @@
+<ompts:test>
+<ompts:version>2.0</ompts:version>
+<ompts:testdescription>Testing if the conditional compilation is supported or not.  
+Yi Wen at 05032004: Do we want to write two versions of has_omp?  both C23456789 
+and #ifdef formats are supposed to work. At least Sun's compiler cannot deal with 
+the second format (#ifdef)</ompts:testdescription>
+
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>has_openmp</ompts:testcode:functionname>()
+        <testfunctionname></testfunctionname> = 0
+
+<ompts:check>
+!version 1.
+!C23456789 
+!$        <testfunctionname></testfunctionname> = 1
+
+! version 2.
+!#ifdef _OPENMP
+        <testfunctionname></testfunctionname> = 1
+!#endif
+</ompts:check>
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_atomic.f b/final/testsuite/fortran/omp_atomic.f
new file mode 100644
index 0000000..ee74450
--- /dev/null
+++ b/final/testsuite/fortran/omp_atomic.f

@@ -0,0 +1,588 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp atomic directive by counting up a variable in a parallelized loop with an atomic directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp atomic</ompts:directive>
+<ompts:testcode>
+!********************************************************************
+! Functions: omp_atomic
+! change "character*20" into "character (LEN=20)::"
+! get rid of the "tab" key by Zhenying Liu, on Oct. 16, 2005.
+!********************************************************************
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_atomic</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        INTEGER sum2, known_sum
+        INTEGER known_product, int_const
+        DOUBLE PRECISION rounding_error, dpt
+        INTEGER double_DIGITS
+        DOUBLE PRECISION dknown_sum
+        INTEGER result
+        PARAMETER (int_const=10,known_product=3628800)
+        PARAMETER (rounding_error=1.E-2)
+<ompts:orphan:vars>
+        INTEGER sum,i,diff,product
+        DOUBLE PRECISION dsum,dt,ddiff
+        LOGICAL logic_and, logic_or, logic_eqv,logic_neqv
+        INTEGER bit_and, bit_or
+        INTEGER exclusiv_bit_or
+        INTEGER min_value, max_value
+        DOUBLE PRECISION dmin, dmax
+        LOGICAL logics(LOOPCOUNT)
+        INTEGER int_array(LOOPCOUNT)
+        DOUBLE PRECISION d_array(LOOPCOUNT)
+        COMMON /orphvars/ sum,product,diff,i,dsum,ddiff,dt,logic_and,
+     &    logic_or,logic_eqv,logic_neqv,logics,bit_and,bit_or,int_array,
+     &    exclusiv_bit_or,min_value,dmin,dmax,d_array,max_value
+        INTEGER MAX_FACTOR
+        PARAMETER (double_DIGITS=20,MAX_FACTOR=10)
+</ompts:orphan:vars>
+
+        dt = 1./3.
+        known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+        product = 1
+        sum2 = 0
+        sum = 0
+        dsum = 0.
+        result =0 
+        logic_and = .true.
+        logic_or = .false.
+        bit_and = 1
+        bit_or = 0
+        exclusiv_bit_or = 0
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i =1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          sum = sum + i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (known_sum .NE. sum) THEN
+             result = result + 1
+        WRITE(1,*) "Error in sum with integers: Result was ",
+     &   sum,"instead of ", known_sum
+        END If
+
+        diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i =1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          diff = diff - i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+  
+        IF ( diff .NE. 0 ) THEN
+          result = result + 1
+        WRITE(1,*) "Error in difference with integers: Result was ",
+     &   diff,"instead of 0."
+        END IF
+
+!... Test for doubles
+        dsum = 0.
+        dpt = 1
+
+        DO i=1, DOUBLE_DIGITS
+          dpt= dpt * dt
+        END DO
+        dknown_sum = (1-dpt)/(1-dt)
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=0,DOUBLE_DIGITS-1
+<ompts:check>
+!$omp atomic
+</ompts:check>
+              dsum = dsum + dt**i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+ 
+        IF (dsum .NE. dknown_sum .AND. 
+     &     ABS(dsum - dknown_sum) .GT. rounding_error ) THEN
+           result = result + 1
+           WRITE(1,*) "Error in sum with doubles: Result was ",
+     &       dsum,"instead of ",dknown_sum,"(Difference: ",
+     &       dsum - dknown_sum,")"
+        END IF
+        dpt = 1
+
+      
+        DO i=1, DOUBLE_DIGITS
+           dpt = dpt*dt
+        END DO
+
+        ddiff = ( 1-dpt)/(1-dt)
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=0, DOUBLE_DIGITS-1
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          ddiff = ddiff - dt**i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( ABS(ddiff) .GT. rounding_error ) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Difference with doubles: Result was ",
+     &       ddiff,"instead of 0.0"
+        END IF
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1,MAX_FACTOR
+<ompts:check>
+!$omp atomic
+</ompts:check>
+           product = product * i
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (known_product .NE. product) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Product with integers: Result was ",
+     &       product," instead of",known_product 
+        END IF
+
+        DO i=1,LOOPCOUNT
+          logics(i) = .TRUE.
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1,LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (.NOT. logic_and) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic AND part 1"
+        END IF
+
+
+        logic_and = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do
+        DO i=1,LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (logic_and) THEN
+           result = result + 1
+           WRITE(1,*) "Error in logic AND pass 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+         logics(i) = .FALSE.
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (logic_or) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 1"
+        END IF
+
+        logic_or = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1,LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( .NOT. logic_or ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 2"
+        END IF
+
+!... Test logic EQV, unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .TRUE.
+        END DO
+
+        logic_eqv = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (.NOT. logic_eqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 1"
+        END IF
+
+        logic_eqv = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1,LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( logic_eqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 2"
+        END IF
+
+!... Test logic NEQV, which is unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .FALSE.
+        END DO
+
+        logic_neqv = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+           logic_neqv = logic_neqv .OR. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF (logic_neqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 1"
+        END IF
+
+        logic_neqv = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1,LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+           logic_neqv = logic_neqv .OR. logics(i)
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( .NOT. logic_neqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 1
+        END DO
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1, LOOPCOUNT
+!... iand(I,J): Returns value resulting from boolean AND of 
+!... pair of bits in each of I and J. 
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          bit_and = IAND(bit_and,int_array(i))
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( bit_and .LT. 1 ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 1"
+        END If
+
+        bit_and = 1
+        int_array(LOOPCOUNT/2) = 0
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          bit_and = IAND ( bit_and, int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF( bit_and .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1, LOOPCOUNT
+!... Ior(I,J): Returns value resulting from boolean OR of 
+!... pair of bits in each of I and J. 
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          bit_or = Ior(bit_or, int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( bit_or .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 1"
+        END IF
+
+
+        bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i=1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( bit_or .le. 0) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+            exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( exclusiv_bit_or .GE. 1) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Ieor part 1"
+        END IF
+
+        exclusiv_bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+            exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( exclusiv_bit_or .LE. 0) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ieor part 2"
+        END IF
+
+        DO i=1,LOOPCOUNT
+           int_array(i) = 10 - i
+        END DO
+
+        min_value = 65535
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+            min_value = min(min_value,int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( min_value .GT. (10-LOOPCOUNT) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+           int_array(i) = i
+        END DO
+
+        max_value = -32768
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+            max_value = max(max_value,int_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( max_value .LT. LOOPCOUNT )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MAX"
+        END IF
+
+!... test double min, max
+        DO i=1,LOOPCOUNT
+           d_array(i) = 10 - i*dt
+        END DO
+
+        dmin = 2**10
+        dt = 0.5
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+            dmin= MIN(dmin,d_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( dmin .GT. (10-dt) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in double MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+          d_array(i) = i * dt
+        END DO
+
+        dmax= - (2**10)
+
+!$omp parallel
+<ompts:orphan>
+!$omp do 
+        DO i = 1, LOOPCOUNT
+<ompts:check>
+!$omp atomic
+</ompts:check>
+          dmax= max(dmax,d_array(i) )
+        END DO
+!$omp end do
+</ompts:orphan>
+!$omp end parallel
+
+        IF ( dmax .LT. LOOPCOUNT*dt )THEN
+          result = result + 1
+          WRITE(1,*) "Error in double MAX"
+        END IF
+
+        IF ( result .EQ. 0 ) THEN
+          <testfunctionname></testfunctionname>=  1
+        ELSE
+          <testfunctionname></testfunctionname>=  0
+        END IF
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_barrier.f b/final/testsuite/fortran/omp_barrier.f
new file mode 100644
index 0000000..5125994
--- /dev/null
+++ b/final/testsuite/fortran/omp_barrier.f

@@ -0,0 +1,50 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp barrier directive. The test    creates several threads and sends one of them sleeping before setting a flag.  After the barrier the other ones do some little work depending on the flag.</   ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp barrier</ompts:directive>
+<ompts:testcode>
+
+      SUBROUTINE do_some_work3()
+        REAL i
+        INTRINSIC sqrt
+        DOUBLE PRECISION sum
+        INCLUDE "omp_testsuite.f"
+        sum = 0.0
+        DO WHILE (i < LOOPCOUNT-1)
+          sum = sum + sqrt(i)
+          i = i + 1
+        END DO
+      END
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_barrier</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sleeptime
+        INTEGER omp_get_thread_num
+        INTEGER result1, result2, rank
+        result1 = 0
+        result2 = 0
+        sleeptime = 1
+!$omp parallel private(rank)
+        rank = omp_get_thread_num()
+!        PRINT *, "rank", rank
+        IF ( rank .EQ. 1 ) THEN
+          CALL sleep(sleeptime)
+          result2 = 3
+        END IF
+        <ompts:orphan>
+        <ompts:check>
+!$omp barrier
+        </ompts:check>
+        </ompts:orphan>
+        IF ( rank .EQ. 0 ) THEN
+          result1 = result2
+        END IF
+!$omp end parallel
+        IF ( result1 .EQ. 3 ) THEN
+           <testfunctionname></testfunctionname> = 1
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_copyin.f b/final/testsuite/fortran/omp_copyin.f
new file mode 100644
index 0000000..714f149
--- /dev/null
+++ b/final/testsuite/fortran/omp_copyin.f

@@ -0,0 +1,47 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel copyin directive.</   ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel copyin</ompts:directive>
+<ompts:dependences>omp critical,omp threadprivate</ompts:dependences>
+<ompts:testcode>
+! Changelog:
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_copyin</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER known_sum
+		<ompts:orphan:vars>
+!        INTEGER, SAVE::sum1 
+!        implicitly by omp_threadprivate, see spec25 Chap. 2.8.2
+        INTEGER sum1
+        COMMON /csum1/ sum1
+        INTEGER sum, i, threads
+        COMMON /orphvars/ sum, i, threads
+!   C. Niethammer 30.11.06: moved threadprivate statement into the orphaned
+!      function
+!$omp threadprivate(/csum1/)
+		</ompts:orphan:vars>
+
+        sum = 0
+        sum1 = 7
+        threads = 0
+		<ompts:orphan>
+!$omp parallel <ompts:check>copyin(sum1)</ompts:check>
+!        print *,"sum1",sum1
+!$omp do
+        DO i=1, 999
+          sum1 = sum1 + i
+        END DO
+!$omp critical
+        sum = sum + sum1
+        threads = threads + 1
+!$omp end critical
+!$omp end parallel
+		</ompts:orphan>
+        known_sum = 999*1000/2 + 7*threads
+        IF ( known_sum .EQ. sum ) THEN
+           <testfunctionname></testfunctionname> = 1
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_critical.f b/final/testsuite/fortran/omp_critical.f
new file mode 100644
index 0000000..892dd2b
--- /dev/null
+++ b/final/testsuite/fortran/omp_critical.f

@@ -0,0 +1,59 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp critical directive by counting up a variable in a parallelized region within a critical section.
+
+</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp critical</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_critical</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER known_sum
+        <ompts:orphan:vars>
+        INTEGER i,j,myi,myj, sum
+        COMMON /orphvars/ sum, myi, myj
+        </ompts:orphan:vars>
+        sum = 0
+        myi = 0
+        myj = 500
+!$omp parallel
+!$omp sections
+
+!$omp section
+        DO i = 0 , 499
+                <ompts:orphan>
+                <ompts:check>
+!$omp critical
+                </ompts:check>
+           sum = sum + myi
+           myi = myi + 1
+                <ompts:check>
+!$omp end critical
+                </ompts:check>
+                </ompts:orphan>
+        END DO
+
+!$omp section
+        DO j = 500 , 999
+                <ompts:orphan>
+                <ompts:check>
+!$omp critical
+                </ompts:check>
+           sum = sum + myj
+           myj = myj + 1
+                <ompts:check>
+!$omp end critical
+                </ompts:check>
+                </ompts:orphan>
+        END DO
+!$omp end sections
+!$omp end parallel
+        known_sum = 999*1000/2
+        IF ( known_sum .EQ. sum ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          WRITE (1,*) "Found sum was", sum, "instead", known_sum
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_flush.f b/final/testsuite/fortran/omp_flush.f
new file mode 100644
index 0000000..30daf50
--- /dev/null
+++ b/final/testsuite/fortran/omp_flush.f

@@ -0,0 +1,49 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp flush directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp flush</ompts:directive>
+<ompts:dependences>omp barrier</ompts:dependences>
+<ompts:testcode>
+        INTEGER FUNCTION <ompts:testcode:functionname>omp_flush</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER result1, dummy, rank
+        INTEGER omp_get_thread_num
+        <ompts:orphan:vars>
+        INTEGER result2
+        COMMON /orphvars/ result2
+        </ompts:orphan:vars>
+        result1=0
+        result2=0
+!$omp parallel private(rank)
+        rank = omp_get_thread_num()
+!$omp barrier
+        IF ( rank .EQ. 1 ) THEN
+          result2 = 3
+          <ompts:orphan>
+          <ompts:check>
+!$omp flush(result2)
+          </ompts:check>
+          </ompts:orphan>
+          dummy = result2
+        END IF
+        IF ( rank .EQ. 0 ) THEN
+          call sleep(1)
+          <ompts:orphan>
+          <ompts:check>
+!$omp flush(result2)
+          </ompts:check>
+          </ompts:orphan>
+          result1 = result2
+        END IF
+!$omp end parallel
+
+!        PRINT *,"1:", result1, "2:", result2, "dummy", dummy
+        IF ( (result1 .EQ. result2) .AND. (result2 .EQ. dummy) .AND.
+     &       (result2 .EQ. 3) ) THEN
+           <testfunctionname></testfunctionname> = 1
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_get_num_threads.f b/final/testsuite/fortran/omp_get_num_threads.f
new file mode 100644
index 0000000..467ad9f
--- /dev/null
+++ b/final/testsuite/fortran/omp_get_num_threads.f

@@ -0,0 +1,37 @@
+<ompts:test>
+<ompts:testdescription>Test which checks that the omp_get_num_threads returns the correct number of threads. Therefor it counts up a variable in a parallelized section and compars this value with the result of the omp_get_num_threads function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_get_num_threads</ompts:directive>
+<ompts:dependences>omp critical,somp single</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_get_num_threads</ompts:testcode:functionname>()
+        INTEGER nthreads
+        INTEGER omp_get_num_threads
+		<ompts:orphan:vars>
+        INTEGER nthreads_lib
+        COMMON /orphvars/ nthreads_lib
+		</ompts:orphan:vars>
+        nthreads=0
+        nthreads_lib=-1
+
+!$omp parallel
+!shared(nthreads,nthreads_lib)
+!$omp critical
+        nthreads = nthreads + 1
+!$omp end critical
+!$omp single
+		<ompts:orphan>
+		<ompts:check>
+        nthreads_lib=omp_get_num_threads()
+		</ompts:check>
+		</ompts:orphan>
+!$omp end single
+!$omp end parallel
+        IF (nthreads .EQ. nthreads_lib) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_get_wticks.f b/final/testsuite/fortran/omp_get_wticks.f
new file mode 100644
index 0000000..762d03d
--- /dev/null
+++ b/final/testsuite/fortran/omp_get_wticks.f

@@ -0,0 +1,28 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_get_wtick function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_get_wtick</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_get_wticks</ompts:testcode:functionname>()
+        IMPLICIT NONE
+<ompts:orphan:vars>
+        DOUBLE PRECISION tick
+        COMMON /orphvars/ tick
+        include "omp_lib.h"
+</ompts:orphan:vars>
+!        DOUBLE PRECISION omp_get_wtick
+        tick = 1
+                <ompts:orphan>
+<ompts:check>
+        tick=omp_get_wticK()
+</ompts:check>
+                </ompts:orphan>
+        WRITE(1,*) "work took",tick,"sec. time."
+        IF(tick .GT. 0. .AND. tick .LT. 0.01) THEN
+          <testfunctionname></testfunctionname>=1
+        ELSE
+          <testfunctionname></testfunctionname>=0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_in_parallel.f b/final/testsuite/fortran/omp_in_parallel.f
new file mode 100644
index 0000000..45edff6
--- /dev/null
+++ b/final/testsuite/fortran/omp_in_parallel.f

@@ -0,0 +1,41 @@
+<ompts:test>
+<ompts:testdescription>Test which checks that omp_in_parallel returns false when called from a serial region and true when called within a parallel region.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_in_parallel</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_in_parallel</ompts:testcode:functionname>()
+!   checks that false is returned when called from serial region
+!   and true is returned when called within parallel region
+        LOGICAL omp_in_parallel
+		<ompts:orphan:vars>
+!        LOGICAL omp_in_parallel
+        LOGICAL serial, parallel
+        COMMON /orphvars/ serial, parallel
+		</ompts:orphan:vars>
+        serial=.TRUE.
+        parallel=.FALSE.
+
+		<ompts:orphan>
+		<ompts:check>
+        serial=omp_in_parallel()
+		</ompts:check>
+		</ompts:orphan>
+
+!$omp parallel
+!$omp single
+		<ompts:orphan>
+		<ompts:check>
+        parallel=omp_in_parallel();
+		</ompts:check>
+		</ompts:orphan>
+!$omp end single
+!$omp end parallel
+
+        IF ( (.NOT. serial) .AND. (parallel) ) THEN
+          <testfunctionname></testfunctionname>=1
+        ELSE
+          <testfunctionname></testfunctionname>=0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_lock.f b/final/testsuite/fortran/omp_lock.f
new file mode 100644
index 0000000..29026c5
--- /dev/null
+++ b/final/testsuite/fortran/omp_lock.f

@@ -0,0 +1,58 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_set_lock  and the omp_unset_lock function by counting the threads entering and exiting a single region with locks.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_lock</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER result
+        INTEGER nr_threads_in_single
+        INTEGER nr_iterations
+        INTEGER i
+!lock variable
+                <ompts:orphan:vars>
+        INCLUDE "omp_lib.h"
+        INTEGER (KIND=OMP_LOCK_KIND) :: lock
+        COMMON /orphvars/ lock
+                </ompts:orphan:vars>
+        INCLUDE "omp_testsuite.f"
+
+!result is:
+!  0 -- if the test fails
+!  1 -- if the test succeeds
+        CALL omp_init_lock(lock)
+        nr_iterations=0
+        nr_threads_in_single=0
+        result=0
+!$omp parallel shared(lock,nr_threads_in_single,nr_iterations,result)
+!$omp do
+        DO i=1,LOOPCOUNT
+                  <ompts:orphan>
+                  <ompts:check>
+          CALL omp_set_lock(lock)
+                  </ompts:check>
+                  </ompts:orphan>
+!$omp flush
+          nr_threads_in_single=nr_threads_in_single+1
+!$omp flush
+          nr_iterations=nr_iterations+1
+          nr_threads_in_single=nr_threads_in_single-1
+          result=result+nr_threads_in_single
+                  <ompts:orphan>
+                  <ompts:check>
+          CALL omp_unset_lock(lock)
+                  </ompts:check>
+                  </ompts:orphan>
+        END DO
+!$omp end do
+!$omp end parallel
+        CALL omp_destroy_lock(lock)
+        IF(result.EQ.0 .AND. nr_iterations .EQ. LOOPCOUNT) THEN
+              <testfunctionname></testfunctionname>=1
+        ELSE
+              <testfunctionname></testfunctionname>=0
+        ENDIf
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_master.f b/final/testsuite/fortran/omp_master.f
new file mode 100644
index 0000000..65e1f3d
--- /dev/null
+++ b/final/testsuite/fortran/omp_master.f

@@ -0,0 +1,39 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp master directive by counting up a variable in a omp master section.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp master</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_master</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER omp_get_thread_num
+		<ompts:orphan:vars>
+        INTEGER nthreads, executing_thread
+        COMMON /orphvars/ nthreads, executing_thread
+		</ompts:orphan:vars>
+        nthreads=0
+        executing_thread=-1
+
+!$omp parallel
+		<ompts:orphan>
+		<ompts:check>
+!$omp master
+		</ompts:check>
+!$omp critical
+        nthreads = nthreads + 1
+!$omp end critical
+        executing_thread=omp_get_thread_num()
+		<ompts:check>
+!$omp end master
+		</ompts:check>
+		</ompts:orphan>
+!$omp end parallel
+
+        IF ( (nthreads .EQ. 1) .AND. (executing_thread .EQ. 0) ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_master_3.f b/final/testsuite/fortran/omp_master_3.f
new file mode 100644
index 0000000..a222c48
--- /dev/null
+++ b/final/testsuite/fortran/omp_master_3.f

@@ -0,0 +1,49 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp master directive by counting up a variable in a omp master section. It also checks that the master thread has the thread number 0 as specified in the OpenMP standard version 3.0.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp master</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_master_3</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER omp_get_thread_num
+        <ompts:orphan:vars>
+        INTEGER nthreads, executing_thread
+        INTEGER tid_result ! counts up the number of wrong thread no.
+                           ! for the master thread
+        COMMON /orphvars/ nthreads, executing_thread, tid_result
+        </ompts:orphan:vars>
+        tid_result = 0
+        nthreads=0
+        executing_thread=-1
+
+!$omp parallel
+        <ompts:orphan>
+        <ompts:check>
+!$omp master
+        </ompts:check>
+        if (omp_get_thread_num() .ne. 0) then
+!$omp critical
+            tid_result = tid_result + 1
+!$omp end critical
+        end if
+!$omp critical
+        nthreads = nthreads + 1
+!$omp end critical
+        executing_thread=omp_get_thread_num()
+        <ompts:check>
+!$omp end master
+        </ompts:check>
+        </ompts:orphan>
+!$omp end parallel
+
+        IF ( (nthreads .EQ. 1) .AND. (executing_thread .EQ. 0) .AND.
+     &       (tid_result .EQ. 0) ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>
+

diff --git a/final/testsuite/fortran/omp_nest_lock.f b/final/testsuite/fortran/omp_nest_lock.f
new file mode 100644
index 0000000..9b9e3aa
--- /dev/null
+++ b/final/testsuite/fortran/omp_nest_lock.f

@@ -0,0 +1,59 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_set_nest_lock and the omp_unset_nest_lock function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_nest_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_nest_lock</ompts:testcode:functionname>()
+        IMPLICIT NONE
+!result is:
+!  0 -- if the test fails
+!  1 -- if the test succeeds
+        INTEGER result
+        INTEGER nr_threads_in_single
+        INTEGER nr_iterations
+        INTEGER i
+                <ompts:orphan:vars>
+        INCLUDE "omp_lib.h"
+        INTEGER (KIND=OMP_NEST_LOCK_KIND) :: lock
+        COMMON /orphvars/ lock
+                </ompts:orphan:vars>
+        INCLUDE "omp_testsuite.f"
+
+        nr_iterations=0
+        nr_threads_in_single=0
+        CALL omp_init_nest_lock(lock)
+        result=0
+
+!$omp parallel shared(lock,nr_threads_in_single,nr_iterations,result)
+!$omp do
+      DO i=1,LOOPCOUNT
+                <ompts:orphan>
+                <ompts:check>
+        CALL omp_set_nest_lock(lock)
+                </ompts:check>
+                </ompts:orphan>
+!$omp flush
+        nr_threads_in_single=nr_threads_in_single+1
+!$omp flush
+        nr_iterations=nr_iterations+1
+        nr_threads_in_single=nr_threads_in_single-1
+        result=result+nr_threads_in_single
+                <ompts:orphan>
+                <ompts:check>
+        CALL omp_unset_nest_lock(lock)
+                </ompts:check>
+                </ompts:orphan>
+      END DO
+!$omp end do
+!$omp end parallel
+      CALL omp_destroy_nest_lock(lock)
+!               PRINT *, result, nr_iterations
+        IF(result.EQ.0 .AND. nr_iterations .EQ. LOOPCOUNT) THEN
+          <testfunctionname></testfunctionname>=1
+        ELSE
+          <testfunctionname></testfunctionname>=0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_nested.f b/final/testsuite/fortran/omp_nested.f
new file mode 100644
index 0000000..5435151
--- /dev/null
+++ b/final/testsuite/fortran/omp_nested.f

@@ -0,0 +1,53 @@
+<ompts:test>
+<ompts:testdescription>Test if the compiler support nested parallelism.</ompts:testdescription>
+<ompts:version>2.5</ompts:version>
+<ompts:directive>nestedtest</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_nested</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+<ompts:orphan:vars>
+        INTEGER counter
+        COMMON /orphvars/ counter
+</ompts:orphan:vars>
+
+        counter =0
+        
+        <ompts:check>
+!$      CALL OMP_SET_NESTED(.TRUE.)
+!#ifdef _OPENMP
+!       CALL OMP_SET_NESTED(.TRUE.) 
+!#endif
+        </ompts:check>
+        <ompts:crosscheck>
+!$      CALL OMP_SET_NESTED(.FALSE.)
+!#ifdef _OPENMP
+!       CALL OMP_SET_NESTED(.FALSE.)
+!#endif
+        </ompts:crosscheck>
+
+!$omp parallel
+        <ompts:orphan>
+!$omp critical
+          counter = counter + 1
+!$omp end critical
+
+!$omp parallel
+!$omp critical
+          counter = counter - 1
+!$omp end critical
+!$omp end parallel
+        </ompts:orphan>
+!$omp end parallel
+        
+        IF (counter .EQ. 0 ) THEN
+           WRITE (1,*) "Counter was 0"
+           <testfunctionname></testfunctionname> = 0
+        ELSE
+           WRITE (1,*) "Counter was", counter
+           <testfunctionname></testfunctionname> = 1
+        END IF 
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_num_threads.f b/final/testsuite/fortran/omp_num_threads.f
new file mode 100644
index 0000000..a1ede73
--- /dev/null
+++ b/final/testsuite/fortran/omp_num_threads.f

@@ -0,0 +1,60 @@
+<ompts:test>
+<ompts:testdescription>Test which checks that the omp_get_num_threads returns the correct number of threads. Therefor it counts up a variable in a parallelized section and compars this value with the result of the omp_get_num_threads function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_get_num_threads</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_num_threads</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER i, max_threads
+        INTEGER omp_get_num_threads
+<ompts:orphan:vars>
+        INTEGER failed,threads,nthreads,tmp
+        COMMON /orphvars/ failed,threads,nthreads
+</ompts:orphan:vars>
+
+        failed = 0
+        max_threads = 0
+         
+!$omp parallel
+!$omp master
+        max_threads = OMP_GET_NUM_THREADS()       
+!$omp end master
+!$omp end parallel
+!         print *, "max threads:",max_threads
+
+!Yi Wen added omp_Set_dynamics here to make sure num_threads clause work
+!Thanks to Dr. Yin Ma in Absoft. should be not be called before the test loop
+!because it allows the dynamic adjustment of the number of threads at runtime
+!instead of using the max_threads set. 
+
+        !CALL OMP_SET_DYNAMIC(.TRUE.)
+        DO threads = 1, max_threads
+          nthreads = 0
+           <ompts:orphan>
+!$omp parallel num_threads(threads) reduction(+:failed)
+!          print *, threads, omp_get_num_threads()
+          tmp = omp_get_num_threads()
+          IF ( threads .NE. tmp ) THEN
+            failed = failed + 1
+            WRITE (1,*) "Error: found ", tmp, " instead of ",
+     &          threads, " threads"
+          END IF
+!$omp atomic
+          nthreads = nthreads + 1
+!$omp end parallel
+          </ompts:orphan>
+!            print *, threads, nthreads
+          <ompts:check>IF ( nthreads .NE. threads ) THEN</ompts:check>
+          <ompts:crosscheck>IF ( nthreads .EQ. threads ) THEN</ompts:crosscheck>
+            failed = failed + 1
+          END IF
+        END DO
+
+        IF(failed .NE. 0) THEN
+          <testfunctionname></testfunctionname> = 0
+        ELSE
+          <testfunctionname></testfunctionname> = 1
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_single.f b/final/testsuite/fortran/omp_single.f
new file mode 100644
index 0000000..e8821ca
--- /dev/null
+++ b/final/testsuite/fortran/omp_single.f

@@ -0,0 +1,40 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp single directive by controlling how often a directive is called in an omp single region.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp single</ompts:directive>
+<ompts:dependences>omp parallel private,omp flush</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_single</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER i
+		<ompts:orphan:vars>
+        INTEGER nr_threads_in_single,nr_iterations,result
+        COMMON /orphvars/ nr_threads_in_single,nr_iterations,result
+		</ompts:orphan:vars>
+        INCLUDE "omp_testsuite.f"
+        nr_threads_in_single=0
+        result=0
+        nr_iterations=0
+!$omp parallel
+        DO i=0, LOOPCOUNT-1
+		<ompts:orphan>
+!$omp single
+!$omp flush
+          nr_threads_in_single = nr_threads_in_single + 1
+!$omp flush
+          nr_iterations = nr_iterations + 1
+          <ompts:check>nr_threads_in_single = nr_threads_in_single - 1</ompts:check>
+          <ompts:crosscheck>nr_threads_in_single = nr_threads_in_single + 1</ompts:crosscheck>
+          result = result + nr_threads_in_single
+!$omp end single
+		</ompts:orphan>
+        END DO
+!$omp end parallel
+        IF ( result .EQ. 0 .AND. nr_iterations .EQ. LOOPCOUNT ) THEN
+           <testfunctionname></testfunctionname> = 1
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_task.f b/final/testsuite/fortran/omp_task.f
new file mode 100644
index 0000000..273d7b0
--- /dev/null
+++ b/final/testsuite/fortran/omp_task.f

@@ -0,0 +1,50 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp task directive. The idea of the tests is to generate a set of tasks in a single region. We let pause the tasks generated so that other threads get sheduled to the newly opened tasks.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task</ompts:directive>
+<ompts:dependences>omp single</ompts:dependences>
+<ompts:testcode>
+      INCLUDE "omp_my_sleep.f"
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_task</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        <ompts:orphan:vars>
+        INTEGER omp_get_num_threads, omp_get_thread_num
+        EXTERNAL my_sleep
+        INTEGER myj
+        INTEGER i,j
+        INTEGER tids(NUM_TASKS)
+        COMMON /orphvars/ j,tids
+        </ompts:orphan:vars>
+!$omp parallel private(myj) shared(j)
+!$omp single
+        do i=1, NUM_TASKS
+        j = i
+        <ompts:orphan>
+        myj = j
+        <ompts:check>
+!$omp task
+        </ompts:check>
+          call my_sleep(SLEEPTIME)
+          tids(myj) = omp_get_thread_num()
+        <ompts:check>
+!$omp end task
+        </ompts:check>
+        </ompts:orphan>
+        end do
+!$omp end single
+!$omp end parallel
+
+        <testfunctionname></testfunctionname> = 0
+
+        ! check if more than one thread executed the tasks.
+        do i=1, NUM_TASKS
+          if (tids(1) .ne. tids(i)) then
+               <testfunctionname></testfunctionname> = 1
+          end if
+        end do
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_task_firstprivate.f b/final/testsuite/fortran/omp_task_firstprivate.f
new file mode 100644
index 0000000..ca7759e
--- /dev/null
+++ b/final/testsuite/fortran/omp_task_firstprivate.f

@@ -0,0 +1,54 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the firstprivate clause of the task directive. We create a set of tasks in a single region. We defines a variable named sum unequal zero which gets declared firstprivate for each task. Now each task calcualtes a sum using this private variable. Before each calcualation step we introduce a flush command so that maybe the private variabel gets bad. At the end we check if the calculated sum was right.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task firstprivate</ompts:directive>
+<ompts:dependences>omp single,omp critical</ompts:dependences>
+<ompts:testcode>
+      INCLUDE "omp_my_sleep.f"
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_task_firstprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        INTEGER j,i
+        <ompts:orphan:vars>
+        external my_sleep
+        INTEGER my_sum
+        INTEGER known_sum
+        INTEGER rslt
+        COMMON /orphvars/ my_sum, known_sum, rslt
+        </ompts:orphan:vars>
+
+        my_sum = 1234
+        known_sum = 1234 + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+
+!$omp parallel private(j)
+!$omp single
+        do i=1, NUM_TASKS
+        <ompts:orphan>
+!$omp task <ompts:check>firstprivate(my_sum)</ompts:check>
+          do j = 0, LOOPCOUNT
+!$omp flush
+            my_sum = my_sum + j
+          end do
+
+          ! check if calculated my_sum was right
+          if (my_sum .ne. known_sum) then
+!$omp critical
+            rslt = rslt + 1
+!$omp end critical
+          end if
+!$omp end task
+        </ompts:orphan>
+        end do
+!$omp end single
+!$omp end parallel
+
+        if (rslt .eq. 0) then
+            <testfunctionname></testfunctionname> = 1
+        else
+            <testfunctionname></testfunctionname> = 0
+        end if
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_task_if.f b/final/testsuite/fortran/omp_task_if.f
new file mode 100644
index 0000000..149f457
--- /dev/null
+++ b/final/testsuite/fortran/omp_task_if.f

@@ -0,0 +1,45 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the if clause of the omp task directive. The idear of the tests is to generate a tasks in a single region and pause it immediately. The parent thread now shall set a counter variable which the paused task shall evaluate when woke up.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task if</ompts:directive>
+<ompts:dependences>omp single,omp flush</ompts:dependences>
+<ompts:testcode>
+      INCLUDE "omp_my_sleep.f"
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_task_if</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        <ompts:orphan:vars>
+        external my_sleep
+        INTEGER dummy
+        LOGICAL condition_false
+        INTEGER cnt
+        INTEGER rslt
+        COMMON /orphvars/ condition_false, cnt, rslt
+        </ompts:orphan:vars>
+
+        cnt = 0
+        condition_false = (dummy .eq. 314159)
+
+!$omp parallel
+!$omp single
+        <ompts:orphan>
+!$omp task <ompts:check>if (condition_false)</ompts:check> shared(cnt,rslt)
+          call my_sleep(SLEEPTIME_LONG)
+!$omp flush
+          if (cnt .eq. 0) then
+              rslt = 1
+          else
+              rslt = 0
+          end if
+!$omp end task
+        </ompts:orphan>
+        cnt = 1
+!$omp end single
+!$omp end parallel
+
+        <testfunctionname></testfunctionname> = rslt
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_task_private.f b/final/testsuite/fortran/omp_task_private.f
new file mode 100644
index 0000000..5e44bc5
--- /dev/null
+++ b/final/testsuite/fortran/omp_task_private.f

@@ -0,0 +1,55 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the private clause of the task directive. We create a set of tasks in a single region. We defines a variable named sum which gets declared private for each task. Now each task calcualtes a sum using this private variable. Before each calcualation step we introduce a flush command so that maybe the private variabel gets bad. At the end we check if the calculated sum was right.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task private</ompts:directive>
+<ompts:dependences>omp single,omp critical</ompts:dependences>
+<ompts:testcode>
+      INCLUDE "omp_my_sleep.f"
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_task_firstprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        INTEGER j,i
+        <ompts:orphan:vars>
+        external my_sleep
+        INTEGER my_sum
+        INTEGER known_sum
+        INTEGER rslt
+        COMMON /orphvars/ my_sum, known_sum, rslt
+        </ompts:orphan:vars>
+
+        my_sum = 0
+        rslt = 0
+        known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+
+!$omp parallel private(j)
+!$omp single
+        do i=1, NUM_TASKS
+        <ompts:orphan>
+!$omp task <ompts:check>private(my_sum)</ompts:check> shared(rslt, known_sum)
+          <ompts:check>my_sum = 0</ompts:check>
+          do j = 0, LOOPCOUNT
+            my_sum = my_sum + j
+          end do
+
+          ! check if calculated my_sum was right
+          if (my_sum .ne. known_sum) then
+!$omp critical
+            rslt = rslt + 1
+!$omp end critical
+          end if
+!$omp end task
+        </ompts:orphan>
+        end do
+!$omp end single
+!$omp end parallel
+
+        if (rslt .eq. 0) then
+            <testfunctionname></testfunctionname> = 1
+        else
+            <testfunctionname></testfunctionname> = 0
+        end if
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_task_shared.f b/final/testsuite/fortran/omp_task_shared.f
new file mode 100644
index 0000000..58564b3
--- /dev/null
+++ b/final/testsuite/fortran/omp_task_shared.f

@@ -0,0 +1,47 @@
+<ompts:test>
+<ompts:testdescription> Test to see if implied shared works correctly</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task</ompts:directive>
+<ompts:dependences>omp single, omp task firstprivate</ompts:dependences>
+<ompts:testcode>
+      INCLUDE "omp_my_sleep.f"
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_task_shared</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        <ompts:orphan:vars>
+        external my_sleep
+        INTEGER i
+        COMMON /orphvars/ i
+        </ompts:orphan:vars>
+        INTEGER rslt
+        INTEGER k
+
+        i = 0
+        k = 0
+        rslt = 0
+
+!$omp parallel private(k) shared(i)
+!$omp single
+        do k=1, NUM_TASKS
+        <ompts:orphan>
+!$omp task <ompts:crosscheck>firstprivate(i)</ompts:crosscheck>
+!$omp+     <ompts:check>shared(i)</ompts:check>
+!$omp atomic
+            i = i + 1
+!$omp end task
+        </ompts:orphan>
+        end do
+!$omp end single
+!$omp end parallel
+
+        rslt = i
+        if (rslt .eq. NUM_TASKS) then
+            <testfunctionname></testfunctionname> = 1
+        else
+            <testfunctionname></testfunctionname> = 0
+        end if
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_task_untied.f b/final/testsuite/fortran/omp_task_untied.f
new file mode 100644
index 0000000..e58e935
--- /dev/null
+++ b/final/testsuite/fortran/omp_task_untied.f

@@ -0,0 +1,63 @@
+<ompts:test>
+<ompts:testdescription>Test for untied clause. First generate a set of tasks and pause it immediately. Then we resume half of them and check whether they are scheduled by different threads</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp task untied</ompts:directive>
+<ompts:dependences>omp taskwait</ompts:dependences>
+<ompts:testcode>
+      INCLUDE "omp_my_sleep.f"
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_task_untied</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        <ompts:orphan:vars>
+        EXTERNAL my_sleep
+        INTEGER omp_get_num_threads, omp_get_thread_num
+        INTEGER myj
+        INTEGER i,j
+        INTEGER cnt
+        INTEGER start_tid(NUM_TASKS)
+        INTEGER current_tid(NUM_TASKS)
+        COMMON /orphvars/ j, cnt, start_tid, current_tid
+        </ompts:orphan:vars>
+
+        cnt = 0
+        do i = 1, NUM_TASKS
+          start_tid(i) = 0
+          current_tid(i) = 0
+        end do
+
+!$omp parallel private(myj) shared(j)
+!$omp single
+        do i=1, NUM_TASKS
+        j = i
+        <ompts:orphan>
+        myj = j
+!$omp task <ompts:check>untied</ompts:check>
+          call my_sleep(SLEEPTIME)
+          start_tid(myj) = omp_get_thread_num()
+!$omp taskwait
+      <ompts:check>if (MOD(start_tid(myj),2) .ne. 0) then</ompts:check>
+        call my_sleep(SLEEPTIME)
+        current_tid(myj) = omp_get_thread_num()
+      <ompts:check>
+       else
+        current_tid(myj) = omp_get_thread_num()
+       end if</ompts:check>
+!$omp end task
+        </ompts:orphan>
+        end do
+!$omp end single
+!$omp end parallel
+
+        <testfunctionname></testfunctionname> = 0
+
+        ! check if at least one untied task switched threads
+        do i=1, NUM_TASKS
+          if (current_tid(i) .ne. start_tid(i)) then
+               <testfunctionname></testfunctionname> = 1
+          end if
+        end do
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_taskwait.f b/final/testsuite/fortran/omp_taskwait.f
new file mode 100644
index 0000000..2aa94c4
--- /dev/null
+++ b/final/testsuite/fortran/omp_taskwait.f

@@ -0,0 +1,83 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp taskwait directive. First we generate a set of tasks, which set the elements of an array to a specific value. Then we do a taskwait and check if all tasks finished meaning all array elements contain the right value. Then we generate a second set setting the array elements to another value. After the parallel region we check if all tasks of the second set finished and were executed after the tasks of the first set.</ompts:testdescription>
+<ompts:ompversion>3.0</ompts:ompversion>
+<ompts:directive>omp taskwait</ompts:directive>
+<ompts:dependences>omp single,omp task</ompts:dependences>
+<ompts:testcode>
+      INCLUDE "omp_my_sleep.f"
+
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_taskwait</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INCLUDE "omp_testsuite.f"
+        INTEGER result1, result2
+        INTEGER array(NUM_TASKS)
+        INTEGER i, myi
+        <ompts:orphan:vars>
+        external my_sleep
+        </ompts:orphan:vars>
+
+        result1 = 0
+        result2 = 0
+
+        ! fill array
+        do i = 1, NUM_TASKS
+          array(i) = 0
+        end do
+
+!$omp parallel shared(i) private(myi)
+!$omp single
+        do i=1, NUM_TASKS
+         ! First we have to store the value of the loop index in a new variable
+         ! which will be private for each task because otherwise it will be
+         ! overwritten if the execution of the task takes longer than the time
+         ! which is needed to enter the next step of the loop!
+
+         myi = i
+
+!$omp task
+          call my_sleep(SLEEPTIME)
+          array(myi) = 1
+!$omp end task
+        end do
+
+        <ompts:orphan>
+        <ompts:check>
+!$omp taskwait
+        </ompts:check>
+        </ompts:orphan>
+
+        ! check if all tasks were finished
+        do i=1, NUM_TASKS
+          if (array(i) .ne. 1) then
+              result1 = result1 + 1
+          end if
+        end do
+
+        ! generate some more tasks which now shall overwrite the valuesin the
+        ! array
+        do i=1, NUM_TASKS
+          myi = i
+!$omp task
+          array(myi) = 2
+!$omp end task
+        end do
+
+!$omp end single
+!$omp end parallel
+
+        ! final check, if all array elements contain the right values
+        do i=1, NUM_TASKS
+          if (array(i) .ne. 2) then
+            result2 = result2 + 1
+          end if
+        end do
+
+        if ( (result1 .eq. 0) .and. (result2 .eq. 0) ) then
+            <testfunctionname></testfunctionname> = 1
+        else
+            <testfunctionname></testfunctionname> = 0
+        end if
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_test_nest_lock.f b/final/testsuite/fortran/omp_test_nest_lock.f
new file mode 100644
index 0000000..8396d24
--- /dev/null
+++ b/final/testsuite/fortran/omp_test_nest_lock.f

@@ -0,0 +1,61 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_test_nest_lock function.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_test_nest_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_test_nest_lock</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER result
+!result is:
+!      0 -- if the test fails
+!      1 -- if the test succeeds
+        INTEGER nr_threads_in_single
+        INTEGER nr_iterations
+        INTEGER i
+            <ompts:orphan:vars>
+        include "omp_lib.h"
+        INTEGER (KIND=OMP_NEST_LOCK_KIND) :: lock
+        COMMON /orphvars/ lock
+            </ompts:orphan:vars>
+!        INTEGER  lck
+        INCLUDE "omp_testsuite.f"
+
+        nr_iterations=0
+        nr_threads_in_single=0
+        CALL OMP_INIT_NEST_LOCK(lock)
+        result=0
+
+!$omp parallel shared(lock,nr_threads_in_single,nr_iterations,result)
+!$omp do
+        DO i=1,LOOPCOUNT
+                  <ompts:orphan>
+                  <ompts:check>
+          DO WHILE(OMP_TEST_NEST_LOCK(lock) .EQ. 0)
+          END DO
+                  </ompts:check>
+                  </ompts:orphan>
+!$omp flush
+          nr_threads_in_single=nr_threads_in_single+1
+!$omp flush
+          nr_iterations=nr_iterations+1
+          nr_threads_in_single=nr_threads_in_single-1
+          result=result+nr_threads_in_single
+                  <ompts:orphan>
+                  <ompts:check>
+          CALL OMP_UNSET_NEST_LOCK(lock)
+                  </ompts:check>
+                  </ompts:orphan>
+        END DO
+!$omp end do
+!$omp end parallel
+        CALL omp_destroy_nest_lock(lock)
+!               print *, result, nr_iterations
+        IF(result.EQ.0 .AND. nr_iterations .EQ. LOOPCOUNT) THEN
+              <testfunctionname></testfunctionname>=1
+        ELSE
+              <testfunctionname></testfunctionname>=0
+        ENDIF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_testlock.f b/final/testsuite/fortran/omp_testlock.f
new file mode 100644
index 0000000..3b1304d
--- /dev/null
+++ b/final/testsuite/fortran/omp_testlock.f

@@ -0,0 +1,57 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_testlock function. The test counts the threads entering and exiting a single region which is build with a test_lock in an endless loop.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_test_lock</ompts:directive>
+<ompts:dependences>omp flush</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_testlock</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER result
+        INTEGER nr_threads_in_single
+        INTEGER nr_iterations
+        INTEGER i
+              <ompts:orphan:vars>
+        include "omp_lib.h"
+        INTEGER (KIND=OMP_LOCK_KIND)::lock
+        COMMON /orphvars/ lock
+              </ompts:orphan:vars>
+        INCLUDE "omp_testsuite.f"
+
+        nr_iterations=0
+        nr_threads_in_single=0
+        CALL OMP_INIT_LOCK(lock)
+        result=0
+
+!$omp parallel shared(lock,nr_threads_in_single,nr_iterations,result)
+!$omp do
+        DO i=1,LOOPCOUNT
+                  <ompts:orphan>
+                  <ompts:check>
+          DO WHILE (.NOT. OMP_TEST_LOCK(lock))
+          END DO
+                  </ompts:check>
+                  </ompts:orphan>
+!$omp flush
+          nr_threads_in_single=nr_threads_in_single+1
+!$omp flush
+          nr_iterations=nr_iterations+1
+          nr_threads_in_single=nr_threads_in_single-1
+          result=result+nr_threads_in_single
+                  <ompts:orphan>
+                  <ompts:check>
+          CALL OMP_UNSET_LOCK(lock)
+                  </ompts:check>
+                  </ompts:orphan>
+        END DO
+!$omp end do
+!$omp end parallel
+        CALL OMP_DESTROY_LOCK(lock)
+!               print *, result, nr_iterations
+        IF(result.EQ.0 .AND. nr_iterations .EQ. LOOPCOUNT) THEN
+          <testfunctionname></testfunctionname>=1
+        ELSE
+          <testfunctionname></testfunctionname>=0
+        ENDIF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_testsuite.f b/final/testsuite/fortran/omp_testsuite.f
new file mode 100644
index 0000000..42d4821
--- /dev/null
+++ b/final/testsuite/fortran/omp_testsuite.f

@@ -0,0 +1,2 @@
+	integer LOOPCOUNT
+	parameter (LOOPCOUNT=1000)

diff --git a/final/testsuite/fortran/omp_threadprivate.f b/final/testsuite/fortran/omp_threadprivate.f
new file mode 100644
index 0000000..c920498
--- /dev/null
+++ b/final/testsuite/fortran/omp_threadprivate.f

@@ -0,0 +1,89 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp threadprivate directive by filling an array with random numbers in an parallelised region. Each thread generates one number of the array and saves this in a temporary threadprivate variable. In a second parallelised region the test controls, that the temporary variable contains still the former value by comparing it with the one in the array.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp threadprivate</ompts:directive>
+<ompts:dependences>omp critical,omp_set_dynamic,omp_get_num_threads,omp master</ompts:dependences>
+<ompts:testcode>
+!Yi Wen modified this function from his own understanding of the semantics
+!of C version at 05042004
+!The undeestanding is that sum0 and myvalue can be local static variables
+!of the chk_omp_threadprivate function. There is no need to use common
+!block
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_threadprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, known_sum, i , iter, rank,size, failed
+        INTEGER omp_get_num_threads, omp_get_thread_num
+        REAL my_random
+        REAL, ALLOCATABLE:: data(:)
+        INTEGER random_size
+        INTRINSIC random_number
+        INTRINSIC random_seed
+        EXTERNAL omp_set_dynamic
+
+!Yi Wen modified at 05042004 : add "save"
+        INTEGER, SAVE:: sum0
+        REAL, SAVE::myvalue
+!Yi Wen commented two common blocks
+!	common/csum0/ sum0
+!	common/cmyvalue/ myvalue
+!!!!!!!!!!$omp threadprivate(/csum0/,/cmyvalue/)
+		<ompts:check>
+!$omp threadprivate(sum0,myvalue)
+		</ompts:check>
+        INCLUDE "omp_testsuite.f"
+
+        sum = 0
+        failed = 0
+        sum0=0
+        myvalue=0
+        random_size=45
+        CALL omp_set_dynamic(.FALSE.)
+!$omp parallel
+        sum0 = 0
+!$omp do
+        DO i=1, LOOPCOUNT
+          sum0 = sum0 + i
+        END DO
+!$omp end do
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end parallel
+        known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2
+        IF ( known_sum .NE. sum ) THEN
+          PRINT *, ' known_sum =', known_sum, ', sum =',sum
+        END IF
+
+        CALL omp_set_dynamic(.FALSE.)
+
+!$omp parallel
+!$omp master
+        size = omp_get_num_threads()
+        ALLOCATE ( data(size) )
+!$omp end master
+!$omp end parallel
+        CALL RANDOM_SEED(SIZE=random_size)
+        DO iter = 0, 99
+          CALL RANDOM_NUMBER(HARVEST=my_random)
+!$omp parallel private(rank)
+          rank = omp_get_thread_num()+1
+          myvalue = my_random + rank
+          data(rank) = myvalue
+!$omp end parallel
+!$omp parallel private(rank)
+          rank = omp_get_thread_num()+1
+          IF ( myvalue .NE. data(rank) ) THEN
+            failed = failed + 1
+            PRINT *, ' myvalue =',myvalue,' data(rank)=', data(rank)
+          END IF
+!$omp end parallel
+        END DO
+        DEALLOCATE( data)
+        IF ( (known_sum .EQ. sum) .AND. (failed .NE. 1) ) THEN
+          <testfunctionname></testfunctionname> = 1
+        else
+          <testfunctionname></testfunctionname> = 0 
+        end if
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_workshare.f b/final/testsuite/fortran/omp_workshare.f
new file mode 100644
index 0000000..a8faa27
--- /dev/null
+++ b/final/testsuite/fortran/omp_workshare.f

@@ -0,0 +1,142 @@
+<ompts:test>
+<ompts:testdescription>Test which checks if WORKSHARE is present.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp workshare</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+!********************************************************************
+! Function: omp_workshare
+! 
+! by Chunhua Liao, University of Houston
+! Oct. 2005 - First version
+! 
+! The idea for the test is that if WORKSHARE is not present,
+! the array assignment in PARALLEL region will be executed by each 
+! thread and then wrongfully repeated several times.
+!
+! TODO:Do we need test for WHERE and FORALL?
+! A simple test for WHERE and FORALL is added by Zhenying Liu
+!********************************************************************
+        INTEGER FUNCTION <ompts:testcode:functionname>omp_workshare</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER result,i
+        INTEGER scalar02,scalar12,scalar22,scalar32,count
+        REAL, DIMENSION(1000)::FF
+<ompts:orphan:vars>
+        INTEGER scalar0,scalar1,scalar2,scalar3
+        INTEGER, DIMENSION(1000)::AA,BB,CC
+        REAL, DIMENSION(1000)::DD
+        COMMON /orphvars/ scalar0,scalar1,scalar2,scalar3,
+     &      AA,BB,CC,DD
+</ompts:orphan:vars>
+
+        result=0
+        scalar0=0
+        scalar02=0
+        scalar1=0
+        scalar12=0
+        scalar2=0
+        scalar22=0
+        scalar3=0
+        scalar32=0
+ 
+        count = 0
+
+        AA=0
+        BB=0
+
+        do i=1,1000
+          CC(i) = i
+          FF(i) = 1.0/i
+        end do
+
+!$OMP PARALLEL
+<ompts:orphan>
+<ompts:check>!$OMP   WORKSHARE</ompts:check>
+
+! test if work is divided or not for array assignment
+        AA=AA+1
+
+! test if scalar assignment is treated as a single unit of work
+        scalar0=scalar0+1 
+
+! test if atomic is treated as a single unit of work
+!$OMP ATOMIC
+        scalar1=scalar1+1 
+! test if critical is treated as a single unit of work
+!$OMP CRITICAL
+        scalar2=scalar2+1
+!$OMP END CRITICAL
+
+! test if PARALLEL is treated as a single unit of work
+!$OMP PARALLEL
+        scalar3=scalar3+1
+!$OMP END PARALLEL
+
+        WHERE ( CC .ne. 0 ) DD = 1.0/CC
+
+        FORALL (I=1:1000) CC(i) = -i
+
+<ompts:check>!$OMP   END WORKSHARE</ompts:check>
+</ompts:orphan>
+!$OMP END PARALLEL
+
+!sequential equivalent statements for comparison 
+       BB=BB+1
+       scalar02=scalar02+1
+       scalar12=scalar12+1
+       scalar22=scalar22+1
+       scalar32=scalar32+1
+
+!      write (1,*) "ck:sum of AA is",SUM(AA)," sum of BB is ",sum(BB)
+       if (SUM(AA)/=SUM(BB)) then
+            write(1,*) "Array assignment has some problem"
+            result=result +1
+       endif
+       if (scalar0/=scalar02) then
+          write(1,*) "Scalar assignment has some problem"
+          result = result +1
+       endif
+       if (scalar1/=scalar12) then
+          write(1,*) "Atomic inside WORKSHARE has some problem"
+         result = result +1
+       endif
+       if (scalar2/=scalar22) then
+          write(1,*) "CRITICAL inside WORKSHARE has some problem"
+         result = result +1
+       endif
+       if (scalar3/=scalar32) then
+           write(1,*) "PARALLEL inside WORKSHARE has some problem"
+           result = result +1
+       endif
+       do i=1,1000
+         if ( abs( DD(i)- FF(i)) .gt. 1.0E-4 ) then
+	    count = count + 1
+         end if
+       end do
+       if ( count .ne. 0 ) then
+           result = result + 1
+           write(1,*) "WHERE has some problem"
+       end if
+
+       count = 0
+       do i=1,1000
+         if ( CC(i) .ne. -i ) then
+            count = count + 1
+         end if
+       end do
+       if ( count .ne. 0 ) then
+           result = result + 1
+           write(1,*) "FORALL has some problem"
+       end if
+
+
+!if anything is wrong, set return value to 0
+       if (result==0) then
+          <testfunctionname></testfunctionname> = 1
+       else
+          <testfunctionname></testfunctionname> = 0
+       end if
+       end
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_workshare_default.f b/final/testsuite/fortran/omp_workshare_default.f
new file mode 100644
index 0000000..f774c43
--- /dev/null
+++ b/final/testsuite/fortran/omp_workshare_default.f

@@ -0,0 +1,36 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp master directive by counting up a variable in a omp master section.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp workshare default</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_workshare_default</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum
+        INTEGER known_sum
+        INTEGER mysum
+        INTEGER i
+
+        known_sum = 999*1000/2
+
+!$omp parallel default(private) shared(sum)
+!$omp do 
+        DO i = 1, 999
+           mysum = mysum + i
+        END DO
+!$omp end do 
+
+!$omp critical
+        sum = sum + mysum
+!$omp end critical
+
+!$omp end parallel
+
+        IF ( (known_sum .EQ. sum) ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/omp_wtime.f b/final/testsuite/fortran/omp_wtime.f
new file mode 100644
index 0000000..557334f
--- /dev/null
+++ b/final/testsuite/fortran/omp_wtime.f

@@ -0,0 +1,43 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp_get_wtime function. It compares the time with which is called a sleep function with the time it took by messuring the difference between the call of the sleep function and its end.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp_get_wtime</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>omp_wtime</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        <ompts:orphan:vars>
+        DOUBLE PRECISION start
+        DOUBLE PRECISION endtime
+        COMMON start, endtime
+        include "omp_lib.h"
+        </ompts:orphan:vars>
+        INTEGER wait_time
+        DOUBLE PRECISION measured_time
+        INTEGER fileunit
+        wait_time=1
+
+        start = 0;
+        endtime = 0;
+
+                <ompts:orphan>
+                <ompts:check>
+        start=omp_get_wtime()
+                </ompts:check>
+                </ompts:orphan>
+        CALL sleep(wait_time)
+                <ompts:orphan>
+                <ompts:check>
+        endtime=omp_get_wtime()
+                </ompts:check>
+                </ompts:orphan>
+        measured_time=endtime-start
+        WRITE(1,*) "work took",measured_time,"sec. time."
+        IF(measured_time.GT.0.99*wait_time .AND.
+     & measured_time .LT. 1.01*wait_time) THEN
+              <testfunctionname></testfunctionname>=1
+        ELSE
+              <testfunctionname></testfunctionname>=0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_do_firstprivate.f b/final/testsuite/fortran/par_do_firstprivate.f
new file mode 100644
index 0000000..79ae527
--- /dev/null
+++ b/final/testsuite/fortran/par_do_firstprivate.f

@@ -0,0 +1,26 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel do firstprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel for firstprivate</ompts:directive>
+<ompts:dependences>par do reduction,par do private</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_do_firstprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum,known_sum, i2, i
+        INCLUDE "omp_testsuite.f"
+        sum =0
+        i2 = 3
+!$omp parallel do <ompts:check>firstprivate(i2)</ompts:check><ompts:crosscheck>private(i2)</ompts:crosscheck> reduction(+:sum)
+        DO i=1, LOOPCOUNT
+          sum = sum + ( i+ i2)
+        END DO
+!$omp end parallel do
+        known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2+3*LOOPCOUNT
+        IF ( known_sum .EQ. sum ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_do_if.f b/final/testsuite/fortran/par_do_if.f
new file mode 100644
index 0000000..0b7fc9a
--- /dev/null
+++ b/final/testsuite/fortran/par_do_if.f

@@ -0,0 +1,32 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel for if directive. Needs at least two threads.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel do if</ompts:directive>
+<ompts:dependences></ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_do_if</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER omp_get_num_threads
+        INTEGER sum,known_sum, i, num_threads
+        INTEGER control
+        INCLUDE "omp_testsuite.f"
+        sum = 0
+
+        control = 0
+!$omp parallel do <ompts:check>if (control == 1)</ompts:check>
+        DO i=1, LOOPCOUNT
+          sum = sum + i
+          num_threads = omp_get_num_threads ()
+        END DO
+!$omp end parallel do
+        WRITE (1,*) "Number of threads determined by:"\
+                    "omg_get_num_threasd:", num_threads
+        known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2
+        IF ( known_sum .EQ. sum .AND. num_threads .EQ. 1) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_do_lastprivate.f b/final/testsuite/fortran/par_do_lastprivate.f
new file mode 100644
index 0000000..180210c
--- /dev/null
+++ b/final/testsuite/fortran/par_do_lastprivate.f

@@ -0,0 +1,28 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel do lastprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel do lastprivate</ompts:directive>
+<ompts:dependences>par do reduction, par do private</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_do_lastprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, known_sum, i , i0
+        INCLUDE "omp_testsuite.f"
+        sum = 0
+        i0 = -1
+
+!$omp parallel do reduction(+:sum) schedule(static,7) <ompts:check>lastprivate(i0)</ompts:check><ompts:crosscheck>private(i0)</ompts:crosscheck>
+        DO i=1, LOOPCOUNT
+          sum = sum + i
+          i0 = i
+        END DO
+!$omp end parallel do
+        known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2
+        IF ( (known_sum .EQ. sum) .AND. (i0 .EQ. LOOPCOUNT) ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END   
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_do_ordered.f b/final/testsuite/fortran/par_do_ordered.f
new file mode 100644
index 0000000..33d4cd0
--- /dev/null
+++ b/final/testsuite/fortran/par_do_ordered.f

@@ -0,0 +1,67 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel do ordered directive</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel do ordered</ompts:directive>
+<ompts:dependences>par schedule stat</ompts:dependences>
+<ompts:testcode>
+! **********************************************************
+! Helper function is_larger
+! **********************************************************
+      INTEGER FUNCTION i_islarger2(i)
+        IMPLICIT NONE
+        INTEGER i
+        INTEGER last_i,islarger
+        COMMON /com/ last_i
+        INCLUDE "omp_testsuite.f"
+!        print *, "last_i",last_i, "i", i
+! last_i is a global variable
+        IF ( i .GT. last_i ) THEN
+          islarger = 1
+        ELSE
+          islarger = 0
+        END IF
+        last_i = i
+        i_islarger2 = islarger
+      END FUNCTION
+
+      INTEGER FUNCTION <ompts:testcode:functionname>par_do_ordered</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        COMMON /com/ last_i
+        INTEGER known_sum,i, last_i
+<ompts:orphan:vars>
+        INTEGER is_larger,sum,i_islarger2
+        COMMON /orphvars/ is_larger,sum,i
+</ompts:orphan:vars>
+        
+        sum=0
+        is_larger=1
+        last_i=0
+!$omp parallel do schedule(static, 1) ordered
+        DO i=1, 99
+                <ompts:orphan>
+		<ompts:check>
+!$omp ordered
+		</ompts:check>
+        IF( i_islarger2(i) .EQ. 1 .AND. is_larger .EQ. 1 ) THEN  
+          is_larger = 1
+        ELSE
+          is_larger = 0
+        END IF
+        sum = sum + i
+		<ompts:check>
+!$omp end ordered
+		</ompts:check>
+                </ompts:orphan>
+        END DO
+!$omp end parallel do
+        known_sum = (99*100)/2
+!Yi Wen; Sun compiler will fail sometimes
+!        print *, "sum", sum, "ks", known_sum, "la", is_larger
+        IF ( known_sum .EQ. sum .AND. is_larger .EQ. 1 ) THEN
+           <testfunctionname></testfunctionname> = 1
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_do_private.f b/final/testsuite/fortran/par_do_private.f
new file mode 100644
index 0000000..dd39ab9
--- /dev/null
+++ b/final/testsuite/fortran/par_do_private.f

@@ -0,0 +1,46 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel do private directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel do private</ompts:directive>
+<ompts:dependences>par do reduction,omp flush</ompts:dependences>
+<ompts:testcode>
+      SUBROUTINE do_some_work2()
+        IMPLICIT NONE
+        REAL i
+        DOUBLE PRECISION sum
+        INTRINSIC sqrt
+        INCLUDE "omp_testsuite.f"
+        sum = 0.0
+        i = 0
+        DO WHILE (i < LOOPCOUNT)
+           sum = sum + sqrt(i)
+           i = i + 1
+        END DO
+      END
+
+!********************************************************************
+
+      INTEGER FUNCTION <ompts:testcode:functionname>par_do_private</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum,known_sum, i, i2, i3
+        INCLUDE "omp_testsuite.f"
+        sum = 0
+
+!$omp parallel do reduction(+:sum) <ompts:check>private(i2)</ompts:check> schedule(static,1)
+        DO i=1, LOOPCOUNT
+          i2 = i
+!$omp flush
+          CALL do_some_work2()
+!$omp flush
+          sum = sum + i2
+        END DO
+!$omp end parallel do
+          known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2
+        IF ( known_sum .EQ. sum ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_do_reduction.f b/final/testsuite/fortran/par_do_reduction.f
new file mode 100644
index 0000000..9a3f455
--- /dev/null
+++ b/final/testsuite/fortran/par_do_reduction.f

@@ -0,0 +1,415 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel do reduction directive with all its options.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel do reduction</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_do_reduction</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, sum2, known_sum, i, i2,diff
+        INTEGER product,known_product,int_const
+        INTEGER MAX_FACTOR
+        DOUBLE PRECISION dsum,dknown_sum,dt,dpt
+        DOUBLE PRECISION rounding_error, ddiff
+        INTEGER double_DIGITS
+        LOGICAL logic_and, logic_or, logic_eqv,logic_neqv
+        INTEGER bit_and, bit_or
+        INTEGER exclusiv_bit_or
+        INTEGER min_value, max_value
+        DOUBLE PRECISION dmin, dmax
+        INTEGER result
+        INCLUDE "omp_testsuite.f"
+        LOGICAL logics(LOOPCOUNT)
+        INTEGER int_array(LOOPCOUNT)
+        DOUBLE PRECISION d_array(LOOPCOUNT)
+        PARAMETER (int_const=10,known_product=3628800)
+        PARAMETER (double_DIGITS=20,MAX_FACTOR=10)
+        PARAMETER (rounding_error=1.E-6)
+
+        dt = 1./3.
+        known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+        product = 1
+        sum2 = 0
+        sum = 0
+        dsum = 0.
+        result =0 
+        logic_and = .true.
+        logic_or = .false.
+        bit_and = 1
+        bit_or = 0
+        exclusiv_bit_or = 0
+!$omp parallel do schedule(dynamic, 1) <ompts:check>reduction(+:sum)</ompts:check>
+        DO i =1, LOOPCOUNT
+          sum = sum + i
+        END DO
+!$omp end parallel do
+
+        IF (known_sum .NE. sum) THEN
+          result = result + 1
+          WRITE(1,*) "Error in sum with integers: Result was ",
+     &       sum,"instead of ", known_sum
+        END IF
+
+        diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+
+
+
+!$omp parallel do schedule(dynamic, 1) <ompts:check>reduction (-: diff)</ompts:check>
+        DO i =1, LOOPCOUNT
+          diff = diff - i
+        END DO
+!$omp end parallel do
+  
+        IF ( diff .NE. 0 ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in difference with integers: Result was ",
+     &       sum,"instead of 0."
+        END IF
+
+!... Test for doubles
+        dsum =0.
+        dpt = 1
+
+        DO i=1, DOUBLE_DIGITS
+          dpt= dpt * dt
+        END DO
+        dknown_sum = (1-dpt)/(1-dt)
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(+:dsum)</ompts:check>
+        DO i=0,DOUBLE_DIGITS-1
+              dsum = dsum + dt**i
+        END DO
+!$omp end parallel do
+
+ 
+        IF(dsum .NE. dknown_sum .AND. 
+     &    abs(dsum - dknown_sum) .GT. rounding_error ) THEN
+          result = result + 1
+          write(1,*) "Error in sum with doubles: Result was ",
+     &      dsum,"instead of ",dknown_sum,"(Difference: ",
+     &      dsum - dknown_sum,")"
+        END IF
+        dpt = 1
+
+        DO i=1, DOUBLE_DIGITS
+          dpt = dpt*dt
+        END DO
+        ddiff = ( 1-dpt)/(1-dt)
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(-:ddiff)</ompts:check>
+        DO i=0, DOUBLE_DIGITS-1
+          ddiff = ddiff - dt**i
+        END DO
+!$omp end parallel do
+
+        IF ( ABS(ddiff) .GT. rounding_error ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Difference with doubles: Result was ",
+     &       ddiff,"instead of 0.0"
+        END IF
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(*:product)</ompts:check>
+        DO i=1,MAX_FACTOR
+           product = product * i
+        END DO
+!$omp end parallel do
+
+        IF (known_product .NE. product) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Product with integers: Result was ",
+     &       product," instead of",known_product 
+        END IF
+
+        DO i=1,LOOPCOUNT
+          logics(i) = .TRUE.
+        END DO
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.AND.:logic_and)</ompts:check>
+        DO i=1,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF (.NOT. logic_and) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic AND part 1"
+        END IF
+
+
+        logic_and = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.AND.:logic_and)</ompts:check>
+        DO i=1,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF (logic_and) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic AND pass 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+         logics(i) = .FALSE.
+        END DO
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.OR.:logic_or)</ompts:check>
+        DO i = 1, LOOPCOUNT
+           logic_or = logic_or .or. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF (logic_or) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 1"
+        END IF
+
+        logic_or = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.OR.:logic_or)</ompts:check>
+        DO i=1,LOOPCOUNT
+          logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF ( .NOT. logic_or ) THEn
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 2"
+        END IF
+
+!... Test logic EQV, unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .TRUE.
+        END DO
+
+        logic_eqv = .TRUE.
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.EQV.:logic_eqv)</ompts:check>
+        DO i = 1, LOOPCOUNT
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF (.NOT. logic_eqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 1"
+        END IF
+
+        logic_eqv = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.EQV.:logic_eqv)</ompts:check>
+        DO i=1,LOOPCOUNT
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF ( logic_eqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 2"
+        END IF
+
+!... Test logic NEQV, which is unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .FALSE.
+        END DO
+
+        logic_neqv = .FALSE.
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.NEQV.:logic_neqv)</ompts:check>
+        DO i = 1, LOOPCOUNT
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF (logic_neqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 1"
+        END IF
+
+        logic_neqv = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(.NEQV.:logic_neqv)</ompts:check>
+        DO i=1,LOOPCOUNT
+           logic_neqv = logic_neqv .neqv. logics(i)
+        END DO
+!$omp end parallel do
+
+        IF ( .NOT. logic_neqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+           int_array(i) = 1
+        END DO
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(IAND:bit_and)</ompts:check>
+        DO i=1, LOOPCOUNT
+!... iand(I,J): Returns value resulting from boolean AND of 
+!... pair of bits in each of I and J. 
+          bit_and = IAND(bit_and,int_array(i))
+        END DO
+!$omp end parallel do
+
+        IF ( bit_and .LT. 1 ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 1"
+        END IF
+
+        bit_and = 1
+        int_array(LOOPCOUNT/2) = 0
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(IAND:bit_and)</ompts:check>
+        DO i=1, LOOPCOUNT
+          bit_and = IAND ( bit_and, int_array(i) )
+        END DO
+!$omp end parallel do
+
+        IF( bit_and .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+
+!$omp parallel do schedule(dynamic,1) reduction(IOR:bit_or)
+        DO i=1, LOOPCOUNT
+!... Ior(I,J): Returns value resulting from boolean OR of 
+!... pair of bits in each of I and J. 
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp end parallel do
+
+        IF ( bit_or .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 1"
+        END IF
+
+
+        bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(IOR:bit_or)</ompts:check>
+        DO i=1, LOOPCOUNT
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp end parallel do
+
+        IF ( bit_or .LE. 0) then
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(IEOR:exclusiv_bit_or)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end parallel do
+
+        IF ( exclusiv_bit_or .GE. 1) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Ieor part 1"
+        END IF
+
+        exclusiv_bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(IEOR:exclusiv_bit_or)</ompts:check>
+        DO i = 1, LOOPCOUNT
+            exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end parallel do
+
+        IF ( exclusiv_bit_or .le. 0) then
+          result = result + 1
+          WRITE(1,*) "Error in Ieor part 2"
+        END IF
+
+        DO i=1,LOOPCOUNT
+           int_array(i) = 10 - i
+        END DO
+
+        min_value = 65535
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(MIN:min_value)</ompts:check>
+        DO i = 1, LOOPCOUNT
+            min_value = MIN(min_value,int_array(i) )
+        END DO
+!$omp end parallel do
+
+        IF ( min_value .GT. (10-LOOPCOUNT) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+           int_array(i) = i
+        END DO
+
+        max_value = -32768
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(MAX:max_value)</ompts:check>
+        DO i = 1, LOOPCOUNT
+            max_value = MAX(max_value,int_array(i) )
+        END DO
+!$omp end parallel do
+
+        IF ( max_value .LT. LOOPCOUNT )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MAX"
+        END IF
+
+!... test double min, max
+        DO i=1,LOOPCOUNT
+          d_array(i) = 10 - i*dt
+        END DO
+
+        dmin = 2**10
+        dt = 0.5
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(MIN:dmin)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          dmin= MIN(dmin,d_array(i) )
+        END DO
+!$omp end parallel do
+
+        IF ( dmin .GT. (10-dt) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in double MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+           d_array(i) = i * dt
+        END DO
+
+        dmax= - (2**10)
+
+!$omp parallel do schedule(dynamic,1) <ompts:check>reduction(max:dmax)</ompts:check>
+        DO i = 1, LOOPCOUNT
+          dmax= max(dmax,d_array(i) )
+        END DO
+!$omp end parallel do
+
+        IF ( dmax .LT. LOOPCOUNT*dt )THEN
+          result = result + 1
+          WRITE(1,*) "Error in double MAX"
+        END IF
+
+        IF ( result .EQ. 0 ) THEN
+          <testfunctionname></testfunctionname> =  1
+        ELSE
+          <testfunctionname></testfunctionname> =  0
+        END IF
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_section_firstprivate.f b/final/testsuite/fortran/par_section_firstprivate.f
new file mode 100644
index 0000000..b32b85c
--- /dev/null
+++ b/final/testsuite/fortran/par_section_firstprivate.f

@@ -0,0 +1,34 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel sections firstprivate clause.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel sections firstprivate</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_section_firstprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, sum0, known_sum
+        sum = 7
+        sum0 = 11
+!$omp parallel sections <ompts:check>firstprivate(sum0)</ompts:check><ompts:crosscheck>private(sum0)</ompts:crosscheck>  
+!$omp section
+!$omp critical 
+        sum = sum + sum0
+!$omp end critical
+<ompts:crosscheck>!$omp section</ompts:crosscheck>
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+<ompts:crosscheck>!$omp section</ompts:crosscheck>
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end parallel sections
+        known_sum = 11*3 + 7
+        IF ( known_sum .EQ. sum ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END 
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_section_lastprivate.f b/final/testsuite/fortran/par_section_lastprivate.f
new file mode 100644
index 0000000..d9c721d
--- /dev/null
+++ b/final/testsuite/fortran/par_section_lastprivate.f

@@ -0,0 +1,51 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel sections lastprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel sections lastprivate</ompts:directive>
+<ompts:dependences>omp critical,omp parallel sections private</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_section_lastprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, sum0, known_sum, i ,i0
+        sum = 0
+        sum0 = 0
+        i0 = -1
+!$omp parallel sections <ompts:check>lastprivate(i0)</ompts:check><ompts:crosscheck>private(i0)</ompts:crosscheck> private(i,sum0)
+!$omp section
+        sum0 = 0
+        DO i=1, 399
+          sum0 = sum0 + i
+          i0=i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=400, 699
+          sum0 = sum0 + i
+          i0 = i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=700, 999
+          sum0 = sum0 + i
+          i0 = i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end parallel sections
+        known_sum = (999*1000)/2
+!        print *, "sum", sum, "ks", known_sum, i0
+        IF ( known_sum .EQ. sum .AND. i0 .EQ. 999 ) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/par_section_private.f b/final/testsuite/fortran/par_section_private.f
new file mode 100644
index 0000000..fd0ac0e
--- /dev/null
+++ b/final/testsuite/fortran/par_section_private.f

@@ -0,0 +1,86 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the parallel section private clause.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel section private</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_section_private</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, sum0, known_sum, i
+        sum = 7
+        sum0 = 0
+!$omp parallel sections<ompts:check>private(sum0,i)</ompts:check><ompts:crosscheck>private(i)</ompts:crosscheck>
+!$omp section
+        sum0 = 0
+        DO i=1, 399
+          sum0 = sum0 + i
+        END DO
+!$omp critical
+          sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=400, 699
+          sum0 = sum0 + i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=700, 999
+          sum0 = sum0 + i
+        END DO
+!$omp critical
+          sum = sum + sum0
+!$omp end critical
+!$omp end parallel sections
+        known_sum = (999*1000)/2+7
+        IF ( known_sum .eq. sum ) then
+           <testfunctionname></testfunctionname> = 1
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END 
+</ompts:testcode>
+</ompts:test>
+
+        integer function crschk_par_section_private()
+        implicit none
+        integer sum, sum0, known_sum, i
+        sum = 7
+        sum0 = 0
+!$omp parallel sections private(i)
+!$omp section
+        sum0 = 0
+        do i=1, 399
+          sum0 = sum0 + i
+        end do
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        do i=400, 699
+          sum0 = sum0 + i
+        end do
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        do i=700, 999
+          sum0 = sum0 + i
+        end do
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end parallel sections
+        known_sum = (999*1000)/2+7
+        if ( known_sum .eq. sum ) then
+          crschk_par_section_private = 1
+        else
+          crschk_par_section_private = 0
+        end if
+        end
+

diff --git a/final/testsuite/fortran/par_section_reduct.f b/final/testsuite/fortran/par_section_reduct.f
new file mode 100644
index 0000000..adde304
--- /dev/null
+++ b/final/testsuite/fortran/par_section_reduct.f

@@ -0,0 +1,631 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp parallel sections reduction directive with all its option.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel sections reduction</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>par_section_reduct</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum, sum2, known_sum, i, i2,diff
+        INTEGER product,known_product,int_const
+        INTEGER MAX_FACTOR
+        DOUBLE PRECISION dsum,dknown_sum,dt,dpt
+        DOUBLE PRECISION rounding_error, ddiff
+        INTEGER DOUBLE_DIGITS
+        LOGICAL logic_and, logic_or, logic_eqv,logic_neqv
+        INTEGER bit_and, bit_or
+        INTEGER exclusiv_bit_or
+        INTEGER min_value, max_value
+        DOUBLE PRECISION dmin, dmax
+        INTEGER result
+        INCLUDE "omp_testsuite.f"
+        LOGICAL logics(LOOPCOUNT)
+        INTEGER int_array(LOOPCOUNT)
+        DOUBLE PRECISION d_array(LOOPCOUNT)
+        PARAMETER (int_const=10,known_product=3628800)
+        PARAMETER (DOUBLE_DIGITS=20,MAX_FACTOR=10)
+        PARAMETER (rounding_error=1.E-6)
+
+        INTEGER cut1, cut2, cut3, cut4
+
+        dt = 1./3.
+        known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+        product = 1
+        sum2 = 0
+        sum = 0
+        dsum = 0.
+        result =0 
+        logic_and = .TRUE.
+        logic_or = .FALSE.
+        bit_and = 1
+        bit_or = 0
+        exclusiv_bit_or = 0
+        cut1 = NINT(LOOPCOUNT / 3.)
+        cut2 = cut1 + 1
+        cut3 = 2 * cut1
+        cut4 = cut3 + 1
+
+!$omp parallel sections private(i) <ompts:check>reduction(+:sum)</ompts:check>
+!$omp section
+        DO i =1, cut1
+          sum = sum + i
+        END DO
+!$omp section
+        DO i =cut2, cut3
+          sum = sum + i
+        END DO
+!$omp section
+        DO i =cut4, LOOPCOUNT
+          sum = sum + i
+        END DO
+!$omp end parallel sections
+
+        IF (known_sum .NE. sum) THEN
+          result = result + 1
+          WRITE(1,*) "Error in sum with integers: Result was ",
+     &       sum,"instead of ", known_sum
+        END IF
+
+        diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+
+
+!$omp parallel sections <ompts:check>reduction (-: diff)</ompts:check>
+!$omp section
+        DO i =1, cut1
+          diff = diff - i
+        END DO
+!$omp section
+        DO i =cut2, cut3
+          diff = diff - i
+        END DO
+!$omp section
+        DO i =cut4, LOOPCOUNT
+          diff = diff - i
+        END DO
+!$omp end parallel sections
+  
+        IF ( diff .NE. 0 ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in difference with integers: Result was ",
+     &       sum,"instead of 0."
+        END IF
+
+!... Test for doubles
+        dsum =0.
+        dpt = 1
+
+        DO i=1, DOUBLE_DIGITS
+          dpt= dpt * dt
+        END DO
+        dknown_sum = (1-dpt)/(1-dt)
+!$omp parallel sections <ompts:check>reduction(+:dsum)</ompts:check>
+!$omp section
+        DO i=0,6
+           dsum = dsum + dt**i
+        END DO
+!$omp section
+        DO i=7,12
+           dsum = dsum + dt**i
+        END DO
+!$omp section
+        DO i=13,DOUBLE_DIGITS-1
+           dsum = dsum + dt**i
+        END DO
+!$omp end parallel sections
+
+ 
+        IF(dsum .NE. dknown_sum .AND. 
+     &     ABS(dsum - dknown_sum) .GT. rounding_error ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in sum with doubles: Result was ",
+     &       dsum,"instead of ",dknown_sum,"(Difference: ",
+     &       dsum - dknown_sum,")"
+        END IF
+        dpt = 1
+
+
+      
+        DO i=1, DOUBLE_DIGITS
+           dpt = dpt*dt
+        END DO
+        ddiff = ( 1-dpt)/(1-dt)
+!$omp parallel sections <ompts:check>reduction(-:ddiff)</ompts:check>
+!$omp section
+        DO i=0, 6
+          ddiff = ddiff - dt**i
+        END DO
+!$omp section
+        DO i=7, 12
+          ddiff = ddiff - dt**i
+        END DO
+!$omp section
+        DO i=13, DOUBLE_DIGITS-1
+          ddiff = ddiff - dt**i
+        END DO
+!$omp end parallel sections
+
+        IF ( ABS(ddiff) .GT. rounding_error ) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Difference with doubles: Result was ",
+     &       ddiff,"instead of 0.0"
+        END IF
+
+!$omp parallel sections <ompts:check>reduction(*:product)</ompts:check>
+!$omp section
+        DO i=1,3
+           product = product * i
+        END DO
+!$omp section
+        DO i=4,6
+           product = product * i
+        END DO
+!$omp section
+        DO i=7,10
+           product = product * i
+        END DO
+!$omp end parallel sections
+
+        IF (known_product .NE. product) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Product with integers: Result was ",
+     &       product," instead of",known_product 
+         END IF
+
+        DO i=1,LOOPCOUNT
+          logics(i) = .TRUE.
+        END DO
+
+!$omp parallel sections <ompts:check>reduction(.AND.:logic_and)</ompts:check>
+!$omp section
+        DO i=1,cut1
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end parallel sections
+
+        if (.not. logic_and) then
+          result = result + 1
+          write(1,*) "Error in logic AND part 1"
+        end if
+
+
+        logic_and = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel sections <ompts:check>reduction(.AND.:logic_and)</ompts:check>
+!$omp section
+        DO i=1,cut1
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp end parallel sections
+
+        IF (logic_and) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic AND pass 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          logics(i) = .FALSE.
+        END DO
+
+!$omp parallel sections <ompts:check>reduction(.OR.:logic_or)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp end parallel sections
+
+        IF (logic_or) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 1"
+        END IF
+
+        logic_or = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel sections <ompts:check>reduction(.OR.:logic_or)</ompts:check>
+!$omp section
+        DO i=1,cut1
+          logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+          logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+          logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp end parallel sections
+
+        IF ( .NOT. logic_or ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 2"
+        END IF
+
+!... Test logic EQV, unique in Fortran
+        DO i=1, LOOPCOUNT
+          logics(i) = .TRUE.
+        END DO
+
+        logic_eqv = .TRUE.
+
+!$omp parallel sections <ompts:check>reduction(.EQV.:logic_eqv)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut2, cut3
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end parallel sections
+
+        IF (.NOT. logic_eqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 1"
+        END IF
+
+        logic_eqv = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel sections <ompts:check>reduction(.EQV.:logic_eqv)</ompts:check>
+!$omp section
+        DO i=1,cut1
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp end parallel sections
+
+        IF ( logic_eqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 2"
+        END IF
+
+!... Test logic NEQV, which is unique in Fortran
+        DO i=1, LOOPCOUNT
+          logics(i) = .FALSE.
+        END DO
+
+        logic_neqv = .FALSE.
+
+!$omp parallel sections <ompts:check>reduction(.NEQV.:logic_neqv)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp end parallel sections
+
+        IF (logic_neqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 1"
+        END IF
+
+        logic_neqv = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel sections <ompts:check>reduction(.NEQV.:logic_neqv)</ompts:check>
+!$omp section
+        DO i=1,cut1
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp end parallel sections
+
+        IF ( .NOT. logic_neqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 1
+        END DO
+!$omp parallel sections <ompts:check>reduction(IAND:bit_and)</ompts:check>
+!... iand(I,J): Returns value resulting from boolean AND of
+!... pair of bits in each of I and J.
+!$omp section
+        DO i=1, cut1
+          bit_and = IAND(bit_and,int_array(i))
+        END DO
+!$omp section
+        DO i=cut2, cut3
+          bit_and = IAND(bit_and,int_array(i))
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+          bit_and = IAND(bit_and,int_array(i))
+        END DO
+!$omp end parallel sections
+
+        IF ( bit_and .LT. 1 ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 1"
+        END IF
+
+        bit_and = 1
+        int_array(LOOPCOUNT/2) = 0
+
+!$omp parallel sections <ompts:check>reduction(IAND:bit_and)</ompts:check>
+!$omp section
+        DO i=1, cut1
+          bit_and = IAND ( bit_and, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut2, cut3
+          bit_and = IAND ( bit_and, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+          bit_and = IAND ( bit_and, int_array(i) )
+        END DO
+!$omp end parallel sections
+
+        IF( bit_and .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in IAND part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+
+!$omp parallel sections <ompts:check>reduction(IOR:bit_or)</ompts:check>
+!... Ior(I,J): Returns value resulting from boolean OR of
+!... pair of bits in each of I and J.
+!$omp section
+        DO i=1, cut1
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut2, cut3
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp end parallel sections
+
+        IF ( bit_or .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 1"
+        END IF
+
+
+        bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+!$omp parallel sections <ompts:check>reduction(IOR:bit_or)</ompts:check>
+!$omp section
+        DO i=1, cut1
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut2, cut3
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+          bit_or = IOR(bit_or, int_array(i) )
+        END DO
+!$omp end parallel sections
+
+        IF ( bit_or .LE. 0) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ior part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+!$omp parallel sections <ompts:check>reduction(IEOR:exclusiv_bit_or)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end parallel sections
+
+        IF ( exclusiv_bit_or .GE. 1) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ieor part 1"
+        END IF
+
+        exclusiv_bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+
+!$omp parallel sections <ompts:check>reduction(IEOR:exclusiv_bit_or)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          exclusiv_bit_or = IEOR(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp end parallel sections
+
+        IF ( exclusiv_bit_or .LE. 0) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ieor part 2"
+        END IF
+
+        DO i=1,LOOPCOUNT
+          int_array(i) = 10 - i
+        END DO
+
+        min_value = 65535
+
+!$omp parallel sections <ompts:check>reduction(MIN:min_value)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          min_value = MIN(min_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          min_value = MIN(min_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          min_value = MIN(min_value,int_array(i) )
+        END DO
+!$omp end parallel sections
+
+        IF ( min_value .GT. (10-LOOPCOUNT) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+          int_array(i) = i
+        END DO
+
+        max_value = -32768
+
+!$omp parallel sections <ompts:check>reduction(MAX:max_value)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          max_value = MAX(max_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          max_value = MAX(max_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          max_value = MAX(max_value,int_array(i) )
+        END DO
+!$omp end parallel sections
+
+        IF ( max_value .LT. LOOPCOUNT )THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MAX"
+        END IF
+
+!... test double min, max
+        DO i=1,LOOPCOUNT
+          d_array(i) = 10 - i*dt
+        END DO
+
+        dmin = 2**10
+        dt = 0.5
+
+!$omp parallel sections <ompts:check>reduction(MIN:dmin)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          dmin= MIN(dmin,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          dmin= MIN(dmin,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          dmin= MIN(dmin,d_array(i) )
+        END DO
+!$omp end parallel sections
+
+        IF ( dmin .GT. (10-dt) )THEN
+          result = result + 1
+          WRITE(1,*) "Error in double MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+          d_array(i) = i * dt
+        END DO
+
+        dmax= - (2**10)
+
+!$omp parallel sections <ompts:check>reduction(MAX:dmax)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+          dmax= MAX(dmax,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+          dmax= MAX(dmax,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+          dmax= MAX(dmax,d_array(i) )
+        END DO
+!$omp end parallel sections
+
+        IF ( dmax .lt. LOOPCOUNT*dt )THEN
+          result = result + 1
+          write(1,*) "Error in double MAX"
+        END IF
+
+        IF ( result .EQ. 0 ) THEN
+          <testfunctionname></testfunctionname> =  1
+        ELSE
+          <testfunctionname></testfunctionname> =  0
+        END IF
+
+        CLOSE(2)
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/section_firstprivate.f b/final/testsuite/fortran/section_firstprivate.f
new file mode 100644
index 0000000..630058a
--- /dev/null
+++ b/final/testsuite/fortran/section_firstprivate.f

@@ -0,0 +1,41 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp section firstprivate directive by adding a variable which is defined before the parallel region.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp firstprivate</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>section_firstprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER known_sum
+<ompts:orphan:vars>
+        INTEGER sum,sum0
+        COMMON /orphvars/ sum,sum0
+</ompts:orphan:vars>
+        sum = 7
+        sum0 = 11
+!$omp parallel
+        <ompts:orphan>
+!$omp sections <ompts:check>firstprivate(sum0)</ompts:check><ompts:crosscheck>private(sum0)</ompts:crosscheck>
+!$omp section
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end sections
+        </ompts:orphan>
+!$omp end parallel
+        known_sum = 11*3+7
+        IF ( known_sum .EQ. sum) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/section_lastprivate.f b/final/testsuite/fortran/section_lastprivate.f
new file mode 100644
index 0000000..5ba7148
--- /dev/null
+++ b/final/testsuite/fortran/section_lastprivate.f

@@ -0,0 +1,62 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp section lastprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp section lastprivate</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>section_lastprivate</ompts:testcode:functionname>()
+        INTEGER known_sum
+
+        <ompts:orphan:vars>
+        INTEGER i, i0, sum, sum0
+        COMMON /orphvars/ i,i0,sum
+        </ompts:orphan:vars>
+
+        sum = 0
+        sum0 = 0
+        i0 = -1
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>lastprivate(i0)</ompts:check><ompts:crosscheck>private(i0)</ompts:crosscheck> private(i,sum0)
+!$omp section
+        sum0 = 0
+        DO i=1, 399
+          sum0 = sum0 + i
+          i0 = i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=400, 699
+          sum0 = sum0 + i
+          i0 = i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=700, 999
+          sum0 = sum0 + i
+          i0 = i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end sections
+</ompts:orphan>
+!$omp end parallel
+
+        known_sum = (999*1000)/2
+        IF ( known_sum .EQ. sum .AND. i0 .EQ. 999 ) THEN
+           <testfunctionname></testfunctionname> = 1
+        ELSE
+           <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</omts:test>
+
+

diff --git a/final/testsuite/fortran/section_private.f b/final/testsuite/fortran/section_private.f
new file mode 100644
index 0000000..28b8090
--- /dev/null
+++ b/final/testsuite/fortran/section_private.f

@@ -0,0 +1,55 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the parallel section private clause.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp parallel section private</ompts:directive>
+<ompts:dependences>omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>section_private</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER known_sum
+<ompts:orphan:vars>
+        INTEGER sum,sum0,i
+        COMMON /orphvars/ sum,sum0,i
+</ompts:orphan:vars>
+
+        sum = 7
+        sum0 = 0
+!$omp parallel
+        <ompts:orphan>
+!$omp sections <ompts:check>private(sum0,i)</ompts:check><ompts:crosscheck>private(i)</ompts:crosscheck>
+!$omp section
+        sum0 = 0
+        DO i=1, 399
+          sum0 = sum0 + i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=400, 699
+          sum0 = sum0 + i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp section
+        sum0 = 0
+        DO i=700, 999
+          sum0 = sum0 + i
+        END DO
+!$omp critical
+        sum = sum + sum0
+!$omp end critical
+!$omp end sections
+        </ompts:orphan>
+!$omp end parallel
+        known_sum = (999*1000)/2+7
+        IF ( known_sum .EQ. sum) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/section_reduction.f b/final/testsuite/fortran/section_reduction.f
new file mode 100644
index 0000000..d7e70f9
--- /dev/null
+++ b/final/testsuite/fortran/section_reduction.f

@@ -0,0 +1,743 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp sections reduction directive with all its options.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp sections reduction</ompts:directive>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>section_reduction</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER sum2, known_sum, i2
+        INTEGER known_product,int_const
+        INTEGER MAX_FACTOR
+        DOUBLE PRECISION dknown_sum,dpt
+        INTEGER result
+        INCLUDE "omp_testsuite.f"
+        PARAMETER (int_const=10,known_product=3628800)
+
+        <ompts:orphan:vars>
+        INTEGER i,dummy
+        INTEGER sum, dIFf
+        DOUBLE PRECISION dt
+        DOUBLE PRECISION dsum, ddIFf
+        INTEGER product
+        LOGICAL logics(LOOPCOUNT)
+        INTEGER int_array(LOOPCOUNT)
+        LOGICAL logic_and, logic_or, logic_eqv,logic_neqv
+        INTEGER bit_and, bit_or
+        INTEGER exclusiv_bit_or
+        INTEGER min_value, max_value
+        DOUBLE PRECISION d_array(LOOPCOUNT)
+        DOUBLE PRECISION dmin, dmax
+
+        INTEGER DOUBLE_DIGITS
+        INTEGER cut1, cut2, cut3, cut4
+        PARAMETER (DOUBLE_DIGITS=20,MAX_FACTOR=10)
+        DOUBLE PRECISION rounding_error
+        PARAMETER (rounding_error=1.E-6)
+
+        COMMON /orphvars/ i,sum,dIFf,product,dt,dsum,ddIFf,logic_and,
+     &    logic_or,logic_eqv,logic_neqv,logics,int_array,bit_and,bit_or,
+     &    exclusiv_bit_or,min_value,dmin,dmax,d_array,max_value
+        
+        cut1 = NINT(LOOPCOUNT / 3.3)
+        cut2 = cut1 + 1
+        cut3 = cut1 * 2
+        cut4 = cut3 + 1
+
+        </ompts:orphan:vars>
+
+        dt = 1./3.
+        known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2
+        product = 1
+        sum2 = 0
+        sum = 0
+        dsum = 0.
+        result =0 
+        logic_and = .true.
+        logic_or = .false.
+        bit_and = 1
+        bit_or = 0
+        exclusiv_bit_or = 0
+        cut1 = NINT(LOOPCOUNT / 3.3)
+        cut2 = cut1 + 1
+        cut3 = cut1 * 2
+        cut4 = cut3 + 1
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections private(i) <ompts:check>reduction(+:sum)</ompts:check>
+!$omp section
+        DO i =1, cut1
+          sum = sum + i
+        END DO
+!$omp section
+        DO i =cut2, cut3
+          sum = sum + i
+        END DO
+!$omp section
+        DO i =cut4, LOOPCOUNT
+          sum = sum + i
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF (known_sum .NE. sum) THEN
+          result = result + 1
+        WRITE(1,*) "Error in sum with integers: Result was ",
+     &    sum,"instead of ", known_sum
+        END IF
+
+        dIFf = known_sum
+
+
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction (-: dIFf)</ompts:check>
+!$omp section
+        DO i =1, cut1
+          dIFf = dIFf - i
+        END DO
+!$omp section
+        DO i =cut2, cut3
+          dIFf = dIFf - i
+        END DO
+!$omp section
+        DO i =cut4, LOOPCOUNT
+          dIFf = dIFf - i
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+  
+        IF ( dIFf .NE. 0 ) THEN
+          result = result + 1
+        WRITE(1,*) "Error in dIFference with integers: Result was ",
+     &    sum,"instead of 0."
+        END IF
+
+!**********************************************************************!
+!   Test for DOubles
+!**********************************************************************!
+        dsum = 0.
+        dpt = 1
+
+        DO i=1, DOUBLE_DIGITS
+          dpt= dpt * dt
+        END DO
+        dknown_sum = (1-dpt)/(1-dt)
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(+:dsum)</ompts:check>
+!$omp section
+        DO i=0,6
+              dsum = dsum + dt**i
+        END DO
+!$omp section
+        DO i=7,12
+              dsum = dsum + dt**i
+        END DO
+!$omp section
+        DO i=13,DOUBLE_DIGITS-1
+              dsum = dsum + dt**i
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+ 
+        IF (dsum .NE. dknown_sum .AND. 
+     &    abs(dsum - dknown_sum) .GT. rounding_error ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in sum with DOubles: Result was ",
+     &      dsum,"instead of ",dknown_sum,"(DIFference: ",
+     &      dsum - dknown_sum,")"
+        END IF
+
+        dpt = 1
+        DO i=1, DOUBLE_DIGITS
+          dpt = dpt*dt
+        END DO
+
+        ddIFf = ( 1-dpt)/(1-dt)
+!$omp parallel
+!$omp sections <ompts:check>reduction(-:ddIFf)</ompts:check>
+!$omp section
+        DO i=0, 6
+          ddIFf = ddIFf - dt**i
+        END DO
+!$omp section
+        DO i=7, 12
+          ddIFf = ddIFf - dt**i
+        END DO
+!$omp section
+        DO i=13, DOUBLE_DIGITS-1
+          ddIFf = ddIFf - dt**i
+        END DO
+!$omp END sections
+!$omp END parallel
+
+        IF ( abs(ddIFf) .GT. rounding_error ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in DIFference with DOubles: Result was ",
+     &      ddIFf,"instead of 0.0"
+        END IF
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(*:product)</ompts:check>
+!$omp section
+        DO i=1,3
+          product = product * i
+        END DO
+!$omp section
+        DO i=4,6
+          product = product * i
+        END DO
+!$omp section
+        DO i=7,10
+          product = product * i
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF (known_product .NE. product) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Product with integers: Result was ",
+     &      product," instead of",known_product 
+        END IF
+
+        DO i=1,LOOPCOUNT
+          logics(i) = .TRUE.
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.and.:logic_and)</ompts:check>
+!$omp section
+        DO i=1,cut1
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF (.NOT. logic_and) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic AND part 1"
+        END IF
+
+
+        logic_and = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.and.:logic_and)</ompts:check>
+!$omp section
+        DO i=1,cut1
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+          logic_and = logic_and .AND. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF (logic_and) THEN
+           result = result + 1
+           WRITE(1,*) "Error in logic AND pass 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+         logics(i) = .FALSE.
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.or.:logic_or)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i = cut2, cut3
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF (logic_or) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 1"
+        END IF
+
+        logic_or = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.or.:logic_or)</ompts:check>
+!$omp section
+        DO i=1,cut1
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+           logic_or = logic_or .OR. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( .NOT. logic_or ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic OR part 2"
+        END IF
+
+!... Test logic EQV, unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .TRUE.
+        END DO
+
+        logic_eqv = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.eqv.:logic_eqv)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut2, cut3
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF (.NOT. logic_eqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 1"
+        END IF
+
+        logic_eqv = .TRUE.
+        logics(LOOPCOUNT/2) = .FALSE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.eqv.:logic_eqv)</ompts:check>
+!$omp section
+        DO i=1,cut1
+           logic_eqv = logic_eqv .EQV. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+           logic_eqv = logic_eqv .eqv. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+           logic_eqv = logic_eqv .eqv. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( logic_eqv ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic EQV part 2"
+        END IF
+
+!... Test logic NEQV, which is unique in Fortran
+        DO i=1, LOOPCOUNT
+         logics(i) = .false.
+        END DO
+
+        logic_neqv = .false.
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.neqv.:logic_neqv)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut2, cut3
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF (logic_neqv) THEN
+          result = result + 1
+          WRITE(1,*) "Error in logic NEQV part 1"
+        END IF
+
+        logic_neqv = .FALSE.
+        logics(LOOPCOUNT/2) = .TRUE.
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(.neqv.:logic_neqv)</ompts:check>
+!$omp section
+        DO i=1,cut1
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i=cut2,cut3
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp section
+        DO i=cut4,LOOPCOUNT
+           logic_neqv = logic_neqv .NEQV. logics(i)
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( .NOT. logic_neqv ) THEN
+          result = result + 1
+          write(1,*) "Error in logic NEQV part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+           int_array(i) = 1
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(iand:bit_and)</ompts:check>
+!... iand(I,J): Returns value resulting from boolean AND of
+!... pair of bits in each of I and J.
+!$omp section
+        DO i=1, cut1
+         bit_and = iand(bit_and,int_array(i))
+        END DO
+!$omp section
+        DO i=cut2, cut3
+         bit_and = iand(bit_and,int_array(i))
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+         bit_and = iand(bit_and,int_array(i))
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( bit_and .lt. 1 ) THEN
+          result = result + 1
+          write(1,*) "Error in IAND part 1"
+        END IF
+
+        bit_and = 1
+        int_array(LOOPCOUNT/2) = 0
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(iand:bit_and)</ompts:check>
+!$omp section
+        DO i=1, cut1
+          bit_and = iand ( bit_and, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut2, cut3
+          bit_and = iand ( bit_and, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+          bit_and = iand ( bit_and, int_array(i) )
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF( bit_and .GE. 1) THEN
+           result = result + 1
+          WRITE(1,*) "Error in IAND part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(ior:bit_or)</ompts:check>
+!... Ior(I,J): Returns value resulting from boolean OR of
+!... pair of bits in each of I and J.
+!$omp section
+        DO i=1, cut1
+          bit_or = ior(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut2, cut3
+          bit_or = ior(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+          bit_or = ior(bit_or, int_array(i) )
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( bit_or .GE. 1) THEN
+           result = result + 1
+          WRITE(1,*) "Error in Ior part 1"
+        END IF
+
+
+        bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(ior:bit_or)</ompts:check>
+!$omp section
+        DO i=1, cut1
+          bit_or = Ior(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut2, cut3
+          bit_or = Ior(bit_or, int_array(i) )
+        END DO
+!$omp section
+        DO i=cut4, LOOPCOUNT
+          bit_or = Ior(bit_or, int_array(i) )
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( bit_or .LE. 0) THEN
+           result = result + 1
+          WRITE(1,*) "Error in Ior part 2"
+        END IF
+
+        DO i=1, LOOPCOUNT
+          int_array(i) = 0
+        END DO
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(ieor:exclusiv_bit_or)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+            exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut2, cut3
+            exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+            exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( exclusiv_bit_or .GE. 1) THEN
+           result = result + 1
+           WRITE(1,*) "Error in Ieor part 1"
+        END IF
+
+        exclusiv_bit_or = 0
+        int_array(LOOPCOUNT/2) = 1
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(ieor:exclusiv_bit_or)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+            exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut2, cut3
+            exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+            exclusiv_bit_or = ieor(exclusiv_bit_or, int_array(i))
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( exclusiv_bit_or .LE. 0) THEN
+          result = result + 1
+          WRITE(1,*) "Error in Ieor part 2"
+        END IF
+
+        DO i=1,LOOPCOUNT
+           int_array(i) = 10 - i
+        END DO
+
+        min_value = 65535
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(min:min_value)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+            min_value = min(min_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+            min_value = min(min_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+            min_value = min(min_value,int_array(i) )
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( min_value .GT. (10-LOOPCOUNT) ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+           int_array(i) = i
+        END DO
+
+        max_value = -32768
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(max:max_value)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+            max_value = max(max_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+            max_value = max(max_value,int_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+            max_value = max(max_value,int_array(i) )
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( max_value .LT. LOOPCOUNT ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in integer MAX"
+        END IF
+
+!... test DOuble min, max
+        DO i=1,LOOPCOUNT
+           d_array(i) = 10 - i*dt
+        END DO
+
+        dmin = 2**10
+        dt = 0.5
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(min:dmin)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+            dmin= min(dmin,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+            dmin= min(dmin,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+            dmin= min(dmin,d_array(i) )
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( dmin .GT. (10-dt) ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in DOuble MIN"
+        END IF
+
+
+        DO i=1,LOOPCOUNT
+           d_array(i) = i * dt
+        END DO
+
+        dmax= - (2**10)
+
+!$omp parallel
+<ompts:orphan>
+!$omp sections <ompts:check>reduction(max:dmax)</ompts:check>
+!$omp section
+        DO i = 1, cut1
+            dmax= max(dmax,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut2, cut3
+            dmax= max(dmax,d_array(i) )
+        END DO
+!$omp section
+        DO i = cut4, LOOPCOUNT
+            dmax= max(dmax,d_array(i) )
+        END DO
+!$omp END sections
+</ompts:orphan>
+!$omp END parallel
+
+        IF ( dmax .LT. LOOPCOUNT*dt ) THEN
+          result = result + 1
+          WRITE(1,*) "Error in DOuble MAX"
+        END IF
+
+        IF ( result .EQ. 0 ) THEN
+           <testfunctionname></testfunctionname> =  1
+        ELSE
+           <testfunctionname></testfunctionname> =  0
+        END IF
+
+        CLOSE(2)
+
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/single_copyprivate.f b/final/testsuite/fortran/single_copyprivate.f
new file mode 100644
index 0000000..0878a66
--- /dev/null
+++ b/final/testsuite/fortran/single_copyprivate.f

@@ -0,0 +1,41 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp single copyprivate directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp single copyprivate</ompts:directive>
+<ompts:dependences>omp parllel,omp critical</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>single_copyprivate</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER omp_get_thread_num
+        INCLUDE "omp_testsuite.f"
+<ompts:orphan:vars>
+        INTEGER i,j,thread,nr_iterations,result
+        COMMON /orphvars/ nr_iterations,result
+</ompts:orphan:vars>
+
+        result=0
+        nr_iterations=0
+
+!$omp parallel private(i,j,thread)
+        <ompts:orphan>
+        DO i=0,LOOPCOUNT-1
+          thread=OMP_GET_THREAD_NUM()
+!$omp single 
+          nr_iterations=nr_iterations+1
+          j=i
+!$omp end single <ompts:check>copyprivate(j)</ompts:check>
+!$omp critical
+          result=result+j-i;
+!$omp end critical
+        END DO
+        </ompts:orphan>
+!$omp end parallel
+        IF(result .EQ. 0 .AND. 
+     &     nr_iterations .EQ. LOOPCOUNT) THEN
+          <testfunctionname></testfunctionname>=1
+        ELSE
+          <testfunctionname></testfunctionname>=0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/single_nowait.f b/final/testsuite/fortran/single_nowait.f
new file mode 100644
index 0000000..f944af5
--- /dev/null
+++ b/final/testsuite/fortran/single_nowait.f

@@ -0,0 +1,50 @@
+<ompts:test>
+<ompts:testdescription></ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp single nowait</ompts:directive>
+<ompts:dependences>omp critical,omp atomic</ompts:dependences>
+<ompts:testcode>
+      INTEGER FUNCTION <ompts:testcode:functionname>single_nowait</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER result, total_iterations, my_iterations,i
+        INCLUDE "omp_testsuite.f"
+<ompts:orphan:vars>
+        INTEGER nr_iterations
+        COMMON /orphvars/ nr_iterations
+</ompts:orphan:vars>
+
+        result=0
+        nr_iterations=0
+        total_iterations=0
+        my_iterations=0
+
+!$omp parallel private(i)
+        DO i=0, LOOPCOUNT -1
+        <ompts:orphan>
+<ompts:check>!$omp single</ompts:check>
+!$omp atomic
+          nr_iterations = nr_iterations + 1
+<ompts:check>!$omp end single nowait</ompts:check>
+        </ompts:orphan>
+        END DO
+!$omp end parallel
+!$omp parallel private(i,my_iterations)
+        my_iterations = 0
+        DO i=0, LOOPCOUNT -1
+<ompts:check>!$omp single</ompts:check>
+          my_iterations = my_iterations + 1
+<ompts:check>!$omp end single nowait</ompts:check>
+        END DO
+!$omp critical
+        total_iterations = total_iterations + my_iterations
+!$omp end critical
+!$omp end parallel
+        IF ( nr_iterations .EQ. LOOPCOUNT .AND.
+     &     total_iterations .EQ. LOOPCOUNT ) THEN
+            <testfunctionname></testfunctionname> = 1
+        ELSE
+            <testfunctionname></testfunctionname> = 0
+        END IF
+      END FUNCTION
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/single_private.f b/final/testsuite/fortran/single_private.f
new file mode 100644
index 0000000..7ad4a6a
--- /dev/null
+++ b/final/testsuite/fortran/single_private.f

@@ -0,0 +1,51 @@
+<ompts:test>
+<ompts:testdescription>Test which checks the omp single private directive.</ompts:testdescription>
+<ompts:ompversion>2.0</ompts:ompversion>
+<ompts:directive>omp singel private</ompts:directive>
+<ompts:dependences>omp critical,omp flush,omp single nowait</ompts:dependences>
+<ompts:testcode>
+        INTEGER FUNCTION <ompts:testcode:functionname>single_private</ompts:testcode:functionname>()
+        IMPLICIT NONE
+        INTEGER nr_iterations, i
+		<ompts:orphan:vars>
+        INTEGER result
+        INTEGER nr_threads_in_single, myresult, myit
+        COMMON /orphvars/ result,nr_iterations
+		</ompts:orphan:vars>
+        INCLUDE "omp_testsuite.f"
+        nr_threads_in_single=0
+        result=0
+        myresult=0
+        myit=0
+        nr_iterations=0
+!$omp parallel private(i, myresult, myit)
+<ompts:orphan>
+        myresult = 0
+        myit = 0
+        nr_threads_in_single=0
+!$omp barrier
+        DO i=0, LOOPCOUNT -1
+!$omp single <ompts:check>private(nr_threads_in_single)</ompts:check>
+          nr_threads_in_single = 0
+!$omp flush
+          nr_threads_in_single = nr_threads_in_single + 1
+!$omp flush
+          myit = myit + 1
+          myresult = myresult + nr_threads_in_single
+!$omp end single nowait
+        END DO
+!$omp critical
+        result = result + nr_threads_in_single
+        nr_iterations = nr_iterations + myit
+!$omp end critical
+</ompts:orphan>
+!$omp end parallel
+        WRITE(1,*) "result is",result,"nr_it is",nr_iterations
+        IF ( result .EQ. 0 .AND. nr_iterations .EQ. LOOPCOUNT) THEN
+          <testfunctionname></testfunctionname> = 1
+        ELSE
+          <testfunctionname></testfunctionname> = 0
+        END IF
+      END
+</ompts:testcode>
+</ompts:test>

diff --git a/final/testsuite/fortran/testlist-f.txt b/final/testsuite/fortran/testlist-f.txt
new file mode 100644
index 0000000..41766e0
--- /dev/null
+++ b/final/testsuite/fortran/testlist-f.txt

@@ -0,0 +1,45 @@
+do_firstprivate.f
+do_lastprivate
+do_ordered
+do_private
+do_reduction
+#do_schedule_dynamic
+#do_schedule_guided
+#do_schedule_static
+has_openmp
+#omp_atomic
+omp_barrier
+omp_copyin
+omp_critical
+#omp_flush
+omp_get_num_threads
+omp_in_parallel
+omp_lock
+omp_master
+omp_nest_lock
+omp_nest_testlock
+omp_nested
+omp_num_threads
+omp_testlock
+omp_testsuite
+omp_threadprivate
+#omp_ticks_time
+#omp_workshare
+#omp_wtime
+#par_dofirstprivate
+#par_do_lastprivate
+#par_do_ordered
+#par_do_private
+#par_do_reduction
+#par_sectionirstprivate.f
+#par_section_lastprivate
+#par_section_private
+#par_section_reduction
+#sectionirstprivate.f
+#section_lastprivate
+#section_private
+#section_reduction
+#single
+#single_copyprivate
+#single_nowait
+#single_private

diff --git a/final/testsuite/omp_my_sleep.f b/final/testsuite/omp_my_sleep.f
new file mode 100644
index 0000000..e3cb04b
--- /dev/null
+++ b/final/testsuite/omp_my_sleep.f

@@ -0,0 +1,26 @@
+! Utility functions to have a sleep function with better resolution and
+! which only stops one thread.
+
+      subroutine my_sleep(sleeptime)
+        implicit none
+        double precision :: sleeptime
+        integer :: u
+        integer :: t(8)
+        integer :: ms1, ms2
+        integer :: cnt
+
+        u = sleeptime * 1000
+
+        call date_and_time(values=t)
+
+        ! calculate start time in ms
+        ms1 = t(8) + t(7)*1000 + t(6)*60000 + t(5)*3600000
+
+        ms2 = ms1
+        cnt = 0
+        do while ( (ms2 - ms1) < u)
+            call date_and_time(values=t)
+            ms2 = t(8) + t(7)*1000 + t(6)*60000 + t(5)*3600000
+            cnt = cnt+1
+        end do
+      end subroutine

diff --git a/final/testsuite/omp_my_sleep.h b/final/testsuite/omp_my_sleep.h
new file mode 100644
index 0000000..8390a5b
--- /dev/null
+++ b/final/testsuite/omp_my_sleep.h

@@ -0,0 +1,35 @@
+#ifndef MY_SLEEP_H
+#define MY_SLEEP_H
+
+#include <stdio.h>
+#include<stdlib.h>
+#include<unistd.h>
+
+#include <sys/times.h> 
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+
+/*! Utility function to have a sleep function with better resolution and which only stops one thread. */
+
+static void my_sleep(double sleeptime){
+  struct timeval tv;
+  struct timezone tzp;
+  double start;
+  double real;
+  if(gettimeofday(&tv,&tzp)!=0) {
+    perror("get_time: ");
+    exit(-1);
+  }
+  start = (double)tv.tv_sec + ((double)tv.tv_usec/1000000.0);
+  real=start;
+  while( (real-start)<sleeptime){
+    if(gettimeofday(&tv,&tzp)!=0) {
+      perror("get_time: ");
+      exit(-1);
+    }
+    real = (double)tv.tv_sec + ((double)tv.tv_usec/1000000.0);
+  }
+}
+
+#endif

diff --git a/final/testsuite/omp_testsuite.f b/final/testsuite/omp_testsuite.f
new file mode 100644
index 0000000..e135313
--- /dev/null
+++ b/final/testsuite/omp_testsuite.f

@@ -0,0 +1,14 @@
+      INTEGER N !The number of times each test is run
+      PARAMETER (N=1)
+      CHARACTER*8 OMPTS_VERSION
+      PARAMETER (OMPTS_VERSION="3.0.a")
+      INTEGER LOOPCOUNT !The number of times each loop is run
+      PARAMETER (LOOPCOUNT=1000)
+      DOUBLE PRECISION SLEEPTIME
+      PARAMETER (SLEEPTIME = .01)
+      DOUBLE PRECISION SLEEPTIME_LONG
+      PARAMETER (SLEEPTIME_LONG = 0.5)
+      INTEGER NUM_TASKS
+      PARAMETER (NUM_TASKS=25)
+      INTEGER MAX_TASKS_PER_THREAD
+      PARAMETER (MAX_TASKS_PER_THREAD=5)

diff --git a/final/testsuite/omp_testsuite.h b/final/testsuite/omp_testsuite.h
new file mode 100644
index 0000000..a808d92
--- /dev/null
+++ b/final/testsuite/omp_testsuite.h

@@ -0,0 +1,164 @@
+/* Global headerfile of the OpenMP Testsuite */
+
+/* This file was created with the ompts_makeHeader.pl script using the following opions: */
+/* -f=ompts-c.conf -t=c  */
+
+
+#ifndef OMP_TESTSUITE_H
+#define OMP_TESTSUITE_H
+
+#include <stdio.h>
+#include <omp.h>
+
+/* Version info                                           */
+/**********************************************************/
+#define OMPTS_VERSION "3.0a"
+
+/* General                                                */
+/**********************************************************/
+#define LOOPCOUNT 	1000
+#define REPETITIONS 	  20
+/* following times are in seconds */
+#define SLEEPTIME	 0.01
+#define SLEEPTIME_LONG	 0.5
+
+/* Definitions for tasks                                  */
+/**********************************************************/
+#define NUM_TASKS              25
+#define MAX_TASKS_PER_THREAD    5
+int test_omp_parallel_for_ordered(FILE * logfile);  /* Test for omp parallel for ordered */
+int crosstest_omp_parallel_for_ordered(FILE * logfile);  /* Crosstest for omp parallel for ordered */
+int test_omp_task_imp_firstprivate(FILE * logfile);  /* Test for omp task */
+int crosstest_omp_task_imp_firstprivate(FILE * logfile);  /* Crosstest for omp task */
+int test_omp_taskwait(FILE * logfile);  /* Test for omp taskwait */
+int crosstest_omp_taskwait(FILE * logfile);  /* Crosstest for omp taskwait */
+int test_omp_barrier(FILE * logfile);  /* Test for omp barrier */
+int crosstest_omp_barrier(FILE * logfile);  /* Crosstest for omp barrier */
+int test_omp_parallel_for_if(FILE * logfile);  /* Test for omp parallel for if */
+int crosstest_omp_parallel_for_if(FILE * logfile);  /* Crosstest for omp parallel for if */
+int test_omp_atomic(FILE * logfile);  /* Test for omp atomic */
+int crosstest_omp_atomic(FILE * logfile);  /* Crosstest for omp atomic */
+int test_omp_get_num_threads(FILE * logfile);  /* Test for omp_get_num_threads */
+int crosstest_omp_get_num_threads(FILE * logfile);  /* Crosstest for omp_get_num_threads */
+int test_omp_section_private(FILE * logfile);  /* Test for omp section private */
+int crosstest_omp_section_private(FILE * logfile);  /* Crosstest for omp section private */
+int test_omp_parallel_if(FILE * logfile);  /* Test for omp parallel if */
+int crosstest_omp_parallel_if(FILE * logfile);  /* Crosstest for omp parallel if */
+int test_omp_lock(FILE * logfile);  /* Test for omp_lock */
+int crosstest_omp_lock(FILE * logfile);  /* Crosstest for omp_lock */
+int test_omp_parallel_shared(FILE * logfile);  /* Test for omp parallel shared */
+int crosstest_omp_parallel_shared(FILE * logfile);  /* Crosstest for omp parallel shared */
+int test_omp_task_imp_shared(FILE * logfile);  /* Test for omp task */
+int crosstest_omp_task_imp_shared(FILE * logfile);  /* Crosstest for omp task */
+int test_omp_task_private(FILE * logfile);  /* Test for omp task private */
+int crosstest_omp_task_private(FILE * logfile);  /* Crosstest for omp task private */
+int test_omp_section_lastprivate(FILE * logfile);  /* Test for omp section lastprivate */
+int crosstest_omp_section_lastprivate(FILE * logfile);  /* Crosstest for omp section lastprivate */
+int test_omp_parallel_firstprivate(FILE * logfile);  /* Test for omp parallel firstprivate */
+int crosstest_omp_parallel_firstprivate(FILE * logfile);  /* Crosstest for omp parallel firstprivate */
+int test_omp_for_auto(FILE * logfile);  /* Test for omp for auto */
+int crosstest_omp_for_auto(FILE * logfile);  /* Crosstest for omp for auto */
+int test_omp_for_schedule_static(FILE * logfile);  /* Test for omp for schedule(static) */
+int crosstest_omp_for_schedule_static(FILE * logfile);  /* Crosstest for omp for schedule(static) */
+int test_omp_threadprivate_for(FILE * logfile);  /* Test for omp threadprivate */
+int crosstest_omp_threadprivate_for(FILE * logfile);  /* Crosstest for omp threadprivate */
+int test_omp_task_untied(FILE * logfile);  /* Test for omp task untied */
+int crosstest_omp_task_untied(FILE * logfile);  /* Crosstest for omp task untied */
+int test_omp_parallel_private(FILE * logfile);  /* Test for omp parallel private */
+int crosstest_omp_parallel_private(FILE * logfile);  /* Crosstest for omp parallel private */
+int test_omp_single_nowait(FILE * logfile);  /* Test for omp single nowait */
+int crosstest_omp_single_nowait(FILE * logfile);  /* Crosstest for omp single nowait */
+int test_omp_critical(FILE * logfile);  /* Test for omp critical */
+int crosstest_omp_critical(FILE * logfile);  /* Crosstest for omp critical */
+int test_omp_get_wtick(FILE * logfile);  /* Test for omp_get_wtick */
+int crosstest_omp_get_wtick(FILE * logfile);  /* Crosstest for omp_get_wtick */
+int test_omp_single(FILE * logfile);  /* Test for omp single */
+int crosstest_omp_single(FILE * logfile);  /* Crosstest for omp single */
+int test_omp_parallel_sections_reduction(FILE * logfile);  /* Test for omp parallel sections reduction */
+int crosstest_omp_parallel_sections_reduction(FILE * logfile);  /* Crosstest for omp parallel sections reduction */
+int test_omp_taskyield(FILE * logfile);  /* Test for omp taskyield */
+int crosstest_omp_taskyield(FILE * logfile);  /* Crosstest for omp taskyield */
+int test_has_openmp(FILE * logfile);  /* Test for _OPENMP */
+int crosstest_has_openmp(FILE * logfile);  /* Crosstest for _OPENMP */
+int test_omp_parallel_for_lastprivate(FILE * logfile);  /* Test for omp parallel for lastprivate */
+int crosstest_omp_parallel_for_lastprivate(FILE * logfile);  /* Crosstest for omp parallel for lastprivate */
+int test_omp_parallel_sections_lastprivate(FILE * logfile);  /* Test for omp parallel sections lastprivate */
+int crosstest_omp_parallel_sections_lastprivate(FILE * logfile);  /* Crosstest for omp parallel sections lastprivate */
+int test_omp_for_lastprivate(FILE * logfile);  /* Test for omp for lastprivate */
+int crosstest_omp_for_lastprivate(FILE * logfile);  /* Crosstest for omp for lastprivate */
+int test_omp_parallel_sections_firstprivate(FILE * logfile);  /* Test for omp parallel sections firstprivate */
+int crosstest_omp_parallel_sections_firstprivate(FILE * logfile);  /* Crosstest for omp parallel sections firstprivate */
+int test_omp_parallel_for_reduction(FILE * logfile);  /* Test for omp parallel for reduction */
+int crosstest_omp_parallel_for_reduction(FILE * logfile);  /* Crosstest for omp parallel for reduction */
+int test_omp_test_lock(FILE * logfile);  /* Test for omp_test_lock */
+int crosstest_omp_test_lock(FILE * logfile);  /* Crosstest for omp_test_lock */
+int test_omp_parallel_for_firstprivate(FILE * logfile);  /* Test for omp parallel for firstprivate */
+int crosstest_omp_parallel_for_firstprivate(FILE * logfile);  /* Crosstest for omp parallel for firstprivate */
+int test_omp_parallel_sections_private(FILE * logfile);  /* Test for omp parallel sections private */
+int crosstest_omp_parallel_sections_private(FILE * logfile);  /* Crosstest for omp parallel sections private */
+int test_omp_parallel_num_threads(FILE * logfile);  /* Test for omp parellel num_threads */
+int crosstest_omp_parallel_num_threads(FILE * logfile);  /* Crosstest for omp parellel num_threads */
+int test_omp_for_reduction(FILE * logfile);  /* Test for omp for reduction */
+int crosstest_omp_for_reduction(FILE * logfile);  /* Crosstest for omp for reduction */
+int test_omp_sections_nowait(FILE * logfile);  /* Test for omp parallel sections nowait */
+int crosstest_omp_sections_nowait(FILE * logfile);  /* Crosstest for omp parallel sections nowait */
+int test_omp_parallel_reduction(FILE * logfile);  /* Test for omp parallel reduction */
+int crosstest_omp_parallel_reduction(FILE * logfile);  /* Crosstest for omp parallel reduction */
+int test_omp_nested(FILE * logfile);  /* Test for omp_nested */
+int crosstest_omp_nested(FILE * logfile);  /* Crosstest for omp_nested */
+int test_omp_threadprivate(FILE * logfile);  /* Test for omp threadprivate */
+int crosstest_omp_threadprivate(FILE * logfile);  /* Crosstest for omp threadprivate */
+int test_omp_sections_reduction(FILE * logfile);  /* Test for omp sections reduction */
+int crosstest_omp_sections_reduction(FILE * logfile);  /* Crosstest for omp sections reduction */
+int test_omp_for_schedule_guided(FILE * logfile);  /* Test for omp for schedule(guided) */
+int crosstest_omp_for_schedule_guided(FILE * logfile);  /* Crosstest for omp for schedule(guided) */
+int test_omp_task_final(FILE * logfile);  /* Test for omp task final */
+int crosstest_omp_task_final(FILE * logfile);  /* Crosstest for omp task final */
+int test_omp_parallel_for_private(FILE * logfile);  /* Test for omp parallel for private */
+int crosstest_omp_parallel_for_private(FILE * logfile);  /* Crosstest for omp parallel for private */
+int test_omp_flush(FILE * logfile);  /* Test for omp flush */
+int crosstest_omp_flush(FILE * logfile);  /* Crosstest for omp flush */
+int test_omp_for_private(FILE * logfile);  /* Test for omp for private */
+int crosstest_omp_for_private(FILE * logfile);  /* Crosstest for omp for private */
+int test_omp_for_ordered(FILE * logfile);  /* Test for omp for ordered */
+int crosstest_omp_for_ordered(FILE * logfile);  /* Crosstest for omp for ordered */
+int test_omp_single_copyprivate(FILE * logfile);  /* Test for omp single copyprivate */
+int crosstest_omp_single_copyprivate(FILE * logfile);  /* Crosstest for omp single copyprivate */
+int test_omp_task_if(FILE * logfile);  /* Test for omp task if */
+int crosstest_omp_task_if(FILE * logfile);  /* Crosstest for omp task if */
+int test_omp_section_firstprivate(FILE * logfile);  /* Test for omp firstprivate */
+int crosstest_omp_section_firstprivate(FILE * logfile);  /* Crosstest for omp firstprivate */
+int test_omp_for_schedule_static_3(FILE * logfile);  /* Test for omp for schedule(static) */
+int crosstest_omp_for_schedule_static_3(FILE * logfile);  /* Crosstest for omp for schedule(static) */
+int test_omp_task_firstprivate(FILE * logfile);  /* Test for omp task firstprivate */
+int crosstest_omp_task_firstprivate(FILE * logfile);  /* Crosstest for omp task firstprivate */
+int test_omp_for_collapse(FILE * logfile);  /* Test for omp for collapse */
+int crosstest_omp_for_collapse(FILE * logfile);  /* Crosstest for omp for collapse */
+int test_omp_in_parallel(FILE * logfile);  /* Test for omp_in_parallel */
+int crosstest_omp_in_parallel(FILE * logfile);  /* Crosstest for omp_in_parallel */
+int test_omp_for_schedule_dynamic(FILE * logfile);  /* Test for omp for schedule(dynamic) */
+int crosstest_omp_for_schedule_dynamic(FILE * logfile);  /* Crosstest for omp for schedule(dynamic) */
+int test_omp_for_firstprivate(FILE * logfile);  /* Test for omp for firstprivate */
+int crosstest_omp_for_firstprivate(FILE * logfile);  /* Crosstest for omp for firstprivate */
+int test_omp_master(FILE * logfile);  /* Test for omp master */
+int crosstest_omp_master(FILE * logfile);  /* Crosstest for omp master */
+int test_omp_single_private(FILE * logfile);  /* Test for omp singel private */
+int crosstest_omp_single_private(FILE * logfile);  /* Crosstest for omp singel private */
+int test_omp_task(FILE * logfile);  /* Test for omp task */
+int crosstest_omp_task(FILE * logfile);  /* Crosstest for omp task */
+int test_omp_parallel_default(FILE * logfile);  /* Test for omp parallel default */
+int crosstest_omp_parallel_default(FILE * logfile);  /* Crosstest for omp parallel default */
+int test_omp_for_nowait(FILE * logfile);  /* Test for omp parallel for nowait */
+int crosstest_omp_for_nowait(FILE * logfile);  /* Crosstest for omp parallel for nowait */
+int test_omp_test_nest_lock(FILE * logfile);  /* Test for omp_test_nest_lock */
+int crosstest_omp_test_nest_lock(FILE * logfile);  /* Crosstest for omp_test_nest_lock */
+int test_omp_nest_lock(FILE * logfile);  /* Test for omp_nest_lock */
+int crosstest_omp_nest_lock(FILE * logfile);  /* Crosstest for omp_nest_lock */
+int test_omp_parallel_copyin(FILE * logfile);  /* Test for omp parallel copyin */
+int crosstest_omp_parallel_copyin(FILE * logfile);  /* Crosstest for omp parallel copyin */
+int test_omp_master_3(FILE * logfile);  /* Test for omp master */
+int crosstest_omp_master_3(FILE * logfile);  /* Crosstest for omp master */
+int test_omp_get_wtime(FILE * logfile);  /* Test for omp_get_wtime */
+int crosstest_omp_get_wtime(FILE * logfile);  /* Crosstest for omp_get_wtime */
+
+#endif

diff --git a/final/testsuite/ompts-c.conf b/final/testsuite/ompts-c.conf
new file mode 100644
index 0000000..f10b950
--- /dev/null
+++ b/final/testsuite/ompts-c.conf

@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include <omp.h>
+
+/* Version info                                           */
+/**********************************************************/
+#define OMPTS_VERSION "3.0a"
+
+/* General                                                */
+/**********************************************************/
+#define LOOPCOUNT 	1000
+#define REPETITIONS 	  20
+/* following times are in seconds */
+#define SLEEPTIME	 0.01
+#define SLEEPTIME_LONG	 0.5
+
+/* Definitions for tasks                                  */
+/**********************************************************/
+#define NUM_TASKS              25
+#define MAX_TASKS_PER_THREAD    5

diff --git a/final/testsuite/ompts-f.conf b/final/testsuite/ompts-f.conf
new file mode 100644
index 0000000..525af30
--- /dev/null
+++ b/final/testsuite/ompts-f.conf

@@ -0,0 +1 @@
+!Empty config file

diff --git a/final/testsuite/ompts.conf b/final/testsuite/ompts.conf
new file mode 100755
index 0000000..33135b5
--- /dev/null
+++ b/final/testsuite/ompts.conf

@@ -0,0 +1,26 @@
+<config>
+<globalconfig>
+<logfile>ompts.log</logfile>
+<singletesttimeout>120</singletesttimeout>
+<displayerrors>1</displayerrors>
+<displaywarnings>0</displaywarnings>
+<envsetthreadscommand>OMP_NUM_THREADS=%n; export OMP_NUM_THREADS;</envsetthreadscommand>
+<resultsfile>results.txt</resultsfile>
+<numthreads>8</numthreads>
+</globalconfig>
+
+<languages>
+<language>
+<languagename>c</languagename>
+<dir>c</dir>
+<fileextension>c</fileextension>
+<templateparsername>template_parser_c.pl</templateparsername>
+</language>
+<language>
+<languagename>fortran</languagename>
+<dir>fortran</dir>
+<fileextension>f</fileextension>
+<templateparsername>template_parser_fortran.pl</templateparsername>
+</language>
+</languages>
+</config>

diff --git a/final/testsuite/ompts_makeHeader.pl b/final/testsuite/ompts_makeHeader.pl
new file mode 100755
index 0000000..7212cec
--- /dev/null
+++ b/final/testsuite/ompts_makeHeader.pl

@@ -0,0 +1,101 @@
+#!/usr/bin/perl -w
+
+# ompts_makeHeader [options] -f=NAME -t=DIR
+#
+# Creats the headerfile for the OpenMP-Testsuite out of the templatefiles 
+# witch are in the default/explicitely specified dir and the settings in the
+# given config file. 
+# 
+# ATTENTION:
+#	At the moment it builts only a c-headerfile!
+#
+# -f=FILENAME: Using file FILENAME as configfile
+# -t=DIR:	Directory holding the template files
+# 
+# options
+# -i=FILENAME: Include other Headerfile. The files to be included must be specified
+# 	after setting this option. (Not implemented yet.)
+# -o=FILENAME: outputfilename (default is "omp_testsuite.h")
+
+$headerfile = "\/\* Global headerfile of the OpenMP Testsuite \*\/\n\n\/\* This file was created with the ompts_makeHeader.pl script using the following opions:\ *\/\n\/\* ";
+if(@ARGV > 0)
+{
+	foreach $opt (@ARGV)
+	{
+		$headerfile .= "$opt ";
+	}
+}
+else
+{
+	$headerfile .= "No options were specified";
+}
+$headerfile .=" \*\/\n\n\n";
+
+use Getopt::Long;
+GetOptions("-o=s" => \$outfile, "-f=s" =>\$configfile, "-t=s" => \$templatedir, "-i=s" => \$include);
+
+$include = "";
+
+
+# Getting and verifying the necessary options:
+if(!$configfile) {	
+	die "Config file name is missing.";
+}	
+else {
+    if (!(-e $configfile)) {
+	  die "Could not find config file $configfile.";
+	}
+}
+	
+if(!$templatedir) {	
+	die "Directoryname is missing.";
+}	
+
+if(!$outfile){
+	$outfile = "omp_testsuite.h";	# setting default value for the headerfile
+}
+
+
+
+#@includefiles;					# list holing extra includefiles specified by the user 
+
+
+# generating the head of the includeguard:
+$headerfile .= "\#ifndef OMP_TESTSUITE_H\n\#define OMP_TESTSUITE_H\n\n";
+
+# inserting general settings out of the configfile:
+open(OMPTS_CONF,$configfile) or die "Could not open the global config file $configfile.";
+while(<OMPTS_CONF>){
+	$headerfile .= $_;
+}
+close(OMPTS_CONF);
+
+# searching the tests:
+opendir TEMPLATEDIR, $templatedir or die "Could not open dir $templatedir.";
+@templates = grep /(.*)\.c/, readdir TEMPLATEDIR;
+closedir TEMPLATEDIR;
+
+# inserting the function declarations:
+foreach $template (@templates){
+	$source = "";
+	open(TEMPLATE,$templatedir."/".$template) or die "Could not open the following sourcefile: ".$templatedir."/".$template;
+	while(<TEMPLATE>){
+		$source .= $_;
+	}
+	close(TEMPLATE);
+	$source =~ /\<ompts\:testcode\:functionname\>(.*)\<\/ompts\:testcode\:functionname\>/;
+	$functionname = $1."(FILE \* logfile);";
+	$source =~ /\<ompts\:directive\>(.*)\<\/ompts\:directive\>/;
+	$directive = $1;
+	$headerfile .= "int test_".$functionname."  /* Test for ".$directive." */\n";
+	$headerfile .= "int crosstest_".$functionname."  /* Crosstest for ".$directive." */\n";
+}
+
+# inserting the end of the includeguard:
+$headerfile .= "\n#endif";
+
+# craeting the headerfile:
+open(OUTFILE,">".$outfile) or die "Could not create the haedaerfile ($outfile)";
+print OUTFILE $headerfile."\n";
+close(OUTFILE);
+

diff --git a/final/testsuite/ompts_parser.pl b/final/testsuite/ompts_parser.pl
new file mode 100755
index 0000000..98d7271
--- /dev/null
+++ b/final/testsuite/ompts_parser.pl

@@ -0,0 +1,167 @@
+#!/usr/bin/env perl
+
+# ompts_parser [option] SOURCEFILE
+# 
+# Creats the tests and the crosstests for the OpenMP-Testsuite out of an templatefiles which are given to the programm.
+# 
+# Options:
+# -test: 	make test
+# -crosstest: 	make crosstest
+# -orphan	if possible generate tests using orphan regions (not implemented yet)
+# -lang=LANG	preprocessing for language LANG, where LANG is one of the following languages:
+#		c, fortran
+# -o=FILENAME	outputfile (only when one templatefile is specified)
+
+
+# Using Getopt::long to extract the programm options
+use Getopt::Long;
+# Using functions: Set of subroutines to modify the testcode
+use ompts_parserFunctions;
+
+# Getting given options
+GetOptions("-test" => \$test,"-crosstest" => \$crosstest, "-o=s" => \$outputfile, "-orphan" => \$orphan, "-f!", "-lang=s" => \$language);
+
+# Remaining arguments are the templatefiles. 
+# Adding these to the list of to be parsed files if they exist.
+foreach $file(@ARGV)
+{
+	if(-e $file){ push(@sourcefiles,$file); }
+	else { print "Error: Unknown Option $file\n"; }
+}
+	
+# Checking if options were valid:
+#################################################################
+# preparations and checks for sourcefiles
+if(@sourcefiles == 0){die "No files to parse are specified!";}
+if($outputfile && (@sourcefiles != 1 || ($test && $crosstest) ) ){die "There were multiple files for one outputfiles specified!";} 
+# preparations fopr orphan tests
+if($orphan){ $orphanprefix = "orphaned"; }
+else { $orphanprefix = ""; }
+# preparations for test / crosstest
+if($test){push(@testtypes,"test"); 
+# %checks['test']="check";
+}
+if($crosstest){push(@testtypes,"ctest");
+# %checks['crosstest']="crosscheck";
+}
+# preparations and checks for language
+if($language eq"c") { $extension = "c";}
+elsif($language eq "fortran" or $language eq "f") { $language = "f"; $extension = "f";}
+else { die "You must specify a valid language!"; }
+    
+
+# Reading the templates for the tests into @sources
+foreach $srcfile (@sourcefiles)
+{
+	# Reading the content of the current sourcefile	into $src
+	open(TEST,$srcfile) or print "Error: Could not open template $srcfile\n";
+	while(<TEST>){ $src .= $_; }
+	close(TEST);
+	# Adding the content $src to the end of the list @sources
+	push(@sources,$src);
+}
+
+# Extracting the source for the mainprogramm and saving it in $mainprocsrc
+if($language eq "c") { $mainprocsrc = "ompts_standaloneProc.c"; }
+elsif($language eq "f") { $mainprocsrc = "ompts_standaloneProc.f"; } 
+open(MAINPROC,$mainprocsrc) or die "Could not open the sourcefile for the main program $mainprocsrc";
+while(<MAINPROC>){
+	$mainproc .= $_;
+}
+
+foreach $testtype (@testtypes)
+{
+  foreach $src(@sources)
+  {
+# Some temporary testinformation:
+    ($description) = get_tag_values('ompts:testdescription',$src);
+    ($directive) = get_tag_values('ompts:directive',$src);
+    ($functionname) = get_tag_values('ompts:testcode:functionname',$src);
+
+    open(OUTFILE,">".$language.$orphanprefix.$testtype."_".$functionname.".".$extension) or die("Could not create the output file for $directive");
+
+# Creating the source for the test:
+    ($code) = get_tag_values('ompts:testcode',$src);
+# Putting together the functions and the mainprogramm:
+    $code .= $mainproc;
+    
+# get the parameters <ompts:orphan:params> by joon
+# thanks to Dr. Yin Ma in Absoft
+    ($parms) = get_tag_values('ompts:orphan:parms',$code);
+    ($parms) = leave_single_space($parms);
+# to remove parameters tag between 'ompts:orphan:parms' by joon
+    ($code) = replace_tags('ompts:orphan:parms','',$code);
+    
+# Make modifications for the orphaned testversion if necessary:
+    if($orphan)
+    {
+# Get the global variables:
+      @defs = get_tag_values("ompts:orphan:vars",$code);
+      $orphvarsdef = "";
+      foreach $_ (@defs)
+      {
+	#print $_;
+	if(not /^[ ]*$/gs) { $orphvarsdef = join("\n",$orphvarsdef,$_); } 
+	#print "OK\n".$orphvarsdef; 
+      }
+      if($language eq "f")
+      {
+# Generate the orphan subroutines:
+	$orphfuncs = create_orph_fortranfunctions("$testtype_", $code);
+# Replace orphan regions by functioncalls:
+	($code) = orphan_regions2fortranfunctions( "$testtype_", ($code) );
+	($code) = enlarge_tags('ompts:orphan:vars','','',($code));
+    ($code) = enlarge_tags('ompts:orphan:parms','','',($code));
+    #to find orphan call statemetn and add parameters
+    
+# Put all together:
+	$code = $code . $orphfuncs;
+      }
+      elsif($language eq "c")
+      {
+# Generate predeclarations for orpahn functions:
+	$orphfuncsdefs = orph_functions_declarations("$testtype_",$code);
+# Generate the orphan functions:
+	$orphfuncs = create_orph_cfunctions("$testtype_",$code);
+# Repla:e orphan regions by functioncalls:
+	($code) = orphan_regions2cfunctions( "$testtype_", ($code) );
+# Deleting the former declarations of the variables in the orphan regions:
+	($code) = delete_tags('ompts:orphan:vars',($code));
+# Put all together:
+	$code = "#include \"omp_testsuite.h\"\n".$orphvarsdef . $orphfuncsdefs . $code . $orphfuncs;
+      }
+      else {
+	print "An error occurred!";
+      }
+    }
+# remove parameters between <ompts:orphan:parms> tags, added by joon
+    ($code)= replace_tags('ompts:orphan:parms',$code);
+    
+# Remove the marks for the orpahn regions and its variables:
+    ($code) = enlarge_tags('ompts:orphan','','',($code));
+    ($code) = enlarge_tags('ompts:orphan:vars','','',($code));
+
+# remove parameters between for orphaned directive parametes, added by joon
+    ($code) = enlarge_tags('ompts:orphan:parms','','',($code));
+    
+    if($testtype eq "test") {
+# Remove the marks for the testcode and remove the code for the crosstests: 
+      ($code) = enlarge_tags('ompts:check','','',($code));
+      ($code) = delete_tags('ompts:crosscheck',($code));		
+    }
+    elsif($testtype eq "ctest") {
+# Remove the marks for the crosstestcode and remove the code for the tests: 
+      ($code) = enlarge_tags('ompts:crosscheck','','',($code));
+      ($code) = delete_tags('ompts:check',($code));		
+    }
+# Making some final modifications:
+    ($code) = replace_tags('testfunctionname',$testtype."_".$functionname,($code));
+    ($code) = replace_tags('directive',$directive,($code));
+    ($code) = replace_tags('description',$description,($code));
+    ($code) = enlarge_tags('ompts:testcode:functionname',$testtype."_",'',($code) );
+#	$code =  "\#include \"omp_testsuite.h\"\n".$code;
+# Write the result into the file and close it:
+    print OUTFILE $code;
+    close(OUTFILE);
+  }
+}

diff --git a/final/testsuite/ompts_parserFunctions.pm b/final/testsuite/ompts_parserFunctions.pm
new file mode 100755
index 0000000..38cedb0
--- /dev/null
+++ b/final/testsuite/ompts_parserFunctions.pm

@@ -0,0 +1,280 @@
+#!/usr/bin/perl -w
+
+# functions.pm
+# This package contains a set of subroutines to modify the templates for the openMP Testuite.
+
+
+################################################################################
+# subroutines to extract, modify or delete tags from the template
+################################################################################
+
+# LIST get_tag_values( $tagname, $string )
+# subrutine to get the text encloded by a tag.
+# Returns a list containing the inner texts of the found tags
+sub get_tag_values
+{
+	my ( $tagname, $string );
+	( $tagname, $string ) = @_;
+	my (@tmp,@tmp2);
+   	@tmp = split(/\<$tagname\>/,$string); 
+	foreach $_(@tmp){
+		push(@tmp2,split(/\<\/$tagname\>/));
+	}
+	my(@result,$i);
+	$i=1; # couter to get only every second item
+	foreach $_(@tmp2){
+		if($i%2 eq 0){
+			push(@result,$_);
+		}
+		$i++;
+	}
+	return @result;
+}
+
+# LIST replace_tags( $tagname, $replacestring, @list )
+# subrutine to replace tags by a replacestring. 
+# Returns a list of the srings after conversion.
+sub replace_tags
+{
+	my ($tagname, $replacestring, @stringlist, @result);
+	($tagname, $replacestring, @stringlist) = @_;
+	foreach $_(@stringlist) {
+		s#\<$tagname\>(.*?)\<\/$tagname\>#$replacestring#gs;
+		push(@result,$_);
+	}
+	return @result;
+}
+
+# LIST enlarge_tags( $tagname, $before, $after, @list )
+# subrutine to replace tags by the tags added by a string before and after. 
+# Returns a list of the srings after conversion.
+sub enlarge_tags
+{
+	my ($tagname, $before, $after, @stringlist,@result);
+	($tagname, $before, $after, @stringlist) = @_;
+	foreach $_(@stringlist) {
+		s#\<$tagname\>(.*?)\<\/$tagname\>#$before$1$after#gs;
+		push(@result,$_);
+	}
+	return @result;
+}
+
+# LIST delete_tags( $tagname, @list )
+# subrutine to delete tags in a string. 
+# Returns a list of the cleared strings
+sub delete_tags
+{
+	my($tagname,@stringlist);
+	($tagname, @stringlist) = @_;
+	my(@result);
+	foreach $_(@stringlist) {
+		s#\<$tagname\>(.*?)\<\/$tagname\>##gs;
+		push(@result,$_);
+	}
+	return @result;
+}
+
+
+
+################################################################################
+# subroutines for generating "orpahned" tests 					
+################################################################################
+
+# SCALAR create_orph_cfunctions( $prefix, $code )
+# returns a string containing the definitions of the functions for the 
+# orphan regions.
+sub create_orph_cfunctions
+{
+	my ($code,@defs);
+	($code) = @_;
+	@defs = get_tag_values('ompts:orphan',$code);
+	($functionname) = get_tag_values('ompts:testcode:functionname',$code);
+	my ( @result,$functionsrc, $i);
+	$functionsrc =  "\n/* Automatically generated definitions of the orphan functions */\n";
+
+	$i = 1;
+	foreach (@defs) {
+		$functionsrc .= "\nvoid orph$i\_$functionname (FILE * logFile) {";
+		$functionsrc .= $_;
+		$functionsrc .= "\n}\n";
+		$i++;
+	}
+	$functionsrc .= "/* End of automatically generated definitions */\n";
+	return $functionsrc;
+}
+
+# SCALAR create_orph_fortranfunctions( $prefix, $code )
+# returns a string containing the definitions of the functions for the 
+# orphan regions.
+sub create_orph_fortranfunctions
+{
+	my ($prefix,$code,@defs,$orphan_parms);
+	($prefix,$code,$orphan_parms) = @_;
+	@defs = get_tag_values('ompts:orphan',$code);
+
+    #to remove space and put a single space
+    if($orphan_parms ne "")
+    {
+      $orphan_parms =~ s/[ \t]+//sg;
+      $orphan_parms =~ s/[ \t]+\n/\n/sg;
+    }
+    
+	($orphanvarsdefs) = get_tag_values('ompts:orphan:vars',$code);
+	foreach (@varsdef) {
+		if (not /[^ \n$]*/){ $orphanvarsdefs = join("\n",$orphanvarsdef,$_);}
+	}
+	($functionname) = get_tag_values('ompts:testcode:functionname',$code);
+	my ( @result,$functionsrc, $i);
+	$functionsrc =  "\n! Definitions of the orphan functions\n";
+	$i = 1;
+	foreach $_(@defs)
+	{
+		$functionsrc .= "\n      SUBROUTINE orph$i\_$prefix\_$functionname\($orphan_parms\)\n      ";
+        $functionsrc .= "INCLUDE \"omp_testsuite.f\"\n";
+		$functionsrc .= $orphanvarsdefs."\n";
+		$functionsrc .= $_;
+		$functionsrc .= "\n";
+		$functionsrc .= "      END SUBROUTINE\n! End of definition\n\n";
+		$i++;
+	}
+	return $functionsrc;
+}
+
+# LIST orphan_regions2cfunctions( $prefix, @code )
+# replaces orphan regions by functioncalls in C/C++.
+sub orphan_regions2cfunctions
+{
+	my ($code, $i, $functionname);
+	($code) = @_;
+	$i = 1;
+	($functionname) = get_tag_values('ompts:testcode:functionname',$code);
+        while( /\<ompts\:orphan\>(.*)\<\/ompts\:orphan\>/s) {
+            s#\<ompts\:orphan\>(.*?)\<\/ompts\:orphan\>#orph$i\_$functionname (logFile);#s;
+            $i++;
+        }
+	return $code;
+}
+
+# LIST orphan_regions2fortranfunctions( $prefix, @code )
+# replaces orphan regions by functioncalls in fortran
+sub orphan_regions2fortranfunctions
+{
+	my ( $prefix, @code, $my_parms, $i, $functionname);
+	($prefix, ($code), $my_parms) = @_;
+	$i = 1;
+	($functionname) = get_tag_values('ompts:testcode:functionname',$code);
+	foreach $_(($code))
+	{
+		while( /\<ompts\:orphan\>(.*)\<\/ompts\:orphan\>/s)
+		{
+			s#\<ompts\:orphan\>(.*?)\<\/ompts\:orphan\>#      CALL orph$i\_$prefix\_$functionname\($my_parms\);#s;
+			$i++;
+		}
+	}
+	return ($code);
+}
+
+# SCALAR orph_functions_declarations( $prefix, $code )
+# returns a sring including the declaration of the functions used 
+# in the orphan regions. The function names are generated using 
+# the $prefix as prefix for the functionname.
+sub orph_functions_declarations
+{
+	my ($prefix, $code);
+	($prefix, $code) = @_;
+	my ( @defs, $result );
+	
+	# creating declarations for later used functions
+	$result .= "\n\n/* Declaration of the functions containing the code for the orphan regions */\n#include <stdio.h>\n";
+	@defs = get_tag_values('ompts:orphan',$code);
+	my ($functionname,$i);
+	($functionname) = get_tag_values('ompts:testcode:functionname',$code);
+	$i = 1;
+	foreach $_(@defs) {
+		$result .= "\nvoid orph$i\_$prefix\_$functionname ( FILE * logFile );";
+		$i++;
+	}
+	$result .= "\n\n/* End of declaration */\n\n";
+	return $result;
+}
+
+# SCALAR make_global_vars_definition( $code )
+# returns a sring including the declaration for the vars needed to
+# be declared global for the orphan region.
+sub make_global_vars_def
+{
+	my ( $code );
+	($code) = @_;
+	my ( @defs, $result, @tmp, @tmp2 ,$predefinitions);
+	
+	# creating global declarations for the variables.
+	$result  = "\n\n/* Declaration of the variables used in the orphan region. */\n";
+	
+	# get all tags containing the variable definitions
+	@defs = get_tag_values('ompts:orphan:vars',$code);
+	foreach $_(@defs)
+	{
+		# cutting the different declarations in the same tag by the ';' as cuttmark
+		@tmp = split(/;/,$_);
+		foreach $_(@tmp)
+		{
+			# replacing newlines and double spaces
+			s/\n//gs;
+			s/  //gs;
+			# put the new declaration at the end of $result
+			if($_ ne ""){ $result .= "\n $_;"; }
+		}
+	}
+	$result .= "\n\n/* End of declaration. */\n\n";
+	return $result;
+}
+
+# SCALAR extern_vars_definition( $code )
+# returns a sring including the declaration for the vars needed to
+# be declared extern for the orphan region.
+sub extern_vars_def
+{
+	my ( $code );
+	($code) = @_;
+	my ( @defs, $result, @tmp, @tmp2 ,$predefinitions);
+	
+	# creating declarations for the extern variables.
+	$result  = "\n\n/* Declaration of the extern variables used in the orphan region. */\n";
+	# $result .= "\n#include <stdio.h>\n#include <omp.h>\n";
+	$result .= "\nextern FILE * logFile;";
+	
+	# get all tags containing the variable definitions
+	@defs = get_tag_values('ompts:orphan:vars',$code);
+	foreach $_(@defs)
+	{
+		# cutting the different declarations in the same tag by the ';' as cuttmark
+		@tmp = split(/;/,$_);
+		foreach $_(@tmp)
+		{
+			# replacing newlines and double spaces
+			s/\n//gs;
+			s/  //gs;
+			# cutting off definitions
+			@tmp2 = split("=",$_);
+			# put the new declaration at the end of $result
+			$result .= "\nextern $tmp2[0];";
+		}
+	}
+	$result .= "\n\n/* End of declaration. */\n\n";
+	return $result;
+}
+
+sub leave_single_space
+{
+  my($str);
+  ($str)=@_;
+  if($str ne "")
+  {
+    $str =~ s/^[ \t]+/ /;
+    $str =~ s/[ \t]+\n$/\n/;
+    $str =~ s/[ \t]+//g;
+  }
+  return $str;
+}
+
+return 1;

diff --git a/final/testsuite/ompts_standaloneProc.c b/final/testsuite/ompts_standaloneProc.c
new file mode 100644
index 0000000..591b593
--- /dev/null
+++ b/final/testsuite/ompts_standaloneProc.c

@@ -0,0 +1,51 @@
+int main()
+{
+	int i;			/* Loop index */
+	int result;		/* return value of the program */
+	int failed=0; 		/* Number of failed tests */
+	int success=0;		/* number of succeeded tests */
+	static FILE * logFile;	/* pointer onto the logfile */
+	static const char * logFileName = "<testfunctionname></testfunctionname>.log";	/* name of the logfile */
+
+
+	/* Open a new Logfile or overwrite the existing one. */
+	logFile = fopen(logFileName,"w+");
+
+	printf("######## OpenMP Validation Suite V %s ######\n", OMPTS_VERSION );
+	printf("## Repetitions: %3d                       ####\n",REPETITIONS);
+	printf("## Loop Count : %6d                    ####\n",LOOPCOUNT);
+	printf("##############################################\n");
+	printf("Testing <directive></directive>\n\n");
+
+	fprintf(logFile,"######## OpenMP Validation Suite V %s ######\n", OMPTS_VERSION );
+	fprintf(logFile,"## Repetitions: %3d                       ####\n",REPETITIONS);
+	fprintf(logFile,"## Loop Count : %6d                    ####\n",LOOPCOUNT);
+	fprintf(logFile,"##############################################\n");
+	fprintf(logFile,"Testing <directive></directive>\n\n");
+
+	for ( i = 0; i < REPETITIONS; i++ ) {
+		fprintf (logFile, "\n\n%d. run of <testfunctionname></testfunctionname> out of %d\n\n",i+1,REPETITIONS);
+		if(<testfunctionname></testfunctionname>(logFile)){
+			fprintf(logFile,"Test successful.\n");
+			success++;
+		}
+		else {
+			fprintf(logFile,"Error: Test failed.\n");
+			printf("Error: Test failed.\n");
+			failed++;
+		}
+	}
+
+    if(failed==0){
+		fprintf(logFile,"\nDirective worked without errors.\n");
+		printf("Directive worked without errors.\n");
+		result=0;
+	}
+	else{
+		fprintf(logFile,"\nDirective failed the test %i times out of %i. %i were successful\n",failed,REPETITIONS,success);
+		printf("Directive failed the test %i times out of %i.\n%i test(s) were successful\n",failed,REPETITIONS,success);
+		result = (int) (((double) failed / (double) REPETITIONS ) * 100 );
+	}
+	printf ("Result: %i\n", result);
+	return result;
+}

diff --git a/final/testsuite/ompts_standaloneProc.f b/final/testsuite/ompts_standaloneProc.f
new file mode 100644
index 0000000..4afcab6
--- /dev/null
+++ b/final/testsuite/ompts_standaloneProc.f

@@ -0,0 +1,68 @@
+!This is the main driver to invoke different test functions
+      PROGRAM <testfunctionname></testfunctionname>_main
+      IMPLICIT NONE
+      INTEGER failed, success !Number of failed/succeeded tests
+      INTEGER num_tests,crosschecked, crossfailed, j
+      INTEGER temp,temp1
+      INCLUDE "omp_testsuite.f"
+
+      INTEGER <testfunctionname></testfunctionname>
+
+
+      CHARACTER*50 logfilename !Pointer to logfile
+      INTEGER result 
+
+      num_tests = 0
+      crosschecked = 0
+      crossfailed = 0
+      result = 1
+      failed = 0
+
+      !Open a new logfile or overwrite the existing one.
+      logfilename = "bin/fortran/<testfunctionname></testfunctionname>.log"
+!      WRITE (*,*) "Enter logFilename:" 
+!      READ  (*,*) logfilename
+
+      OPEN (1, FILE = logfilename)
+ 
+      WRITE (*,*) "######## OpenMP Validation Suite V 3.0a ######"
+      WRITE (*,*) "## Repetitions:", N 
+      WRITE (*,*) "## Loop Count :", LOOPCOUNT
+      WRITE (*,*) "##############################################"
+      WRITE (*,*)
+
+      crossfailed=0
+      result=1
+      WRITE (1,*) "--------------------------------------------------"
+      WRITE (1,*) "Testing <directive></directive>"
+      WRITE (1,*) "--------------------------------------------------"
+      WRITE (1,*) 
+      WRITE (1,*) "testname: <testfunctionname></testfunctionname>"
+      WRITE (1,*) "(Crosstests should fail)"
+      WRITE (1,*)
+      
+      DO j = 1, N
+        temp =  <testfunctionname></testfunctionname>()
+        IF (temp .EQ. 1) THEN
+          WRITE (1,*)  j, ". test successful."
+          success = success + 1
+        ELSE
+          WRITE (1,*) "Error: ",j, ". test failed."
+          failed = failed + 1
+        ENDIF
+      END DO
+
+      
+      IF (failed .EQ. 0) THEN
+        WRITE (1,*) "Directive worked without errors."
+        WRITE (*,*) "Directive worked without errors."
+        result = 0
+        WRITE (*,*) "Result:",result
+      ELSE
+        WRITE (1,*) "Directive failed the test ", failed, " times."
+        WRITE (*,*) "Directive failed the test ", failed, " times."
+        result = failed * 100 / N
+        WRITE (*,*) "Result:",result
+      ENDIF
+      CALL EXIT (result)
+      END PROGRAM 

diff --git a/final/testsuite/runtest.pl b/final/testsuite/runtest.pl
new file mode 100755
index 0000000..3472351
--- /dev/null
+++ b/final/testsuite/runtest.pl

@@ -0,0 +1,592 @@
+#!/usr/bin/env perl
+
+# runtest [options] FILENAME
+#
+# Read the file FILENAME. Each line contains a test.
+# Convert template to test and crosstest.
+# If possilble generate orphaned testversions, too.
+# Use make to compile the test
+
+################################################################################
+# Global configuration options for the runtestscript itself:
+################################################################################
+
+# name of the global configuration file for the testsuite:
+$config_file    = "ompts.conf";
+$logfile        = "ompts.log"; # overwriteable by value in config file
+$env_set_threads_command = 'OMP_NUM_THREADS=%n; export OMP_NUM_THREADS;';
+$debug_mode     = 0;
+################################################################################
+# After this line the script part begins! Do not edit anithing below
+################################################################################
+
+
+# Namespaces:
+use Getopt::Long;
+#use Unix::PID;
+use Data::Dumper;
+use ompts_parserFunctions;
+
+# Extracting given options
+GetOptions("help",
+      "listlanguages",
+      "lang=s",
+      "list",
+      "testinfo=s",
+      "numthreads=i",
+      "test=s",
+      "compile!",
+      "run!",
+      "orphan!",
+      "resultfile=s"
+      );
+
+# Get global configuratino options from config file:
+if(! -e $config_file){ error ("Could not find config file $config_file\n", 1);}
+open (CONFIG, "<$config_file") or error ("Could not open config file $config_file\n", 2);
+while (<CONFIG>) { $config .= $_; }
+close (CONFIG);
+
+($logfile) = get_tag_values ("logfile", $config);
+($timeout) = get_tag_values ("singletesttimeout", $config);
+($display_errors) = get_tag_values("displayerrors", $config);
+($display_warnings) = get_tag_values ("displaywarnings", $config);
+($numthreads) = get_tag_values ("numthreads", $config);
+($env_set_threads_command) = get_tag_values("envsetthreadscommand",$config);
+$env_set_threads_command =~ s/\%n/$numthreads/g;
+@languages = get_tag_values ("language", $config);
+
+if (!defined($opt_compile)) {$opt_compile = 1;}
+if (!defined($opt_run))     {$opt_run = 1;}
+if (!defined($opt_orphan)) {$opt_orphan = 1;}
+if (!defined($opt_resultsfile)) {($opt_resultsfile) = get_tag_values("resultsfile", $config);}
+if ( defined($opt_numthreads) && ($opt_numthreads > 0)) {$numthreads = $opt_numthreads;}
+if ($debug_mode) {
+print <<EOF;
+Testsuite configuration:
+Logfile = $logfile
+Timeout = $timeout seconds
+Language:  $opt_lang
+Display errors:   $display_errors
+Display warnings: $display_warnings
+Resultsfile:      $opt_resultsfile
+Numthreads: $numthreads
+------------------------------
+EOF
+}
+
+$num_construcs    = 0;
+$num_tests        = 0;
+$num_failed_tests = 0;
+$num_successful_tests   = 0;
+$num_verified_tests     = 0;
+$num_failed_compilation = 0;
+
+$num_normal_tests_failed = 0;
+$num_normal_tests_compile_error = 0;
+$num_normal_tests_timed_out = 0;
+$num_normal_tests_successful = 0;
+$num_normal_tests_verified = 0;
+
+$num_orphaned_tests_failed = 0;
+$num_orphaned_tests_compile_error = 0;
+$num_orphaned_tests_timed_out = 0;
+$num_orphaned_tests_successful = 0;
+$num_orphaned_tests_verified = 0;
+
+if ($opt_help)         { print_help_text ();   exit 0; }
+if ($opt_listlanguages){ print_avail_langs (); exit 0; }
+if ($opt_list)     { print_avail_tests ();   exit 0; }
+if ($opt_testinfo) { print_testinfo ();      exit 0; }
+if ($opt_test)     { write_result_file_head();
+                     execute_single_test (); exit 0; }
+if (-e $ARGV[0])   { write_result_file_head();
+                     execute_testlist($ARGV[0]); print_results();
+                     result_summary(); exit 0;}
+
+################################################################################
+# sub function definitions
+################################################################################
+
+# Function which prints the results file
+sub print_results
+{
+    system("echo; cat $opt_resultsfile; echo;");
+}
+
+# Function which prints a summary of all test
+sub result_summary
+{
+    my $num_directives = @test_results;
+
+    print <<EOF;
+
+Summary:
+S Number of tested Open MP constructs: $num_constructs
+S Number of used tests:                $num_tests
+S Number of failed tests:              $num_failed_tests
+S Number of successful tests:          $num_successful_tests
+S + from this were verified:           $num_verified_tests
+
+Normal tests:
+N Number of failed tests:              $num_normal_tests_failed
+N + from this fail compilation:        $num_normal_tests_compile_error
+N + from this timed out                $num_normal_tests_timed_out
+N Number of successful tests:          $num_normal_tests_successful
+N + from this were verified:           $num_normal_tests_verified
+
+Orphaned tests:
+O Number of failed tests:              $num_orphaned_tests_failed
+O + from this fail compilation:        $num_orphaned_tests_compile_error
+O + from this timed out                $num_orphaned_tests_timed_out
+O Number of successful tests:          $num_orphaned_tests_successful
+O + from this were verified:           $num_orphaned_tests_verified
+EOF
+
+}
+
+# Function that executest the tests specified in the given list
+sub execute_testlist
+{
+    my ($filename) = @_;
+    # opening testlist
+    open(TESTS,$filename) or error ("Could not open  $filename\n", 1);
+TEST: while (<TESTS>) {
+        if (/^\s*#/) {next TEST;}
+        if (/^\s*$/) {next TEST;}
+        $opt_test = $_;
+        chomp ($opt_test);
+        execute_single_test ();
+    }
+# print Dumper(@test_results);
+}
+
+# Function that executes a system command but takes care of the global timeout
+# If command did not finish inbetween returns '-' otherwise the exit status of
+# the system command
+sub timed_sys_command
+{
+    my ($command) = @_;
+    my $exit_status = '-';
+
+# set up the timeout for the command
+    eval {
+        local $SIG{ALRM} = sub {die "alarm\n"};
+        alarm $timeout;
+        log_message_add ("Starting command \"$command\"");
+        $exit_status = system ($command);
+        alarm 0;
+    };
+# check if command finished during the maximum execution time
+    if ($@ eq "alarm\n") {
+# test timed out
+#		my $pid = Unix::PID->new();
+#		$pid->get_pidof($command, 1);
+#		$pid->kill();
+        if ($debug_mode) {
+	    log_message_add ("Command \"$command\" reached max execution time.\n");
+        }
+        return "TO";
+    }
+# test finished
+    return $exit_status;
+}
+
+# Function that runs the tests given as a array containing the testnames
+# Returns an array containing the percent values of the passed tests and the
+# successful crosstests.
+sub run_test
+{
+    my ($testname, $orphan) = @_;
+    my $bin_name, $cbin_name;
+    my $cmd, $exit_status, $failed;
+    my $resulttest, $resultctest;
+
+# path to test and crosstest either in normal or in orphaned version
+    if ($orphan) {
+        $bin_name  = "bin/$opt_lang/orph_test_$testname";
+        $cbin_name = "bin/$opt_lang/orph_ctest_$testname";
+    } else {
+        $bin_name  = "bin/$opt_lang/test_$testname";
+        $cbin_name = "bin/$opt_lang/ctest_$testname";
+    }
+# Check if executables exist
+    if (! -e $bin_name) {
+        test_error ("Could not find executable \"$bin_name\".");
+        return ('test' => '-', 'crosstest' => '-');
+    }
+# run the test
+    $cmd = "$env_set_threads_command ./$bin_name >$bin_name.out";
+    print "Running test with $numthreads threads .";
+    $exit_status = timed_sys_command ($cmd);
+############################################################
+# Check if test finished within max execution time
+    if ($exit_status eq 'TO') {
+        print ".... failed (timeout)\n";
+        return ('test' => 'TO', 'crosstest' => '-')
+    }
+############################################################
+# check if all tests were successful
+    $failed = $exit_status >> 8;
+    if ($failed < 0 or $failed > 100) { $failed = 100; }
+    $resulttest = 100 - $failed;
+    if ($resulttest eq 100) {
+        print ".... success ...";
+    } else {
+        print ".... failed $failed\% of the tests\n";
+        return ('test' => $resulttest, 'crosstest' => '-');
+    }
+############################################################
+
+# do crosstest
+# check if executable exist
+    if (! -e $cbin_name) {
+        test_error ("Could not find executable \"$cbin_name\".");
+        print "... not verified (crosstest missing)\n";
+        return ('test' => $resulttest, 'crosstest' => '-');
+    }
+# run crosstest
+# Test was successful, so it makes sense to run the crosstest
+    $cmd = "$env_set_threads_command ./$cbin_name > $cbin_name.out";
+    $exit_status = timed_sys_command ($cmd);
+############################################################
+# Check if crosstest finished within max execution time
+    if ($exit_status eq 'TO') {
+        print "... not verified (timeout)\n";
+        return ('test' => $result, 'crosstest' => 'TO');
+    }
+############################################################
+# test if crosstests failed as expected
+    $resultctest = $exit_status >> 8;
+    if ($resultctest > 0) {
+        print "... and verified with $resultctest\% certainty\n";
+    } else {
+        print "... but might be lucky\n";
+    }
+    return ('test' => $resulttest, 'crosstest' => $resultctest);
+############################################################
+}
+
+# Function that generates the test binaries out of the sourcecode
+sub compile_src
+{
+    my ($testname, $orphan) = @_;
+    print "Compiling soures ............";
+    if ($orphan) {
+# Make orphaned tests
+        $exec_name     = "bin/$opt_lang/orph_test_$testname";
+        $crossexe_name = "bin/$opt_lang/orph_ctest_$testname";
+        $resulttest  = system ("make $exec_name > $exec_name\_compile.log" );
+        $resultctest = system ("make $crossexe_name > $crossexe_name\_compile.log" );
+    } else {
+# Make test
+        $exec_name     = "bin/$opt_lang/test_$testname";
+        $crossexe_name = "bin/$opt_lang/ctest_$testname";
+        $resulttest  = system ("make $exec_name > $exec_name\_compile.log" );
+        $resultctest = system ("make $crossexe_name > $crossexe_name\_compile.log" );
+    }
+    if ($resulttest) { test_error ("Compilation of the test failed."); }
+    if ($resultctest){ test_error ("Compilation of the crosstest failed."); }
+
+    if ($resulttest or $resultctest) {
+        print ".... failed\n";
+        return 0;
+    } else {
+        print ".... success\n";
+        return 1;
+    }
+}
+
+# Function which prepare the directory structure:
+sub init_directory_structure
+{
+    my ($language) = @_;
+    if (-e "bin" && -d "bin") { warning ("Old binary directory detected!");}
+    else { system ("mkdir bin"); }
+    if (-e "bin/$language" && -d "bin/$language") {
+        warning ("Old binary directory for language $language found.");}
+    else { system ("mkdir bin/$language"); }
+}
+
+# Function that generates the sourcecode for the given test
+sub make_src
+{
+    my ($testname, $orphan) = @_;
+    my $template_file;
+    my $src_name;
+
+    $template_file = "$dir/$testname.$extension";
+    if (!-e $template_file) { test_error ("Could not find template for \"$testname\""); }
+
+    print "Generating sources ..........";
+    if ($orphan) {
+# Make orphaned tests
+        $src_name = "bin/$opt_lang/orph_test_$testname.$extension";
+        $resulttest = system ("./$templateparsername --test --orphan $template_file $src_name");
+        $src_name = "bin/$opt_lang/orph_ctest_$testname.$extension";
+        $resultctest = system ("./$templateparsername --crosstest --orphan $template_file $src_name");
+    } else {
+# Make test
+        $src_name = "bin/$opt_lang/test_$testname.$extension";
+        $resulttest = system ("./$templateparsername --test --noorphan $template_file $src_name");
+        $src_name = "bin/$opt_lang/ctest_$testname.$extension";
+        $resultctest = system ("./$templateparsername --crosstest --noorphan $template_file $src_name");
+    }
+    if ($resulttest) { test_error ("Generation of sourcecode for the test failed."); }
+    if ($resultctest){ test_error ("Generation of sourcecode for the crosstest failed."); }
+
+    if ($resulttest or $resultctest) {
+        print ".... failed\n";
+       return 0;
+    } else {
+       print ".... success\n";
+       return 1;
+    }
+}
+
+# Function which checks if a given test is orphanable
+sub test_is_orphanable
+{
+    my ($testname) = @_;
+    my $src;
+    my $file = "$dir/$testname.$extension";
+    if(! -e $file){ test_error ("Could not find test file $file\n");}
+    open (TEST, "<$file") or test_error ("Could not open test file $file\n");
+    while (<TEST>) { $src .= $_; }
+    close (TEST);
+    return $src =~/ompts:orphan/;
+}
+
+sub write_result_file_head
+{
+    open (RESULTS, ">$opt_resultsfile") or error ("Could not open file '$opt_resultsfile' to write results.", 1);
+    $resultline = sprintf "%-25s %-s\n", "#Tested Directive", "\tt\tct\tot\toct";
+    print RESULTS $resultline;
+}
+
+# Function which adds a result to the list of results
+sub add_result
+{
+    my ($testname, $result) = @_;
+#	print Dumper(@{$result});
+
+    $num_constructs++;
+
+    open (RESULTS, ">>$opt_resultsfile") or error ("Could not open file '$opt_resultsfile' to write results.", 1);
+
+    if (${$result}[0][0]) {
+		$num_tests ++;}
+
+	if ($opt_compile and ${$result}[0][1] eq 0) {
+		${$result}[0][2]{test}      = 'ce';
+		${$result}[0][2]{crosstest} = '-';
+		$num_normal_tests_compile_error++;
+	    $num_normal_tests_failed++;
+	}
+
+    if ($opt_run and ${$result}[0][2] and ${$result}[0][2]{test} ne 'ce') {
+        if (${$result}[0][2]{test} == 100) {
+            $num_normal_tests_successful++;
+            if (${$result}[0][2]{crosstest} == 100){
+				$num_normal_tests_verified++;}
+		} elsif (${$result}[0][2]{test} eq 'TO'){
+			$num_normal_tests_timed_out++;
+			$num_normal_tests_failed++;
+		} else {
+			$num_normal_tests_failed++;
+		}
+    }
+    $resultline = "${$result}[0][2]{test}\t${$result}[0][2]{crosstest}\t";
+
+    if (${$result}[1][0]) {
+		$num_tests ++;}
+    else { $resultline .= "-\t-\n"; }
+
+    if ($opt_compile and ${$result}[1][1] eq 0) {
+		${$result}[1][2]{test}      = 'ce';
+		${$result}[1][2]{crosstest} = '-';
+		$num_orphaned_tests_compile_error++;
+		$num_orphaned_tests_failed++;
+	}
+
+    if ($opt_run and ${$result}[1][2] and ${$result}[1][2]{test} ne 'ce') {
+        if (${$result}[1][2]{test} == 100) {
+            $num_orphaned_tests_successful++;
+            if (${$result}[1][2]{crosstest} == 100){
+				$num_orphaned_tests_verified++;}
+		} elsif (${$result}[1][2]{test} eq 'TO'){
+			$num_orphaned_tests_timed_out++;
+			$num_orphaned_tests_failed++;
+        } else {
+			$num_orphaned_tests_failed++;
+		}
+    }
+    $resultline .= "${$result}[1][2]{test}\t${$result}[1][2]{crosstest}\n";
+
+    $num_failed_tests = $num_normal_tests_failed + $num_orphaned_tests_failed;
+	$num_failed_compilation = $num_normal_tests_compile_error + $num_orphaned_tests_compile_error;
+	$num_successful_tests = $num_normal_tests_successful + $num_orphaned_tests_successful;
+	$num_verified_tests = $num_normal_tests_verified + $num_orphaned_tests_verified;
+
+    $resultline2 = sprintf "%-25s %-s", "$testname", "\t$resultline";
+    print RESULTS $resultline2;
+}
+
+# Function which executes a single test
+sub execute_single_test
+{
+    my @result;
+    init_language_settings ($opt_lang);
+    init_directory_structure ($opt_lang);
+    log_message_add ("Testing for \"$opt_test\":");
+    print "Testing for \"$opt_test\":\n";
+# tests in normal mode
+    if ($opt_compile){ $result[0][0] = make_src ($opt_test, 0);
+                       $result[0][1] = compile_src ($opt_test, 0);}
+    if ($opt_run && $result[0][1] == 1) {
+                       $result[0][2] = {run_test ($opt_test, 0)};}
+# tests in orphaned mode
+    if ($opt_orphan && test_is_orphanable($opt_test)){
+        log_message_add ("Testing for \"$opt_test\" in orphaned mode:");
+        print "+ orphaned mode:\n";
+        if ($opt_compile) { $result[1][0] = make_src ($opt_test, 1);
+                            $result[1][1] = compile_src ($opt_test, 1);}
+        if ($opt_run && $result[1][1] == 1) {
+                            $result[1][2] = {run_test ($opt_test, 1)};}
+    }
+    add_result($opt_test, \@result);
+}
+
+# Function that prints info about a given test
+sub print_testinfo
+{
+    init_language_settings($opt_lang);
+    my $doc = "";
+    my $file = $dir."/".$opt_testinfo.".".$extension;
+    if (! -e $file) {error ("Could not find template for test $opt_testinfo", 5);}
+    open (TEST,"<$file") or error ("Could not open template file \"$file\" for test $opt_testinfo", 6);
+    while (<TEST>) {$doc .= $_;}
+    close (TEST);
+
+    (my $omp_version) = get_tag_values ("ompts:ompversion", $doc);
+    (my $dependences) = get_tag_values ("ompts:dependences", $doc);
+    (my $description) = get_tag_values ("ompts:testdescription", $doc);
+    my $orphanable = 'no';
+    if ($doc =~ /ompts:orphan/) {$orphanable = 'yes';}
+    print <<EOF;
+Info for test $opt_testinfo:
+Open MP standard: $omp_version
+Orphaned mode: $orphanable
+Dependencies:  $dependences
+Description:   $description
+EOF
+}
+
+# Function that initializes the settings for the given language
+sub init_language_settings
+{
+    my ($language) = @_;
+    foreach my $lang (@languages) {
+        (my $name) = get_tag_values ("languagename", $lang);
+        if ($name eq $language) {
+            ($extension) = get_tag_values ("fileextension", $lang);
+            ($dir)       = get_tag_values ("dir", $lang);
+            ($templateparsername) = get_tag_values ("templateparsername", $lang);
+            last;
+        }
+    }
+    # Check if we found the specified language in the config file
+    if (!$extension and !$dir) {
+      error ("Language $language could not be found.\n", 3);
+    }
+}
+
+
+
+# Function that prints all available tests for the given language
+sub print_avail_tests
+{
+    init_language_settings($opt_lang);
+    my @tests;
+    opendir(DIR,"$dir") or error ("Could not open directory $dir", 4);
+    while($_ = readdir(DIR)) { if (/\.$extension$/) {s/\.$extension//; push (@tests, $_);}}
+    closedir(DIR);
+    print "Found ".(@tests)." tests:\n". "-" x 30 . "\n";
+    foreach (@tests) { print $_."\n";}
+}
+
+# Function that prints all available tests for the given language
+sub print_avail_langs
+{
+    if (@languages > 0) {
+        print "Available languages:\n";
+        foreach (@languages) {
+            (my $name) = get_tag_values ("languagename", $_);
+            print "$name\n";
+        }
+    } else {
+        print "No languages available\n";
+    }
+}
+
+# Function that prints the error message
+sub print_help_text
+{
+    print <<EOF;
+runtest.pl [options] [FILE]
+
+Executes the tests listed in FILE. FILE has to contain the names of the tests,
+one test per line. Lines starting with '#' will be ignored.
+A language has to be specified for all commands except --help and --listlanguages.
+
+Options:
+  --help            displays this help message
+  --listlanguages   lists all available languages
+  --lang=s          select language
+  --list            list available tests for a language
+  --testinfo=NAME   show info for test NAME
+  --numthreads=NUM  set number of threads (overwrites config file settings)
+  --test=NAME       execute single test NAME
+  --nocompile       do not compile tests
+  --norun           do not run tests
+  --noorphan        switch of orphaned tests
+  --resultfile=NAME use NAME as resultfile (overwrites config file settings)
+EOF
+}
+
+# Function that writes an error message for a failed test / part of a test
+sub test_error
+{
+   my ($message) = @_;
+   log_message_add ("ERROR: $message");
+   if ($display_errors eq 1) { print STDERR "ERROR: $message\n"; }
+}
+
+# Function that returns an warning message
+sub warning {
+  my ($message) = @_;
+  if ($display_warnings eq 1) { print "Warniong: $message\n"; }
+  log_message_add ("Warning: $message");
+}
+
+# Function that returns an error message and exits with the specified error code
+sub error {
+  my ($message, $error_code) = @_;
+  if ($display_errors eq 1) { print STDERR "ERROR: $message\n"; }
+  log_message_add ("ERROR: $message");
+  exit ($error_code);
+}
+
+# Function which adds an new entry into the logfile together with a timestamp
+sub log_message_add
+{
+    (my $message) = @_;
+    ($sec,$min,$hour,$mday,$mon,$year,$wday,$ydat,$isdst) = localtime();
+    if(length($hour) == 1) { $hour="0$hour"; }
+    if(length($min) == 1)  { $min="0$min";   }
+    if(length($sec) == 1)  { $sec="0$sec";   }
+    $mon=$mon+1;
+    $year=$year+1900;
+    open (LOGFILE,">>$logfile") or die "ERROR: Could not create $logfile\n";
+    print LOGFILE "$mday/$mon/$year $hour.$min.$sec: $message\n";
+}

diff --git a/final/testsuite/template_parser_c.pl b/final/testsuite/template_parser_c.pl
new file mode 100755
index 0000000..3d6f346
--- /dev/null
+++ b/final/testsuite/template_parser_c.pl

@@ -0,0 +1,114 @@
+#!/usr/bin/env perl
+
+# ompts_parser [option] INFILE OUTFILE
+# 
+# Creats the tests and the crosstests for the OpenMP-Testsuite out of an templatefiles which are given to the programm.
+# 
+# Options:
+# --test: 	make test
+# --crosstest: 	make crosstest
+# --orphan	if possible generate tests using orphan 
+#
+# Return:
+#       Succes:                 0
+#       Template not found      -1
+#
+
+# Using Getopt::long to extract the programm options
+use Getopt::Long;
+# Using functions: Set of subroutines to modify the testcode
+use ompts_parserFunctions;
+
+# Getting given options
+GetOptions("test" => \$test,"crosstest" => \$crosstest, "orphan!" => \$orphan);
+
+# Remaining arguments are the templatefiles. 
+# Adding these to the list of to be parsed files if they exist.
+
+my $templatefile;
+my $sourcefile;
+my $mainprocsrc = "ompts_standaloneProc.c"; 
+
+$templatefile = $ARGV[0];
+$outfile = $ARGV[1];
+
+if (!-e $templatefile) {
+    print "Temaplte file not found";
+    exit -1;
+}
+
+	
+# Checking if options were valid:
+#################################################################
+# preparations and checks for sourcefiles
+
+# Reading the template for the tests 
+open(TEST,$templatefile) or die "Error: Could not open template $srcfile\n";
+while(<TEST>){ $src .= $_; }
+close(TEST);
+
+# Extracting the source for the mainprogramm and saving it in $mainprocsrc
+open(MAINPROC,$mainprocsrc) or die "Could not open the sourcefile for the main program $mainprocsrc";
+while(<MAINPROC>){ $mainproc .= $_; }
+close (MAINPROC);
+
+# Some temporary testinformation:
+my ($description)  = get_tag_values ('ompts:testdescription',$src);
+my ($directive)    = get_tag_values ('ompts:directive',$src);
+my ($functionname) = get_tag_values ('ompts:testcode:functionname',$src);
+
+open (OUTFILE,">$outfile") or die "Could not create the output file for $directive";
+
+# Creating the source for the test:
+my ($code) = get_tag_values('ompts:testcode',$src);
+# Putting together the functions and the mainprogramm:
+$code .= $mainproc;
+
+my $testprefix = "";
+
+# Make modifications for the orphaned testversion if necessary:
+if ($orphan) {
+# Get the global variables:
+    @defs = get_tag_values("ompts:orphan:vars",$code);
+    $orphvarsdef = "";
+    foreach (@defs) {
+        $orphvarsdef = join("\n",$orphvarsdef,$_); 
+    }
+# Generate predeclarations for orpahn functions:
+    $orphfuncsdefs = orph_functions_declarations($code);
+# Generate the orphan functions:
+    $orphfuncs = create_orph_cfunctions($code);
+# Repla:e orphan regions by functioncalls:
+    $code = orphan_regions2cfunctions($code);
+# Deleting the former declarations of the variables in the orphan regions:
+    ($code) = delete_tags('ompts:orphan:vars',($code));
+# Put all together:
+    $code = "#include \"omp_testsuite.h\"\n" . $orphvarsdef . $orphfuncsdefs . $code . $orphfuncs;
+    $testprefix .= "orph_";
+}
+
+# Remove remaining marks for the orpahn regions and its variables:
+($code) = enlarge_tags('ompts:orphan','','',($code));
+($code) = enlarge_tags('ompts:orphan:vars','','',($code));
+
+if($test) {
+# Remove the marks for the testcode and remove the code for the crosstests: 
+    ($code) = enlarge_tags('ompts:check','','',($code));
+    ($code) = delete_tags('ompts:crosscheck',($code));		
+    $testprefix .= "test_";
+}
+elsif($crosstest) {
+# Remove the marks for the crosstestcode and remove the code for the tests: 
+    ($code) = enlarge_tags('ompts:crosscheck','','',($code));
+    ($code) = delete_tags('ompts:check',($code));		
+    $testprefix .= "ctest_";
+}
+# Making some final modifications:
+($code) = replace_tags('testfunctionname',$testprefix.$functionname,($code));
+($code) = replace_tags('directive',$directive,($code));
+($code) = replace_tags('description',$description,($code));
+($code) = enlarge_tags('ompts:testcode:functionname',$testprefix,'',($code) );
+#	$code =  "\#include \"omp_testsuite.h\"\n".$code;
+# Write the result into the file and close it:
+print OUTFILE $code;
+close(OUTFILE);

diff --git a/final/testsuite/template_parser_fortran.pl b/final/testsuite/template_parser_fortran.pl
new file mode 100755
index 0000000..570a4e0
--- /dev/null
+++ b/final/testsuite/template_parser_fortran.pl

@@ -0,0 +1,115 @@
+#!/usr/bin/env perl
+
+# ompts_parser [option] INFILE OUTFILE
+# 
+# Creats the tests and the crosstests for the OpenMP-Testsuite out of an templatefiles which are given to the programm.
+# 
+# Options:
+# --test: 	make test
+# --crosstest: 	make crosstest
+# --orphan	if possible generate tests using orphan 
+#
+# Return:
+#       Succes:                 0
+#       Template not found      -1
+#
+
+# Using Getopt::long to extract the programm options
+use Getopt::Long;
+# Using functions: Set of subroutines to modify the testcode
+use ompts_parserFunctions;
+
+# Getting given options
+GetOptions("test" => \$test,"crosstest" => \$crosstest, "orphan!" => \$orphan);
+
+# Remaining arguments are the templatefiles. 
+# Adding these to the list of to be parsed files if they exist.
+
+my $templatefile;
+my $sourcefile;
+my $mainprocsrc = "ompts_standaloneProc.f"; 
+
+$templatefile = $ARGV[0];
+$outfile = $ARGV[1];
+
+if (!-e $templatefile) {
+    print "Temaplte file not found";
+    exit -1;
+}
+
+	
+# Checking if options were valid:
+#################################################################
+# preparations and checks for sourcefiles
+
+# Reading the template for the tests 
+open(TEST,$templatefile) or die "Error: Could not open template $srcfile\n";
+while(<TEST>){ $src .= $_; }
+close(TEST);
+
+# Extracting the source for the mainprogramm and saving it in $mainprocsrc
+open(MAINPROC,$mainprocsrc) or die "Could not open the sourcefile for the main program $mainprocsrc";
+while(<MAINPROC>){ $mainproc .= $_; }
+close (MAINPROC);
+
+# Some temporary testinformation:
+($description)  = get_tag_values ('ompts:testdescription',$src);
+($directive)    = get_tag_values ('ompts:directive',$src);
+($functionname) = get_tag_values ('ompts:testcode:functionname',$src);
+
+open (OUTFILE,">$outfile") or die "Could not create the output file for $directive";
+
+# Creating the source for the test:
+($code) = get_tag_values('ompts:testcode',$src);
+# Putting together the functions and the mainprogramm:
+$code .= $mainproc;
+
+#thanks to Dr. Yin Ma in Absoft, get the parameters <ompts:orphan:params> by joon
+ ($parms) = get_tag_values('ompts:orphan:parms',($code));
+ ($parms) = leave_single_space($parms);
+ ($code) = replace_tags('ompts:orphan:parms','',$code);
+
+# Make modifications for the orphaned testversion if necessary:
+if ($orphan) {
+# Get the global variables:
+    @defs = get_tag_values("ompts:orphan:vars",$code);
+    $orphvarsdef = "";
+    foreach (@defs) {
+        if (not /^[ ]*$/gs) { $orphvarsdef = join("\n",$orphvarsdef,$_); } 
+    }
+# Generate the orphan subroutines:
+        $orphfuncs = create_orph_fortranfunctions ("", ($code),($parms));
+# Replace orphan regions by functioncalls:
+        ($code) = orphan_regions2fortranfunctions ("", ($code),($parms));
+        ($code) = enlarge_tags ('ompts:orphan:vars','','',($code));
+# to find orphan call statement and add parameters, by joon
+        ($code) = enlarge_tags('ompts:orphan:parms','','',($code));
+# Put all together:
+        $code = $code . $orphfuncs;
+}
+
+# Remove remaining marks for the orpahn regions and its variables:
+($code) = enlarge_tags('ompts:orphan','','',($code));
+($code) = enlarge_tags('ompts:orphan:vars','','',($code));
+# remove parameters between for orphaned directive parametes, added byjoon
+($code) = enlarge_tags('ompts:orphan:parms','','',($code));
+
+if($test) {
+# Remove the marks for the testcode and remove the code for the crosstests: 
+    ($code) = enlarge_tags('ompts:check','','',($code));
+    ($code) = delete_tags('ompts:crosscheck',($code));		
+}
+elsif($crosstest) {
+# Remove the marks for the crosstestcode and remove the code for the tests: 
+    ($code) = enlarge_tags('ompts:crosscheck','','',($code));
+    ($code) = delete_tags('ompts:check',($code));		
+}
+# Making some final modifications:
+($code) = replace_tags('testfunctionname',"test_".$functionname,($code));
+($code) = replace_tags('directive',$directive,($code));
+($code) = replace_tags('description',$description,($code));
+($code) = enlarge_tags('ompts:testcode:functionname',"test_",'',($code) );
+#	$code =  "\#include \"omp_testsuite.h\"\n".$code;
+# Write the result into the file and close it:
+print OUTFILE $code;
+close(OUTFILE);

diff --git a/final/testsuite/testlist-c.txt b/final/testsuite/testlist-c.txt
new file mode 100644
index 0000000..813a218
--- /dev/null
+++ b/final/testsuite/testlist-c.txt

@@ -0,0 +1,68 @@
+# List containing the tests of the OpenMPValidaion Suite available to check
+# the c implementation.
+has_openmp
+omp_atomic
+omp_barrier
+omp_critical
+omp_flush
+omp_for_firstprivate
+omp_for_lastprivate
+omp_for_ordered
+omp_for_private
+omp_for_reduction
+omp_for_schedule_dynamic
+omp_for_schedule_guided
+omp_for_schedule_static
+omp_for_nowait
+omp_get_num_threads
+omp_get_wtick
+omp_get_wtime
+omp_in_parallel
+omp_lock
+omp_master
+omp_nest_lock
+omp_parallel_copyin
+omp_parallel_for_firstprivate
+omp_parallel_for_lastprivate
+omp_parallel_for_ordered
+omp_parallel_for_private
+omp_parallel_for_reduction
+omp_parallel_num_threads
+omp_parallel_sections_firstprivate
+omp_parallel_sections_lastprivate
+omp_parallel_sections_private
+omp_parallel_sections_reduction
+omp_section_firstprivate
+omp_section_lastprivate
+omp_section_private
+omp_sections_reduction
+omp_sections_nowait
+omp_parallel_for_if
+omp_single_copyprivate
+omp_single_nowait
+omp_single_private
+omp_single
+omp_test_lock
+omp_test_nest_lock
+omp_threadprivate
+omp_parallel_default
+omp_parallel_shared
+omp_parallel_private
+omp_parallel_firstprivate
+omp_parallel_if
+omp_parallel_reduction
+
+# New tests for OpenMP 3.0
+omp_for_collapse
+omp_master_3
+omp_task
+omp_task_if
+omp_task_untied
+omp_task_shared
+omp_task_private
+omp_task_firstprivate
+omp_taskwait
+
+# New tests for OpenMP 3.1
+omp_taskyield
+omp_task_final

diff --git a/final/testsuite/testlist-f.txt b/final/testsuite/testlist-f.txt
new file mode 100644
index 0000000..55389a5
--- /dev/null
+++ b/final/testsuite/testlist-f.txt

@@ -0,0 +1,63 @@
+# List containing the tests of the OpenMPValidaion Suite available to check
+# the fortan implementation.
+do_firstprivate
+do_lastprivate
+do_ordered
+do_private
+do_reduction
+do_schedule_dynamic
+do_schedule_guided
+do_schedule_static
+has_openmp
+omp_atomic
+omp_barrier
+omp_copyin
+omp_critical
+omp_flush
+omp_get_num_threads
+omp_in_parallel
+omp_lock
+omp_master
+omp_nest_lock
+omp_test_nest_lock
+omp_nested
+omp_num_threads
+omp_testlock
+omp_threadprivate
+omp_get_wticks
+###omp_workshare_default
+omp_workshare
+### Braucht man den oder obigen erweitern?
+omp_wtime
+par_do_firstprivate
+par_do_lastprivate
+par_do_ordered
+par_do_private
+par_do_reduction
+par_section_firstprivate
+par_section_lastprivate
+par_section_private
+par_section_reduct
+section_firstprivate
+section_lastprivate
+section_private
+section_reduction
+omp_single
+single_copyprivate
+single_nowait
+single_private
+
+# New tests for OpenMP 3.0
+do_collapse
+omp_master_3
+omp_task
+omp_task_if
+omp_task_untied
+omp_task_shared
+omp_task_private
+omp_task_firstprivate
+omp_taskwait
+
+# New tests for OpenMP 3.1
+#omp_taskyield
+#omp_task_final

diff --git a/final/testsuite/tests_to_integrate/omp_set_unset_lock_hinted.c b/final/testsuite/tests_to_integrate/omp_set_unset_lock_hinted.c
new file mode 100644
index 0000000..ca01534
--- /dev/null
+++ b/final/testsuite/tests_to_integrate/omp_set_unset_lock_hinted.c

@@ -0,0 +1,67 @@
+/******************************************************************************\
+  Extended version of omp_set_unset_lock.c for testing hinted locks.
+  Check to make sure OpenMP locks guarantee mutual 
+  exclusion for multiple threads.
+\******************************************************************************/
+	
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+void cscall(int id, int n[1000], int *passed, omp_lock_t *lock) {
+    int i;
+
+    omp_set_lock( lock );
+    for (i = 0; i < 1000; i++) {
+        n[i] = id;
+    }
+    for (i = 0; i < 1000; i++) {
+        if ( n[i] != id ) {
+            *passed = 0;
+        }
+    }
+    omp_unset_lock( lock );
+}
+
+int hinted_lock(kmp_lock_hint_t hint) {
+    int passed, n[1000], j, id;
+    omp_lock_t lock;
+    
+    passed = 1;
+
+    kmp_init_lock_hinted(&lock, hint);
+
+    #pragma omp parallel shared(n, passed, lock) private(id, j)	
+    {
+        id = omp_get_thread_num();
+        for (j = 1; j <= 10000; j++) {
+            cscall( id, n, &passed, &lock );
+        }
+    }
+
+    omp_destroy_lock(&lock);
+
+    if (passed) {
+        return 0;
+    } else {
+        return 1;
+    }
+}
+
+int main() {
+    int ret = 0;
+    ret += hinted_lock(kmp_lock_hint_none);
+    ret += hinted_lock(kmp_lock_hint_contended);
+    ret += hinted_lock(kmp_lock_hint_uncontended);
+    ret += hinted_lock(kmp_lock_hint_nonspeculative);
+    ret += hinted_lock(kmp_lock_hint_speculative);
+    // This one will emit Warning on machines with no TSX. 
+    ret += hinted_lock(kmp_lock_hint_adaptive);
+    if (ret) {
+        printf(" Test %s failed\n", __FILE__);
+        return 1;
+    } else {
+        printf(" Test %s passed\n", __FILE__);
+        return 0;
+    }
+}

diff --git a/final/testsuite/tests_to_integrate/omp_set_unset_lock_hinted.f b/final/testsuite/tests_to_integrate/omp_set_unset_lock_hinted.f
new file mode 100644
index 0000000..f743d38
--- /dev/null
+++ b/final/testsuite/tests_to_integrate/omp_set_unset_lock_hinted.f

@@ -0,0 +1,90 @@
+!
+! Modified version of omp_set_unset_lock.F, using hinted lock API
+!
+      program omp_set_unset_lock_hinted
+
+      integer ret
+#if defined(_OPENMP)
+      include 'omp_lib.h'
+
+      ret = 0
+      call hinted_lock(kmp_lock_hint_none, ret)
+      call hinted_lock(kmp_lock_hint_contended, ret)
+      call hinted_lock(kmp_lock_hint_uncontended, ret)
+      call hinted_lock(kmp_lock_hint_nonspeculative, ret)
+      call hinted_lock(kmp_lock_hint_speculative, ret)
+      call hinted_lock(kmp_lock_hint_adaptive, ret)
+#else
+      ret = 0
+#endif
+
+      if (ret .eq. 0) then
+          print *, 'Test omp_set_unset_lock_hinted.f passed'
+      else
+          print *, 'Test omp_set_unset_lock_hinted.f failed'
+      endif
+
+      stop
+      end
+
+      subroutine hinted_lock(lock_hint, ret)
+#if defined(_OPENMP)
+      include 'omp_lib.h'
+      integer(omp_lock_kind) lock
+      integer(kmp_lock_hint_kind) lock_hint
+#else
+      integer lock
+      integer lock_hint
+#endif
+      integer ret
+      logical passed
+      integer n(1000), j, id
+      
+      passed = .TRUE.
+
+      call kmp_init_lock_hinted(lock, lock_hint)
+
+!$omp  parallel
+!$omp&     shared       (n,passed,lock)
+!$omp&     private      (id,j)      
+#if defined(_OPENMP) && !defined(_ASSURE)
+      id = omp_get_thread_num()
+#else
+      id = 0
+#endif
+#if defined(_ASSURE)
+      do j = 1, 10
+#else
+      do j = 1, 10000
+#endif
+          call cscall(id, n, passed, lock)
+      enddo
+!$omp  end parallel
+
+      call omp_destroy_lock(lock)
+
+      if (.not. passed) then
+          ret = ret + 1
+      endif
+
+      end
+
+      subroutine cscall(id, n, passed, lock)
+#if defined(_OPENMP)
+        include 'omp_lib.h'
+      integer(omp_lock_kind) lock
+#else
+      integer lock
+#endif
+      integer id, i, n(1000)
+      logical passed
+
+      call omp_set_lock(lock)
+      do i = 1,1000
+          n(i) = id
+      enddo
+      do i = 1,1000
+          if (n(i) .ne. id) passed = .FALSE.
+      enddo
+      call omp_unset_lock(lock)
+      end

diff --git a/final/www/README.txt b/final/www/README.txt
new file mode 100644
index 0000000..3b594bd
--- /dev/null
+++ b/final/www/README.txt

@@ -0,0 +1,143 @@
+
+               README for the LLVM* OpenMP* Runtime Library
+               ============================================
+
+How to Build Documentation
+==========================
+
+The main documentation is in Doxygen* format, and this distribution
+should come with pre-built PDF documentation in doc/Reference.pdf.  
+However, an HTML version can be built by executing:
+
+% doxygen doc/doxygen/config 
+
+in the runtime directory.
+
+That will produce HTML documentation in the doc/doxygen/generated
+directory, which can be accessed by pointing a web browser at the
+index.html file there.
+
+If you don't have Doxygen installed, you can download it from
+www.doxygen.org.
+
+
+How to Build the LLVM* OpenMP* Runtime Library
+==============================================
+
+The library can be built either using Cmake, or using a makefile that
+in turn invokes various Perl scripts. For porting, non X86
+architectures, and for those already familiar with Cmake that may be
+an easier route to take than the one described here.
+
+Building with CMake
+===================
+The runtime/Build_With_CMake.txt file has a description of how to
+build with Cmake.
+
+Building with the Makefile
+==========================
+The Makefile at the top-level will attempt to detect what it needs to
+build the LLVM* OpenMP* Runtime Library.  To see the default settings, 
+type:
+
+make info
+
+You can change the Makefile's behavior with the following options:
+
+omp_root:    The path to the top-level directory containing the top-level
+	     Makefile.  By default, this will take on the value of the 
+	     current working directory.
+
+omp_os:      Operating system.  By default, the build will attempt to 
+	     detect this. Currently supports "linux", "freebsd", "macos", and
+	     "windows".
+
+arch:        Architecture. By default, the build will attempt to 
+	     detect this if not specified by the user. Currently 
+	     supported values are
+                 "32" for IA-32 architecture 
+                 "32e" for Intel(R) 64 architecture
+                 "mic" for Intel(R) Many Integrated Core Architecture
+                 "arm" for ARM* architecture
+                 "aarch64" for Aarch64 (64-bit ARM) architecture
+                 "ppc64" for IBM(R) Power architecture (big endian)
+                 "ppc64le" for IBM(R) Power architecture (little endian)
+
+             If "mic" is specified then "icc" will be used as the
+	     compiler, and appropriate k1om binutils will be used. The
+	     necessary packages must be installed on the build machine
+	     for this to be possible (but an Intel(R) Xeon Phi(TM)
+	     coprocessor card is not required to build the library).
+
+compiler:    Which compiler to use for the build.  Defaults to "icc" 
+	     or "icl" depending on the value of omp_os. Also supports 
+	     some versions of "gcc"* when omp_os is "linux". The selected 
+	     compiler should be installed and in the user's path. The 
+	     corresponding Fortran compiler should also be in the path. 
+	     See "Supported RTL Build Configurations" below for more 
+	     information on compiler versions.
+
+mode:        Library mode: default is "release".  Also supports "debug".
+
+jobs:        The number of parallel jobs for the underlying call to make.
+         This value is sent as the parameter to the -j flag for make.
+         This value defaults to "1", but can be set to any positive integer.
+
+To use any of the options above, simple add <option_name>=<value>.  For 
+example, if you want to build with gcc instead of icc, type:
+
+make compiler=gcc
+
+On OS X* machines, it is possible to build universal (or fat) libraries which
+include both IA-32 architecture and Intel(R) 64 architecture objects in a
+single archive; just build the 32 and 32e libraries separately, then invoke 
+make again with a special argument as follows:
+
+make compiler=clang build_args=fat
+
+Supported RTL Build Configurations
+==================================
+
+Supported Architectures: IA-32 architecture, Intel(R) 64, and 
+Intel(R) Many Integrated Core Architecture
+
+              ----------------------------------------------
+              |   icc/icl     |    gcc      |   clang      |
+--------------|---------------|----------------------------|
+| Linux* OS   |   Yes(1,5)    |  Yes(2,4)   | Yes(4,6,7)   |
+| FreeBSD*    |   No          |  No         | Yes(4,6,7,8) |
+| OS X*       |   Yes(1,3,4)  |  No         | Yes(4,6,7)   |
+| Windows* OS |   Yes(1,4)    |  No         | No           |
+------------------------------------------------------------
+
+(1) On IA-32 architecture and Intel(R) 64, icc/icl versions 12.x are 
+    supported (12.1 is recommended).
+(2) GCC* version 4.6.2 is supported.
+(3) For icc on OS X*, OS X* version 10.5.8 is supported.
+(4) Intel(R) Many Integrated Core Architecture not supported.
+(5) On Intel(R) Many Integrated Core Architecture, icc/icl versions 13.0 
+    or later are required.
+(6) Clang* version 3.3 is supported.
+(7) Clang* currently does not offer a software-implemented 128 bit extended 
+    precision type.  Thus, all entry points reliant on this type are removed
+    from the library and cannot be called in the user program.  The following
+    functions are not available:
+    __kmpc_atomic_cmplx16_*
+    __kmpc_atomic_float16_*
+    __kmpc_atomic_*_fp
+(8) Community contribution provided AS IS, not tested by Intel.
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL: clang (from the OpenMP development branch at
+http://clang-omp.github.io/ ), Intel compilers, GCC.  See the documentation
+for more details.
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+
+*Other names and brands may be claimed as the property of others.

diff --git a/final/www/Reference.pdf b/final/www/Reference.pdf
new file mode 100644
index 0000000..d6faf7e
--- /dev/null
+++ b/final/www/Reference.pdf
Binary files differ

diff --git a/final/www/content.css b/final/www/content.css
new file mode 100644
index 0000000..dca6a32
--- /dev/null
+++ b/final/www/content.css

@@ -0,0 +1,27 @@
+html { margin: 0px; } body { margin: 8px; }
+
+html, body {
+  padding:0px;
+  font-size:small; font-family:"Lucida Grande", "Lucida Sans Unicode", Arial, Verdana, Helvetica, sans-serif; background-color: #fff; color: #222;
+  line-height:1.5;
+}
+
+h1, h2, h3, tt { color: #000 }
+
+h1 { padding-top:0px; margin-top:0px;}
+h2 { color:#333333; padding-top:0.5em; }
+h3 { padding-top: 0.5em; margin-bottom: -0.25em; color:#2d58b7}
+li { padding-bottom: 0.5em; }
+ul { padding-left:1.5em; }
+
+/* Slides */
+IMG.img_slide {
+    display: block;
+    margin-left: auto;
+    margin-right: auto
+}
+
+.itemTitle { color:#2d58b7 }
+
+/* Tables */
+tr { vertical-align:top }

diff --git a/final/www/index.html b/final/www/index.html
new file mode 100644
index 0000000..6ef4f25
--- /dev/null
+++ b/final/www/index.html

@@ -0,0 +1,220 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+          "http://www.w3.org/TR/html4/strict.dtd">
+<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
+<html>
+<head>
+  <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
+  <title>OpenMP* : Support for the OpenMP language</title>
+  <link type="text/css" rel="stylesheet" href="menu.css">
+  <link type="text/css" rel="stylesheet" href="content.css">
+</head>
+
+<body>
+<div id="menu">
+  <div>
+    <a href="http://llvm.org/">LLVM Home</a>
+  </div>
+
+  <div class="submenu">
+    <label>OpenMP Info</label>
+    <a href="/index.html">About</a>
+  </div>
+
+  <div class="submenu">
+    <label>Quick Links</label>
+    <a href="http://lists.llvm.org/mailman/listinfo/openmp-dev">openmp-dev</a>
+    <a href="http://lists.llvm.org/mailman/listinfo/openmp-commits">openmp-commits</a>
+    <a href="http://llvm.org/bugs/">Bug Reports</a>
+    <a href="http://llvm.org/svn/llvm-project/openmp/trunk/">Browse SVN</a>
+    <a href="http://llvm.org/viewvc/llvm-project/openmp/trunk/">Browse ViewVC</a>
+  </div>
+</div>
+
+<div id="content">
+  <!--*********************************************************************-->
+  <h1>OpenMP&reg;: Support for the OpenMP language</h1>
+  <!--*********************************************************************-->
+
+  <p>The OpenMP subproject of LLVM is intended to contain all of the
+     components required to build an executing OpenMP program that are
+     outside the compiler itself.
+  </p>
+
+  <p>Here you can find :-
+    <ul>
+      <li>
+        the code for the runtime library against which
+        code compiled by the OpenMP/Clang compiler must be linked before it
+        can run. This code is also available
+        at <a href="http://openmprtl.org/">openmprtl.org</a>; we intend to
+        keep the different sites in sync.
+      </li>
+      <li>
+        the library that supports offload to target devices (in
+        "offload")
+      </li>
+      <li>
+        the OpenUH test-suite which is being integrated.
+      </li>
+    </ul>
+  </p>
+
+  <p>Support for the parts of the OpenMP 4.0 language that are not
+  associated with the "target" constructs are contained in the
+  "runtime" directory. Support for offloading computation via the
+  "target" directive is in the separate "offload" directory. That
+  builds a library that provides the interfaces for transferring code
+  and data to an attached computational device. Initial support here
+  is for the Intel&reg Xeon Phi&#0153 coprocessor, but work is
+  beginning to support other attached computing devices, and the
+  design is intended to be general. The README.txt in the "offload"
+  directory describes how to build the offload library.
+  </p>
+
+  <p>All of the code here is <a
+     href="http://llvm.org/docs/DeveloperPolicy.html#license">dual licensed</a>
+     under the MIT license and the UIUC License (a BSD-like license).
+     The LICENSE.txt file at the top of the OpenMP project contains
+     the license text and associated patent grants.
+  </p>
+
+  <!--=====================================================================-->
+  <h2 id="goals">Features and Goals</h2>
+  <!--=====================================================================-->
+
+    <ul>
+        <li>Correctness as defined by the 
+          <a href="http://www.openmp.org/mp-documents/OpenMP3.1.pdf">OpenMP
+          3.1 standard (PDF)</a> now, and <a href="http://www.openmp.org/mp-documents/OpenMP4.0.0.pdf">OpenMP
+          4.0 standard (PDF)</a> in the future.</li>
+        <li>High performance.</li>
+        <li>ABI compatibility with <a href="http://gcc.gnu.org">Gcc</a> and
+        <a href="http://software.intel.com/en-us/intel-compilers">Intel's
+        existing OpenMP compilers.</a> 
+        With this release we have restored compatibility with OpenMP
+        3.1 code compiled by gcc 4.9, however we do not have support
+        for OpenMP 4.0 code that uses task cancellation when compiled
+        by gcc 4.9. How we will support such code remains a research issue.
+        </li>
+    </ul>
+
+  <!--=====================================================================-->
+  <h2 id="why">Why have the runtime code here?</h2>
+  <!--=====================================================================-->
+
+  <p>It makes sense to have the runtime sources in the same place
+    (and with the same license) as the compiler.
+  </p>
+
+  <!--=====================================================================-->
+  <h2 id="requirements">Platform Support</h2>
+  <!--=====================================================================-->
+
+  <p>The OpenMP runtime is known to work on 
+    <ul>
+      <li>ARM&reg;&nbsp; architecture processors</li>
+      <li>PowerPC&trade;&nbsp; processors</li>
+      <li>32 and 64 bit X86
+        processors when compiled with clang, with the Intel compiler
+        or with gcc, and also the Intel&reg;&nbsp;Xeon Phi&trade; product family, when compiled with
+        the Intel compiler.
+      </li>
+    </ul>
+    Ports to other architectures and operating systems are welcome.
+  </p>
+  
+  <p>A full OS and archiecture compatibility matrix is in 
+    <a href="README.txt">README.txt</a>
+  </p>
+
+
+  <!--=====================================================================-->
+  <h2 id="dir-structure">Status</h2>
+  <!--=====================================================================-->
+
+   <p>The runtime can be built with gcc, icc or clang. However, note
+   that a runtime built with clang cannot be guaranteed to work with
+   OpenMP code compiled by the other compilers, since clang does not support
+   a 128-bit float type, and cannot therefore generate the code used
+   for reductions of that type (which may occur in user code compiled
+   by the other compilers).
+   </p>
+ 
+   <p>The University of Houston has kindly contributed their test
+   suite (in the "testsuite" directory). Integration and use of this
+   for automatic testing is in progress.
+   </p>
+
+  <!--=====================================================================-->
+  <h2>Get it and get involved!</h2>
+  <!--=====================================================================-->
+
+  <p>First please review our
+     <a href="http://llvm.org/docs/DeveloperPolicy.html">Developer's Policy</a>.
+
+  <p>To check out the code, use:</p>
+
+  <ul>
+  <li><code>svn co http://llvm.org/svn/llvm-project/openmp/trunk openmp</code></li>
+  </ul>
+
+  <p>
+     Next:
+  </p>
+  
+  <ul>
+    <li><code>cd openmp/runtime</code></li>
+    <li><code>make compiler=gcc</code></li>
+  </ul>
+  
+  <p>Full details of how to build are in the
+    <a href="README.txt">README.txt</a>
+    which also describes a CMake based build system which may be more
+    convenient on some platforms and for porting to new platforms than
+    the make and Perl based system.
+  </p>
+
+  <!--=====================================================================-->
+  <h3>Notes</h3>
+  <!--=====================================================================-->
+
+<p>
+
+</p>
+
+  <p>Send discussions to the
+  (<a href="http://lists.llvm.org/mailman/listinfo/openmp-dev">OpenMP mailing list</a>).</p>
+
+
+  <!--=====================================================================-->
+  <h2>Design Documents</h2>
+  <!--=====================================================================-->
+
+<ul>
+<li><a href="Reference.pdf">Runtime design (PDF)</a></li>
+</ul>
+
+  <!--=====================================================================-->
+  <h2>Copyright notices</h2>
+  <!--=====================================================================-->
+<ul>
+<li>
+  The OpenMP name and the OpenMP logo are registered trademarks of the
+  OpenMP Architecture Review Board.
+</li>
+<li>
+  Intel is a trademark of Intel Corporation in the U.S. and/or other
+  countries.
+</li>
+<li>
+  PowerPC is a trademark of IBM Corporation in the U.S. and/or other
+  countries.
+</li>
+<li>
+  ARM is a trademark of ARM Corporation in the U.S. and/or
+  other countries.
+</li>
+</ul>
+</div>
+</body>
+</html>

diff --git a/final/www/menu.css b/final/www/menu.css
new file mode 100644
index 0000000..4a887b1
--- /dev/null
+++ b/final/www/menu.css

@@ -0,0 +1,39 @@
+/***************/
+/* page layout */
+/***************/
+
+[id=menu] {
+	position:fixed;
+	width:25ex;
+}
+[id=content] {
+	/* *****  EDIT THIS VALUE IF CONTENT OVERLAPS MENU ***** */
+	position:absolute;
+  left:29ex;
+	padding-right:4ex;
+}
+
+/**************/
+/* menu style */
+/**************/
+
+#menu .submenu {
+	padding-top:1em;
+	display:block;
+}
+
+#menu label {
+	display:block;
+	font-weight: bold;
+	text-align: center;
+	background-color: rgb(192,192,192);
+}
+#menu a {
+	padding:0 .2em;
+	display:block;
+	text-align: center;
+	background-color: rgb(235,235,235);
+}
+#menu a:visited {
+	color:rgb(100,50,100);
+}
commit	4a99d3ceb95fea1cc5ce3a87cf553712c0ecd415	[log] [tgz]
author	Hans Wennborg <hans@hanshq.net>	Fri Aug 28 01:44:15 2015 +0000
committer	Hans Wennborg <hans@hanshq.net>	Fri Aug 28 01:44:15 2015 +0000
tree	fa5a5bf22fc8baf4c2d56726dfd9f58c2a8e6424
parent	c23659910f772638c6b016d506c208e1a42e24a9 [diff]