[Offload] Move `/openmp/libomptarget` to `/offload` (#75125) In a nutshell, this moves our libomptarget code to populate the offload subproject. With this commit, users need to enable the new LLVM/Offload subproject as a runtime in their cmake configuration. No further changes are expected for downstream code. Tests and other components still depend on OpenMP and have also not been renamed. The results below are for a build in which OpenMP and Offload are enabled runtimes. In addition to the pure `git mv`, we needed to adjust some CMake files. Nothing is intended to change semantics. ``` ninja check-offload ``` Works with the X86 and AMDGPU offload tests ``` ninja check-openmp ``` Still works but doesn't build offload tests anymore. ``` ls install/lib ``` Shows all expected libraries, incl. - `libomptarget.devicertl.a` - `libomptarget-nvptx-sm_90.bc` - `libomptarget.rtl.amdgpu.so` -> `libomptarget.rtl.amdgpu.so.18git` - `libomptarget.so` -> `libomptarget.so.18git` Fixes: https://github.com/llvm/llvm-project/issues/75124 --------- Co-authored-by: Saiyedul Islam <Saiyedul.Islam@amd.com> GitOrigin-RevId: 330d8983d25d08580fc1642fea48b2473f47a9da

commit: f99c47ef437ce6919c614d51a876c850997ba2a1 [log] [tgz]
author: Johannes Doerfert <johannes@jdoerfert.de> Mon Apr 22 09:51:33 2024 -0700
committer: Copybara-Service <copybara-worker@google.com> Mon Apr 22 09:55:47 2024 -0700
tree: e2ba8c2db9e0f937fe06bbfb087449f45c6bea19
parent: c2462eaedc665b5fea5d6ef3576dc5f29f456fce [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c4ff76..95f2425 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -113,7 +113,17 @@
 
 # Header install location
 if(${OPENMP_STANDALONE_BUILD})
-  set(LIBOMP_HEADERS_INSTALL_PATH "${CMAKE_INSTALL_INCLUDEDIR}")
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    execute_process(
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      COMMAND ${CMAKE_CXX_COMPILER} --print-resource-dir
+      RESULT_VARIABLE COMMAND_RETURN_CODE
+      OUTPUT_VARIABLE COMPILER_RESOURCE_DIR
+    )
+    set(LIBOMP_HEADERS_INSTALL_PATH "${COMPILER_RESOURCE_DIR}/include")
+  else()
+    set(LIBOMP_HEADERS_INSTALL_PATH "${CMAKE_INSTALL_INCLUDEDIR}")
+  endif()
 else()
   include(GetClangResourceDir)
   get_clang_resource_dir(LIBOMP_HEADERS_INSTALL_PATH SUBDIR include)
@@ -123,19 +133,6 @@
 # to enable time profiling support in the OpenMP runtime.
 add_subdirectory(runtime)
 
-if (OPENMP_ENABLE_LIBOMPTARGET)
-  # Check that the library can actually be built.
-  if (APPLE OR WIN32)
-    message(FATAL_ERROR "libomptarget cannot be built on Windows and MacOS X!")
-  elseif (NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES)
-    message(FATAL_ERROR "Host compiler must support C++17 to build libomptarget!")
-  elseif (NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-    message(FATAL_ERROR "libomptarget on 32-bit systems are not supported!")
-  endif()
-
-  add_subdirectory(libomptarget)
-endif()
-
 set(ENABLE_OMPT_TOOLS ON)
 # Currently tools are not tested well on Windows or MacOS X.
 if (APPLE OR WIN32)
@@ -148,6 +145,10 @@
   add_subdirectory(tools)
 endif()
 
+# Propagate OMPT support to offload
+set(LIBOMP_HAVE_OMPT_SUPPORT ${LIBOMP_HAVE_OMPT_SUPPORT} PARENT_SCOPE)
+set(LIBOMP_OMP_TOOLS_INCLUDE_DIR ${LIBOMP_OMP_TOOLS_INCLUDE_DIR} PARENT_SCOPE)
+
 option(OPENMP_MSVC_NAME_SCHEME "Build dll with MSVC naming scheme." OFF)
 
 # Build libompd.so

diff --git a/libomptarget/CMakeLists.txt b/libomptarget/CMakeLists.txt
deleted file mode 100644
index 531198f..0000000
--- a/libomptarget/CMakeLists.txt
+++ /dev/null

@@ -1,159 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build offloading library and related plugins.
-#
-##===----------------------------------------------------------------------===##
-
-if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
-  message(FATAL_ERROR "Direct configuration not supported, please use parent directory!")
-endif()
-
-# Add cmake directory to search for custom cmake functions.
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH})
-
-# Set the path of all resulting libraries to a unified location so that it can
-# be used for testing.
-set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LIBRARY_DIR})
-
-if(NOT LLVM_LIBRARY_OUTPUT_INTDIR)
-  set(LIBOMPTARGET_INTDIR ${LIBOMPTARGET_LIBRARY_DIR})
-else()
-  set(LIBOMPTARGET_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
-endif()
-
-# Message utilities.
-include(LibomptargetUtils)
-
-# Get dependencies for the different components of the project.
-include(LibomptargetGetDependencies)
-
-# LLVM source tree is required at build time for libomptarget
-if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
-  message(FATAL_ERROR "Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
-endif()
-
-include_directories(${LIBOMPTARGET_LLVM_INCLUDE_DIRS})
-
-# This is a list of all the targets that are supported/tested right now.
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu-LTO")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu-LTO")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu-LTO")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu-LTO")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-LTO")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-JIT-LTO")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu-LTO")
-
-# Once the plugins for the different targets are validated, they will be added to
-# the list of supported targets in the current system.
-set (LIBOMPTARGET_SYSTEM_TARGETS "")
-set (LIBOMPTARGET_TESTED_PLUGINS "")
-
-# Check whether using debug mode. In debug mode, allow dumping progress
-# messages at runtime by default. Otherwise, it can be enabled
-# independently using the LIBOMPTARGET_ENABLE_DEBUG option.
-string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE)
-if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
-  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" ON)
-else()
-  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" OFF)
-endif()
-if(LIBOMPTARGET_ENABLE_DEBUG)
-  add_definitions(-DOMPTARGET_DEBUG)
-endif()
-
-# No exceptions and no RTTI, except if requested.
-set(offload_compile_flags -fno-exceptions)
-if(NOT LLVM_ENABLE_RTTI)
-  set(offload_compile_flags ${offload_compile_flags} -fno-rtti)
-endif()
-
-# TODO: Consider enabling LTO by default if supported.
-# https://cmake.org/cmake/help/latest/module/CheckIPOSupported.html can be used
-# to test for working LTO. However, before CMake 3.24 this will test the
-# default linker and ignore options such as LLVM_ENABLE_LLD. As a result, CMake
-# would test whether LTO works with the default linker but build with another one.
-# In a typical scenario, libomptarget is compiled with the in-tree Clang, but
-# linked with ld.gold, which requires the LLVMgold plugin, when it actually
-# would work with the lld linker (or also fail because the system lld is too old
-# to understand opaque pointers). Using gcc as the compiler would pass the test, but fail
-# when linking with lld since does not understand gcc's LTO format.
-set(LIBOMPTARGET_USE_LTO FALSE CACHE BOOL "Use LTO for the offload runtimes if available")
-if (LIBOMPTARGET_USE_LTO)
-  # CMake sets CMAKE_CXX_COMPILE_OPTIONS_IPO depending on the compiler and is
-  # also what CheckIPOSupported uses to test support.
-  list(APPEND offload_compile_flags ${CMAKE_CXX_COMPILE_OPTIONS_IPO})
-  list(APPEND offload_link_flags ${CMAKE_CXX_COMPILE_OPTIONS_IPO})
-endif()
-
-# OMPT support for libomptarget
-# Follow host OMPT support and check if host support has been requested.
-# LIBOMP_HAVE_OMPT_SUPPORT indicates whether host OMPT support has been implemented.
-# LIBOMP_OMPT_SUPPORT indicates whether host OMPT support has been requested (default is ON).
-# LIBOMPTARGET_OMPT_SUPPORT indicates whether target OMPT support has been requested (default is ON).
-set(OMPT_TARGET_DEFAULT FALSE)
-if ((LIBOMP_HAVE_OMPT_SUPPORT) AND (LIBOMP_OMPT_SUPPORT) AND (NOT WIN32))
-  set (OMPT_TARGET_DEFAULT TRUE)
-endif()
-set(LIBOMPTARGET_OMPT_SUPPORT ${OMPT_TARGET_DEFAULT} CACHE BOOL "OMPT-target-support?")
-if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT))
-  add_definitions(-DOMPT_SUPPORT=1)
-  message(STATUS "OMPT target enabled")
-else()
-  set(LIBOMPTARGET_OMPT_SUPPORT FALSE)
-  message(STATUS "OMPT target disabled")
-endif()
-
-pythonize_bool(LIBOMPTARGET_OMPT_SUPPORT)
-
-set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LLVM_LIBC_GPU_BUILD} CACHE BOOL
-    "Libomptarget support for the GPU libc")
-pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT)
-
-set(LIBOMPTARGET_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
-message(STATUS "OpenMP tools dir in libomptarget: ${LIBOMP_OMP_TOOLS_INCLUDE_DIR}")
-include_directories(${LIBOMP_OMP_TOOLS_INCLUDE_DIR})
-
-# Definitions for testing, for reuse when testing libomptarget-nvptx.
-set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${LIBOMP_INCLUDE_DIR}" CACHE STRING
-  "Path to folder containing omp.h")
-set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${LIBOMP_LIBRARY_DIR}" CACHE STRING
-  "Path to folder containing libomp.so, and libLLVMSupport.so with profiling enabled")
-set(LIBOMPTARGET_LLVM_LIBRARY_DIR "${LLVM_LIBRARY_DIR}" CACHE STRING
-  "Path to folder containing llvm library libomptarget.so")
-set(LIBOMPTARGET_LLVM_LIBRARY_INTDIR "${LIBOMPTARGET_INTDIR}" CACHE STRING
-  "Path to folder where intermediate libraries will be output")
-
-# Build offloading plugins and device RTLs if they are available.
-add_subdirectory(plugins-nextgen)
-add_subdirectory(DeviceRTL)
-add_subdirectory(tools)
-
-# Build target agnostic offloading library.
-add_subdirectory(src)
-
-# Add tests.
-add_subdirectory(test)
-
-# Add unit tests if GMock/GTest is present
-if (EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest)
-  if (NOT TARGET llvm_gtest)
-    add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
-  endif()
-  add_subdirectory(unittests)
-endif()

diff --git a/libomptarget/DeviceRTL/CMakeLists.txt b/libomptarget/DeviceRTL/CMakeLists.txt
deleted file mode 100644
index 2e7f28d..0000000
--- a/libomptarget/DeviceRTL/CMakeLists.txt
+++ /dev/null

@@ -1,315 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build the DeviceRTL for all toolchains that are available
-#
-##===----------------------------------------------------------------------===##
-
-set(LIBOMPTARGET_BUILD_DEVICERTL_BCLIB TRUE CACHE BOOL
-  "Can be set to false to disable building this library.")
-
-if (NOT LIBOMPTARGET_BUILD_DEVICERTL_BCLIB)
-  libomptarget_say("Not building DeviceRTL: Disabled by LIBOMPTARGET_BUILD_DEVICERTL_BCLIB")
-  return()
-endif()
-
-# Check to ensure the host system is a supported host architecture.
-if(NOT ${CMAKE_SIZEOF_VOID_P} EQUAL "8")
-  libomptarget_say("Not building DeviceRTL: Runtime does not support 32-bit hosts")
-  return()
-endif()
-
-if (LLVM_DIR)
-  # Builds that use pre-installed LLVM have LLVM_DIR set.
-  # A standalone or LLVM_ENABLE_RUNTIMES=openmp build takes this route
-  find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-  find_program(PACKAGER_TOOL clang-offload-packager PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-  find_program(LINK_TOOL llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-  find_program(OPT_TOOL opt PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-  if ((NOT CLANG_TOOL) OR (NOT LINK_TOOL) OR (NOT OPT_TOOL) OR (NOT PACKAGER_TOOL))
-    libomptarget_say("Not building DeviceRTL. Missing clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL}, opt: ${OPT_TOOL}, or clang-offload-packager: ${PACKAGER_TOOL}")
-    return()
-  else()
-    libomptarget_say("Building DeviceRTL. Using clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} and opt: ${OPT_TOOL}")
-  endif()
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
-  # LLVM in-tree builds may use CMake target names to discover the tools.
-  # A LLVM_ENABLE_PROJECTS=openmp build takes this route
-  set(CLANG_TOOL $<TARGET_FILE:clang>)
-  set(PACKAGER_TOOL $<TARGET_FILE:clang-offload-packager>)
-  set(LINK_TOOL $<TARGET_FILE:llvm-link>)
-  set(OPT_TOOL $<TARGET_FILE:opt>)
-  libomptarget_say("Building DeviceRTL. Using clang from in-tree build")
-else()
-  libomptarget_say("Not building DeviceRTL. No appropriate clang found")
-  return()
-endif()
-
-set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
-set(include_directory ${devicertl_base_directory}/include)
-set(source_directory ${devicertl_base_directory}/src)
-
-set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010"
-                             "gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035"
-                             "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150"
-                             "gfx1151")
-set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-                            "sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90")
-set(all_gpu_architectures
-    "${all_amdgpu_architectures};${all_nvptx_architectures}")
-
-set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING
-    "List of device architectures to be used to compile the OpenMP DeviceRTL.")
-
-if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all")
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
-       LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native")
-  if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH)
-    libomptarget_error_say(
-      "Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'")
-  elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU)
-    libomptarget_error_say("No AMD or NVIDIA GPU found on the system when using 'auto'")
-  endif()
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES
-      "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
-endif()
-list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
-
-set(include_files
-  ${include_directory}/Allocator.h
-  ${include_directory}/Configuration.h
-  ${include_directory}/Debug.h
-  ${include_directory}/Interface.h
-  ${include_directory}/LibC.h
-  ${include_directory}/Mapping.h
-  ${include_directory}/State.h
-  ${include_directory}/Synchronization.h
-  ${include_directory}/Types.h
-  ${include_directory}/Utils.h
-)
-
-set(src_files
-  ${source_directory}/Allocator.cpp
-  ${source_directory}/Configuration.cpp
-  ${source_directory}/Debug.cpp
-  ${source_directory}/Kernel.cpp
-  ${source_directory}/LibC.cpp
-  ${source_directory}/Mapping.cpp
-  ${source_directory}/Misc.cpp
-  ${source_directory}/Parallelism.cpp
-  ${source_directory}/Reduction.cpp
-  ${source_directory}/State.cpp
-  ${source_directory}/Synchronization.cpp
-  ${source_directory}/Tasking.cpp
-  ${source_directory}/Utils.cpp
-  ${source_directory}/Workshare.cpp
-)
-
-# We disable the slp vectorizer during the runtime optimization to avoid
-# vectorized accesses to the shared state. Generally, those are "good" but
-# the optimizer pipeline (esp. Attributor) does not fully support vectorized
-# instructions yet and we end up missing out on way more important constant
-# propagation. That said, we will run the vectorizer again after the runtime 
-# has been linked into the user program.
-set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
-set(link_opt_flags  -O3        -openmp-opt-disable -attributor-enable=module -vectorize-slp=false )
-set(link_export_flag -passes=internalize -internalize-public-api-file=${source_directory}/exports)
-
-# If the user built with the GPU C library enabled we will use that instead.
-if(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
-  list(APPEND clang_opt_flags -DOMPTARGET_HAS_LIBC)
-endif()
-
-# Prepend -I to each list element
-set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
-list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
-
-# Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
-              ${clang_opt_flags} --offload-device-only
-             -nocudalib -nogpulib -nostdinc
-             -fopenmp -fopenmp-cuda-mode
-             -Wno-unknown-cuda-version
-             -DOMPTARGET_DEVICE_RUNTIME
-             -I${include_directory}
-             -I${devicertl_base_directory}/../include
-             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
-)
-
-# first create an object target
-add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
-function(compileDeviceRTLLibrary target_cpu target_name target_triple)
-  set(target_bc_flags ${ARGN})
-
-  set(bc_files "")
-  foreach(src ${src_files})
-    get_filename_component(infile ${src} ABSOLUTE)
-    get_filename_component(outfile ${src} NAME)
-    set(outfile "${outfile}-${target_cpu}.bc")
-    set(depfile "${outfile}.d")
-
-    add_custom_command(OUTPUT ${outfile}
-      COMMAND ${CLANG_TOOL}
-      ${bc_flags}
-      --offload-arch=${target_cpu}
-      ${target_bc_flags}
-      -MD -MF ${depfile}
-      ${infile} -o ${outfile}
-      DEPENDS ${infile}
-      DEPFILE ${depfile}
-      COMMENT "Building LLVM bitcode ${outfile}"
-      VERBATIM
-    )
-    if(TARGET clang)
-      # Add a file-level dependency to ensure that clang is up-to-date.
-      # By default, add_custom_command only builds clang if the
-      # executable is missing.
-      add_custom_command(OUTPUT ${outfile}
-        DEPENDS clang
-        APPEND
-      )
-    endif()
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-    list(APPEND bc_files ${outfile})
-  endforeach()
-
-  set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
-
-  # Link to a bitcode library.
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
-      COMMAND ${LINK_TOOL}
-        -o ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} ${bc_files}
-      DEPENDS ${bc_files}
-      COMMENT "Linking LLVM bitcode ${bclib_name}"
-  )
-
-  if(TARGET llvm-link)
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
-      DEPENDS llvm-link
-      APPEND)
-  endif()
-
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/internalized_${bclib_name}
-      COMMAND ${OPT_TOOL} ${link_export_flag} ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
-                      -o ${CMAKE_CURRENT_BINARY_DIR}/internalized_${bclib_name}
-      DEPENDS ${source_directory}/exports ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
-      COMMENT "Internalizing LLVM bitcode ${bclib_name}"
-  )
-  if(TARGET opt)
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/internalized_${bclib_name}
-      DEPENDS opt
-      APPEND)
-  endif()
-
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-      COMMAND ${OPT_TOOL} ${link_opt_flags} ${CMAKE_CURRENT_BINARY_DIR}/internalized_${bclib_name}
-                      -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/internalized_${bclib_name}
-      COMMENT "Optimizing LLVM bitcode ${bclib_name}"
-  )
-  if(TARGET opt)
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-      DEPENDS opt
-      APPEND)
-  endif()
-
-  set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
-  add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
-
-  # Copy library to destination.
-  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-                    ${LIBOMPTARGET_LIBRARY_DIR})
-  add_dependencies(omptarget.devicertl.${target_name} ${bclib_target_name})
-
-  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name} ${LIBOMPTARGET_LIBRARY_DIR}/${bclib_name})
-
-  # Install bitcode library under the lib destination folder.
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-
-  set(target_feature "")
-  if("${target_triple}" STREQUAL "nvptx64-nvidia-cuda")
-    set(target_feature "feature=+ptx63")
-  endif()
-
-  # Package the bitcode in the bitcode and embed it in an ELF for the static library
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-      COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-        "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp"
-      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-      COMMENT "Packaging LLVM offloading binary ${bclib_name}.out"
-  )
-  if(TARGET clang-offload-packager)
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-      DEPENDS clang-offload-packager
-      APPEND)
-  endif()
-
-  set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o")
-  add_custom_command(OUTPUT ${output_name}
-    COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib
-            -Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-            -o ${output_name}
-            ${source_directory}/Stub.cpp
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp
-    COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o"
-    VERBATIM
-  )
-  if(TARGET clang)
-    add_custom_command(OUTPUT ${output_name}
-      DEPENDS clang
-      APPEND)
-  endif()
-
-  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${output_name})
-  set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name})
-
-  if (CMAKE_EXPORT_COMPILE_COMMANDS)
-    set(ide_target_name omptarget-ide-${target_name}-${target_cpu})
-    add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
-    target_compile_options(${ide_target_name} PRIVATE
-      -fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode
-      -mllvm -openmp-opt-disable
-      -foffload-lto -fvisibility=hidden --offload-device-only
-      -nocudalib -nogpulib -nostdinc -Wno-unknown-cuda-version
-    )
-    target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-    target_include_directories(${ide_target_name} PRIVATE
-      ${include_directory}
-      ${devicertl_base_directory}/../include
-      ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
-    )
-    install(TARGETS ${ide_target_name} EXCLUDE_FROM_ALL)
-  endif()
-endfunction()
-
-# Generate a Bitcode library for all the gpu architectures the user requested.
-add_custom_target(omptarget.devicertl.nvptx)
-add_custom_target(omptarget.devicertl.amdgpu)
-foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
-  if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
-  elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
-  else()
-    libomptarget_error_say("Unknown GPU architecture '${gpu_arch}'")
-  endif()
-endforeach()
-
-# Archive all the object files generated above into a static library
-add_library(omptarget.devicertl STATIC)
-set_target_properties(omptarget.devicertl PROPERTIES
-  ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}"
-  LINKER_LANGUAGE CXX
-)
-target_link_libraries(omptarget.devicertl PRIVATE omptarget.devicertl.all_objs)
-
-install(TARGETS omptarget.devicertl ARCHIVE DESTINATION ${OPENMP_INSTALL_LIBDIR})

diff --git a/libomptarget/DeviceRTL/include/Allocator.h b/libomptarget/DeviceRTL/include/Allocator.h
deleted file mode 100644
index a28eb0f..0000000
--- a/libomptarget/DeviceRTL/include/Allocator.h
+++ /dev/null

@@ -1,44 +0,0 @@
-//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_ALLOCATOR_H
-#define OMPTARGET_ALLOCATOR_H
-
-#include "Types.h"
-
-// Forward declaration.
-struct KernelEnvironmentTy;
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-
-namespace allocator {
-
-static uint64_t constexpr ALIGNMENT = 16;
-
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
-/// Allocate \p Size bytes.
-[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
-alloc(uint64_t Size);
-
-/// Free the allocation pointed to by \p Ptr.
-void free(void *Ptr);
-
-} // namespace allocator
-
-} // namespace ompx
-
-#pragma omp end declare target
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/Configuration.h b/libomptarget/DeviceRTL/include/Configuration.h
deleted file mode 100644
index 8e6f5c8..0000000
--- a/libomptarget/DeviceRTL/include/Configuration.h
+++ /dev/null

@@ -1,68 +0,0 @@
-//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// API to query the global (constant) device environment.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_CONFIGURATION_H
-#define OMPTARGET_CONFIGURATION_H
-
-#include "Shared/Environment.h"
-
-#include "Types.h"
-
-namespace ompx {
-namespace config {
-
-/// Return the number of devices in the system, same number as returned on the
-/// host by omp_get_num_devices.
-uint32_t getNumDevices();
-
-/// Return the device number in the system for omp_get_device_num.
-uint32_t getDeviceNum();
-
-/// Return the user choosen debug level.
-uint32_t getDebugKind();
-
-/// Return if teams oversubscription is assumed
-uint32_t getAssumeTeamsOversubscription();
-
-/// Return if threads oversubscription is assumed
-uint32_t getAssumeThreadsOversubscription();
-
-/// Return the amount of dynamic shared memory that was allocated at launch.
-uint64_t getDynamicMemorySize();
-
-/// Returns the cycles per second of the device's fixed frequency clock.
-uint64_t getClockFrequency();
-
-/// Returns the pointer to the beginning of the indirect call table.
-void *getIndirectCallTablePtr();
-
-/// Returns the size of the indirect call table.
-uint64_t getIndirectCallTableSize();
-
-/// Returns the size of the indirect call table.
-uint64_t getHardwareParallelism();
-
-/// Return if debugging is enabled for the given debug kind.
-bool isDebugMode(DeviceDebugKind Level);
-
-/// Indicates if this kernel may require thread-specific states, or if it was
-/// explicitly disabled by the user.
-bool mayUseThreadStates();
-
-/// Indicates if this kernel may require data environments for nested
-/// parallelism, or if it was explicitly disabled by the user.
-bool mayUseNestedParallelism();
-
-} // namespace config
-} // namespace ompx
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/Debug.h b/libomptarget/DeviceRTL/include/Debug.h
deleted file mode 100644
index 22998f4..0000000
--- a/libomptarget/DeviceRTL/include/Debug.h
+++ /dev/null

@@ -1,49 +0,0 @@
-//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_DEBUG_H
-#define OMPTARGET_DEVICERTL_DEBUG_H
-
-#include "Configuration.h"
-#include "LibC.h"
-
-/// Assertion
-///
-/// {
-extern "C" {
-void __assert_assume(bool condition);
-void __assert_fail(const char *expr, const char *file, unsigned line,
-                   const char *function);
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
-                            unsigned line, const char *function);
-}
-
-#define ASSERT(expr, msg)                                                      \
-  {                                                                            \
-    if (config::isDebugMode(DeviceDebugKind::Assertion) && !(expr))            \
-      __assert_fail_internal(#expr, msg, __FILE__, __LINE__,                   \
-                             __PRETTY_FUNCTION__);                             \
-    else                                                                       \
-      __assert_assume(expr);                                                   \
-  }
-#define UNREACHABLE(msg)                                                       \
-  PRINT(msg);                                                                  \
-  __builtin_trap();                                                            \
-  __builtin_unreachable();
-
-///}
-
-#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
-#define PRINT(str) PRINTF("%s", str)
-
-///}
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/Interface.h b/libomptarget/DeviceRTL/include/Interface.h
deleted file mode 100644
index f4854ed..0000000
--- a/libomptarget/DeviceRTL/include/Interface.h
+++ /dev/null

@@ -1,365 +0,0 @@
-//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_INTERFACE_H
-#define OMPTARGET_DEVICERTL_INTERFACE_H
-
-#include "Shared/Environment.h"
-
-#include "Types.h"
-
-/// External API
-///
-///{
-
-extern "C" {
-
-/// ICV: dyn-var, constant 0
-///
-/// setter: ignored.
-/// getter: returns 0.
-///
-///{
-void omp_set_dynamic(int);
-int omp_get_dynamic(void);
-///}
-
-/// ICV: nthreads-var, integer
-///
-/// scope: data environment
-///
-/// setter: ignored.
-/// getter: returns false.
-///
-/// implementation notes:
-///
-///
-///{
-void omp_set_num_threads(int);
-int omp_get_max_threads(void);
-///}
-
-/// ICV: thread-limit-var, computed
-///
-/// getter: returns thread limited defined during launch.
-///
-///{
-int omp_get_thread_limit(void);
-///}
-
-/// ICV: max-active-level-var, constant 1
-///
-/// setter: ignored.
-/// getter: returns 1.
-///
-///{
-void omp_set_max_active_levels(int);
-int omp_get_max_active_levels(void);
-///}
-
-/// ICV: places-partition-var
-///
-///
-///{
-///}
-
-/// ICV: active-level-var, 0 or 1
-///
-/// getter: returns 0 or 1.
-///
-///{
-int omp_get_active_level(void);
-///}
-
-/// ICV: level-var
-///
-/// getter: returns parallel region nesting
-///
-///{
-int omp_get_level(void);
-///}
-
-/// ICV: run-sched-var
-///
-///
-///{
-void omp_set_schedule(omp_sched_t, int);
-void omp_get_schedule(omp_sched_t *, int *);
-///}
-
-/// TODO this is incomplete.
-int omp_get_num_threads(void);
-int omp_get_thread_num(void);
-void omp_set_nested(int);
-
-int omp_get_nested(void);
-
-void omp_set_max_active_levels(int Level);
-
-int omp_get_max_active_levels(void);
-
-omp_proc_bind_t omp_get_proc_bind(void);
-
-int omp_get_num_places(void);
-
-int omp_get_place_num_procs(int place_num);
-
-void omp_get_place_proc_ids(int place_num, int *ids);
-
-int omp_get_place_num(void);
-
-int omp_get_partition_num_places(void);
-
-void omp_get_partition_place_nums(int *place_nums);
-
-int omp_get_cancellation(void);
-
-void omp_set_default_device(int deviceId);
-
-int omp_get_default_device(void);
-
-int omp_get_num_devices(void);
-
-int omp_get_device_num(void);
-
-int omp_get_num_teams(void);
-
-int omp_get_team_num();
-
-int omp_get_initial_device(void);
-
-void *llvm_omp_target_dynamic_shared_alloc();
-
-/// Synchronization
-///
-///{
-void omp_init_lock(omp_lock_t *Lock);
-
-void omp_destroy_lock(omp_lock_t *Lock);
-
-void omp_set_lock(omp_lock_t *Lock);
-
-void omp_unset_lock(omp_lock_t *Lock);
-
-int omp_test_lock(omp_lock_t *Lock);
-///}
-
-/// Tasking
-///
-///{
-int omp_in_final(void);
-
-int omp_get_max_task_priority(void);
-///}
-
-/// Misc
-///
-///{
-double omp_get_wtick(void);
-
-double omp_get_wtime(void);
-///}
-}
-
-extern "C" {
-/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
-/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
-/// called by any thread, allocation happens *per thread*.
-void *__kmpc_alloc_shared(uint64_t Bytes);
-
-/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
-/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the
-/// allocated by __kmpc_alloc_shared by the same thread.
-void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
-
-/// Get a pointer to the memory buffer containing dynamically allocated shared
-/// memory configured at launch.
-void *__kmpc_get_dynamic_shared();
-
-/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
-/// allocation address in \p GlobalArgs.
-///
-/// Called by the main thread prior to a parallel region.
-///
-/// We also remember it in GlobalArgsPtr to ensure the worker threads and
-/// deallocation function know the allocation address too.
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs);
-
-/// Deallocate the memory allocated by __kmpc_begin_sharing_variables.
-///
-/// Called by the main thread after a parallel region.
-void __kmpc_end_sharing_variables();
-
-/// Store the allocation address obtained via __kmpc_begin_sharing_variables in
-/// \p GlobalArgs.
-///
-/// Called by the worker threads in the parallel region (function).
-void __kmpc_get_shared_variables(void ***GlobalArgs);
-
-/// External interface to get the thread ID.
-uint32_t __kmpc_get_hardware_thread_id_in_block();
-
-/// External interface to get the number of threads.
-uint32_t __kmpc_get_hardware_num_threads_in_block();
-
-/// External interface to get the warp size.
-uint32_t __kmpc_get_warp_size();
-
-/// Kernel
-///
-///{
-// Forward declaration
-struct KernelEnvironmentTy;
-
-int8_t __kmpc_is_spmd_exec_mode();
-
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
-
-void __kmpc_target_deinit();
-
-///}
-
-/// Reduction
-///
-///{
-void *__kmpc_reduction_get_fixed_buffer();
-
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
-                                               uint64_t reduce_data_size,
-                                               void *reduce_data,
-                                               ShuffleReductFnTy shflFct,
-                                               InterWarpCopyFnTy cpyFct);
-
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
-    uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
-    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
-    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct);
-///}
-
-/// Synchronization
-///
-///{
-void __kmpc_ordered(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
-
-void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
-
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_master(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter);
-
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId);
-
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
-
-void __kmpc_end_single(IdentTy *Loc, int32_t TId);
-
-void __kmpc_flush(IdentTy *Loc);
-
-uint64_t __kmpc_warp_active_thread_mask(void);
-
-void __kmpc_syncwarp(uint64_t Mask);
-
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
-
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
-///}
-
-/// Parallelism
-///
-///{
-/// TODO
-void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
-
-/// TODO
-bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
-
-/// TODO
-void __kmpc_kernel_end_parallel();
-
-/// TODO
-void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
-
-/// TODO
-void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
-                           int32_t ThreadLimit);
-
-/// TODO
-uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
-
-///}
-
-/// Tasking
-///
-///{
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
-                                        size_t TaskSizeInclPrivateValues,
-                                        size_t SharedValuesSize,
-                                        TaskFnTy TaskFn);
-
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
-                        TaskDescriptorTy *TaskDescriptor);
-
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor, int32_t,
-                                  void *, int32_t, void *);
-
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
-                               TaskDescriptorTy *TaskDescriptor);
-
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor);
-
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
-                          void *);
-
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
-
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
-
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
-
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
-
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
-                     TaskDescriptorTy *TaskDescriptor, int,
-                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
-                     int32_t, uint64_t, void *);
-///}
-
-/// Misc
-///
-///{
-int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal);
-
-int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
-///}
-
-/// Shuffle
-///
-///{
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-///}
-}
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/LibC.h b/libomptarget/DeviceRTL/include/LibC.h
deleted file mode 100644
index dde86af..0000000
--- a/libomptarget/DeviceRTL/include/LibC.h
+++ /dev/null

@@ -1,25 +0,0 @@
-//===--------- LibC.h - Simple implementation of libc functions --- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_LIBC_H
-#define OMPTARGET_LIBC_H
-
-#include "Types.h"
-
-extern "C" {
-
-int memcmp(const void *lhs, const void *rhs, size_t count);
-void memset(void *dst, int C, size_t count);
-
-int printf(const char *format, ...);
-}
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/Mapping.h b/libomptarget/DeviceRTL/include/Mapping.h
deleted file mode 100644
index 1659046..0000000
--- a/libomptarget/DeviceRTL/include/Mapping.h
+++ /dev/null

@@ -1,112 +0,0 @@
-//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_MAPPING_H
-#define OMPTARGET_MAPPING_H
-
-#include "Types.h"
-
-namespace ompx {
-
-namespace mapping {
-
-enum {
-  DIM_X = 0,
-  DIM_Y = 1,
-  DIM_Z = 2,
-};
-
-#pragma omp begin declare target device_type(nohost)
-
-inline constexpr uint32_t MaxThreadsPerTeam = 1024;
-
-#pragma omp end declare target
-
-/// Initialize the mapping machinery.
-void init(bool IsSPMD);
-
-/// Return true if the kernel is executed in SPMD mode.
-bool isSPMDMode();
-
-/// Return true if the kernel is executed in generic mode.
-bool isGenericMode();
-
-/// Return true if the executing thread is the main thread in generic mode.
-/// These functions will lookup state and it is required that that is OK for the
-/// thread and location. See also `isInitialThreadInLevel0` for a stateless
-/// alternative for certain situations, e.g. during initialization.
-bool isMainThreadInGenericMode();
-bool isMainThreadInGenericMode(bool IsSPMD);
-
-/// Return true if this thread is the initial thread in parallel level 0.
-///
-/// The thread for which this returns true should be used for single threaded
-/// initialization tasks. We pick a special thread to ensure there are no
-/// races between the initialization and the first read of initialized state.
-bool isInitialThreadInLevel0(bool IsSPMD);
-
-/// Return true if the executing thread has the lowest Id of the active threads
-/// in the warp.
-bool isLeaderInWarp();
-
-/// Return a mask describing all active threads in the warp.
-LaneMaskTy activemask();
-
-/// Return a mask describing all threads with a smaller Id in the warp.
-LaneMaskTy lanemaskLT();
-
-/// Return a mask describing all threads with a larget Id in the warp.
-LaneMaskTy lanemaskGT();
-
-/// Return the thread Id in the warp, in [0, getWarpSize()).
-uint32_t getThreadIdInWarp();
-
-/// Return the warp size, thus number of threads in the warp.
-uint32_t getWarpSize();
-
-/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()]
-uint32_t getWarpIdInBlock();
-
-/// Return the number of warps in the block.
-uint32_t getNumberOfWarpsInBlock();
-
-/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)).
-uint32_t getThreadIdInBlock(int32_t Dim = DIM_X);
-
-/// Return the block size, thus number of threads in the block.
-uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X);
-
-/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)).
-uint32_t getBlockIdInKernel(int32_t Dim = DIM_X);
-
-/// Return the number of blocks in the kernel.
-uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X);
-
-/// Return the kernel size, thus number of threads in the kernel.
-uint32_t getNumberOfThreadsInKernel();
-
-/// Return the maximal number of threads in the block usable for a team (=
-/// parallel region).
-///
-/// Note: The version taking \p IsSPMD mode explicitly can be used during the
-/// initialization of the target region, that is before `mapping::isSPMDMode()`
-/// can be called by any thread other than the main one.
-uint32_t getMaxTeamThreads();
-uint32_t getMaxTeamThreads(bool IsSPMD);
-
-/// Return the number of processing elements on the device.
-uint32_t getNumberOfProcessorElements();
-
-} // namespace mapping
-
-} // namespace ompx
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/State.h b/libomptarget/DeviceRTL/include/State.h
deleted file mode 100644
index 1a34903..0000000
--- a/libomptarget/DeviceRTL/include/State.h
+++ /dev/null

@@ -1,385 +0,0 @@
-//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_STATE_H
-#define OMPTARGET_STATE_H
-
-#include "Shared/Environment.h"
-
-#include "Debug.h"
-#include "Mapping.h"
-#include "Types.h"
-#include "Utils.h"
-
-// Forward declaration.
-struct KernelEnvironmentTy;
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-
-namespace memory {
-
-/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
-///
-/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
-void *allocShared(uint64_t Size, const char *Reason);
-
-/// Free \p Ptr, alloated via allocShared, for \p Reason.
-///
-/// Note: See the restrictions on __kmpc_free_shared for proper usage.
-void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
-
-/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
-void *allocGlobal(uint64_t Size, const char *Reason);
-
-/// Return a pointer to the dynamic shared memory buffer.
-void *getDynamicBuffer();
-
-/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
-void freeGlobal(void *Ptr, const char *Reason);
-
-} // namespace memory
-
-namespace state {
-
-inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
-
-struct ICVStateTy {
-  uint32_t NThreadsVar;
-  uint32_t LevelVar;
-  uint32_t ActiveLevelVar;
-  uint32_t Padding0Val;
-  uint32_t MaxActiveLevelsVar;
-  uint32_t RunSchedVar;
-  uint32_t RunSchedChunkVar;
-
-  bool operator==(const ICVStateTy &Other) const;
-
-  void assertEqual(const ICVStateTy &Other) const;
-};
-
-struct TeamStateTy {
-  void init(bool IsSPMD);
-
-  bool operator==(const TeamStateTy &) const;
-
-  void assertEqual(TeamStateTy &Other) const;
-
-  /// ICVs
-  ///
-  /// Preallocated storage for ICV values that are used if the threads have not
-  /// set a custom default. The latter is supported but unlikely and slow(er).
-  ///
-  ///{
-  ICVStateTy ICVState;
-  ///}
-
-  uint32_t ParallelTeamSize;
-  uint32_t HasThreadState;
-  ParallelRegionFnTy ParallelRegionFnVar;
-};
-
-extern TeamStateTy TeamState;
-#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
-
-struct ThreadStateTy {
-
-  /// ICVs have preallocated storage in the TeamStateTy which is used if a
-  /// thread has not set a custom value. The latter is supported but unlikely.
-  /// When it happens we will allocate dynamic memory to hold the values of all
-  /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
-  /// ICV struct to hold them all. This is slower than alternatives but allows
-  /// users to pay only for what they use.
-  ///
-  state::ICVStateTy ICVState;
-
-  ThreadStateTy *PreviousThreadState;
-
-  void init() {
-    ICVState = TeamState.ICVState;
-    PreviousThreadState = nullptr;
-  }
-
-  void init(ThreadStateTy *PreviousTS) {
-    ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
-    PreviousThreadState = PreviousTS;
-  }
-};
-
-extern ThreadStateTy **ThreadStates;
-#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
-
-/// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-          KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
-
-/// Return the kernel and kernel launch environment associated with the current
-/// kernel. The former is static and contains compile time information that
-/// holds for all instances of the kernel. The latter is dynamic and provides
-/// per-launch information.
-KernelEnvironmentTy &getKernelEnvironment();
-KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
-
-/// TODO
-enum ValueKind {
-  VK_NThreads,
-  VK_Level,
-  VK_ActiveLevel,
-  VK_MaxActiveLevels,
-  VK_RunSched,
-  // ---
-  VK_RunSchedChunk,
-  VK_ParallelRegionFn,
-  VK_ParallelTeamSize,
-  VK_HasThreadState,
-};
-
-/// TODO
-void enterDataEnvironment(IdentTy *Ident);
-
-/// TODO
-void exitDataEnvironment();
-
-/// TODO
-struct DateEnvironmentRAII {
-  DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); }
-  ~DateEnvironmentRAII() { exitDataEnvironment(); }
-};
-
-/// TODO
-void resetStateForThread(uint32_t TId);
-
-inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
-                                       IdentTy *Ident, bool ForceTeamState) {
-  if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||
-                 !TeamState.HasThreadState))
-    return TeamState.ICVState.*Var;
-  uint32_t TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(!ThreadStates[TId])) {
-    ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
-        sizeof(ThreadStateTy), "ICV modification outside data environment"));
-    ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");
-    TeamState.HasThreadState = true;
-    ThreadStates[TId]->init();
-  }
-  return ThreadStates[TId]->ICVState.*Var;
-}
-
-inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
-                            bool ForceTeamState) {
-  auto TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&
-                   TeamState.HasThreadState && ThreadStates[TId]))
-    return ThreadStates[TId]->ICVState.*Var;
-  return TeamState.ICVState.*Var;
-}
-
-[[gnu::always_inline, gnu::flatten]] inline uint32_t &
-lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
-  switch (Kind) {
-  case state::VK_NThreads:
-    if (IsReadonly)
-      return lookupImpl(&ICVStateTy::NThreadsVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident,
-                                 ForceTeamState);
-  case state::VK_Level:
-    if (IsReadonly)
-      return lookupImpl(&ICVStateTy::LevelVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident, ForceTeamState);
-  case state::VK_ActiveLevel:
-    if (IsReadonly)
-      return lookupImpl(&ICVStateTy::ActiveLevelVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident,
-                                 ForceTeamState);
-  case state::VK_MaxActiveLevels:
-    if (IsReadonly)
-      return lookupImpl(&ICVStateTy::MaxActiveLevelsVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident,
-                                 ForceTeamState);
-  case state::VK_RunSched:
-    if (IsReadonly)
-      return lookupImpl(&ICVStateTy::RunSchedVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident,
-                                 ForceTeamState);
-  case state::VK_RunSchedChunk:
-    if (IsReadonly)
-      return lookupImpl(&ICVStateTy::RunSchedChunkVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident,
-                                 ForceTeamState);
-  case state::VK_ParallelTeamSize:
-    return TeamState.ParallelTeamSize;
-  case state::VK_HasThreadState:
-    return TeamState.HasThreadState;
-  default:
-    break;
-  }
-  __builtin_unreachable();
-}
-
-[[gnu::always_inline, gnu::flatten]] inline void *&
-lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
-  switch (Kind) {
-  case state::VK_ParallelRegionFn:
-    return TeamState.ParallelRegionFnVar;
-  default:
-    break;
-  }
-  __builtin_unreachable();
-}
-
-/// A class without actual state used to provide a nice interface to lookup and
-/// update ICV values we can declare in global scope.
-template <typename Ty, ValueKind Kind> struct Value {
-  [[gnu::flatten, gnu::always_inline]] operator Ty() {
-    return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
-                  /*ForceTeamState=*/false);
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) {
-    set(Other, /*IdentTy=*/nullptr);
-    return *this;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Value &operator++() {
-    inc(1, /*IdentTy=*/nullptr);
-    return *this;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Value &operator--() {
-    inc(-1, /*IdentTy=*/nullptr);
-    return *this;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] void
-  assert_eq(const Ty &V, IdentTy *Ident = nullptr,
-            bool ForceTeamState = false) {
-    ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr);
-  }
-
-private:
-  [[gnu::flatten, gnu::always_inline]] Ty &
-  lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
-    Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
-    return t;
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) {
-    return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) +=
-            UpdateVal);
-  }
-
-  [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) {
-    return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) =
-                UpdateVal);
-  }
-
-  template <typename VTy, typename Ty2> friend struct ValueRAII;
-};
-
-/// A mookup class without actual state used to provide
-/// a nice interface to lookup and update ICV values
-/// we can declare in global scope.
-template <typename Ty, ValueKind Kind> struct PtrValue {
-  [[gnu::flatten, gnu::always_inline]] operator Ty() {
-    return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
-                  /*ForceTeamState=*/false);
-  }
-
-  [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) {
-    set(Other);
-    return *this;
-  }
-
-private:
-  Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
-    return lookupPtr(Kind, IsReadonly, ForceTeamState);
-  }
-
-  Ty &set(Ty UpdateVal) {
-    return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr,
-                   /*ForceTeamState=*/false) = UpdateVal);
-  }
-
-  template <typename VTy, typename Ty2> friend struct ValueRAII;
-};
-
-template <typename VTy, typename Ty> struct ValueRAII {
-  ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident,
-            bool ForceTeamState = false)
-      : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState)
-                   : (Ty *)utils::UndefPtr),
-        Val(OldValue), Active(Active) {
-    if (!Active)
-      return;
-    ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!");
-    *Ptr = NewValue;
-  }
-  ~ValueRAII() {
-    if (Active)
-      *Ptr = Val;
-  }
-
-private:
-  Ty *Ptr;
-  Ty Val;
-  bool Active;
-};
-
-/// TODO
-inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;
-
-/// TODO
-inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
-    ParallelRegionFn;
-
-void runAndCheckState(void(Func(void)));
-
-void assumeInitialState(bool IsSPMD);
-
-/// Return the value of the ParallelTeamSize ICV.
-int getEffectivePTeamSize();
-
-} // namespace state
-
-namespace icv {
-
-/// TODO
-inline state::Value<uint32_t, state::VK_NThreads> NThreads;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_Level> Level;
-
-/// The `active-level` describes which of the parallel level counted with the
-/// `level-var` is active. There can only be one.
-///
-/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0.
-inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
-
-/// TODO
-inline state::Value<uint32_t, state::VK_RunSched> RunSched;
-
-} // namespace icv
-
-} // namespace ompx
-
-#pragma omp end declare target
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/Synchronization.h b/libomptarget/DeviceRTL/include/Synchronization.h
deleted file mode 100644
index af9e1a6..0000000
--- a/libomptarget/DeviceRTL/include/Synchronization.h
+++ /dev/null

@@ -1,140 +0,0 @@
-//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
-#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
-
-#include "Types.h"
-
-namespace ompx {
-
-namespace atomic {
-
-enum OrderingTy {
-  relaxed = __ATOMIC_RELAXED,
-  aquire = __ATOMIC_ACQUIRE,
-  release = __ATOMIC_RELEASE,
-  acq_rel = __ATOMIC_ACQ_REL,
-  seq_cst = __ATOMIC_SEQ_CST,
-};
-
-enum MemScopeTy {
-  all,    // All threads on all devices
-  device, // All threads on the device
-  cgroup  // All threads in the contention group, e.g. the team
-};
-
-/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
-uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
-             MemScopeTy MemScope = MemScopeTy::all);
-
-/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
-/// result is stored in \p *Addr;
-/// {
-
-#define ATOMIC_COMMON_OP(TY)                                                   \
-  TY add(TY *Addr, TY V, OrderingTy Ordering);                                 \
-  TY mul(TY *Addr, TY V, OrderingTy Ordering);                                 \
-  TY load(TY *Addr, OrderingTy Ordering);                                      \
-  void store(TY *Addr, TY V, OrderingTy Ordering);                             \
-  bool cas(TY *Addr, TY ExpectedV, TY DesiredV, OrderingTy OrderingSucc,       \
-           OrderingTy OrderingFail);
-
-#define ATOMIC_FP_ONLY_OP(TY)                                                  \
-  TY min(TY *Addr, TY V, OrderingTy Ordering);                                 \
-  TY max(TY *Addr, TY V, OrderingTy Ordering);
-
-#define ATOMIC_INT_ONLY_OP(TY)                                                 \
-  TY min(TY *Addr, TY V, OrderingTy Ordering);                                 \
-  TY max(TY *Addr, TY V, OrderingTy Ordering);                                 \
-  TY bit_or(TY *Addr, TY V, OrderingTy Ordering);                              \
-  TY bit_and(TY *Addr, TY V, OrderingTy Ordering);                             \
-  TY bit_xor(TY *Addr, TY V, OrderingTy Ordering);
-
-#define ATOMIC_FP_OP(TY)                                                       \
-  ATOMIC_FP_ONLY_OP(TY)                                                        \
-  ATOMIC_COMMON_OP(TY)
-
-#define ATOMIC_INT_OP(TY)                                                      \
-  ATOMIC_INT_ONLY_OP(TY)                                                       \
-  ATOMIC_COMMON_OP(TY)
-
-// This needs to be kept in sync with the header. Also the reason we don't use
-// templates here.
-ATOMIC_INT_OP(int8_t)
-ATOMIC_INT_OP(int16_t)
-ATOMIC_INT_OP(int32_t)
-ATOMIC_INT_OP(int64_t)
-ATOMIC_INT_OP(uint8_t)
-ATOMIC_INT_OP(uint16_t)
-ATOMIC_INT_OP(uint32_t)
-ATOMIC_INT_OP(uint64_t)
-ATOMIC_FP_OP(float)
-ATOMIC_FP_OP(double)
-
-#undef ATOMIC_INT_ONLY_OP
-#undef ATOMIC_FP_ONLY_OP
-#undef ATOMIC_COMMON_OP
-#undef ATOMIC_INT_OP
-#undef ATOMIC_FP_OP
-
-///}
-
-} // namespace atomic
-
-namespace synchronize {
-
-/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
-
-/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
-
-/// Synchronize all threads in a block and perform a fence before and after the
-/// barrier according to \p Ordering. Note that the fence might be part of the
-/// barrier.
-void threads(atomic::OrderingTy Ordering);
-
-/// Synchronizing threads is allowed even if they all hit different instances of
-/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance. The
-/// noinline is removed by the openmp-opt pass and helps to preserve the
-/// information till then.
-///{
-#pragma omp begin assumes ext_aligned_barrier
-
-/// Synchronize all threads in a block, they are reaching the same instruction
-/// (hence all threads in the block are "aligned"). Also perform a fence before
-/// and after the barrier according to \p Ordering. Note that the
-/// fence might be part of the barrier if the target offers this.
-[[gnu::noinline]] void threadsAligned(atomic::OrderingTy Ordering);
-
-#pragma omp end assumes
-///}
-
-} // namespace synchronize
-
-namespace fence {
-
-/// Memory fence with \p Ordering semantics for the team.
-void team(atomic::OrderingTy Ordering);
-
-/// Memory fence with \p Ordering semantics for the contention group.
-void kernel(atomic::OrderingTy Ordering);
-
-/// Memory fence with \p Ordering semantics for the system.
-void system(atomic::OrderingTy Ordering);
-
-} // namespace fence
-
-} // namespace ompx
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/Types.h b/libomptarget/DeviceRTL/include/Types.h
deleted file mode 100644
index 2e12d9d..0000000
--- a/libomptarget/DeviceRTL/include/Types.h
+++ /dev/null

@@ -1,213 +0,0 @@
-//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_TYPES_H
-#define OMPTARGET_TYPES_H
-
-// Tell the compiler that we do not have any "call-like" inline assembly in the
-// device rutime. That means we cannot have inline assembly which will call
-// another function but only inline assembly that performs some operation or
-// side-effect and then continues execution with something on the existing call
-// stack.
-//
-// TODO: Find a good place for this
-#pragma omp assumes ext_no_call_asm
-
-/// Base type declarations for freestanding mode
-///
-///{
-using int8_t = char;
-using uint8_t = unsigned char;
-using int16_t = short;
-using uint16_t = unsigned short;
-using int32_t = int;
-using uint32_t = unsigned int;
-using int64_t = long;
-using uint64_t = unsigned long;
-using size_t = decltype(sizeof(char));
-// TODO: Properly implement this
-using intptr_t = int64_t;
-using uintptr_t = uint64_t;
-
-static_assert(sizeof(int8_t) == 1, "type size mismatch");
-static_assert(sizeof(uint8_t) == 1, "type size mismatch");
-static_assert(sizeof(int16_t) == 2, "type size mismatch");
-static_assert(sizeof(uint16_t) == 2, "type size mismatch");
-static_assert(sizeof(int32_t) == 4, "type size mismatch");
-static_assert(sizeof(uint32_t) == 4, "type size mismatch");
-static_assert(sizeof(int64_t) == 8, "type size mismatch");
-static_assert(sizeof(uint64_t) == 8, "type size mismatch");
-///}
-
-enum omp_proc_bind_t {
-  omp_proc_bind_false = 0,
-  omp_proc_bind_true = 1,
-  omp_proc_bind_master = 2,
-  omp_proc_bind_close = 3,
-  omp_proc_bind_spread = 4
-};
-
-enum omp_sched_t {
-  omp_sched_static = 1,  /* chunkSize >0 */
-  omp_sched_dynamic = 2, /* chunkSize >0 */
-  omp_sched_guided = 3,  /* chunkSize >0 */
-  omp_sched_auto = 4,    /* no chunkSize */
-};
-
-enum kmp_sched_t {
-  kmp_sched_static_chunk = 33,
-  kmp_sched_static_nochunk = 34,
-  kmp_sched_dynamic = 35,
-  kmp_sched_guided = 36,
-  kmp_sched_runtime = 37,
-  kmp_sched_auto = 38,
-
-  kmp_sched_static_balanced_chunk = 45,
-
-  kmp_sched_static_ordered = 65,
-  kmp_sched_static_nochunk_ordered = 66,
-  kmp_sched_dynamic_ordered = 67,
-  kmp_sched_guided_ordered = 68,
-  kmp_sched_runtime_ordered = 69,
-  kmp_sched_auto_ordered = 70,
-
-  kmp_sched_distr_static_chunk = 91,
-  kmp_sched_distr_static_nochunk = 92,
-  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
-
-  kmp_sched_default = kmp_sched_static_nochunk,
-  kmp_sched_unordered_first = kmp_sched_static_chunk,
-  kmp_sched_unordered_last = kmp_sched_auto,
-  kmp_sched_ordered_first = kmp_sched_static_ordered,
-  kmp_sched_ordered_last = kmp_sched_auto_ordered,
-  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
-  kmp_sched_distribute_last =
-      kmp_sched_distr_static_chunk_sched_static_chunkone,
-
-  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
-   * Since we need to distinguish the three possible cases (no modifier,
-   * monotonic modifier, nonmonotonic modifier), we need separate bits for
-   * each modifier. The absence of monotonic does not imply nonmonotonic,
-   * especially since 4.5 says that the behaviour of the "no modifier" case
-   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
-   *
-   * Since we're passing a full 32 bit value, we can use a couple of high
-   * bits for these flags; out of paranoia we avoid the sign bit.
-   *
-   * These modifiers can be or-ed into non-static schedules by the compiler
-   * to pass the additional information. They will be stripped early in the
-   * processing in __kmp_dispatch_init when setting up schedules, so
-   * most of the code won't ever see schedules with these bits set.
-   */
-  kmp_sched_modifier_monotonic = (1 << 29),
-  /**< Set if the monotonic schedule modifier was present */
-  kmp_sched_modifier_nonmonotonic = (1 << 30),
-/**< Set if the nonmonotonic schedule modifier was present */
-
-#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
-  (enum kmp_sched_t)(                                                          \
-      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
-#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
-  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
-#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
-  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
-   0)
-
-};
-
-struct TaskDescriptorTy;
-using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr);
-struct TaskDescriptorTy {
-  void *Payload;
-  TaskFnTy TaskFn;
-};
-
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(                                       \
-        device = {arch(amdgcn)}, implementation = {extension(match_none)})
-using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
-
-namespace lanes {
-enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
-} // namespace lanes
-
-/// The ident structure that describes a source location. The struct is
-/// identical to the one in the kmp.h file. We maintain the same data structure
-/// for compatibility.
-struct IdentTy {
-  int32_t reserved_1;  /**<  might be used in Fortran; see above  */
-  int32_t flags;       /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
-                            identifies this union member  */
-  int32_t reserved_2;  /**<  not really used in Fortran any more; see above */
-  int32_t reserved_3;  /**<  source[4] in Fortran, do not use for C++  */
-  char const *psource; /**<  String describing the source location.
-                       The string is composed of semi-colon separated fields
-                       which describe the source file, the function and a pair
-                       of line numbers that delimit the construct. */
-};
-
-using __kmpc_impl_lanemask_t = LaneMaskTy;
-
-using ParallelRegionFnTy = void *;
-
-using CriticalNameTy = int32_t[8];
-
-struct omp_lock_t {
-  void *Lock;
-};
-
-using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num);
-using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id,
-                                   int16_t lane_offset, int16_t shortCircuit);
-using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data);
-
-/// Macros for allocating variables in different address spaces.
-///{
-
-// Follows the pattern in interface.h
-typedef enum omp_allocator_handle_t {
-  omp_null_allocator = 0,
-  omp_default_mem_alloc = 1,
-  omp_large_cap_mem_alloc = 2,
-  omp_const_mem_alloc = 3,
-  omp_high_bw_mem_alloc = 4,
-  omp_low_lat_mem_alloc = 5,
-  omp_cgroup_mem_alloc = 6,
-  omp_pteam_mem_alloc = 7,
-  omp_thread_mem_alloc = 8,
-  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
-} omp_allocator_handle_t;
-
-#define __PRAGMA(STR) _Pragma(#STR)
-#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
-
-#define SHARED(NAME)                                                           \
-  NAME [[clang::loader_uninitialized]];                                        \
-  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
-
-// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
-//       now that's not the case.
-#define THREAD_LOCAL(NAME)                                                     \
-  [[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
-
-// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
-//       does?
-#define CONSTANT(NAME)                                                         \
-  [[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
-
-///}
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/Utils.h b/libomptarget/DeviceRTL/include/Utils.h
deleted file mode 100644
index 4ab0aea..0000000
--- a/libomptarget/DeviceRTL/include/Utils.h
+++ /dev/null

@@ -1,96 +0,0 @@
-//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICERTL_UTILS_H
-#define OMPTARGET_DEVICERTL_UTILS_H
-
-#include "Types.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-namespace utils {
-
-/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
-/// is identified by \p Mask.
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
-
-/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
-uint64_t pack(uint32_t LowBits, uint32_t HighBits);
-
-/// Unpack \p Val into \p LowBits and \p HighBits.
-void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
-
-/// Round up \p V to a \p Boundary.
-template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
-}
-
-/// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
-  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_ffs(V);
-}
-
-/// Return the first bit set in \p V.
-inline uint32_t ffs(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_ffsl(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint32_t V) {
-  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
-  return __builtin_popcount(V);
-}
-
-/// Return the number of bits set in \p V.
-inline uint32_t popc(uint64_t V) {
-  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
-  return __builtin_popcountl(V);
-}
-
-/// Return \p V aligned "upwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_up(Ty1 V, Ty2 Align) {
-  return ((V + Ty1(Align) - 1) / Ty1(Align)) * Ty1(Align);
-}
-/// Return \p V aligned "downwards" according to \p Align.
-template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
-  return V - V % Align;
-}
-
-/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
-bool isSharedMemPtr(void *Ptr);
-
-/// Return \p V typed punned as \p DstTy.
-template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
-  return *((DstTy *)(&V));
-}
-
-/// A  pointer variable that has by design an `undef` value. Use with care.
-[[clang::loader_uninitialized]] static void *const UndefPtr;
-
-#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
-#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
-
-} // namespace utils
-} // namespace ompx
-
-#pragma omp end declare target
-
-#endif

diff --git a/libomptarget/DeviceRTL/include/generated_microtask_cases.gen b/libomptarget/DeviceRTL/include/generated_microtask_cases.gen
deleted file mode 100644
index a05f6da..0000000
--- a/libomptarget/DeviceRTL/include/generated_microtask_cases.gen
+++ /dev/null

@@ -1,797 +0,0 @@
-case 0:
-((void (*)(int32_t *, int32_t *))fn)(&global_tid, &bound_tid);
-break;
-case 1:
-((void (*)(int32_t *, int32_t *, void *))fn)(&global_tid, &bound_tid, args[0]);
-break;
-case 2:
-((void (*)(int32_t *, int32_t *, void *, void *))fn)(&global_tid, &bound_tid,
-                                                     args[0], args[1]);
-break;
-case 3:
-((void (*)(int32_t *, int32_t *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2]);
-break;
-case 4:
-((void (*)(int32_t *, int32_t *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3]);
-break;
-case 5:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4]);
-break;
-case 6:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5]);
-break;
-case 7:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6]);
-break;
-case 8:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7]);
-break;
-case 9:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                       args[1], args[2], args[3], args[4],
-                                       args[5], args[6], args[7], args[8]);
-break;
-case 10:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                               args[1], args[2], args[3],
-                                               args[4], args[5], args[6],
-                                               args[7], args[8], args[9]);
-break;
-case 11:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10]);
-break;
-case 12:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11]);
-break;
-case 13:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12]);
-break;
-case 14:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13]);
-break;
-case 15:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14]);
-break;
-case 16:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15]);
-break;
-case 17:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                       args[1], args[2], args[3], args[4],
-                                       args[5], args[6], args[7], args[8],
-                                       args[9], args[10], args[11], args[12],
-                                       args[13], args[14], args[15], args[16]);
-break;
-case 18:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17]);
-break;
-case 19:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18]);
-break;
-case 20:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19]);
-break;
-case 21:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20]);
-break;
-case 22:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21]);
-break;
-case 23:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22]);
-break;
-case 24:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23]);
-break;
-case 25:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(&global_tid, &bound_tid, args[0],
-                                       args[1], args[2], args[3], args[4],
-                                       args[5], args[6], args[7], args[8],
-                                       args[9], args[10], args[11], args[12],
-                                       args[13], args[14], args[15], args[16],
-                                       args[17], args[18], args[19], args[20],
-                                       args[21], args[22], args[23], args[24]);
-break;
-case 26:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25]);
-break;
-case 27:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26]);
-break;
-case 28:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27]);
-break;
-case 29:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28]);
-break;
-case 30:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29]);
-break;
-case 31:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30]);
-break;
-case 32:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23], args[24], args[25], args[26],
-                               args[27], args[28], args[29], args[30],
-                               args[31]);
-break;
-case 33:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32]);
-break;
-case 34:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33]);
-break;
-case 35:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34]);
-break;
-case 36:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35]);
-break;
-case 37:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36]);
-break;
-case 38:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37]);
-break;
-case 39:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30], args[31], args[32], args[33],
-                       args[34], args[35], args[36], args[37], args[38]);
-break;
-case 40:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23], args[24], args[25], args[26],
-                               args[27], args[28], args[29], args[30], args[31],
-                               args[32], args[33], args[34], args[35], args[36],
-                               args[37], args[38], args[39]);
-break;
-case 41:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40]);
-break;
-case 42:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41]);
-break;
-case 43:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42]);
-break;
-case 44:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43]);
-break;
-case 45:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44]);
-break;
-case 46:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45]);
-break;
-///  DONE TO HERE
-case 47:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30], args[31], args[32], args[33],
-                       args[34], args[35], args[36], args[37], args[38],
-                       args[39], args[40], args[41], args[42], args[43],
-                       args[44], args[45], args[46]);
-break;
-case 48:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47]);
-break;
-case 49:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48]);
-break;
-case 50:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49]);
-break;
-case 51:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50]);
-break;
-case 52:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51]);
-break;
-case 53:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52]);
-break;
-case 54:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53]);
-break;
-case 55:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54]);
-break;
-case 56:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(&global_tid, &bound_tid, args[0], args[1],
-                               args[2], args[3], args[4], args[5], args[6],
-                               args[7], args[8], args[9], args[10], args[11],
-                               args[12], args[13], args[14], args[15], args[16],
-                               args[17], args[18], args[19], args[20], args[21],
-                               args[22], args[23], args[24], args[25], args[26],
-                               args[27], args[28], args[29], args[30], args[31],
-                               args[32], args[33], args[34], args[35], args[36],
-                               args[37], args[38], args[39], args[40], args[41],
-                               args[42], args[43], args[44], args[45], args[46],
-                               args[47], args[48], args[49], args[50], args[51],
-                               args[52], args[53], args[54], args[55]);
-break;
-case 57:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56]);
-break;
-case 58:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57]);
-break;
-case 59:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58]);
-break;
-case 60:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59]);
-break;
-case 61:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59], args[60]);
-break;
-case 62:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59], args[60], args[61]);
-break;
-case 63:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *))fn)(&global_tid, &bound_tid, args[0], args[1], args[2],
-                       args[3], args[4], args[5], args[6], args[7], args[8],
-                       args[9], args[10], args[11], args[12], args[13],
-                       args[14], args[15], args[16], args[17], args[18],
-                       args[19], args[20], args[21], args[22], args[23],
-                       args[24], args[25], args[26], args[27], args[28],
-                       args[29], args[30], args[31], args[32], args[33],
-                       args[34], args[35], args[36], args[37], args[38],
-                       args[39], args[40], args[41], args[42], args[43],
-                       args[44], args[45], args[46], args[47], args[48],
-                       args[49], args[50], args[51], args[52], args[53],
-                       args[54], args[55], args[56], args[57], args[58],
-                       args[59], args[60], args[61], args[62]);
-break;
-case 64:
-((void (*)(int32_t *, int32_t *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *, void *, void *, void *, void *, void *, void *,
-           void *, void *))fn)(
-    &global_tid, &bound_tid, args[0], args[1], args[2], args[3], args[4],
-    args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12],
-    args[13], args[14], args[15], args[16], args[17], args[18], args[19],
-    args[20], args[21], args[22], args[23], args[24], args[25], args[26],
-    args[27], args[28], args[29], args[30], args[31], args[32], args[33],
-    args[34], args[35], args[36], args[37], args[38], args[39], args[40],
-    args[41], args[42], args[43], args[44], args[45], args[46], args[47],
-    args[48], args[49], args[50], args[51], args[52], args[53], args[54],
-    args[55], args[56], args[57], args[58], args[59], args[60], args[61],
-    args[62], args[63]);
-break;

diff --git a/libomptarget/DeviceRTL/src/Allocator.cpp b/libomptarget/DeviceRTL/src/Allocator.cpp
deleted file mode 100644
index c9c940d..0000000
--- a/libomptarget/DeviceRTL/src/Allocator.cpp
+++ /dev/null

@@ -1,81 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "Mapping.h"
-#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
-
-using namespace ompx;
-
-#pragma omp begin declare target device_type(nohost)
-
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
-
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
-struct BumpAllocatorTy final {
-
-  void *alloc(uint64_t Size) {
-    Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
-
-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
-      __builtin_trap();
-
-    return reinterpret_cast<void *>(OldData);
-  }
-
-  void free(void *) {}
-};
-
-BumpAllocatorTy BumpAllocator;
-
-/// allocator namespace implementation
-///
-///{
-
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
-}
-
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
-
-///}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Configuration.cpp b/libomptarget/DeviceRTL/src/Configuration.cpp
deleted file mode 100644
index ef0c366..0000000
--- a/libomptarget/DeviceRTL/src/Configuration.cpp
+++ /dev/null

@@ -1,89 +0,0 @@
-//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the data object of the constant device environment and the
-// query API.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Configuration.h"
-#include "State.h"
-#include "Types.h"
-
-using namespace ompx;
-
-#pragma omp begin declare target device_type(nohost)
-
-// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
-[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_nested_parallelism = 0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_threads_oversubscription =
-    0;
-[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0;
-
-// This variable should be visibile to the plugin so we override the default
-// hidden visibility.
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceEnvironmentTy
-    CONSTANT(__omp_rtl_device_environment);
-
-uint32_t config::getAssumeTeamsOversubscription() {
-  return __omp_rtl_assume_teams_oversubscription;
-}
-
-uint32_t config::getAssumeThreadsOversubscription() {
-  return __omp_rtl_assume_threads_oversubscription;
-}
-
-uint32_t config::getDebugKind() {
-  return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind;
-}
-
-uint32_t config::getNumDevices() {
-  return __omp_rtl_device_environment.NumDevices;
-}
-
-uint32_t config::getDeviceNum() {
-  return __omp_rtl_device_environment.DeviceNum;
-}
-
-uint64_t config::getDynamicMemorySize() {
-  return __omp_rtl_device_environment.DynamicMemSize;
-}
-
-uint64_t config::getClockFrequency() {
-  return __omp_rtl_device_environment.ClockFrequency;
-}
-
-void *config::getIndirectCallTablePtr() {
-  return reinterpret_cast<void *>(
-      __omp_rtl_device_environment.IndirectCallTable);
-}
-
-uint64_t config::getHardwareParallelism() {
-  return __omp_rtl_device_environment.HardwareParallelism;
-}
-
-uint64_t config::getIndirectCallTableSize() {
-  return __omp_rtl_device_environment.IndirectCallTableSize;
-}
-
-bool config::isDebugMode(DeviceDebugKind Kind) {
-  return config::getDebugKind() & uint32_t(Kind);
-}
-
-bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
-
-bool config::mayUseNestedParallelism() {
-  if (__omp_rtl_assume_no_nested_parallelism)
-    return false;
-  return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Debug.cpp b/libomptarget/DeviceRTL/src/Debug.cpp
deleted file mode 100644
index 31cd54e..0000000
--- a/libomptarget/DeviceRTL/src/Debug.cpp
+++ /dev/null

@@ -1,45 +0,0 @@
-//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains debug utilities
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Configuration.h"
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Types.h"
-
-using namespace ompx;
-
-#pragma omp begin declare target device_type(nohost)
-
-extern "C" {
-void __assert_assume(bool condition) { __builtin_assume(condition); }
-
-void __assert_fail(const char *expr, const char *file, unsigned line,
-                   const char *function) {
-  __assert_fail_internal(expr, nullptr, file, line, function);
-}
-void __assert_fail_internal(const char *expr, const char *msg, const char *file,
-                            unsigned line, const char *function) {
-  if (msg) {
-    PRINTF("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
-           msg, expr);
-  } else {
-    PRINTF("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
-  }
-  __builtin_trap();
-}
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Kernel.cpp b/libomptarget/DeviceRTL/src/Kernel.cpp
deleted file mode 100644
index 95d4c72..0000000
--- a/libomptarget/DeviceRTL/src/Kernel.cpp
+++ /dev/null

@@ -1,157 +0,0 @@
-//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the kernel entry points for the device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Types.h"
-
-#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
-
-using namespace ompx;
-
-#pragma omp begin declare target device_type(nohost)
-
-static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  // Order is important here.
-  synchronize::init(IsSPMD);
-  mapping::init(IsSPMD);
-  state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
-}
-
-/// Simple generic state machine for worker threads.
-static void genericStateMachine(IdentTy *Ident) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-
-  do {
-    ParallelRegionFnTy WorkFn = nullptr;
-
-    // Wait for the signal that we have a new work function.
-    synchronize::threads(atomic::seq_cst);
-
-    // Retrieve the work function from the runtime.
-    bool IsActive = __kmpc_kernel_parallel(&WorkFn);
-
-    // If there is nothing more to do, break out of the state machine by
-    // returning to the caller.
-    if (!WorkFn)
-      return;
-
-    if (IsActive) {
-      ASSERT(!mapping::isSPMDMode(), nullptr);
-      ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
-      __kmpc_kernel_end_parallel();
-    }
-
-    synchronize::threads(atomic::seq_cst);
-
-  } while (true);
-}
-
-extern "C" {
-
-/// Initialization
-///
-/// \param Ident               Source location identification, can be NULL.
-///
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
-  bool IsSPMD = Configuration.ExecMode &
-                llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
-  bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
-  if (IsSPMD) {
-    inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
-                        KernelLaunchEnvironment);
-    synchronize::threadsAligned(atomic::relaxed);
-  } else {
-    inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
-                        KernelLaunchEnvironment);
-    // No need to wait since only the main threads will execute user
-    // code and workers will run into a barrier right away.
-  }
-
-  if (IsSPMD) {
-    state::assumeInitialState(IsSPMD);
-
-    // Synchronize to ensure the assertions above are in an aligned region.
-    // The barrier is eliminated later.
-    synchronize::threadsAligned(atomic::relaxed);
-    return -1;
-  }
-
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    return -1;
-
-  // Enter the generic state machine if enabled and if this thread can possibly
-  // be an active worker thread.
-  //
-  // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
-  // GPU.  In those cases, a single thread can apparently satisfy a barrier on
-  // behalf of all threads in the same warp.  Thus, it would not be safe for
-  // other threads in the main thread's warp to reach the first
-  // synchronize::threads call in genericStateMachine before the main thread
-  // reaches its corresponding synchronize::threads call: that would permit all
-  // active worker threads to proceed before the main thread has actually set
-  // state::ParallelRegionFn, and then they would immediately quit without
-  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
-  // main thread's warp, so none of its threads can ever be active worker
-  // threads.
-  if (UseGenericStateMachine &&
-      mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
-    genericStateMachine(KernelEnvironment.Ident);
-
-  return mapping::getThreadIdInBlock();
-}
-
-/// De-Initialization
-///
-/// In non-SPMD, this function releases the workers trapped in a state machine
-/// and also any memory dynamically allocated by the runtime.
-///
-/// \param Ident Source location identification, can be NULL.
-///
-void __kmpc_target_deinit() {
-  bool IsSPMD = mapping::isSPMDMode();
-  if (IsSPMD)
-    return;
-
-  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
-    // Signal the workers to exit the state machine and exit the kernel.
-    state::ParallelRegionFn = nullptr;
-  } else if (!state::getKernelEnvironment()
-                  .Configuration.UseGenericStateMachine) {
-    // Retrieve the work function just to ensure we always call
-    // __kmpc_kernel_parallel even if a custom state machine is used.
-    // TODO: this is not super pretty. The problem is we create the call to
-    // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it
-    // is not there yet. Thus, we assume we never reach it from
-    // __kmpc_target_deinit. That allows us to remove the store in there to
-    // ParallelRegionFn, which leads to bad results later on.
-    ParallelRegionFnTy WorkFn = nullptr;
-    __kmpc_kernel_parallel(&WorkFn);
-    ASSERT(WorkFn == nullptr, nullptr);
-  }
-}
-
-int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/LibC.cpp b/libomptarget/DeviceRTL/src/LibC.cpp
deleted file mode 100644
index e587c30..0000000
--- a/libomptarget/DeviceRTL/src/LibC.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-//===------- LibC.cpp - Simple implementation of libc functions --- C++ ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "LibC.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t);
-}
-
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-extern "C" int32_t vprintf(const char *, void *);
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return vprintf(Format, Arguments);
-}
-} // namespace impl
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-#ifdef OMPTARGET_HAS_LIBC
-// TODO: Remove this handling once we have varargs support.
-extern "C" struct FILE *stdout;
-extern "C" int32_t rpc_fprintf(FILE *, const char *, void *, uint64_t);
-
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
-  return rpc_fprintf(stdout, Format, Arguments, Size);
-}
-} // namespace impl
-#else
-// We do not have a vprintf implementation for AMD GPU so we use a stub.
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return -1;
-}
-} // namespace impl
-#endif
-#pragma omp end declare variant
-
-extern "C" {
-
-int memcmp(const void *lhs, const void *rhs, size_t count) {
-  auto *L = reinterpret_cast<const unsigned char *>(lhs);
-  auto *R = reinterpret_cast<const unsigned char *>(rhs);
-
-  for (size_t I = 0; I < count; ++I)
-    if (L[I] != R[I])
-      return (int)L[I] - (int)R[I];
-
-  return 0;
-}
-
-void memset(void *dst, int C, size_t count) {
-  auto *dstc = reinterpret_cast<char *>(dst);
-  for (size_t I = 0; I < count; ++I)
-    dstc[I] = C;
-}
-
-/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
-int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
-  return impl::omp_vprintf(Format, Arguments, Size);
-}
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Mapping.cpp b/libomptarget/DeviceRTL/src/Mapping.cpp
deleted file mode 100644
index b2028a8..0000000
--- a/libomptarget/DeviceRTL/src/Mapping.cpp
+++ /dev/null

@@ -1,367 +0,0 @@
-//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Mapping.h"
-#include "Interface.h"
-#include "State.h"
-#include "Types.h"
-#include "Utils.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-
-using namespace ompx;
-
-namespace ompx {
-namespace impl {
-
-// Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
-LaneMaskTy activemask();
-LaneMaskTy lanemaskLT();
-LaneMaskTy lanemaskGT();
-uint32_t getThreadIdInWarp();
-uint32_t getThreadIdInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInKernel();
-uint32_t getBlockIdInKernel(int32_t Dim);
-uint32_t getNumberOfBlocksInKernel(int32_t Dim);
-uint32_t getWarpIdInBlock();
-uint32_t getNumberOfWarpsInBlock();
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
-  return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  if (Lane == (mapping::getWarpSize() - 1))
-    return 0;
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
-  return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workitem_id_x();
-  case 1:
-    return __builtin_amdgcn_workitem_id_y();
-  case 2:
-    return __builtin_amdgcn_workitem_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
-         __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_id_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_id_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ntid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ntid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ntid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_tid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_tid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_tid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_nctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_nctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_nctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInBlock(0) *
-         impl::getNumberOfBlocksInKernel(0) *
-         impl::getNumberOfThreadsInBlock(1) *
-         impl::getNumberOfBlocksInKernel(1) *
-         impl::getNumberOfThreadsInBlock(2) *
-         impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
-}
-
-#pragma omp end declare variant
-///}
-
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
-///{
-
-static bool isInLastWarp() {
-  uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
-                     ~(mapping::getWarpSize() - 1);
-  return mapping::getThreadIdInBlock() == MainTId;
-}
-
-bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
-  if (IsSPMD || icv::Level)
-    return false;
-
-  // Check if this is the last warp in the block.
-  return isInLastWarp();
-}
-
-bool mapping::isMainThreadInGenericMode() {
-  return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
-}
-
-bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
-  if (IsSPMD)
-    return mapping::getThreadIdInBlock() == 0;
-  return isInLastWarp();
-}
-
-bool mapping::isLeaderInWarp() {
-  __kmpc_impl_lanemask_t Active = mapping::activemask();
-  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
-  return utils::popc(Active & LaneMaskLT) == 0;
-}
-
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
-
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
-
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
-
-uint32_t mapping::getThreadIdInWarp() {
-  uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
-  ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
-  return ThreadIdInWarp;
-}
-
-uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
-  uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
-  return ThreadIdInBlock;
-}
-
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
-
-uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
-  uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
-  // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * impl::getWarpSize());
-}
-uint32_t mapping::getMaxTeamThreads() {
-  return mapping::getMaxTeamThreads(mapping::isSPMDMode());
-}
-
-uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
-  return impl::getNumberOfThreadsInBlock(Dim);
-}
-
-uint32_t mapping::getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInKernel();
-}
-
-uint32_t mapping::getWarpIdInBlock() {
-  uint32_t WarpID = impl::getWarpIdInBlock();
-  ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
-  return WarpID;
-}
-
-uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
-  uint32_t BlockId = impl::getBlockIdInKernel(Dim);
-  ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
-  return BlockId;
-}
-
-uint32_t mapping::getNumberOfWarpsInBlock() {
-  uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
-  ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
-  return NumberOfWarpsInBlocks;
-}
-
-uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
-  uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
-  ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
-  return NumberOfBlocks;
-}
-
-uint32_t mapping::getNumberOfProcessorElements() {
-  return static_cast<uint32_t>(config::getHardwareParallelism());
-}
-
-///}
-
-/// Execution mode
-///
-///{
-
-// TODO: This is a workaround for initialization coming from kernels outside of
-//       the TU. We will need to solve this more correctly in the future.
-[[gnu::weak]] int SHARED(IsSPMDMode);
-
-void mapping::init(bool IsSPMD) {
-  if (mapping::isInitialThreadInLevel0(IsSPMD))
-    IsSPMDMode = IsSPMD;
-}
-
-bool mapping::isSPMDMode() { return IsSPMDMode; }
-
-bool mapping::isGenericMode() { return !isSPMDMode(); }
-///}
-
-extern "C" {
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
-  return mapping::getThreadIdInBlock();
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
-  return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
-}
-
-[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
-  return impl::getWarpSize();
-}
-}
-
-#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME)                                \
-  extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
-
-_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
-_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
-_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
-_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Misc.cpp b/libomptarget/DeviceRTL/src/Misc.cpp
deleted file mode 100644
index c24af94..0000000
--- a/libomptarget/DeviceRTL/src/Misc.cpp
+++ /dev/null

@@ -1,134 +0,0 @@
-//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Configuration.h"
-#include "Types.h"
-
-#include "Debug.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace ompx {
-namespace impl {
-
-double getWTick();
-
-double getWTime();
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-double getWTick() {
-  // The number of ticks per second for the AMDGPU clock varies by card and can
-  // only be retrived by querying the driver. We rely on the device environment
-  // to inform us what the proper frequency is.
-  return 1.0 / config::getClockFrequency();
-}
-
-double getWTime() {
-  uint64_t NumTicks = 0;
-  if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl))
-    NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83);
-  else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime))
-    NumTicks = __builtin_amdgcn_s_memrealtime();
-  else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime))
-    NumTicks = __builtin_amdgcn_s_memtime();
-
-  return static_cast<double>(NumTicks) * getWTick();
-}
-
-#pragma omp end declare variant
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-
-double getWTick() {
-  // Timer precision is 1ns
-  return ((double)1E-9);
-}
-
-double getWTime() {
-  uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
-  return static_cast<double>(nsecs) * getWTick();
-}
-
-#pragma omp end declare variant
-
-/// Lookup a device-side function using a host pointer /p HstPtr using the table
-/// provided by the device plugin. The table is an ordered pair of host and
-/// device pointers sorted on the value of the host pointer.
-void *indirectCallLookup(void *HstPtr) {
-  if (!HstPtr)
-    return nullptr;
-
-  struct IndirectCallTable {
-    void *HstPtr;
-    void *DevPtr;
-  };
-  IndirectCallTable *Table =
-      reinterpret_cast<IndirectCallTable *>(config::getIndirectCallTablePtr());
-  uint64_t TableSize = config::getIndirectCallTableSize();
-
-  // If the table is empty we assume this is device pointer.
-  if (!Table || !TableSize)
-    return HstPtr;
-
-  uint32_t Left = 0;
-  uint32_t Right = TableSize;
-
-  // If the pointer is definitely not contained in the table we exit early.
-  if (HstPtr < Table[Left].HstPtr || HstPtr > Table[Right - 1].HstPtr)
-    return HstPtr;
-
-  while (Left != Right) {
-    uint32_t Current = Left + (Right - Left) / 2;
-    if (Table[Current].HstPtr == HstPtr)
-      return Table[Current].DevPtr;
-
-    if (HstPtr < Table[Current].HstPtr)
-      Right = Current;
-    else
-      Left = Current;
-  }
-
-  // If we searched the whole table and found nothing this is a device pointer.
-  return HstPtr;
-}
-
-} // namespace impl
-} // namespace ompx
-
-/// Interfaces
-///
-///{
-
-extern "C" {
-int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
-
-int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
-
-double omp_get_wtick(void) { return ompx::impl::getWTick(); }
-
-double omp_get_wtime(void) { return ompx::impl::getWTime(); }
-
-void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
-  return ompx::impl::indirectCallLookup(HstPtr);
-}
-}
-
-///}
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Parallelism.cpp b/libomptarget/DeviceRTL/src/Parallelism.cpp
deleted file mode 100644
index 031a5ce..0000000
--- a/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ /dev/null

@@ -1,313 +0,0 @@
-//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-//    while (not finished) {
-//
-//    if (master) {
-//      sequential code, decide which par loop to do, or if finished
-//     __kmpc_kernel_prepare_parallel() // exec by master only
-//    }
-//    syncthreads // A
-//    __kmpc_kernel_parallel() // exec by all
-//    if (this thread is included in the parallel) {
-//      switch () for all parallel loops
-//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
-//    }
-//
-//
-//    The reason we don't exec end_parallel for the threads not included
-//    in the parallel loop is that for each barrier in the parallel
-//    region, these non-included threads will cycle through the
-//    syncthread A. Thus they must preserve their current threadId that
-//    is larger than thread in team.
-//
-//    To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
-
-using namespace ompx;
-
-#pragma omp begin declare target device_type(nohost)
-
-namespace {
-
-uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
-  uint32_t NThreadsICV =
-      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
-  uint32_t NumThreads = mapping::getMaxTeamThreads();
-
-  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
-    NumThreads = NThreadsICV;
-
-  // SPMD mode allows any number of threads, for generic mode we round down to a
-  // multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (mapping::isSPMDMode())
-    return NumThreads;
-
-  if (NumThreads < mapping::getWarpSize())
-    NumThreads = 1;
-  else
-    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
-
-  return NumThreads;
-}
-
-// Invoke an outlined parallel function unwrapping arguments (up to 32).
-[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
-                                              int32_t bound_tid, void *fn,
-                                              void **args, int64_t nargs) {
-  switch (nargs) {
-#include "generated_microtask_cases.gen"
-  default:
-    PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
-    __builtin_trap();
-  }
-}
-
-} // namespace
-
-extern "C" {
-
-[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
-                                                   int32_t num_threads,
-                                                   void *fn, void **args,
-                                                   const int64_t nargs) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t PTeamSize =
-      NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
-  // Avoid the race between the read of the `icv::Level` above and the write
-  // below by synchronizing all threads here.
-  synchronize::threadsAligned(atomic::seq_cst);
-  {
-    // Note that the order here is important. `icv::Level` has to be updated
-    // last or the other updates will cause a thread specific state to be
-    // created.
-    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
-                                          1u, TId == 0, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
-                                     /*ForceTeamState=*/true);
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
-                               /*ForceTeamState=*/true);
-
-    // Synchronize all threads after the main thread (TId == 0) set up the
-    // team state properly.
-    synchronize::threadsAligned(atomic::acq_rel);
-
-    state::ParallelTeamSize.assert_eq(PTeamSize, ident,
-                                      /*ForceTeamState=*/true);
-    icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
-    icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);
-
-    // Ensure we synchronize before we run user code to avoid invalidating the
-    // assumptions above.
-    synchronize::threadsAligned(atomic::relaxed);
-
-    if (!PTeamSize || TId < PTeamSize)
-      invokeMicrotask(TId, 0, fn, args, nargs);
-
-    // Synchronize all threads at the end of a parallel region.
-    synchronize::threadsAligned(atomic::seq_cst);
-  }
-
-  // Synchronize all threads to make sure every thread exits the scope above;
-  // otherwise the following assertions and the assumption in
-  // __kmpc_target_deinit may not hold.
-  synchronize::threadsAligned(atomic::acq_rel);
-
-  state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
-  icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
-  icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);
-
-  // Ensure we synchronize to create an aligned region around the assumptions.
-  synchronize::threadsAligned(atomic::relaxed);
-
-  return;
-}
-
-[[clang::always_inline]] void
-__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs) {
-  uint32_t TId = mapping::getThreadIdInBlock();
-
-  // Assert the parallelism level is zero if disabled by the user.
-  ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
-         "nested parallelism while disabled");
-
-  // Handle the serialized case first, same for SPMD/non-SPMD:
-  // 1) if-clause(0)
-  // 2) parallel in task or other thread state inducing construct
-  // 3) nested parallel regions
-  if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
-                   (config::mayUseNestedParallelism() && icv::Level))) {
-    state::DateEnvironmentRAII DERAII(ident);
-    ++icv::Level;
-    invokeMicrotask(TId, 0, fn, args, nargs);
-    return;
-  }
-
-  // From this point forward we know that there is no thread state used.
-  ASSERT(state::HasThreadState == false, nullptr);
-
-  uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
-  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
-  if (mapping::isSPMDMode()) {
-    // This was moved to its own routine so it could be called directly
-    // in certain situations to avoid resource consumption of unused
-    // logic in parallel_51.
-    __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);
-
-    return;
-  }
-
-  // We do *not* create a new data environment because all threads in the team
-  // that are active are now running this parallel region. They share the
-  // TeamState, which has an increase level-var and potentially active-level
-  // set, but they do not have individual ThreadStates yet. If they ever
-  // modify the ICVs beyond this point a ThreadStates will be allocated.
-
-  bool IsActiveParallelRegion = NumThreads > 1;
-  if (!IsActiveParallelRegion) {
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
-    invokeMicrotask(TId, 0, fn, args, nargs);
-    return;
-  }
-
-  void **GlobalArgs = nullptr;
-  if (nargs) {
-    __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
-    switch (nargs) {
-    default:
-      for (int I = 0; I < nargs; I++)
-        GlobalArgs[I] = args[I];
-      break;
-    case 16:
-      GlobalArgs[15] = args[15];
-      [[fallthrough]];
-    case 15:
-      GlobalArgs[14] = args[14];
-      [[fallthrough]];
-    case 14:
-      GlobalArgs[13] = args[13];
-      [[fallthrough]];
-    case 13:
-      GlobalArgs[12] = args[12];
-      [[fallthrough]];
-    case 12:
-      GlobalArgs[11] = args[11];
-      [[fallthrough]];
-    case 11:
-      GlobalArgs[10] = args[10];
-      [[fallthrough]];
-    case 10:
-      GlobalArgs[9] = args[9];
-      [[fallthrough]];
-    case 9:
-      GlobalArgs[8] = args[8];
-      [[fallthrough]];
-    case 8:
-      GlobalArgs[7] = args[7];
-      [[fallthrough]];
-    case 7:
-      GlobalArgs[6] = args[6];
-      [[fallthrough]];
-    case 6:
-      GlobalArgs[5] = args[5];
-      [[fallthrough]];
-    case 5:
-      GlobalArgs[4] = args[4];
-      [[fallthrough]];
-    case 4:
-      GlobalArgs[3] = args[3];
-      [[fallthrough]];
-    case 3:
-      GlobalArgs[2] = args[2];
-      [[fallthrough]];
-    case 2:
-      GlobalArgs[1] = args[1];
-      [[fallthrough]];
-    case 1:
-      GlobalArgs[0] = args[0];
-      [[fallthrough]];
-    case 0:
-      break;
-    }
-  }
-
-  {
-    // Note that the order here is important. `icv::Level` has to be updated
-    // last or the other updates will cause a thread specific state to be
-    // created.
-    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
-                                          1u, true, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (void *)nullptr, true, ident,
-                                          /*ForceTeamState=*/true);
-    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
-                                     /*ForceTeamState=*/true);
-    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
-                               /*ForceTeamState=*/true);
-
-    // Master signals work to activate workers.
-    synchronize::threads(atomic::seq_cst);
-    // Master waits for workers to signal.
-    synchronize::threads(atomic::seq_cst);
-  }
-
-  if (nargs)
-    __kmpc_end_sharing_variables();
-}
-
-[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
-  // Work function and arguments for L1 parallel region.
-  *WorkFn = state::ParallelRegionFn;
-
-  // If this is the termination signal from the master, quit early.
-  if (!*WorkFn)
-    return false;
-
-  // Set to true for workers participating in the parallel region.
-  uint32_t TId = mapping::getThreadIdInBlock();
-  bool ThreadIsActive = TId < state::getEffectivePTeamSize();
-  return ThreadIsActive;
-}
-
-[[clang::noinline]] void __kmpc_kernel_end_parallel() {
-  // In case we have modified an ICV for this thread before a ThreadState was
-  // created. We drop it now to not contaminate the next parallel region.
-  ASSERT(!mapping::isSPMDMode(), nullptr);
-  uint32_t TId = mapping::getThreadIdInBlock();
-  state::resetStateForThread(TId);
-  ASSERT(!mapping::isSPMDMode(), nullptr);
-}
-
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
-
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
-
-void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
-                           int32_t thread_limit) {}
-
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Reduction.cpp b/libomptarget/DeviceRTL/src/Reduction.cpp
deleted file mode 100644
index 744d1a3..0000000
--- a/libomptarget/DeviceRTL/src/Reduction.cpp
+++ /dev/null

@@ -1,319 +0,0 @@
-//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of reduction with KMPC interface.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
-
-using namespace ompx;
-
-namespace {
-
-#pragma omp begin declare target device_type(nohost)
-
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
-  for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
-    shflFct(reduce_data, /*LaneId - not used= */ 0,
-            /*Offset = */ mask, /*AlgoVersion=*/0);
-  }
-}
-
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
-                               uint32_t size, uint32_t tid) {
-  uint32_t curr_size;
-  uint32_t mask;
-  curr_size = size;
-  mask = curr_size / 2;
-  while (mask > 0) {
-    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
-    curr_size = (curr_size + 1) / 2;
-    mask = curr_size / 2;
-  }
-}
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
-                                          ShuffleReductFnTy shflFct) {
-  uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
-  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
-  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
-  do {
-    Liveness = mapping::activemask();
-    remote_id = utils::ffs(Liveness & lanemask_gt);
-    size = utils::popc(Liveness);
-    logical_lane_id /= 2;
-    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-  return (logical_lane_id == 0);
-}
-#endif
-
-static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
-                                            ShuffleReductFnTy shflFct,
-                                            InterWarpCopyFnTy cpyFct) {
-  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false))
-    BlockThreadId = 0;
-  uint32_t NumThreads = omp_get_num_threads();
-  if (NumThreads == 1)
-    return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded =
-      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpIdInBlock();
-
-  // Volta execution model:
-  // For the Generic execution mode a parallel region either has 1 thread and
-  // beyond that, always a multiple of 32. For the SPMD execution mode we may
-  // have any number of threads.
-  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-  }
-  return BlockThreadId == 0;
-#else
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  if (Liveness == lanes::All) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/utils::popc(Liveness),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-  else { // Dispersed lanes. Only threads in L2
-         // parallel region may enter here; return
-         // early.
-    return gpu_irregular_simd_reduce(reduce_data, shflFct);
-  }
-
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-
-    return BlockThreadId == 0;
-  }
-
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
-  return BlockThreadId == 0;
-#endif // __CUDA_ARCH__ >= 700
-}
-
-uint32_t roundToWarpsize(uint32_t s) {
-  if (s < mapping::getWarpSize())
-    return 1;
-  return (s & ~(unsigned)(mapping::getWarpSize() - 1));
-}
-
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-} // namespace
-
-extern "C" {
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
-                                               uint64_t reduce_data_size,
-                                               void *reduce_data,
-                                               ShuffleReductFnTy shflFct,
-                                               InterWarpCopyFnTy cpyFct) {
-  return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
-}
-
-int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
-    uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
-    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
-    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
-  // Terminate all threads in non-SPMD mode except for the master thread.
-  uint32_t ThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isGenericMode()) {
-    if (!mapping::isMainThreadInGenericMode())
-      return 0;
-    ThreadId = 0;
-  }
-
-  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
-  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads = omp_get_num_threads();
-  uint32_t TeamId = omp_get_team_num();
-  uint32_t NumTeams = omp_get_num_teams();
-  static unsigned SHARED(Bound);
-  static unsigned SHARED(ChunkTeamCount);
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMaster = (ThreadId == 0);
-  while (IsMaster) {
-    Bound = atomic::load(&IterCnt, atomic::aquire);
-    if (TeamId < Bound + num_of_records)
-      break;
-  }
-
-  if (IsMaster) {
-    int ModBockId = TeamId % num_of_records;
-    if (TeamId < num_of_records) {
-      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
-    } else
-      lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
-    // Propagate the memory writes above to the world.
-    fence::kernel(atomic::release);
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // num_of_records chunk.
-    ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
-                                 atomic::MemScopeTy::device);
-  }
-
-  // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
-  // state machine.
-  if (mapping::isSPMDMode())
-    synchronize::threadsAligned(atomic::acq_rel);
-
-  // reduce_data is global or shared so before being reduced within the
-  // warp we need to bring it in local memory:
-  // local_reduce_data = reduce_data[i]
-  //
-  // Example for 3 reduction variables a, b, c (of potentially different
-  // types):
-  //
-  // buffer layout (struct of arrays):
-  // a, a, ..., a, b, b, ... b, c, c, ... c
-  // |__________|
-  //     num_of_records
-  //
-  // local_data_reduce layout (struct):
-  // a, b, c
-  //
-  // Each thread will have a local struct containing the values to be
-  // reduced:
-  //      1. do reduction within each warp.
-  //      2. do reduction across warps.
-  //      3. write the final result to the main reduction variable
-  //         by returning 1 in the thread holding the reduction result.
-
-  // Check if this is the very last team.
-  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
-  if (ChunkTeamCount == NumTeams - Bound - 1) {
-    // Ensure we see the global memory writes by other teams
-    fence::kernel(atomic::aquire);
-
-    //
-    // Last team processing.
-    //
-    if (ThreadId >= NumRecs)
-      return 0;
-    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
-    if (ThreadId >= NumThreads)
-      return 0;
-
-    // Load from buffer and reduce.
-    glcpyFct(GlobalBuffer, ThreadId, reduce_data);
-    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
-      glredFct(GlobalBuffer, i, reduce_data);
-
-    // Reduce across warps to the warp master.
-    if (NumThreads > 1) {
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-
-      // When we have more than [mapping::getWarpSize()] number of threads
-      // a block reduction is performed here.
-      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
-      if (ActiveThreads > mapping::getWarpSize()) {
-        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
-                               mapping::getWarpSize();
-        // Gather all the reduced values from each warp
-        // to the first warp.
-        cpyFct(reduce_data, WarpsNeeded);
-
-        uint32_t WarpId = ThreadId / mapping::getWarpSize();
-        if (WarpId == 0)
-          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                    ThreadId);
-      }
-    }
-
-    if (IsMaster) {
-      Cnt = 0;
-      IterCnt = 0;
-      return 1;
-    }
-    return 0;
-  }
-  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
-    // Allow SIZE number of teams to proceed writing their
-    // intermediate results to the global buffer.
-    atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
-  }
-
-  return 0;
-}
-}
-
-void *__kmpc_reduction_get_fixed_buffer() {
-  return state::getKernelLaunchEnvironment().ReductionBuffer;
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/State.cpp b/libomptarget/DeviceRTL/src/State.cpp
deleted file mode 100644
index a1e4fa2..0000000
--- a/libomptarget/DeviceRTL/src/State.cpp
+++ /dev/null

@@ -1,484 +0,0 @@
-//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/Environment.h"
-
-#include "Allocator.h"
-#include "Configuration.h"
-#include "Debug.h"
-#include "Interface.h"
-#include "LibC.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
-
-using namespace ompx;
-
-#pragma omp begin declare target device_type(nohost)
-
-/// Memory implementation
-///
-///{
-
-/// External symbol to access dynamic shared memory.
-[[gnu::aligned(
-    allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
-#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
-
-/// The kernel environment passed to the init method by the compiler.
-static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
-
-/// The kernel launch environment passed as argument to the kernel by the
-/// runtime.
-static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
-
-///}
-
-namespace {
-
-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-extern "C" {
-#ifdef __AMDGPU__
-
-[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-
-#else
-
-[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-
-#endif
-}
-///}
-
-/// A "smart" stack in shared memory.
-///
-/// The stack exposes a malloc/free interface but works like a stack internally.
-/// In fact, it is a separate stack *per warp*. That means, each warp must push
-/// and pop symmetrically or this breaks, badly. The implementation will (aim
-/// to) detect non-lock-step warps and fallback to malloc/free. The same will
-/// happen if a warp runs out of memory. The master warp in generic memory is
-/// special and is given more memory than the rest.
-///
-struct SharedMemorySmartStackTy {
-  /// Initialize the stack. Must be called by all threads.
-  void init(bool IsSPMD);
-
-  /// Allocate \p Bytes on the stack for the encountering thread. Each thread
-  /// can call this function.
-  void *push(uint64_t Bytes);
-
-  /// Deallocate the last allocation made by the encountering thread and pointed
-  /// to by \p Ptr from the stack. Each thread can call this function.
-  void pop(void *Ptr, uint32_t Bytes);
-
-private:
-  /// Compute the size of the storage space reserved for a thread.
-  uint32_t computeThreadStorageTotal() {
-    uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
-    return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
-                             allocator::ALIGNMENT);
-  }
-
-  /// Return the top address of the warp data stack, that is the first address
-  /// this warp will allocate memory at next.
-  void *getThreadDataTop(uint32_t TId) {
-    return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
-  }
-
-  /// The actual storage, shared among all warps.
-  [[gnu::aligned(
-      allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
-  [[gnu::aligned(
-      allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
-};
-
-static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
-              "Shared scratchpad of this size not supported yet.");
-
-/// The allocation of a single shared memory scratchpad.
-static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
-
-void SharedMemorySmartStackTy::init(bool IsSPMD) {
-  Usage[mapping::getThreadIdInBlock()] = 0;
-}
-
-void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
-  // First align the number of requested bytes.
-  /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
-  /// be passed in as an argument and the stack rewritten to support it.
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
-
-  uint32_t StorageTotal = computeThreadStorageTotal();
-
-  // The main thread in generic mode gets the space of its entire warp as the
-  // other threads do not participate in any computation at all.
-  if (mapping::isMainThreadInGenericMode())
-    StorageTotal *= mapping::getWarpSize();
-
-  int TId = mapping::getThreadIdInBlock();
-  if (Usage[TId] + AlignedBytes <= StorageTotal) {
-    void *Ptr = getThreadDataTop(TId);
-    Usage[TId] += AlignedBytes;
-    return Ptr;
-  }
-
-  if (config::isDebugMode(DeviceDebugKind::CommonIssues))
-    PRINT("Shared memory stack full, fallback to dynamic allocation of global "
-          "memory will negatively impact performance.\n");
-  void *GlobalMemory = memory::allocGlobal(
-      AlignedBytes, "Slow path shared memory allocation, insufficient "
-                    "shared memory stack memory!");
-  ASSERT(GlobalMemory != nullptr, "nullptr returned by malloc!");
-
-  return GlobalMemory;
-}
-
-void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
-  uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
-  if (utils::isSharedMemPtr(Ptr)) {
-    int TId = mapping::getThreadIdInBlock();
-    Usage[TId] -= AlignedBytes;
-    return;
-  }
-  memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
-}
-
-} // namespace
-
-void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
-
-void *memory::allocShared(uint64_t Bytes, const char *Reason) {
-  return SharedMemorySmartStack.push(Bytes);
-}
-
-void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
-  SharedMemorySmartStack.pop(Ptr, Bytes);
-}
-
-void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
-  void *Ptr = malloc(Bytes);
-  if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
-    PRINT("nullptr returned by malloc!\n");
-  return Ptr;
-}
-
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
-
-///}
-
-bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
-  return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
-         (ActiveLevelVar == Other.ActiveLevelVar) &
-         (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
-         (RunSchedVar == Other.RunSchedVar) &
-         (RunSchedChunkVar == Other.RunSchedChunkVar);
-}
-
-void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
-  ASSERT(NThreadsVar == Other.NThreadsVar, nullptr);
-  ASSERT(LevelVar == Other.LevelVar, nullptr);
-  ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr);
-  ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar, nullptr);
-  ASSERT(RunSchedVar == Other.RunSchedVar, nullptr);
-  ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr);
-}
-
-void state::TeamStateTy::init(bool IsSPMD) {
-  ICVState.NThreadsVar = 0;
-  ICVState.LevelVar = 0;
-  ICVState.ActiveLevelVar = 0;
-  ICVState.Padding0Val = 0;
-  ICVState.MaxActiveLevelsVar = 1;
-  ICVState.RunSchedVar = omp_sched_static;
-  ICVState.RunSchedChunkVar = 1;
-  ParallelTeamSize = 1;
-  HasThreadState = false;
-  ParallelRegionFnVar = nullptr;
-}
-
-bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
-  return (ICVState == Other.ICVState) &
-         (HasThreadState == Other.HasThreadState) &
-         (ParallelTeamSize == Other.ParallelTeamSize);
-}
-
-void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
-  ICVState.assertEqual(Other.ICVState);
-  ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr);
-  ASSERT(HasThreadState == Other.HasThreadState, nullptr);
-}
-
-state::TeamStateTy SHARED(ompx::state::TeamState);
-state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
-
-namespace {
-
-int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
-                             int OutOfBoundsVal = -1) {
-  if (Level == 0)
-    return DefaultVal;
-  int LevelVar = omp_get_level();
-  if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
-    return OutOfBoundsVal;
-  int ActiveLevel = icv::ActiveLevel;
-  if (OMP_UNLIKELY(Level != ActiveLevel))
-    return DefaultVal;
-  return Val;
-}
-
-} // namespace
-
-void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
-  SharedMemorySmartStack.init(IsSPMD);
-  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
-    TeamState.init(IsSPMD);
-    ThreadStates = nullptr;
-    KernelEnvironmentPtr = &KernelEnvironment;
-    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
-  }
-}
-
-KernelEnvironmentTy &state::getKernelEnvironment() {
-  return *KernelEnvironmentPtr;
-}
-
-KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
-  return *KernelLaunchEnvironmentPtr;
-}
-
-void state::enterDataEnvironment(IdentTy *Ident) {
-  ASSERT(config::mayUseThreadStates(),
-         "Thread state modified while explicitly disabled!");
-  if (!config::mayUseThreadStates())
-    return;
-
-  unsigned TId = mapping::getThreadIdInBlock();
-  ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(
-      memory::allocGlobal(sizeof(ThreadStateTy), "ThreadStates alloc"));
-  uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
-  if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
-    uint32_t Bytes =
-        sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
-    void *ThreadStatesPtr =
-        memory::allocGlobal(Bytes, "Thread state array allocation");
-    memset(ThreadStatesPtr, 0, Bytes);
-    if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
-                     reinterpret_cast<uintptr_t>(ThreadStatesPtr),
-                     atomic::seq_cst, atomic::seq_cst))
-      memory::freeGlobal(ThreadStatesPtr,
-                         "Thread state array allocated multiple times");
-    ASSERT(atomic::load(ThreadStatesBitsPtr, atomic::seq_cst),
-           "Expected valid thread states bit!");
-  }
-  NewThreadState->init(ThreadStates[TId]);
-  TeamState.HasThreadState = true;
-  ThreadStates[TId] = NewThreadState;
-}
-
-void state::exitDataEnvironment() {
-  ASSERT(config::mayUseThreadStates(),
-         "Thread state modified while explicitly disabled!");
-
-  unsigned TId = mapping::getThreadIdInBlock();
-  resetStateForThread(TId);
-}
-
-void state::resetStateForThread(uint32_t TId) {
-  if (!config::mayUseThreadStates())
-    return;
-  if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
-    return;
-
-  ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
-  memory::freeGlobal(ThreadStates[TId], "ThreadStates dealloc");
-  ThreadStates[TId] = PreviousThreadState;
-}
-
-void state::runAndCheckState(void(Func(void))) {
-  TeamStateTy OldTeamState = TeamState;
-  OldTeamState.assertEqual(TeamState);
-
-  Func();
-
-  OldTeamState.assertEqual(TeamState);
-}
-
-void state::assumeInitialState(bool IsSPMD) {
-  TeamStateTy InitialTeamState;
-  InitialTeamState.init(IsSPMD);
-  InitialTeamState.assertEqual(TeamState);
-  ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr);
-}
-
-int state::getEffectivePTeamSize() {
-  int PTeamSize = state::ParallelTeamSize;
-  return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
-}
-
-extern "C" {
-void omp_set_dynamic(int V) {}
-
-int omp_get_dynamic(void) { return 0; }
-
-void omp_set_num_threads(int V) { icv::NThreads = V; }
-
-int omp_get_max_threads(void) {
-  int NT = icv::NThreads;
-  return NT > 0 ? NT : mapping::getMaxTeamThreads();
-}
-
-int omp_get_level(void) {
-  int LevelVar = icv::Level;
-  ASSERT(LevelVar >= 0, nullptr);
-  return LevelVar;
-}
-
-int omp_get_active_level(void) { return !!icv::ActiveLevel; }
-
-int omp_in_parallel(void) { return !!icv::ActiveLevel; }
-
-void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
-  *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
-  *ChunkSize = state::RunSchedChunk;
-}
-
-void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
-  icv::RunSched = (int)ScheduleKind;
-  state::RunSchedChunk = ChunkSize;
-}
-
-int omp_get_ancestor_thread_num(int Level) {
-  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
-}
-
-int omp_get_thread_num(void) {
-  return omp_get_ancestor_thread_num(omp_get_level());
-}
-
-int omp_get_team_size(int Level) {
-  return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
-}
-
-int omp_get_num_threads(void) {
-  return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
-}
-
-int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
-
-int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
-
-void omp_set_nested(int) {}
-
-int omp_get_nested(void) { return false; }
-
-void omp_set_max_active_levels(int Levels) {
-  icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
-}
-
-int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
-
-omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
-
-int omp_get_num_places(void) { return 0; }
-
-int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
-
-void omp_get_place_proc_ids(int, int *) {
-  // TODO
-}
-
-int omp_get_place_num(void) { return 0; }
-
-int omp_get_partition_num_places(void) { return 0; }
-
-void omp_get_partition_place_nums(int *) {
-  // TODO
-}
-
-int omp_get_cancellation(void) { return 0; }
-
-void omp_set_default_device(int) {}
-
-int omp_get_default_device(void) { return -1; }
-
-int omp_get_num_devices(void) { return config::getNumDevices(); }
-
-int omp_get_device_num(void) { return config::getDeviceNum(); }
-
-int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
-
-int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
-
-int omp_get_initial_device(void) { return -1; }
-
-int omp_is_initial_device(void) { return 0; }
-}
-
-extern "C" {
-[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
-  return memory::allocShared(Bytes, "Frontend alloc shared");
-}
-
-[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
-  memory::freeShared(Ptr, Bytes, "Frontend free shared");
-}
-
-void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
-
-void *llvm_omp_target_dynamic_shared_alloc() {
-  return __kmpc_get_dynamic_shared();
-}
-
-void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
-
-/// Allocate storage in shared memory to communicate arguments from the main
-/// thread to the workers in generic mode. If we exceed
-/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
-constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
-
-[[clang::loader_uninitialized]] static void
-    *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-#pragma omp allocate(SharedMemVariableSharingSpace)                            \
-    allocator(omp_pteam_mem_alloc)
-[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
-#pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
-    allocator(omp_pteam_mem_alloc)
-
-void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
-  if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
-    SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
-  } else {
-    SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
-        nArgs * sizeof(void *), "new extended args");
-    ASSERT(SharedMemVariableSharingSpacePtr != nullptr,
-           "Nullptr returned by malloc!");
-  }
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-void __kmpc_end_sharing_variables() {
-  if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
-    memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
-}
-
-void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-}
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Stub.cpp b/libomptarget/DeviceRTL/src/Stub.cpp
deleted file mode 100644
index e833423..0000000
--- a/libomptarget/DeviceRTL/src/Stub.cpp
+++ /dev/null

@@ -1 +0,0 @@
-// This is an empty file used to create a device fatbinary.

diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
deleted file mode 100644
index 80ba87b..0000000
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ /dev/null

@@ -1,596 +0,0 @@
-//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Include all synchronization.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Synchronization.h"
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Types.h"
-#include "Utils.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-using namespace ompx;
-
-namespace impl {
-
-/// Atomics
-///
-///{
-/// NOTE: This function needs to be implemented by every target.
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope);
-
-template <typename Ty>
-Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  return __scoped_atomic_fetch_add(Address, Val, Ordering,
-                                   __MEMORY_SCOPE_DEVICE);
-}
-
-template <typename Ty>
-Ty atomicMul(Ty *Address, Ty V, atomic::OrderingTy Ordering) {
-  Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
-  bool Success;
-  do {
-    TypedCurrentVal = atomic::load(Address, Ordering);
-    TypedNewVal = TypedCurrentVal * V;
-    Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
-                          atomic::relaxed);
-  } while (!Success);
-  return TypedResultVal;
-}
-
-template <typename Ty> Ty atomicLoad(Ty *Address, atomic::OrderingTy Ordering) {
-  return atomicAdd(Address, Ty(0), Ordering);
-}
-
-template <typename Ty>
-void atomicStore(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  __scoped_atomic_store_n(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE);
-}
-
-template <typename Ty>
-bool atomicCAS(Ty *Address, Ty ExpectedV, Ty DesiredV,
-               atomic::OrderingTy OrderingSucc,
-               atomic::OrderingTy OrderingFail) {
-  return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
-                                          OrderingSucc, OrderingFail,
-                                          __MEMORY_SCOPE_DEVICE);
-}
-
-template <typename Ty>
-Ty atomicMin(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  return __scoped_atomic_fetch_min(Address, Val, Ordering,
-                                   __MEMORY_SCOPE_DEVICE);
-}
-
-template <typename Ty>
-Ty atomicMax(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  return __scoped_atomic_fetch_max(Address, Val, Ordering,
-                                   __MEMORY_SCOPE_DEVICE);
-}
-
-// TODO: Implement this with __atomic_fetch_max and remove the duplication.
-template <typename Ty, typename STy, typename UTy>
-Ty atomicMinFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  if (Val >= 0)
-    return atomicMin((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
-  return atomicMax((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
-}
-
-template <typename Ty, typename STy, typename UTy>
-Ty atomicMaxFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  if (Val >= 0)
-    return atomicMax((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
-  return atomicMin((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
-}
-
-template <typename Ty>
-Ty atomicOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  return __scoped_atomic_fetch_or(Address, Val, Ordering,
-                                  __MEMORY_SCOPE_DEVICE);
-}
-
-template <typename Ty>
-Ty atomicAnd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  return __scoped_atomic_fetch_and(Address, Val, Ordering,
-                                   __MEMORY_SCOPE_DEVICE);
-}
-
-template <typename Ty>
-Ty atomicXOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
-  return __scoped_atomic_fetch_xor(Address, Val, Ordering,
-                                   __MEMORY_SCOPE_DEVICE);
-}
-
-uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
-                        atomic::OrderingTy Ordering) {
-  uint32_t R;
-  __scoped_atomic_exchange(Address, &Val, &R, Ordering, __MEMORY_SCOPE_DEVICE);
-  return R;
-}
-///}
-
-// Forward declarations defined to be defined for AMDGCN and NVPTX.
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope);
-void namedBarrierInit();
-void namedBarrier();
-void fenceTeam(atomic::OrderingTy Ordering);
-void fenceKernel(atomic::OrderingTy Ordering);
-void fenceSystem(atomic::OrderingTy Ordering);
-void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads(atomic::OrderingTy Ordering);
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-void unsetLock(omp_lock_t *);
-int testLock(omp_lock_t *);
-void initLock(omp_lock_t *);
-void destroyLock(omp_lock_t *);
-void setLock(omp_lock_t *);
-void unsetCriticalLock(omp_lock_t *);
-void setCriticalLock(omp_lock_t *);
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope) {
-  // builtin_amdgcn_atomic_inc32 should expand to this switch when
-  // passed a runtime value, but does not do so yet. Workaround here.
-
-#define ScopeSwitch(ORDER)                                                     \
-  switch (MemScope) {                                                          \
-  case atomic::MemScopeTy::all:                                                \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "");                     \
-  case atomic::MemScopeTy::device:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent");                \
-  case atomic::MemScopeTy::cgroup:                                             \
-    return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup");            \
-  }
-
-#define Case(ORDER)                                                            \
-  case ORDER:                                                                  \
-    ScopeSwitch(ORDER)
-
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-    Case(atomic::relaxed);
-    Case(atomic::aquire);
-    Case(atomic::release);
-    Case(atomic::acq_rel);
-    Case(atomic::seq_cst);
-#undef Case
-#undef ScopeSwitch
-  }
-}
-
-uint32_t SHARED(namedBarrierTracker);
-
-void namedBarrierInit() {
-  // Don't have global ctors, and shared memory is not zero init
-  atomic::store(&namedBarrierTracker, 0u, atomic::release);
-}
-
-void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
-  // assert(NumThreads % 32 == 0);
-
-  uint32_t WarpSize = mapping::getWarpSize();
-  uint32_t NumWaves = NumThreads / WarpSize;
-
-  fence::team(atomic::aquire);
-
-  // named barrier implementation for amdgcn.
-  // Uses two 16 bit unsigned counters. One for the number of waves to have
-  // reached the barrier, and one to count how many times the barrier has been
-  // passed. These are packed in a single atomically accessed 32 bit integer.
-  // Low bits for the number of waves, assumed zero before this call.
-  // High bits to count the number of times the barrier has been passed.
-
-  // precondition: NumWaves != 0;
-  // invariant: NumWaves * WarpSize == NumThreads;
-  // precondition: NumWaves < 0xffffu;
-
-  // Increment the low 16 bits once, using the lowest active thread.
-  if (mapping::isLeaderInWarp()) {
-    uint32_t load = atomic::add(&namedBarrierTracker, 1,
-                                atomic::relaxed); // commutative
-
-    // Record the number of times the barrier has been passed
-    uint32_t generation = load & 0xffff0000u;
-
-    if ((load & 0x0000ffffu) == (NumWaves - 1)) {
-      // Reached NumWaves in low bits so this is the last wave.
-      // Set low bits to zero and increment high bits
-      load += 0x00010000u; // wrap is safe
-      load &= 0xffff0000u; // because bits zeroed second
-
-      // Reset the wave counter and release the waiting waves
-      atomic::store(&namedBarrierTracker, load, atomic::relaxed);
-    } else {
-      // more waves still to go, spin until generation counter changes
-      do {
-        __builtin_amdgcn_s_sleep(0);
-        load = atomic::load(&namedBarrierTracker, atomic::relaxed);
-      } while ((load & 0xffff0000u) == generation);
-    }
-  }
-  fence::team(atomic::release);
-}
-
-// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
-// so that it is usable within a template environment and so that a runtime
-// value of the memory order is expanded to this switch within clang/llvm.
-void fenceTeam(atomic::OrderingTy Ordering) {
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-  case atomic::aquire:
-    return __builtin_amdgcn_fence(atomic::aquire, "workgroup");
-  case atomic::release:
-    return __builtin_amdgcn_fence(atomic::release, "workgroup");
-  case atomic::acq_rel:
-    return __builtin_amdgcn_fence(atomic::acq_rel, "workgroup");
-  case atomic::seq_cst:
-    return __builtin_amdgcn_fence(atomic::seq_cst, "workgroup");
-  }
-}
-void fenceKernel(atomic::OrderingTy Ordering) {
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-  case atomic::aquire:
-    return __builtin_amdgcn_fence(atomic::aquire, "agent");
-  case atomic::release:
-    return __builtin_amdgcn_fence(atomic::release, "agent");
-  case atomic::acq_rel:
-    return __builtin_amdgcn_fence(atomic::acq_rel, "agent");
-  case atomic::seq_cst:
-    return __builtin_amdgcn_fence(atomic::seq_cst, "agent");
-  }
-}
-void fenceSystem(atomic::OrderingTy Ordering) {
-  switch (Ordering) {
-  default:
-    __builtin_unreachable();
-  case atomic::aquire:
-    return __builtin_amdgcn_fence(atomic::aquire, "");
-  case atomic::release:
-    return __builtin_amdgcn_fence(atomic::release, "");
-  case atomic::acq_rel:
-    return __builtin_amdgcn_fence(atomic::acq_rel, "");
-  case atomic::seq_cst:
-    return __builtin_amdgcn_fence(atomic::seq_cst, "");
-  }
-}
-
-void syncWarp(__kmpc_impl_lanemask_t) {
-  // This is a no-op on current AMDGPU hardware but it is used by the optimizer
-  // to enforce convergent behaviour between control flow graphs.
-  __builtin_amdgcn_wave_barrier();
-}
-
-void syncThreads(atomic::OrderingTy Ordering) {
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
-
-  __builtin_amdgcn_s_barrier();
-
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
-}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-
-// TODO: Don't have wavefront lane locks. Possibly can't have them.
-void unsetLock(omp_lock_t *) { __builtin_trap(); }
-int testLock(omp_lock_t *) { __builtin_trap(); }
-void initLock(omp_lock_t *) { __builtin_trap(); }
-void destroyLock(omp_lock_t *) { __builtin_trap(); }
-void setLock(omp_lock_t *) { __builtin_trap(); }
-
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-void unsetCriticalLock(omp_lock_t *Lock) {
-  (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
-}
-
-void setCriticalLock(omp_lock_t *Lock) {
-  uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
-  if (mapping::getThreadIdInWarp() == LowestActiveThread) {
-    fenceKernel(atomic::release);
-    while (!atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::relaxed,
-                      atomic::relaxed)) {
-      __builtin_amdgcn_s_sleep(32);
-    }
-    fenceKernel(atomic::aquire);
-  }
-}
-
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope) {
-  return __nvvm_atom_inc_gen_ui(Address, Val);
-}
-
-void namedBarrierInit() {}
-
-void namedBarrier() {
-  uint32_t NumThreads = omp_get_num_threads();
-  ASSERT(NumThreads % 32 == 0, nullptr);
-
-  // The named barrier for active parallel threads of a team in an L1 parallel
-  // region to synchronize with each other.
-  constexpr int BarrierNo = 7;
-  __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
-}
-
-void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
-
-void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
-
-void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
-
-void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
-
-void syncThreads(atomic::OrderingTy Ordering) {
-  constexpr int BarrierNo = 8;
-  __nvvm_barrier_sync(BarrierNo);
-}
-
-void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
-
-constexpr uint32_t OMP_SPIN = 1000;
-constexpr uint32_t UNSET = 0;
-constexpr uint32_t SET = 1;
-
-// TODO: This seems to hide a bug in the declare variant handling. If it is
-// called before it is defined
-//       here the overload won't happen. Investigate lalter!
-void unsetLock(omp_lock_t *Lock) {
-  (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst);
-}
-
-int testLock(omp_lock_t *Lock) {
-  return atomicAdd((uint32_t *)Lock, 0u, atomic::seq_cst);
-}
-
-void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
-
-void setLock(omp_lock_t *Lock) {
-  // TODO: not sure spinning is a good idea here..
-  while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
-                   atomic::seq_cst) != UNSET) {
-    int32_t start = __nvvm_read_ptx_sreg_clock();
-    int32_t now;
-    for (;;) {
-      now = __nvvm_read_ptx_sreg_clock();
-      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) {
-        break;
-      }
-    }
-  } // wait for 0 to be the read value
-}
-
-#pragma omp end declare variant
-///}
-
-} // namespace impl
-
-void synchronize::init(bool IsSPMD) {
-  if (!IsSPMD)
-    impl::namedBarrierInit();
-}
-
-void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
-
-void synchronize::threads(atomic::OrderingTy Ordering) {
-  impl::syncThreads(Ordering);
-}
-
-void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
-  impl::syncThreadsAligned(Ordering);
-}
-
-void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
-
-void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
-
-void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
-
-#define ATOMIC_COMMON_OP(TY)                                                   \
-  TY atomic::add(TY *Addr, TY V, atomic::OrderingTy Ordering) {                \
-    return impl::atomicAdd(Addr, V, Ordering);                                 \
-  }                                                                            \
-  TY atomic::mul(TY *Addr, TY V, atomic::OrderingTy Ordering) {                \
-    return impl::atomicMul(Addr, V, Ordering);                                 \
-  }                                                                            \
-  TY atomic::load(TY *Addr, atomic::OrderingTy Ordering) {                     \
-    return impl::atomicLoad(Addr, Ordering);                                   \
-  }                                                                            \
-  bool atomic::cas(TY *Addr, TY ExpectedV, TY DesiredV,                        \
-                   atomic::OrderingTy OrderingSucc,                            \
-                   atomic::OrderingTy OrderingFail) {                          \
-    return impl::atomicCAS(Addr, ExpectedV, DesiredV, OrderingSucc,            \
-                           OrderingFail);                                      \
-  }
-
-#define ATOMIC_FP_ONLY_OP(TY, STY, UTY)                                        \
-  TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) {                \
-    return impl::atomicMinFP<TY, STY, UTY>(Addr, V, Ordering);                 \
-  }                                                                            \
-  TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) {                \
-    return impl::atomicMaxFP<TY, STY, UTY>(Addr, V, Ordering);                 \
-  }                                                                            \
-  void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) {            \
-    impl::atomicStore(reinterpret_cast<UTY *>(Addr),                           \
-                      utils::convertViaPun<UTY>(V), Ordering);                 \
-  }
-
-#define ATOMIC_INT_ONLY_OP(TY)                                                 \
-  TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) {                \
-    return impl::atomicMin<TY>(Addr, V, Ordering);                             \
-  }                                                                            \
-  TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) {                \
-    return impl::atomicMax<TY>(Addr, V, Ordering);                             \
-  }                                                                            \
-  TY atomic::bit_or(TY *Addr, TY V, atomic::OrderingTy Ordering) {             \
-    return impl::atomicOr(Addr, V, Ordering);                                  \
-  }                                                                            \
-  TY atomic::bit_and(TY *Addr, TY V, atomic::OrderingTy Ordering) {            \
-    return impl::atomicAnd(Addr, V, Ordering);                                 \
-  }                                                                            \
-  TY atomic::bit_xor(TY *Addr, TY V, atomic::OrderingTy Ordering) {            \
-    return impl::atomicXOr(Addr, V, Ordering);                                 \
-  }                                                                            \
-  void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) {            \
-    impl::atomicStore(Addr, V, Ordering);                                      \
-  }
-
-#define ATOMIC_FP_OP(TY, STY, UTY)                                             \
-  ATOMIC_FP_ONLY_OP(TY, STY, UTY)                                              \
-  ATOMIC_COMMON_OP(TY)
-
-#define ATOMIC_INT_OP(TY)                                                      \
-  ATOMIC_INT_ONLY_OP(TY)                                                       \
-  ATOMIC_COMMON_OP(TY)
-
-// This needs to be kept in sync with the header. Also the reason we don't use
-// templates here.
-ATOMIC_INT_OP(int8_t)
-ATOMIC_INT_OP(int16_t)
-ATOMIC_INT_OP(int32_t)
-ATOMIC_INT_OP(int64_t)
-ATOMIC_INT_OP(uint8_t)
-ATOMIC_INT_OP(uint16_t)
-ATOMIC_INT_OP(uint32_t)
-ATOMIC_INT_OP(uint64_t)
-ATOMIC_FP_OP(float, int32_t, uint32_t)
-ATOMIC_FP_OP(double, int64_t, uint64_t)
-
-#undef ATOMIC_INT_ONLY_OP
-#undef ATOMIC_FP_ONLY_OP
-#undef ATOMIC_COMMON_OP
-#undef ATOMIC_INT_OP
-#undef ATOMIC_FP_OP
-
-uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering,
-                     atomic::MemScopeTy MemScope) {
-  return impl::atomicInc(Addr, V, Ordering, MemScope);
-}
-
-void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-extern "C" {
-void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
-
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
-  __kmpc_barrier(Loc, TId);
-  return 0;
-}
-
-void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
-  if (mapping::isMainThreadInGenericMode())
-    return __kmpc_flush(Loc);
-
-  if (mapping::isSPMDMode())
-    return __kmpc_barrier_simple_spmd(Loc, TId);
-
-  impl::namedBarrier();
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
-  synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
-}
-
-[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc,
-                                                       int32_t TId) {
-  synchronize::threads(atomic::OrderingTy::seq_cst);
-}
-
-int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
-  return omp_get_thread_num() == 0;
-}
-
-void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) {
-  return omp_get_thread_num() == Filter;
-}
-
-void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {}
-
-int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
-  return __kmpc_master(Loc, TId);
-}
-
-void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
-  // The barrier is explicitly called.
-}
-
-void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); }
-
-uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
-
-void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
-
-void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
-  impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
-  impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
-}
-
-void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
-
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
-
-void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
-
-void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
-
-int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
-
-void ompx_sync_block(int Ordering) {
-  impl::syncThreadsAligned(atomic::OrderingTy(Ordering));
-}
-void ompx_sync_block_acq_rel() {
-  impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
-}
-void ompx_sync_block_divergent(int Ordering) {
-  impl::syncThreads(atomic::OrderingTy(Ordering));
-}
-} // extern "C"
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Tasking.cpp b/libomptarget/DeviceRTL/src/Tasking.cpp
deleted file mode 100644
index 2dc3356..0000000
--- a/libomptarget/DeviceRTL/src/Tasking.cpp
+++ /dev/null

@@ -1,107 +0,0 @@
-//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Task implementation support.
-//
-// TODO: We should not allocate and execute the task in two steps. A new API is
-//       needed for that though.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Interface.h"
-#include "State.h"
-#include "Types.h"
-#include "Utils.h"
-
-using namespace ompx;
-
-#pragma omp begin declare target device_type(nohost)
-
-extern "C" {
-
-TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
-                                        size_t TaskSizeInclPrivateValues,
-                                        size_t SharedValuesSize,
-                                        TaskFnTy TaskFn) {
-  auto TaskSizeInclPrivateValuesPadded =
-      utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
-  auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
-  TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
-      TaskSizeTotal, "explicit task descriptor");
-  TaskDescriptor->Payload =
-      utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
-  TaskDescriptor->TaskFn = TaskFn;
-
-  return TaskDescriptor;
-}
-
-int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
-                        TaskDescriptorTy *TaskDescriptor) {
-  return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor, int32_t,
-                                  void *, int32_t, void *) {
-  state::DateEnvironmentRAII DERAII(Loc);
-
-  TaskDescriptor->TaskFn(0, TaskDescriptor);
-
-  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
-  return 0;
-}
-
-void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
-                               TaskDescriptorTy *TaskDescriptor) {
-  state::enterDataEnvironment(Loc);
-}
-
-void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
-                                  TaskDescriptorTy *TaskDescriptor) {
-  state::exitDataEnvironment();
-
-  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
-}
-
-void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
-                          void *) {}
-
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
-
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
-
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
-
-void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
-                     TaskDescriptorTy *TaskDescriptor, int,
-                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
-                     int32_t, uint64_t, void *) {
-  // Skip task entirely if empty iteration space.
-  if (*LowerBound > *UpperBound)
-    return;
-
-  // The compiler has already stored lb and ub in the TaskDescriptorTy structure
-  // as we are using a single task to execute the entire loop, we can leave
-  // the initial task_t untouched
-  __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
-}
-
-int omp_in_final(void) {
-  // treat all tasks as final... Specs may expect runtime to keep
-  // track more precisely if a task was actively set by users... This
-  // is not explicitly specified; will treat as if runtime can
-  // actively decide to put a non-final task into a final one.
-  return 1;
-}
-
-int omp_get_max_task_priority(void) { return 0; }
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Utils.cpp b/libomptarget/DeviceRTL/src/Utils.cpp
deleted file mode 100644
index d07ac0f..0000000
--- a/libomptarget/DeviceRTL/src/Utils.cpp
+++ /dev/null

@@ -1,122 +0,0 @@
-//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "Utils.h"
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-
-#pragma omp begin declare target device_type(nohost)
-
-using namespace ompx;
-
-namespace impl {
-
-bool isSharedMemPtr(const void *Ptr) { return false; }
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
-  static_assert(sizeof(unsigned long) == 8, "");
-  *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
-  *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
-  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
-                    int32_t Width);
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  int Width = mapping::getWarpSize();
-  int Self = mapping::getThreadIdInWarp();
-  int Index = SrcLane + (Self & ~(Width - 1));
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
-                    int32_t Width) {
-  int Self = mapping::getThreadIdInWarp();
-  int Index = Self + LaneDelta;
-  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-bool isSharedMemPtr(const void *Ptr) {
-  return __builtin_amdgcn_is_shared(
-      (const __attribute__((address_space(0))) void *)Ptr);
-}
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
-  int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
-  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
-
-#pragma omp end declare variant
-///}
-} // namespace impl
-
-uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
-  return impl::Pack(LowBits, HighBits);
-}
-
-void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
-  impl::Unpack(Val, &LowBits, &HighBits);
-}
-
-int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
-  return impl::shuffle(Mask, Var, SrcLane);
-}
-
-int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
-                           int32_t Width) {
-  return impl::shuffleDown(Mask, Var, Delta, Width);
-}
-
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
-
-extern "C" {
-int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
-  return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
-}
-
-int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
-  uint32_t lo, hi;
-  utils::unpack(Val, lo, hi);
-  hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
-  lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
-  return utils::pack(lo, hi);
-}
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/Workshare.cpp b/libomptarget/DeviceRTL/src/Workshare.cpp
deleted file mode 100644
index bcb7c5a..0000000
--- a/libomptarget/DeviceRTL/src/Workshare.cpp
+++ /dev/null

@@ -1,890 +0,0 @@
-//===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Debug.h"
-#include "Interface.h"
-#include "Mapping.h"
-#include "State.h"
-#include "Synchronization.h"
-#include "Types.h"
-#include "Utils.h"
-
-using namespace ompx;
-
-// TODO:
-struct DynamicScheduleTracker {
-  int64_t Chunk;
-  int64_t LoopUpperBound;
-  int64_t NextLowerBound;
-  int64_t Stride;
-  kmp_sched_t ScheduleType;
-  DynamicScheduleTracker *NextDST;
-};
-
-#define ASSERT0(...)
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-#pragma omp begin declare target device_type(nohost)
-
-// TODO: This variable is a hack inherited from the old runtime.
-static uint64_t SHARED(Cnt);
-
-template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling with chunk
-
-  // Generic implementation of OMP loop scheduling with static policy
-  /*! \brief Calculate initial bounds for static loop and stride
-   *  @param[in] loc location in code of the call (not used here)
-   *  @param[in] global_tid global thread id
-   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
-   *  @param[in] plastiter pointer to last iteration
-   *  @param[in,out] pointer to loop lower bound. it will contain value of
-   *  lower bound of first chunk
-   *  @param[in,out] pointer to loop upper bound. It will contain value of
-   *  upper bound of first chunk
-   *  @param[in,out] pointer to loop stride. It will contain value of stride
-   *  between two successive chunks executed by the same thread
-   *  @param[in] loop increment bump
-   *  @param[in] chunk size
-   */
-
-  // helper function for static chunk
-  static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
-                             T entityId, T numberOfEntities) {
-    // each thread executes multiple chunks all of the same size, except
-    // the last one
-    // distance between two successive chunks
-    stride = numberOfEntities * chunk;
-    lb = lb + entityId * chunk;
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    // Say ub' is the begining of the last chunk. Then who ever has a
-    // lower bound plus a multiple of the increment equal to ub' is
-    // the last one.
-    T beginingLastChunk = inputUb - (inputUb % chunk);
-    last = ((beginingLastChunk - lb) % stride) == 0;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling without chunk
-
-  // helper function for static no chunk
-  static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
-                               T entityId, T numberOfEntities) {
-    // No chunk size specified.  Each thread or warp gets at most one
-    // chunk; chunks are all almost of equal size
-    T loopSize = ub - lb + 1;
-
-    chunk = loopSize / numberOfEntities;
-    T leftOver = loopSize - chunk * numberOfEntities;
-
-    if (entityId < leftOver) {
-      chunk++;
-      lb = lb + entityId * chunk;
-    } else {
-      lb = lb + entityId * chunk + leftOver;
-    }
-
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    last = lb <= inputUb && inputUb <= ub;
-    stride = loopSize; // make sure we only do 1 chunk per warp
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for Static Init
-
-  static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter,
-                              T *plower, T *pupper, ST *pstride, ST chunk,
-                              bool IsSPMDExecutionMode) {
-    int32_t gtid = omp_get_thread_num();
-    int numberOfActiveOMPThreads = omp_get_num_threads();
-
-    // All warps that are in excess of the maximum requested, do
-    // not execute the loop
-    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
-            "current thread is not needed here; error");
-
-    // copy
-    int lastiter = 0;
-    T lb = *plower;
-    T ub = *pupper;
-    ST stride = *pstride;
-
-    // init
-    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
-    case kmp_sched_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_balanced_chunk: {
-      if (chunk > 0) {
-        // round up to make sure the chunk is enough to cover all iterations
-        T tripCount = ub - lb + 1; // +1 because ub is inclusive
-        T span = (tripCount + numberOfActiveOMPThreads - 1) /
-                 numberOfActiveOMPThreads;
-        // perform chunk adjustment
-        chunk = (span + chunk - 1) & ~(chunk - 1);
-
-        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-        T oldUb = ub;
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        if (ub > oldUb)
-          ub = oldUb;
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-      break;
-    }
-    case kmp_sched_distr_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
-                       omp_get_num_teams());
-        break;
-      }
-      [[fallthrough]];
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_distr_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
-                       omp_get_num_teams());
-      break;
-    }
-    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
-      ForStaticChunk(lastiter, lb, ub, stride, chunk,
-                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
-                     omp_get_num_teams() * numberOfActiveOMPThreads);
-      break;
-    }
-    default: {
-      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                     numberOfActiveOMPThreads);
-      break;
-    }
-    }
-    // copy back
-    *plastiter = lastiter;
-    *plower = lb;
-    *pupper = ub;
-    *pstride = stride;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch Init
-
-  static int OrderedSchedule(kmp_sched_t schedule) {
-    return schedule >= kmp_sched_ordered_first &&
-           schedule <= kmp_sched_ordered_last;
-  }
-
-  static void dispatch_init(IdentTy *loc, int32_t threadId,
-                            kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
-                            DynamicScheduleTracker *DST) {
-    int tid = mapping::getThreadIdInBlock();
-    T tnum = omp_get_num_threads();
-    T tripCount = ub - lb + 1; // +1 because ub is inclusive
-    ASSERT0(LT_FUSSY, threadId < tnum,
-            "current thread is not needed here; error");
-
-    /* Currently just ignore the monotonic and non-monotonic modifiers
-     * (the compiler isn't producing them * yet anyway).
-     * When it is we'll want to look at them somewhere here and use that
-     * information to add to our schedule choice. We shouldn't need to pass
-     * them on, they merely affect which schedule we can legally choose for
-     * various dynamic cases. (In particular, whether or not a stealing scheme
-     * is legal).
-     */
-    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
-    // Process schedule.
-    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule))
-        __kmpc_barrier(loc, threadId);
-      schedule = kmp_sched_static_chunk;
-      chunk = tripCount; // one thread gets the whole loop
-    } else if (schedule == kmp_sched_runtime) {
-      // process runtime
-      omp_sched_t rtSched;
-      int ChunkInt;
-      omp_get_schedule(&rtSched, &ChunkInt);
-      chunk = ChunkInt;
-      switch (rtSched) {
-      case omp_sched_static: {
-        if (chunk > 0)
-          schedule = kmp_sched_static_chunk;
-        else
-          schedule = kmp_sched_static_nochunk;
-        break;
-      }
-      case omp_sched_auto: {
-        schedule = kmp_sched_static_chunk;
-        chunk = 1;
-        break;
-      }
-      case omp_sched_dynamic:
-      case omp_sched_guided: {
-        schedule = kmp_sched_dynamic;
-        break;
-      }
-      }
-    } else if (schedule == kmp_sched_auto) {
-      schedule = kmp_sched_static_chunk;
-      chunk = 1;
-    } else {
-      // ASSERT(LT_FUSSY,
-      //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-      //        "unknown schedule %d & chunk %lld\n", (int)schedule,
-      //        (long long)chunk);
-    }
-
-    // init schedules
-    if (schedule == kmp_sched_static_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_static_balanced_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      // round up to make sure the chunk is enough to cover all iterations
-      T span = (tripCount + tnum - 1) / tnum;
-      // perform chunk adjustment
-      chunk = (span + chunk - 1) & ~(chunk - 1);
-
-      T oldUb = ub;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-      if (ub > oldUb)
-        ub = oldUb;
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_static_nochunk) {
-      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
-      // save sched state
-      DST->ScheduleType = schedule;
-      // save ub
-      DST->LoopUpperBound = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      DST->Chunk = chunk;
-      DST->NextLowerBound = lb;
-      DST->Stride = stride;
-    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      // save data
-      DST->ScheduleType = schedule;
-      if (chunk < 1)
-        chunk = 1;
-      DST->Chunk = chunk;
-      DST->LoopUpperBound = ub;
-      DST->NextLowerBound = lb;
-      __kmpc_barrier(loc, threadId);
-      if (tid == 0) {
-        Cnt = 0;
-        fence::team(atomic::seq_cst);
-      }
-      __kmpc_barrier(loc, threadId);
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch next
-
-  static uint64_t NextIter() {
-    __kmpc_impl_lanemask_t active = mapping::activemask();
-    uint32_t leader = utils::ffs(active) - 1;
-    uint32_t change = utils::popc(active);
-    __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
-    unsigned int rank = utils::popc(active & lane_mask_lt);
-    uint64_t warp_res = 0;
-    if (rank == 0) {
-      warp_res = atomic::add(&Cnt, change, atomic::seq_cst);
-    }
-    warp_res = utils::shuffle(active, warp_res, leader);
-    return warp_res + rank;
-  }
-
-  static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
-                              T loopUpperBound) {
-    T N = NextIter();
-    lb = loopLowerBound + N * chunkSize;
-    ub = lb + chunkSize - 1; // Clang uses i <= ub
-
-    // 3 result cases:
-    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
-    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
-    //  NOT_FINISHED
-    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    // a.
-    if (lb <= loopUpperBound && ub < loopUpperBound) {
-      return NOT_FINISHED;
-    }
-    // b.
-    if (lb <= loopUpperBound) {
-      ub = loopUpperBound;
-      return LAST_CHUNK;
-    }
-    // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 2;
-    ub = loopUpperBound + 1;
-    return FINISHED;
-  }
-
-  static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
-                           T *plower, T *pupper, ST *pstride,
-                           DynamicScheduleTracker *DST) {
-    // ID of a thread in its own warp
-
-    // automatically selects thread or warp ID based on selected implementation
-    ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
-            "current thread is not needed here; error");
-    // retrieve schedule
-    kmp_sched_t schedule = DST->ScheduleType;
-
-    // xxx reduce to one
-    if (schedule == kmp_sched_static_chunk ||
-        schedule == kmp_sched_static_nochunk) {
-      T myLb = DST->NextLowerBound;
-      T ub = DST->LoopUpperBound;
-      // finished?
-      if (myLb > ub) {
-        return DISPATCH_FINISHED;
-      }
-      // not finished, save current bounds
-      ST chunk = DST->Chunk;
-      *plower = myLb;
-      T myUb = myLb + chunk - 1; // Clang uses i <= ub
-      if (myUb > ub)
-        myUb = ub;
-      *pupper = myUb;
-      *plast = (int32_t)(myUb == ub);
-
-      // increment next lower bound by the stride
-      ST stride = DST->Stride;
-      DST->NextLowerBound = myLb + stride;
-      return DISPATCH_NOTFINISHED;
-    }
-    ASSERT0(LT_FUSSY,
-            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-            "bad sched");
-    T myLb, myUb;
-    int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
-                                    DST->LoopUpperBound);
-
-    if (finished == FINISHED)
-      return DISPATCH_FINISHED;
-
-    // not finished (either not finished or last chunk)
-    *plast = (int32_t)(finished == LAST_CHUNK);
-    *plower = myLb;
-    *pupper = myUb;
-    *pstride = 1;
-
-    return DISPATCH_NOTFINISHED;
-  }
-
-  static void dispatch_fini() {
-    // nothing
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // end of template class that encapsulate all the helper functions
-  ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// TODO: This is a stopgap. We probably want to expand the dispatch API to take
-//       an DST pointer which can then be allocated properly without malloc.
-static DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
-
-// Create a new DST, link the current one, and define the new as current.
-static DynamicScheduleTracker *pushDST() {
-  DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
-      memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
-  *NewDST = DynamicScheduleTracker({0});
-  NewDST->NextDST = ThreadDSTPtr;
-  ThreadDSTPtr = NewDST;
-  return ThreadDSTPtr;
-}
-
-// Return the current DST.
-static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
-
-// Pop the current DST and restore the last one.
-static void popDST() {
-  DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
-  memory::freeGlobal(ThreadDSTPtr, "remove DST");
-  ThreadDSTPtr = OldDST;
-}
-
-extern "C" {
-
-// init
-void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
-                            int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
-                             uint32_t lb, uint32_t ub, int32_t st,
-                             int32_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
-                            int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
-                             uint64_t lb, uint64_t ub, int64_t st,
-                             int64_t chunk) {
-  DynamicScheduleTracker *DST = pushDST();
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
-}
-
-// next
-int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
-                           int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
-                            uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
-                           int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
-                            uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
-  DynamicScheduleTracker *DST = peekDST();
-  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st, DST);
-}
-
-// fini
-void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-  popDST();
-}
-
-void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-  popDST();
-}
-
-void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-  popDST();
-}
-
-void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-  popDST();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
-                              int32_t schedtype, int32_t *plastiter,
-                              int32_t *plower, int32_t *pupper,
-                              int32_t *pstride, int32_t incr, int32_t chunk) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
-                               int32_t schedtype, int32_t *plastiter,
-                               uint32_t *plower, uint32_t *pupper,
-                               int32_t *pstride, int32_t incr, int32_t chunk) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
-                              int32_t schedtype, int32_t *plastiter,
-                              int64_t *plower, int64_t *pupper,
-                              int64_t *pstride, int64_t incr, int64_t chunk) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
-                               int32_t schedtype, int32_t *plastiter,
-                               uint64_t *plower, uint64_t *pupper,
-                               int64_t *pstride, int64_t incr, int64_t chunk) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk) {
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk) {
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk) {
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk) {
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      mapping::isSPMDMode());
-}
-
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
-
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
-}
-
-namespace ompx {
-
-/// Helper class to hide the generic loop nest and provide the template argument
-/// throughout.
-template <typename Ty> class StaticLoopChunker {
-
-  /// Generic loop nest that handles block and/or thread distribution in the
-  /// absence of user specified chunk sizes. This implicitly picks a block chunk
-  /// size equal to the number of threads in the block and a thread chunk size
-  /// equal to one. In contrast to the chunked version we can get away with a
-  /// single loop in this case
-  static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
-                                        Ty NumBlocks, Ty BId, Ty NumThreads,
-                                        Ty TId, Ty NumIters,
-                                        bool OneIterationPerThread) {
-    Ty KernelIteration = NumBlocks * NumThreads;
-
-    // Start index in the normalized space.
-    Ty IV = BId * NumThreads + TId;
-    ASSERT(IV >= 0, "Bad index");
-
-    // Cover the entire iteration space, assumptions in the caller might allow
-    // to simplify this loop to a conditional.
-    if (IV < NumIters) {
-      do {
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        // Every thread executed one block and thread chunk now.
-        IV += KernelIteration;
-
-        if (OneIterationPerThread)
-          return;
-
-      } while (IV < NumIters);
-    }
-  }
-
-  /// Generic loop nest that handles block and/or thread distribution in the
-  /// presence of user specified chunk sizes (for at least one of them).
-  static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
-                                        Ty BlockChunk, Ty NumBlocks, Ty BId,
-                                        Ty ThreadChunk, Ty NumThreads, Ty TId,
-                                        Ty NumIters,
-                                        bool OneIterationPerThread) {
-    Ty KernelIteration = NumBlocks * BlockChunk;
-
-    // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId;
-    ASSERT(IV >= 0, "Bad index");
-
-    // Cover the entire iteration space, assumptions in the caller might allow
-    // to simplify this loop to a conditional.
-    do {
-
-      Ty BlockChunkLeft =
-          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty ThreadChunkLeft =
-          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
-
-      while (ThreadChunkLeft--) {
-
-        // Given the blocking it's hard to keep track of what to execute.
-        if (IV >= NumIters)
-          return;
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        if (OneIterationPerThread)
-          return;
-
-        ++IV;
-      }
-
-      IV += KernelIteration;
-
-    } while (IV < NumIters);
-  }
-
-public:
-  /// Worksharing `for`-loop.
-  static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(ThreadChunk >= 0, "Bad thread count");
-
-    // All threads need to participate but we don't know if we are in a
-    // parallel at all or if the user might have used a `num_threads` clause
-    // on the parallel and reduced the number compared to the block size.
-    // Since nested parallels are possible too we need to get the thread id
-    // from the `omp` getter and not the mapping directly.
-    Ty TId = omp_get_thread_num();
-
-    // There are no blocks involved here.
-    Ty BlockChunk = 0;
-    Ty NumBlocks = 1;
-    Ty BId = 0;
-
-    // If the thread chunk is not specified we pick a default now.
-    if (ThreadChunk == 0)
-      ThreadChunk = 1;
-
-    // If we know we have more threads than iterations we can indicate that to
-    // avoid an outer loop.
-    bool OneIterationPerThread = false;
-    if (config::getAssumeThreadsOversubscription()) {
-      ASSERT(NumThreads >= NumIters, "Broken assumption");
-      OneIterationPerThread = true;
-    }
-
-    if (ThreadChunk != 1)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-  }
-
-  /// Worksharing `distrbute`-loop.
-  static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
-                         Ty NumIters, Ty BlockChunk) {
-    ASSERT(icv::Level == 0, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(BlockChunk >= 0, "Bad block count");
-
-    // There are no threads involved here.
-    Ty ThreadChunk = 0;
-    Ty NumThreads = 1;
-    Ty TId = 0;
-    ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
-
-    // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
-    Ty BId = mapping::getBlockIdInKernel();
-
-    // If the block chunk is not specified we pick a default now.
-    if (BlockChunk == 0)
-      BlockChunk = NumThreads;
-
-    // If we know we have more blocks than iterations we can indicate that to
-    // avoid an outer loop.
-    bool OneIterationPerThread = false;
-    if (config::getAssumeTeamsOversubscription()) {
-      ASSERT(NumBlocks >= NumIters, "Broken assumption");
-      OneIterationPerThread = true;
-    }
-
-    if (BlockChunk != NumThreads)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-
-    ASSERT(icv::Level == 0, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
-  }
-
-  /// Worksharing `distrbute parallel for`-loop.
-  static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
-                            void *Arg, Ty NumIters, Ty NumThreads,
-                            Ty BlockChunk, Ty ThreadChunk) {
-    ASSERT(icv::Level == 1, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-
-    ASSERT(NumIters >= 0, "Bad iteration count");
-    ASSERT(BlockChunk >= 0, "Bad block count");
-    ASSERT(ThreadChunk >= 0, "Bad thread count");
-
-    // All threads need to participate but the user might have used a
-    // `num_threads` clause on the parallel and reduced the number compared to
-    // the block size.
-    Ty TId = mapping::getThreadIdInBlock();
-
-    // All teams need to participate.
-    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
-    Ty BId = mapping::getBlockIdInKernel();
-
-    // If the block chunk is not specified we pick a default now.
-    if (BlockChunk == 0)
-      BlockChunk = NumThreads;
-
-    // If the thread chunk is not specified we pick a default now.
-    if (ThreadChunk == 0)
-      ThreadChunk = 1;
-
-    // If we know we have more threads (across all blocks) than iterations we
-    // can indicate that to avoid an outer loop.
-    bool OneIterationPerThread = false;
-    if (config::getAssumeTeamsOversubscription() &
-        config::getAssumeThreadsOversubscription()) {
-      OneIterationPerThread = true;
-      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
-    }
-
-    if (BlockChunk != NumThreads || ThreadChunk != 1)
-      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
-                                ThreadChunk, NumThreads, TId, NumIters,
-                                OneIterationPerThread);
-    else
-      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
-                                NumIters, OneIterationPerThread);
-
-    ASSERT(icv::Level == 1, "Bad distribute");
-    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
-    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
-  }
-};
-
-} // namespace ompx
-
-#define OMP_LOOP_ENTRY(BW, TY)                                                 \
-  [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_for_static_loop##BW(                                   \
-          IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,       \
-          TY num_threads, TY block_chunk, TY thread_chunk) {                   \
-    ompx::StaticLoopChunker<TY>::DistributeFor(                                \
-        loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk);  \
-  }                                                                            \
-  [[gnu::flatten, clang::always_inline]] void                                  \
-      __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *),  \
-                                        void *arg, TY num_iters,               \
-                                        TY block_chunk) {                      \
-    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1,       \
-                                            block_chunk);                      \
-  }                                                                            \
-  [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW(      \
-      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
-      TY num_threads, TY thread_chunk) {                                       \
-    ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
-                                     thread_chunk);                            \
-  }
-
-extern "C" {
-OMP_LOOP_ENTRY(_4, int32_t)
-OMP_LOOP_ENTRY(_4u, uint32_t)
-OMP_LOOP_ENTRY(_8, int64_t)
-OMP_LOOP_ENTRY(_8u, uint64_t)
-}
-
-#pragma omp end declare target

diff --git a/libomptarget/DeviceRTL/src/exports b/libomptarget/DeviceRTL/src/exports
deleted file mode 100644
index 288ddf9..0000000
--- a/libomptarget/DeviceRTL/src/exports
+++ /dev/null

@@ -1,18 +0,0 @@
-omp_*
-ompx_*
-*llvm_*
-__kmpc_*
-
-__omp_rtl_debug_kind
-__omp_rtl_assume_no_thread_state
-__omp_rtl_assume_no_nested_parallelism
-
-_ZN4ompx*
-
-IsSPMDMode
-
-malloc
-free
-memcmp
-printf
-__assert_fail

diff --git a/libomptarget/README.txt b/libomptarget/README.txt
deleted file mode 100644
index 4d30c10..0000000
--- a/libomptarget/README.txt
+++ /dev/null

@@ -1,27 +0,0 @@
-
-    README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
-    ======================================================================
-
-For details about building, please look at README.rst in the parent directory
-and the build instructions as well as FAQ at https://openmp.llvm.org.
-
-Architectures Supported
-=======================
-The current library has been only tested in Linux operating system and the
-following host architectures:
-* Intel(R) 64 architecture
-* IBM(R) Power architecture (big endian)
-* IBM(R) Power architecture (little endian)
-* ARM(R) AArch64 architecture (little endian)
-
-The currently supported offloading device architectures are:
-* Intel(R) or AMD(R) 64-bit architecture (x86_64)
-* IBM(R) Power architecture (big endian)
-* IBM(R) Power architecture (little endian)
-* ARM(R) AArch64 architecture (little endian)
-* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures
-* AMD(R) enabled 64-bit AMD(R) GPU architectures
-
------------------------------------------------------------------------
-
-*Other names and brands may be claimed as the property of others.

diff --git a/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
deleted file mode 100644
index bbf2b98..0000000
--- a/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
+++ /dev/null

@@ -1,110 +0,0 @@
-#
-#//===----------------------------------------------------------------------===//
-#//
-#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-#// See https://llvm.org/LICENSE.txt for license information.
-#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#//
-#//===----------------------------------------------------------------------===//
-#
-
-# Try to detect in the system several dependencies required by the different
-# components of libomptarget. These are the dependencies we have:
-#
-# libffi : required to launch target kernels given function and argument
-#          pointers.
-# CUDA : required to control offloading to NVIDIA GPUs.
-
-include (FindPackageHandleStandardArgs)
-
-################################################################################
-# Looking for LLVM...
-################################################################################
-
-if (OPENMP_STANDALONE_BUILD)
-  # Complete LLVM package is required for building libomptarget
-  # in an out-of-tree mode.
-  find_package(LLVM REQUIRED)
-  message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
-  message(STATUS "Using LLVM in: ${LLVM_DIR}")
-  list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_INCLUDE_DIRS})
-  list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
-  include(AddLLVM)
-  if(TARGET omptarget)
-    message(FATAL_ERROR "CMake target 'omptarget' already exists. "
-                        "Use an LLVM installation that doesn't expose its 'omptarget' target.")
-  endif()
-else()
-  # Note that OPENMP_STANDALONE_BUILD is FALSE, when
-  # openmp is built with -DLLVM_ENABLE_RUNTIMES="openmp" vs
-  # -DLLVM_ENABLE_PROJECTS="openmp", but openmp build
-  # is actually done as a standalone project build with many
-  # LLVM CMake variables propagated to it.
-  list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS
-    ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include
-    )
-  message(STATUS
-    "Using LLVM include directories: ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
-endif()
-
-################################################################################
-# Looking for libffi...
-################################################################################
-find_package(FFI QUIET)
-set(LIBOMPTARGET_DEP_LIBFFI_FOUND ${FFI_FOUND})
-
-################################################################################
-# Looking for CUDA...
-################################################################################
-
-find_package(CUDAToolkit QUIET)
-set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDAToolkit_FOUND})
-
-################################################################################
-# Looking for NVIDIA GPUs...
-################################################################################
-set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_35")
-
-if(TARGET nvptx-arch)
-  get_property(LIBOMPTARGET_NVPTX_ARCH TARGET nvptx-arch PROPERTY LOCATION)
-else()
-  find_program(LIBOMPTARGET_NVPTX_ARCH NAMES nvptx-arch
-               PATHS ${LLVM_TOOLS_BINARY_DIR}/bin)
-endif()
-
-if(LIBOMPTARGET_NVPTX_ARCH)
-  execute_process(COMMAND ${LIBOMPTARGET_NVPTX_ARCH}
-                  OUTPUT_VARIABLE LIBOMPTARGET_NVPTX_ARCH_OUTPUT
-                  OUTPUT_STRIP_TRAILING_WHITESPACE)
-  string(REPLACE "\n" ";" nvptx_arch_list "${LIBOMPTARGET_NVPTX_ARCH_OUTPUT}")
-  if(nvptx_arch_list)
-    set(LIBOMPTARGET_FOUND_NVIDIA_GPU TRUE)
-    set(LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST "${nvptx_arch_list}")
-    list(GET nvptx_arch_list 0 LIBOMPTARGET_DEP_CUDA_ARCH)
-  endif()
-endif()
-
-
-################################################################################
-# Looking for AMD GPUs...
-################################################################################
-
-if(TARGET amdgpu-arch)
-  get_property(LIBOMPTARGET_AMDGPU_ARCH TARGET amdgpu-arch PROPERTY LOCATION)
-else()
-  find_program(LIBOMPTARGET_AMDGPU_ARCH NAMES amdgpu-arch
-               PATHS ${LLVM_TOOLS_BINARY_DIR}/bin)
-endif()
-
-if(LIBOMPTARGET_AMDGPU_ARCH)
-  execute_process(COMMAND ${LIBOMPTARGET_AMDGPU_ARCH}
-                  OUTPUT_VARIABLE LIBOMPTARGET_AMDGPU_ARCH_OUTPUT
-                  OUTPUT_STRIP_TRAILING_WHITESPACE)
-  string(REPLACE "\n" ";" amdgpu_arch_list "${LIBOMPTARGET_AMDGPU_ARCH_OUTPUT}")
-  if(amdgpu_arch_list)
-    set(LIBOMPTARGET_FOUND_AMDGPU_GPU TRUE)
-    set(LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST "${amdgpu_arch_list}")
-  endif()
-endif()
-
-set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})

diff --git a/libomptarget/cmake/Modules/LibomptargetUtils.cmake b/libomptarget/cmake/Modules/LibomptargetUtils.cmake
deleted file mode 100644
index 7339cc0..0000000
--- a/libomptarget/cmake/Modules/LibomptargetUtils.cmake
+++ /dev/null

@@ -1,27 +0,0 @@
-#
-#//===----------------------------------------------------------------------===//
-#//
-#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-#// See https://llvm.org/LICENSE.txt for license information.
-#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#//
-#//===----------------------------------------------------------------------===//
-#
-
-# void libomptarget_say(string message_to_user);
-# - prints out message_to_user
-macro(libomptarget_say message_to_user)
-  message(STATUS "LIBOMPTARGET: ${message_to_user}")
-endmacro()
-
-# void libomptarget_warning_say(string message_to_user);
-# - prints out message_to_user with a warning
-macro(libomptarget_warning_say message_to_user)
-  message(WARNING "LIBOMPTARGET: ${message_to_user}")
-endmacro()
-
-# void libomptarget_error_say(string message_to_user);
-# - prints out message_to_user with an error and exits cmake
-macro(libomptarget_error_say message_to_user)
-  message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}")
-endmacro()

diff --git a/libomptarget/docs/declare_target_indirect.md b/libomptarget/docs/declare_target_indirect.md
deleted file mode 100644
index 443a5ab..0000000
--- a/libomptarget/docs/declare_target_indirect.md
+++ /dev/null

@@ -1,127 +0,0 @@
-# Overview
-
-The indirect clause enables **indirect device invocation** for a procedure:
-> 19 An indirect call to the device version of a procedure on a device other than the host<br>
-> 20 device, through a function pointer (C/C++), a pointer to a member function (C++) or<br>
-> 21 a procedure pointer (Fortran) that refers to the host version of the procedure.
-
-# Compiler support
-### Offload entry metadata (C++ FE)
-
-For each function declared as **declare target indirect** C++ FE generates the following offload metadata:
-
-```c++
-// Entry 0 -> Kind of this type of metadata (2)
-// Entry 1 -> Mangled name of the function.
-// Entry 2 -> Order the entry was created.
-```
-
-The offloading metadata uses new `OffloadEntriesInfoManagerTy::OffloadingEntryInfoKinds::OffloadingEntryInfoDeviceIndirectFunc`  metadata kind.
-
-### Offload entries table
-
-The offload entries table that is created for the host and for each of the device images currently have entries for **declare target** global variables, **omp target** outlined functions and constructor/destructor thunks for **declare target** global variables.
-
-
-Compiler will also produce an entry for each procedure listed in **indirect** clause of **declare target** construct:
-```C++
-struct __tgt_offload_entry {
-  void *addr;       // Pointer to the function
-  char *name;       // Name of the function
-  size_t size;      // 0 for function
-  int32_t flags;    // OpenMPOffloadingDeclareTargetFlags::OMP_DECLARE_TARGET_FPTR
-  int32_t reserved; // Reserved
-};
-```
-
-### Run-time dispatch in device code
-
-When an indirect function call is generated by a FE in **device code** it translates the original function pointer (which may be an address of a host function) into the device function pointer using a translation API, and uses the resulting function pointer for the call.
-
-Original call code:
-
-```
-  %0 = load void ()*, void ()** %fptr.addr
-  call void %0()
-```
-
-Becomes this:
-
-```
-  %0 = load void ()*, void ()** %fptr.addr
-  %1 = bitcast void ()* %0 to i8*
-  %call = call i8* @__kmpc_target_translate_fptr(i8* %1)
-  %fptr_device = bitcast i8* %call to void ()*
-  call void %fptr_device()
-```
-
-Device RTLs must provide the translation API:
-
-```c++
-// Translate \p FnPtr identifying a host function into a function pointer
-// identifying its device counterpart.
-// If \p FnPtr matches an address of any host function
-// declared as 'declare target indirect', then the API
-// must return an address of the same function compiled
-// for the device. If \p FnPtr does not match an address
-// of any host function, then the API returns \p FnPtr
-// unchanged.
-EXTERN void *__kmpc_target_translate_fptr(void *FnPtr);
-```
-
-# Runtime handling of function pointers
-
-`OpenMPOffloadingDeclareTargetFlags::OMP_DECLARE_TARGET_FPTR` is a new flag to distinguish offload entries for function pointers from other function entries.  Unlike other function entries (with `size` equal to 0) `omptarget::InitLibrary()` will establish mapping for function pointer entries in `Device.HostDataToTargetMap`.
-
-For each `OMP_DECLARE_TARGET_FPTR` entry in the offload entries table `libomptarget` creates an entry of the following type:
-
-```c++
-struct __omp_offloading_fptr_map_ty {
-  int64_t host_ptr; // key
-  int64_t tgt_ptr;  // value
-};
-```
-
-Where `host_ptr` is `__tgt_offload_entry::addr` in a **host** offload entry, and `tgt_ptr` is `__tgt_offload_entry::addr` in the corresponding **device** offload entry (which may be found using the populated `Device.HostDataToTargetMap`).
-
-When all `__omp_offloading_function_ptr_map_ty` entries are collected in a single host array, `libomptarget` sorts the table by `host_ptr` values and passes it to the device plugin for registration, if plugin supports optional `__tgt_rtl_set_function_ptr_map` API.
-
-Plugins may provide the following API, if they want to support **declare target indirect** functionality:
-
-```c++
-// Register in a target implementation defined way a table
-// of __omp_offloading_function_ptr_map_ty entries providing
-// mapping between host and device addresses of 'declare target indirect'
-// functions. \p table_size is the number of elements in \p table_host_ptr
-// array.
-EXTERN void __tgt_rtl_set_function_ptr_map(
-    int32_t device_id, uint64_t table_size, __omp_offloading_fptr_map_ty *table_host_ptr);
-```
-
-# Sample implementation
-
-This section describes one of potential implementations.
-
-A FE may define the following global symbols for each translation module containing **declare target indirect**, when compiling this module for a device:
-
-```c++
-// Mapping between host and device functions declared as
-// 'declare target indirect'.
-__attribute__((weak)) struct __omp_offloading_fptr_map_ty {
-  int64_t host_ptr; // key
-  int64_t tgt_ptr;  // value
-} *__omp_offloading_fptr_map_p = 0;
-
-// Number of elements in __omp_offloading_fptr_map_p table.
-__attribute__((weak)) uint64_t __omp_offloading_fptr_map_size = 0;
-```
-
-`__tgt_rtl_set_function_ptr_map(int32_t device_id, uint64_t table_size, __omp_offloading_fptr_map_ty *table_host_ptr)` allocates device memory of size `sizeof(__omp_offloading_fptr_map_ty) * table_size`, and transfers the contents of `table_host_ptr` array into this device memory.  An address of the allocated device memory area is then assigned to `__omp_offloading_fptr_map_p` global variables on the device.  For example, in **CUDA**, a device address of `__omp_offloading_fptr_map_p` may be taken by calling `cuModuleGetGlobal`, and then a pointer-sized data transfer will initialize `__omp_offloading_fptr_map_p` to point to the device copy of `table_host_ptr` array.  `__omp_offloading_fptr_map_size` is assigned to `table_size` the same way.
-
-An alternative implementation of `__tgt_rtl_set_function_ptr_map` may invoke a device kernel that will do the assignments.
-
-`__kmpc_target_translate_fptr(void *FnPtr)` API uses binary search to match `FnPtr` against `host_ptr` inside the device table pointed to by `__omp_offloading_fptr_map_p`.  If the matching key is found, it returns the corresponding `tgt_ptr`, otherwise, it returns `FnPtr`.
-
-# TODO: Optimization for non-unified_shared_memory
-
-If a program does not use **required unified_shared_memory**, and all function pointers are mapped (not a requirement by OpenMP spec), then an implementation may avoid the runtime dispatch code for indirect function calls (i.e. `__kmpc_target_translate_fptr` is not needed) and also `__tgt_rtl_set_function_ptr_map` is not needed.  `libomptarget` will just map the function pointers as regular data pointers via `Device.HostDataToTargetMap`.

diff --git a/libomptarget/include/DeviceImage.h b/libomptarget/include/DeviceImage.h
deleted file mode 100644
index ba46a83..0000000
--- a/libomptarget/include/DeviceImage.h
+++ /dev/null

@@ -1,44 +0,0 @@
-//===-- DeviceImage.h - Representation of the device code/image -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_DEVICE_IMAGE_H
-#define OMPTARGET_DEVICE_IMAGE_H
-
-#include "OffloadEntry.h"
-#include "Shared/APITypes.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Object/OffloadBinary.h"
-
-#include <memory>
-
-class DeviceImageTy {
-
-  std::unique_ptr<llvm::object::OffloadBinary> Binary;
-
-  __tgt_bin_desc *BinaryDesc;
-  __tgt_device_image Image;
-
-public:
-  DeviceImageTy(__tgt_bin_desc &BinaryDesc, __tgt_device_image &Image);
-
-  __tgt_device_image &getExecutableImage() { return Image; }
-  __tgt_bin_desc &getBinaryDesc() { return *BinaryDesc; }
-
-  auto entries() {
-    return llvm::make_range(Image.EntriesBegin, Image.EntriesEnd);
-  }
-};
-
-#endif // OMPTARGET_DEVICE_IMAGE_H

diff --git a/libomptarget/include/ExclusiveAccess.h b/libomptarget/include/ExclusiveAccess.h
deleted file mode 100644
index 09b3aac..0000000
--- a/libomptarget/include/ExclusiveAccess.h
+++ /dev/null

@@ -1,101 +0,0 @@
-//===---- ExclusiveAccess.h - Helper for exclusive access data structures -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_EXCLUSIVE_ACCESS
-#define OMPTARGET_EXCLUSIVE_ACCESS
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <mutex>
-
-/// Forward declaration.
-template <typename Ty> struct Accessor;
-
-/// A protected object is a simple wrapper to allocate an object of type \p Ty
-/// together with a mutex that guards accesses to the object. The only way to
-/// access the object is through the "exclusive accessor" which will lock the
-/// mutex accordingly.
-template <typename Ty> struct ProtectedObj {
-  using AccessorTy = Accessor<Ty>;
-
-  /// Get an exclusive access Accessor object. \p DoNotGetAccess allows to
-  /// create an accessor that is not owning anything based on a boolean
-  /// condition.
-  AccessorTy getExclusiveAccessor(bool DoNotGetAccess = false);
-
-private:
-  Ty Obj;
-  std::mutex Mtx;
-  friend struct Accessor<Ty>;
-};
-
-/// Helper to provide transparent exclusive access to protected objects.
-template <typename Ty> struct Accessor {
-  /// Default constructor does not own anything and cannot access anything.
-  Accessor() : Ptr(nullptr) {}
-
-  /// Constructor to get exclusive access by locking the mutex protecting the
-  /// underlying object.
-  Accessor(ProtectedObj<Ty> &PO) : Ptr(&PO) { lock(); }
-
-  /// Constructor to get exclusive access by taking it from \p Other.
-  Accessor(Accessor<Ty> &&Other) : Ptr(Other.Ptr) { Other.Ptr = nullptr; }
-
-  Accessor(Accessor &Other) = delete;
-
-  /// If the object is still owned when the lifetime ends we give up access.
-  ~Accessor() { unlock(); }
-
-  /// Give up access to the underlying object, virtually "destroying" the
-  /// accessor even if the object is still life.
-  void destroy() {
-    unlock();
-    Ptr = nullptr;
-  }
-
-  /// Provide transparent access to the underlying object.
-  Ty &operator*() {
-    assert(Ptr && "Trying to access an object through a non-owning (or "
-                  "destroyed) accessor!");
-    return Ptr->Obj;
-  }
-  Ty *operator->() {
-    assert(Ptr && "Trying to access an object through a non-owning (or "
-                  "destroyed) accessor!");
-    return &Ptr->Obj;
-  }
-
-private:
-  /// Lock the underlying object if there is one.
-  void lock() {
-    if (Ptr)
-      Ptr->Mtx.lock();
-  }
-
-  /// Unlock the underlying object if there is one.
-  void unlock() {
-    if (Ptr)
-      Ptr->Mtx.unlock();
-  }
-
-  /// Pointer to the underlying object or null if the accessor lost access,
-  /// e.g., after a destroy call.
-  ProtectedObj<Ty> *Ptr;
-};
-
-template <typename Ty>
-Accessor<Ty> ProtectedObj<Ty>::getExclusiveAccessor(bool DoNotGetAccess) {
-  if (DoNotGetAccess)
-    return Accessor<Ty>();
-  return Accessor<Ty>(*this);
-}
-
-#endif

diff --git a/libomptarget/include/OffloadEntry.h b/libomptarget/include/OffloadEntry.h
deleted file mode 100644
index da1de81..0000000
--- a/libomptarget/include/OffloadEntry.h
+++ /dev/null

@@ -1,46 +0,0 @@
-//===-- OffloadEntry.h - Representation of offload entries ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OFFLOAD_ENTRY_H
-#define OMPTARGET_OFFLOAD_ENTRY_H
-
-#include "Shared/APITypes.h"
-
-#include "omptarget.h"
-
-#include "llvm/ADT/StringRef.h"
-
-class DeviceImageTy;
-
-class OffloadEntryTy {
-  DeviceImageTy &DeviceImage;
-  __tgt_offload_entry &OffloadEntry;
-
-public:
-  OffloadEntryTy(DeviceImageTy &DeviceImage, __tgt_offload_entry &OffloadEntry)
-      : DeviceImage(DeviceImage), OffloadEntry(OffloadEntry) {}
-
-  bool isGlobal() const { return getSize() != 0; }
-  size_t getSize() const { return OffloadEntry.size; }
-
-  void *getAddress() const { return OffloadEntry.addr; }
-  llvm::StringRef getName() const { return OffloadEntry.name; }
-  const char *getNameAsCStr() const { return OffloadEntry.name; }
-  __tgt_bin_desc *getBinaryDescription() const;
-
-  bool isLink() const { return hasFlags(OMP_DECLARE_TARGET_LINK); }
-
-  bool hasFlags(OpenMPOffloadingDeclareTargetFlags Flags) const {
-    return Flags & OffloadEntry.flags;
-  }
-};
-
-#endif // OMPTARGET_OFFLOAD_ENTRY_H

diff --git a/libomptarget/include/OffloadPolicy.h b/libomptarget/include/OffloadPolicy.h
deleted file mode 100644
index 858d9c3..0000000
--- a/libomptarget/include/OffloadPolicy.h
+++ /dev/null

@@ -1,63 +0,0 @@
-//===-- OffloadPolicy.h - Configuration of offload behavior -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Configuration for offload behavior, e.g., if offload is disabled, can be
-// disabled, is mandatory, etc.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OFFLOAD_POLICY_H
-#define OMPTARGET_OFFLOAD_POLICY_H
-
-#include "PluginManager.h"
-
-enum kmp_target_offload_kind_t {
-  tgt_disabled = 0,
-  tgt_default = 1,
-  tgt_mandatory = 2
-};
-
-extern "C" int __kmpc_get_target_offload(void) __attribute__((weak));
-
-class OffloadPolicy {
-
-  OffloadPolicy(PluginManager &PM) {
-    // TODO: Check for OpenMP.
-    switch ((kmp_target_offload_kind_t)__kmpc_get_target_offload()) {
-    case tgt_disabled:
-      Kind = DISABLED;
-      return;
-    case tgt_mandatory:
-      Kind = MANDATORY;
-      return;
-    default:
-      if (PM.getNumDevices()) {
-        DP("Default TARGET OFFLOAD policy is now mandatory "
-           "(devices were found)\n");
-        Kind = MANDATORY;
-      } else {
-        DP("Default TARGET OFFLOAD policy is now disabled "
-           "(no devices were found)\n");
-        Kind = DISABLED;
-      }
-      return;
-    };
-  }
-
-public:
-  static const OffloadPolicy &get(PluginManager &PM) {
-    static OffloadPolicy OP(PM);
-    return OP;
-  }
-
-  enum OffloadPolicyKind { DISABLED, MANDATORY };
-
-  OffloadPolicyKind Kind = MANDATORY;
-};
-
-#endif // OMPTARGET_OFFLOAD_POLICY_H

diff --git a/libomptarget/include/OpenMP/InternalTypes.h b/libomptarget/include/OpenMP/InternalTypes.h
deleted file mode 100644
index bd84c38..0000000
--- a/libomptarget/include/OpenMP/InternalTypes.h
+++ /dev/null

@@ -1,152 +0,0 @@
-//===-- OpenMP/InternalTypes.h -- Internal OpenMP Types ------------- C++ -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Private type declarations and helper macros for OpenMP.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OPENMP_INTERNAL_TYPES_H
-#define OMPTARGET_OPENMP_INTERNAL_TYPES_H
-
-#include <cstddef>
-#include <cstdint>
-
-extern "C" {
-
-// Compiler sends us this info:
-typedef struct kmp_depend_info {
-  intptr_t base_addr;
-  size_t len;
-  struct {
-    bool in : 1;
-    bool out : 1;
-    bool mtx : 1;
-  } flags;
-} kmp_depend_info_t;
-
-typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
-  /* Compiler flags */             /* Total compiler flags must be 16 bits */
-  unsigned tiedness : 1;           /* task is either tied (1) or untied (0) */
-  unsigned final : 1;              /* task is final(1) so execute immediately */
-  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
-                              code path */
-  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
-                                     invoke destructors from the runtime */
-  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
-                         context of the RTL) */
-  unsigned priority_specified : 1; /* set if the compiler provides priority
-                                      setting for the task */
-  unsigned detachable : 1;         /* 1 == can detach */
-  unsigned hidden_helper : 1;      /* 1 == hidden helper task */
-  unsigned reserved : 8;           /* reserved for compiler use */
-
-  /* Library flags */       /* Total library flags must be 16 bits */
-  unsigned tasktype : 1;    /* task is either explicit(1) or implicit (0) */
-  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
-  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
-  // (1) or may be deferred (0)
-  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
-  // (0) [>= 2 threads]
-  /* If either team_serial or tasking_ser is set, task team may be NULL */
-  /* Task State Flags: */
-  unsigned started : 1;    /* 1==started, 0==not started     */
-  unsigned executing : 1;  /* 1==executing, 0==not executing */
-  unsigned complete : 1;   /* 1==complete, 0==not complete   */
-  unsigned freed : 1;      /* 1==freed, 0==allocated        */
-  unsigned native : 1;     /* 1==gcc-compiled task, 0==intel */
-  unsigned reserved31 : 7; /* reserved for library use */
-} kmp_tasking_flags_t;
-
-struct kmp_task;
-typedef int32_t (*kmp_routine_entry_t)(int32_t, struct kmp_task *);
-typedef struct kmp_task {
-  void *shareds;
-  kmp_routine_entry_t routine;
-  int32_t part_id;
-} kmp_task_t;
-
-int32_t __kmpc_global_thread_num(void *) __attribute__((weak));
-bool __kmpc_omp_has_task_team(int32_t gtid) __attribute__((weak));
-void **__kmpc_omp_get_target_async_handle_ptr(int32_t gtid)
-    __attribute__((weak));
-
-/**
- * The argument set that is passed from asynchronous memory copy to block
- * version of memory copy invoked in helper task
- */
-struct TargetMemcpyArgsTy {
-  /**
-   * Common attribuutes
-   */
-  void *Dst;
-  const void *Src;
-  int DstDevice;
-  int SrcDevice;
-
-  /**
-   * The flag that denotes single dimensional or rectangle dimensional copy
-   */
-  bool IsRectMemcpy;
-
-  /**
-   * Arguments for single dimensional copy
-   */
-  size_t Length;
-  size_t DstOffset;
-  size_t SrcOffset;
-
-  /**
-   * Arguments for rectangle dimensional copy
-   */
-  size_t ElementSize;
-  int NumDims;
-  const size_t *Volume;
-  const size_t *DstOffsets;
-  const size_t *SrcOffsets;
-  const size_t *DstDimensions;
-  const size_t *SrcDimensions;
-
-  /**
-   * Constructor for single dimensional copy
-   */
-  TargetMemcpyArgsTy(void *Dst, const void *Src, size_t Length,
-                     size_t DstOffset, size_t SrcOffset, int DstDevice,
-                     int SrcDevice)
-      : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
-        IsRectMemcpy(false), Length(Length), DstOffset(DstOffset),
-        SrcOffset(SrcOffset), ElementSize(0), NumDims(0), Volume(0),
-        DstOffsets(0), SrcOffsets(0), DstDimensions(0), SrcDimensions(0){};
-
-  /**
-   * Constructor for rectangle dimensional copy
-   */
-  TargetMemcpyArgsTy(void *Dst, const void *Src, size_t ElementSize,
-                     int NumDims, const size_t *Volume,
-                     const size_t *DstOffsets, const size_t *SrcOffsets,
-                     const size_t *DstDimensions, const size_t *SrcDimensions,
-                     int DstDevice, int SrcDevice)
-      : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
-        IsRectMemcpy(true), Length(0), DstOffset(0), SrcOffset(0),
-        ElementSize(ElementSize), NumDims(NumDims), Volume(Volume),
-        DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
-        DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
-};
-
-struct TargetMemsetArgsTy {
-  // Common attributes of a memset operation
-  void *Ptr;
-  int C;
-  size_t N;
-  int DeviceNum;
-
-  // no constructors defined, because this is a PoD
-};
-
-} // extern "C"
-
-#endif // OMPTARGET_OPENMP_INTERNAL_TYPES_H

diff --git a/libomptarget/include/OpenMP/InteropAPI.h b/libomptarget/include/OpenMP/InteropAPI.h
deleted file mode 100644
index 71c7876..0000000
--- a/libomptarget/include/OpenMP/InteropAPI.h
+++ /dev/null

@@ -1,43 +0,0 @@
-//===-- OpenMP/InteropAPI.h - OpenMP interoperability types and API - C++ -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OPENMP_INTEROP_API_H
-#define OMPTARGET_OPENMP_INTEROP_API_H
-
-#include "omp.h"
-
-#include "omptarget.h"
-
-extern "C" {
-
-typedef enum kmp_interop_type_t {
-  kmp_interop_type_unknown = -1,
-  kmp_interop_type_platform,
-  kmp_interop_type_device,
-  kmp_interop_type_tasksync,
-} kmp_interop_type_t;
-
-/// The interop value type, aka. the interop object.
-typedef struct omp_interop_val_t {
-  /// Device and interop-type are determined at construction time and fix.
-  omp_interop_val_t(intptr_t device_id, kmp_interop_type_t interop_type)
-      : interop_type(interop_type), device_id(device_id) {}
-  const char *err_str = nullptr;
-  __tgt_async_info *async_info = nullptr;
-  __tgt_device_info device_info;
-  const kmp_interop_type_t interop_type;
-  const intptr_t device_id;
-  const omp_foreign_runtime_ids_t vendor_id = cuda;
-  const intptr_t backend_type_id = omp_interop_backend_type_cuda_1;
-} omp_interop_val_t;
-
-} // extern "C"
-
-#endif // OMPTARGET_OPENMP_INTEROP_API_H

diff --git a/libomptarget/include/OpenMP/Mapping.h b/libomptarget/include/OpenMP/Mapping.h
deleted file mode 100644
index b9f5c16..0000000
--- a/libomptarget/include/OpenMP/Mapping.h
+++ /dev/null

@@ -1,538 +0,0 @@
-//===-- OpenMP/Mapping.h - OpenMP/OpenACC pointer mapping -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations for managing host-to-device pointer mappings.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OPENMP_MAPPING_H
-#define OMPTARGET_OPENMP_MAPPING_H
-
-#include "ExclusiveAccess.h"
-#include "Shared/EnvironmentVar.h"
-#include "omptarget.h"
-
-#include <cstdint>
-#include <mutex>
-#include <string>
-
-#include "llvm/ADT/SmallSet.h"
-
-struct DeviceTy;
-class AsyncInfoTy;
-
-using map_var_info_t = void *;
-
-class MappingConfig {
-
-  MappingConfig() {
-    BoolEnvar ForceAtomic = BoolEnvar("LIBOMPTARGET_MAP_FORCE_ATOMIC", true);
-    UseEventsForAtomicTransfers = ForceAtomic;
-  }
-
-public:
-  static const MappingConfig &get() {
-    static MappingConfig MP;
-    return MP;
-  };
-
-  /// Flag to indicate if we use events to ensure the atomicity of
-  /// map clauses or not. Can be modified with an environment variable.
-  bool UseEventsForAtomicTransfers = true;
-};
-
-/// Information about shadow pointers.
-struct ShadowPtrInfoTy {
-  void **HstPtrAddr = nullptr;
-  void *HstPtrVal = nullptr;
-  void **TgtPtrAddr = nullptr;
-  void *TgtPtrVal = nullptr;
-
-  bool operator==(const ShadowPtrInfoTy &Other) const {
-    return HstPtrAddr == Other.HstPtrAddr;
-  }
-};
-
-inline bool operator<(const ShadowPtrInfoTy &lhs, const ShadowPtrInfoTy &rhs) {
-  return lhs.HstPtrAddr < rhs.HstPtrAddr;
-}
-
-/// Map between host data and target data.
-struct HostDataToTargetTy {
-  const uintptr_t HstPtrBase; // host info.
-  const uintptr_t HstPtrBegin;
-  const uintptr_t HstPtrEnd;       // non-inclusive.
-  const map_var_info_t HstPtrName; // Optional source name of mapped variable.
-
-  const uintptr_t TgtAllocBegin; // allocated target memory
-  const uintptr_t TgtPtrBegin; // mapped target memory = TgtAllocBegin + padding
-
-private:
-  static const uint64_t INFRefCount = ~(uint64_t)0;
-  static std::string refCountToStr(uint64_t RefCount) {
-    return RefCount == INFRefCount ? "INF" : std::to_string(RefCount);
-  }
-
-  struct StatesTy {
-    StatesTy(uint64_t DRC, uint64_t HRC)
-        : DynRefCount(DRC), HoldRefCount(HRC) {}
-    /// The dynamic reference count is the standard reference count as of OpenMP
-    /// 4.5.  The hold reference count is an OpenMP extension for the sake of
-    /// OpenACC support.
-    ///
-    /// The 'ompx_hold' map type modifier is permitted only on "omp target" and
-    /// "omp target data", and "delete" is permitted only on "omp target exit
-    /// data" and associated runtime library routines.  As a result, we really
-    /// need to implement "reset" functionality only for the dynamic reference
-    /// counter.  Likewise, only the dynamic reference count can be infinite
-    /// because, for example, omp_target_associate_ptr and "omp declare target
-    /// link" operate only on it.  Nevertheless, it's actually easier to follow
-    /// the code (and requires less assertions for special cases) when we just
-    /// implement these features generally across both reference counters here.
-    /// Thus, it's the users of this class that impose those restrictions.
-    ///
-    uint64_t DynRefCount;
-    uint64_t HoldRefCount;
-
-    /// A map of shadow pointers associated with this entry, the keys are host
-    /// pointer addresses to identify stale entries.
-    llvm::SmallSet<ShadowPtrInfoTy, 2> ShadowPtrInfos;
-
-    /// Pointer to the event corresponding to the data update of this map.
-    /// Note: At present this event is created when the first data transfer from
-    /// host to device is issued, and only being used for H2D. It is not used
-    /// for data transfer in another direction (device to host). It is still
-    /// unclear whether we need it for D2H. If in the future we need similar
-    /// mechanism for D2H, and if the event cannot be shared between them, Event
-    /// should be written as <tt>void *Event[2]</tt>.
-    void *Event = nullptr;
-
-    /// Number of threads currently holding a reference to the entry at a
-    /// targetDataEnd. This is used to ensure that only the last thread that
-    /// references this entry will actually delete it.
-    int32_t DataEndThreadCount = 0;
-  };
-  // When HostDataToTargetTy is used by std::set, std::set::iterator is const
-  // use unique_ptr to make States mutable.
-  const std::unique_ptr<StatesTy> States;
-
-public:
-  HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E,
-                     uintptr_t TgtAllocBegin, uintptr_t TgtPtrBegin,
-                     bool UseHoldRefCount, map_var_info_t Name = nullptr,
-                     bool IsINF = false)
-      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name),
-        TgtAllocBegin(TgtAllocBegin), TgtPtrBegin(TgtPtrBegin),
-        States(std::make_unique<StatesTy>(UseHoldRefCount ? 0
-                                          : IsINF         ? INFRefCount
-                                                          : 1,
-                                          !UseHoldRefCount ? 0
-                                          : IsINF          ? INFRefCount
-                                                           : 1)) {}
-
-  /// Get the total reference count.  This is smarter than just getDynRefCount()
-  /// + getHoldRefCount() because it handles the case where at least one is
-  /// infinity and the other is non-zero.
-  uint64_t getTotalRefCount() const {
-    if (States->DynRefCount == INFRefCount ||
-        States->HoldRefCount == INFRefCount)
-      return INFRefCount;
-    return States->DynRefCount + States->HoldRefCount;
-  }
-
-  /// Get the dynamic reference count.
-  uint64_t getDynRefCount() const { return States->DynRefCount; }
-
-  /// Get the hold reference count.
-  uint64_t getHoldRefCount() const { return States->HoldRefCount; }
-
-  /// Get the event bound to this data map.
-  void *getEvent() const { return States->Event; }
-
-  /// Add a new event, if necessary.
-  /// Returns OFFLOAD_FAIL if something went wrong, OFFLOAD_SUCCESS otherwise.
-  int addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const;
-
-  /// Functions that manages the number of threads referencing the entry in a
-  /// targetDataEnd.
-  void incDataEndThreadCount() { ++States->DataEndThreadCount; }
-
-  [[nodiscard]] int32_t decDataEndThreadCount() {
-    return --States->DataEndThreadCount;
-  }
-
-  [[nodiscard]] int32_t getDataEndThreadCount() const {
-    return States->DataEndThreadCount;
-  }
-
-  /// Set the event bound to this data map.
-  void setEvent(void *Event) const { States->Event = Event; }
-
-  /// Reset the specified reference count unless it's infinity.  Reset to 1
-  /// (even if currently 0) so it can be followed by a decrement.
-  void resetRefCount(bool UseHoldRefCount) const {
-    uint64_t &ThisRefCount =
-        UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
-    if (ThisRefCount != INFRefCount)
-      ThisRefCount = 1;
-  }
-
-  /// Increment the specified reference count unless it's infinity.
-  void incRefCount(bool UseHoldRefCount) const {
-    uint64_t &ThisRefCount =
-        UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
-    if (ThisRefCount != INFRefCount) {
-      ++ThisRefCount;
-      assert(ThisRefCount < INFRefCount && "refcount overflow");
-    }
-  }
-
-  /// Decrement the specified reference count unless it's infinity or zero, and
-  /// return the total reference count.
-  uint64_t decRefCount(bool UseHoldRefCount) const {
-    uint64_t &ThisRefCount =
-        UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
-    uint64_t OtherRefCount =
-        UseHoldRefCount ? States->DynRefCount : States->HoldRefCount;
-    (void)OtherRefCount;
-    if (ThisRefCount != INFRefCount) {
-      if (ThisRefCount > 0)
-        --ThisRefCount;
-      else
-        assert(OtherRefCount >= 0 && "total refcount underflow");
-    }
-    return getTotalRefCount();
-  }
-
-  /// Is the dynamic (and thus the total) reference count infinite?
-  bool isDynRefCountInf() const { return States->DynRefCount == INFRefCount; }
-
-  /// Convert the dynamic reference count to a debug string.
-  std::string dynRefCountToStr() const {
-    return refCountToStr(States->DynRefCount);
-  }
-
-  /// Convert the hold reference count to a debug string.
-  std::string holdRefCountToStr() const {
-    return refCountToStr(States->HoldRefCount);
-  }
-
-  /// Should one decrement of the specified reference count (after resetting it
-  /// if \c AfterReset) remove this mapping?
-  bool decShouldRemove(bool UseHoldRefCount, bool AfterReset = false) const {
-    uint64_t ThisRefCount =
-        UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
-    uint64_t OtherRefCount =
-        UseHoldRefCount ? States->DynRefCount : States->HoldRefCount;
-    if (OtherRefCount > 0)
-      return false;
-    if (AfterReset)
-      return ThisRefCount != INFRefCount;
-    return ThisRefCount == 1;
-  }
-
-  /// Add the shadow pointer info \p ShadowPtrInfo to this entry but only if the
-  /// the target ptr value was not already present in the existing set of shadow
-  /// pointers. Return true if something was added.
-  bool addShadowPointer(const ShadowPtrInfoTy &ShadowPtrInfo) const {
-    auto Pair = States->ShadowPtrInfos.insert(ShadowPtrInfo);
-    if (Pair.second)
-      return true;
-    // Check for a stale entry, if found, replace the old one.
-    if ((*Pair.first).TgtPtrVal == ShadowPtrInfo.TgtPtrVal)
-      return false;
-    States->ShadowPtrInfos.erase(ShadowPtrInfo);
-    return addShadowPointer(ShadowPtrInfo);
-  }
-
-  /// Apply \p CB to all shadow pointers of this entry. Returns OFFLOAD_FAIL if
-  /// \p CB returned OFFLOAD_FAIL for any of them, otherwise this returns
-  /// OFFLOAD_SUCCESS. The entry is locked for this operation.
-  template <typename CBTy> int foreachShadowPointerInfo(CBTy CB) const {
-    for (auto &It : States->ShadowPtrInfos)
-      if (CB(const_cast<ShadowPtrInfoTy &>(It)) == OFFLOAD_FAIL)
-        return OFFLOAD_FAIL;
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Lock this entry for exclusive access. Ensure to get exclusive access to
-  /// HDTTMap first!
-  void lock() const { Mtx.lock(); }
-
-  /// Unlock this entry to allow other threads inspecting it.
-  void unlock() const { Mtx.unlock(); }
-
-private:
-  // Mutex that needs to be held before the entry is inspected or modified. The
-  // HDTTMap mutex needs to be held before trying to lock any HDTT Entry.
-  mutable std::mutex Mtx;
-};
-
-/// Wrapper around the HostDataToTargetTy to be used in the HDTT map. In
-/// addition to the HDTT pointer we store the key value explicitly. This
-/// allows the set to inspect (sort/search/...) this entry without an additional
-/// load of HDTT. HDTT is a pointer to allow the modification of the set without
-/// invalidating HDTT entries which can now be inspected at the same time.
-struct HostDataToTargetMapKeyTy {
-  uintptr_t KeyValue;
-
-  HostDataToTargetMapKeyTy(void *Key) : KeyValue(uintptr_t(Key)) {}
-  HostDataToTargetMapKeyTy(uintptr_t Key) : KeyValue(Key) {}
-  HostDataToTargetMapKeyTy(HostDataToTargetTy *HDTT)
-      : KeyValue(HDTT->HstPtrBegin), HDTT(HDTT) {}
-  HostDataToTargetTy *HDTT;
-};
-inline bool operator<(const HostDataToTargetMapKeyTy &LHS,
-                      const uintptr_t &RHS) {
-  return LHS.KeyValue < RHS;
-}
-inline bool operator<(const uintptr_t &LHS,
-                      const HostDataToTargetMapKeyTy &RHS) {
-  return LHS < RHS.KeyValue;
-}
-inline bool operator<(const HostDataToTargetMapKeyTy &LHS,
-                      const HostDataToTargetMapKeyTy &RHS) {
-  return LHS.KeyValue < RHS.KeyValue;
-}
-
-/// This struct will be returned by \p DeviceTy::getTargetPointer which provides
-/// more data than just a target pointer. A TargetPointerResultTy that has a non
-/// null Entry owns the entry. As long as the TargetPointerResultTy (TPR) exists
-/// the entry is locked. To give up ownership without destroying the TPR use the
-/// reset() function.
-struct TargetPointerResultTy {
-  struct FlagTy {
-    /// If the map table entry is just created
-    unsigned IsNewEntry : 1;
-    /// If the pointer is actually a host pointer (when unified memory enabled)
-    unsigned IsHostPointer : 1;
-    /// If the pointer is present in the mapping table.
-    unsigned IsPresent : 1;
-    /// Flag indicating that this was the last user of the entry and the ref
-    /// count is now 0.
-    unsigned IsLast : 1;
-    /// If the pointer is contained.
-    unsigned IsContained : 1;
-  } Flags = {0, 0, 0, 0, 0};
-
-  TargetPointerResultTy(const TargetPointerResultTy &) = delete;
-  TargetPointerResultTy &operator=(const TargetPointerResultTy &TPR) = delete;
-  TargetPointerResultTy() {}
-
-  TargetPointerResultTy(FlagTy Flags, HostDataToTargetTy *Entry,
-                        void *TargetPointer)
-      : Flags(Flags), TargetPointer(TargetPointer), Entry(Entry) {
-    if (Entry)
-      Entry->lock();
-  }
-
-  TargetPointerResultTy(TargetPointerResultTy &&TPR)
-      : Flags(TPR.Flags), TargetPointer(TPR.TargetPointer), Entry(TPR.Entry) {
-    TPR.Entry = nullptr;
-  }
-
-  TargetPointerResultTy &operator=(TargetPointerResultTy &&TPR) {
-    if (&TPR != this) {
-      std::swap(Flags, TPR.Flags);
-      std::swap(Entry, TPR.Entry);
-      std::swap(TargetPointer, TPR.TargetPointer);
-    }
-    return *this;
-  }
-
-  ~TargetPointerResultTy() {
-    if (Entry)
-      Entry->unlock();
-  }
-
-  bool isPresent() const { return Flags.IsPresent; }
-
-  bool isHostPointer() const { return Flags.IsHostPointer; }
-
-  bool isContained() const { return Flags.IsContained; }
-
-  /// The corresponding target pointer
-  void *TargetPointer = nullptr;
-
-  HostDataToTargetTy *getEntry() const { return Entry; }
-  void setEntry(HostDataToTargetTy *HDTTT,
-                HostDataToTargetTy *OwnedTPR = nullptr) {
-    if (Entry)
-      Entry->unlock();
-    Entry = HDTTT;
-    if (Entry && Entry != OwnedTPR)
-      Entry->lock();
-  }
-
-  void reset() { *this = TargetPointerResultTy(); }
-
-private:
-  /// The corresponding map table entry which is stable.
-  HostDataToTargetTy *Entry = nullptr;
-};
-
-struct LookupResult {
-  struct {
-    unsigned IsContained : 1;
-    unsigned ExtendsBefore : 1;
-    unsigned ExtendsAfter : 1;
-  } Flags;
-
-  LookupResult() : Flags({0, 0, 0}), TPR() {}
-
-  TargetPointerResultTy TPR;
-};
-
-// This structure stores information of a mapped memory region.
-struct MapComponentInfoTy {
-  void *Base;
-  void *Begin;
-  int64_t Size;
-  int64_t Type;
-  void *Name;
-  MapComponentInfoTy() = default;
-  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type,
-                     void *Name)
-      : Base(Base), Begin(Begin), Size(Size), Type(Type), Name(Name) {}
-};
-
-// This structure stores all components of a user-defined mapper. The number of
-// components are dynamically decided, so we utilize C++ STL vector
-// implementation here.
-struct MapperComponentsTy {
-  llvm::SmallVector<MapComponentInfoTy> Components;
-  int32_t size() { return Components.size(); }
-};
-
-// The mapper function pointer type. It follows the signature below:
-// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
-//                                           void *base, void *begin,
-//                                           size_t size, int64_t type,
-//                                           void * name);
-typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
-                                void *);
-
-// Function pointer type for targetData* functions (targetDataBegin,
-// targetDataEnd and targetDataUpdate).
-typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
-                                   void **, int64_t *, int64_t *,
-                                   map_var_info_t *, void **, AsyncInfoTy &,
-                                   bool);
-
-void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
-                               bool toStdOut = false);
-
-int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
-                    void **ArgsBase, void **Args, int64_t *ArgSizes,
-                    int64_t *ArgTypes, map_var_info_t *ArgNames,
-                    void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                    bool FromMapper = false);
-
-int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
-                  void **ArgBases, void **Args, int64_t *ArgSizes,
-                  int64_t *ArgTypes, map_var_info_t *ArgNames,
-                  void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                  bool FromMapper = false);
-
-int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
-                     void **ArgsBase, void **Args, int64_t *ArgSizes,
-                     int64_t *ArgTypes, map_var_info_t *ArgNames,
-                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                     bool FromMapper = false);
-
-struct MappingInfoTy {
-  MappingInfoTy(DeviceTy &Device) : Device(Device) {}
-
-  /// Host data to device map type with a wrapper key indirection that allows
-  /// concurrent modification of the entries without invalidating the underlying
-  /// entries.
-  using HostDataToTargetListTy =
-      std::set<HostDataToTargetMapKeyTy, std::less<>>;
-
-  /// The HDTTMap is a protected object that can only be accessed by one thread
-  /// at a time.
-  ProtectedObj<HostDataToTargetListTy> HostDataToTargetMap;
-
-  /// The type used to access the HDTT map.
-  using HDTTMapAccessorTy = decltype(HostDataToTargetMap)::AccessorTy;
-
-  /// Lookup the mapping of \p HstPtrBegin in \p HDTTMap. The accessor ensures
-  /// exclusive access to the HDTT map.
-  LookupResult lookupMapping(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
-                             int64_t Size,
-                             HostDataToTargetTy *OwnedTPR = nullptr);
-
-  /// Get the target pointer based on host pointer begin and base. If the
-  /// mapping already exists, the target pointer will be returned directly. In
-  /// addition, if required, the memory region pointed by \p HstPtrBegin of size
-  /// \p Size will also be transferred to the device. If the mapping doesn't
-  /// exist, and if unified shared memory is not enabled, a new mapping will be
-  /// created and the data will also be transferred accordingly. nullptr will be
-  /// returned because of any of following reasons:
-  /// - Data allocation failed;
-  /// - The user tried to do an illegal mapping;
-  /// - Data transfer issue fails.
-  TargetPointerResultTy getTargetPointer(
-      HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase,
-      int64_t TgtPadding, int64_t Size, map_var_info_t HstPtrName,
-      bool HasFlagTo, bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount,
-      bool HasCloseModifier, bool HasPresentModifier, bool HasHoldModifier,
-      AsyncInfoTy &AsyncInfo, HostDataToTargetTy *OwnedTPR = nullptr,
-      bool ReleaseHDTTMap = true);
-
-  /// Return the target pointer for \p HstPtrBegin in \p HDTTMap. The accessor
-  /// ensures exclusive access to the HDTT map.
-  void *getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
-                       int64_t Size);
-
-  /// Return the target pointer begin (where the data will be moved).
-  /// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target.
-  /// - \p UpdateRefCount and \p UseHoldRefCount controls which and if the entry
-  /// reference counters will be decremented.
-  /// - \p MustContain enforces that the query must not extend beyond an already
-  /// mapped entry to be valid.
-  /// - \p ForceDelete deletes the entry regardless of its reference counting
-  /// (unless it is infinite).
-  /// - \p FromDataEnd tracks the number of threads referencing the entry at
-  /// targetDataEnd for delayed deletion purpose.
-  [[nodiscard]] TargetPointerResultTy
-  getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool UpdateRefCount,
-                 bool UseHoldRefCount, bool MustContain = false,
-                 bool ForceDelete = false, bool FromDataEnd = false);
-
-  /// Remove the \p Entry from the data map. Expect the entry's total reference
-  /// count to be zero and the caller thread to be the last one using it. \p
-  /// HDTTMap ensure the caller holds exclusive access and can modify the map.
-  /// Return \c OFFLOAD_SUCCESS if the map entry existed, and return \c
-  /// OFFLOAD_FAIL if not. It is the caller's responsibility to skip calling
-  /// this function if the map entry is not expected to exist because \p
-  /// HstPtrBegin uses shared memory.
-  [[nodiscard]] int eraseMapEntry(HDTTMapAccessorTy &HDTTMap,
-                                  HostDataToTargetTy *Entry, int64_t Size);
-
-  /// Deallocate the \p Entry from the device memory and delete it. Return \c
-  /// OFFLOAD_SUCCESS if the deallocation operations executed successfully, and
-  /// return \c OFFLOAD_FAIL otherwise.
-  [[nodiscard]] int deallocTgtPtrAndEntry(HostDataToTargetTy *Entry,
-                                          int64_t Size);
-
-  int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
-  int disassociatePtr(void *HstPtrBegin);
-
-  /// Print information about the transfer from \p HstPtr to \p TgtPtr (or vice
-  /// versa if \p H2D is false). If there is an existing mapping, or if \p Entry
-  /// is set, the associated metadata will be printed as well.
-  void printCopyInfo(void *TgtPtr, void *HstPtr, int64_t Size, bool H2D,
-                     HostDataToTargetTy *Entry,
-                     MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr);
-
-private:
-  DeviceTy &Device;
-};
-
-#endif // OMPTARGET_OPENMP_MAPPING_H

diff --git a/libomptarget/include/OpenMP/OMPT/Callback.h b/libomptarget/include/OpenMP/OMPT/Callback.h
deleted file mode 100644
index 89c5731..0000000
--- a/libomptarget/include/OpenMP/OMPT/Callback.h
+++ /dev/null

@@ -1,105 +0,0 @@
-//===-- OpenMP/OMPT/Callback.h - OpenMP Tooling callbacks -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Interface used by target-independent runtimes to coordinate registration and
-// invocation of OMPT callbacks and initialization / finalization.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OPENMP_OMPT_CALLBACK_H
-#define OMPTARGET_OPENMP_OMPT_CALLBACK_H
-
-#ifdef OMPT_SUPPORT
-
-#include "omp-tools.h"
-
-#pragma push_macro("DEBUG_PREFIX")
-#undef DEBUG_PREFIX
-#define DEBUG_PREFIX "OMPT"
-
-#define FOREACH_OMPT_TARGET_CALLBACK(macro)                                    \
-  FOREACH_OMPT_DEVICE_EVENT(macro)                                             \
-  FOREACH_OMPT_NOEMI_EVENT(macro)                                              \
-  FOREACH_OMPT_EMI_EVENT(macro)
-
-#define performIfOmptInitialized(stmt)                                         \
-  do {                                                                         \
-    if (llvm::omp::target::ompt::Initialized) {                                \
-      stmt;                                                                    \
-    }                                                                          \
-  } while (0)
-
-#define performOmptCallback(CallbackName, ...)                                 \
-  do {                                                                         \
-    if (ompt_callback_##CallbackName##_fn)                                     \
-      ompt_callback_##CallbackName##_fn(__VA_ARGS__);                          \
-  } while (0)
-
-/// Function type def used for maintaining unique target region, target
-/// operations ids
-typedef uint64_t (*IdInterfaceTy)();
-
-namespace llvm {
-namespace omp {
-namespace target {
-namespace ompt {
-
-#define declareOmptCallback(Name, Type, Code) extern Name##_t Name##_fn;
-FOREACH_OMPT_NOEMI_EVENT(declareOmptCallback)
-FOREACH_OMPT_EMI_EVENT(declareOmptCallback)
-#undef declareOmptCallback
-
-/// This function will call an OpenMP API function. Which in turn will lookup a
-/// given enum value of type \p ompt_callbacks_t and copy the address of the
-/// corresponding callback funtion into the provided pointer.
-/// The pointer to the runtime function is passed during 'initializeLibrary'.
-/// \p which the enum value of the requested callback function
-/// \p callback the destination pointer where the address shall be copied
-extern ompt_get_callback_t lookupCallbackByCode;
-
-/// Lookup function to be used by the lower layer (e.g. the plugin). This
-/// function has to be provided when actually calling callback functions like
-/// 'ompt_callback_device_initialize_fn' (param: 'lookup').
-/// The pointer to the runtime function is passed during 'initializeLibrary'.
-/// \p InterfaceFunctionName the name of the OMPT callback function to look up
-extern ompt_function_lookup_t lookupCallbackByName;
-
-/// This is the function called by the higher layer (libomp / libomtarget)
-/// responsible for initializing OMPT in this library. This is passed to libomp
-/// as part of the OMPT connector object.
-/// \p lookup to be used to query callbacks registered with libomp
-/// \p initial_device_num initial device num (id) provided by libomp
-/// \p tool_data as provided by the tool
-int initializeLibrary(ompt_function_lookup_t lookup, int initial_device_num,
-                      ompt_data_t *tool_data);
-
-/// This function is passed to libomp / libomtarget as part of the OMPT
-/// connector object. It is called by libomp during finalization of OMPT in
-/// libomptarget -OR- by libomptarget during finalization of OMPT in the plugin.
-/// \p tool_data as provided by the tool
-void finalizeLibrary(ompt_data_t *tool_data);
-
-/// This function will connect the \p initializeLibrary and \p finalizeLibrary
-/// functions to their respective higher layer.
-void connectLibrary();
-
-/// OMPT initialization status; false if initializeLibrary has not been executed
-extern bool Initialized;
-
-} // namespace ompt
-} // namespace target
-} // namespace omp
-} // namespace llvm
-
-#pragma pop_macro("DEBUG_PREFIX")
-
-#else
-#define performIfOmptInitialized(stmt)
-#endif // OMPT_SUPPORT
-
-#endif // OMPTARGET_OPENMP_OMPT_CALLBACK_H

diff --git a/libomptarget/include/OpenMP/OMPT/Connector.h b/libomptarget/include/OpenMP/OMPT/Connector.h
deleted file mode 100644
index c7b3774..0000000
--- a/libomptarget/include/OpenMP/OMPT/Connector.h
+++ /dev/null

@@ -1,109 +0,0 @@
-//===-- OpenMP/OMPT/Connector.h - OpenMP Tooling lib connector -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Support used by OMPT implementation to establish communication between
-// various OpenMP runtime libraries: host openmp library, target-independent
-// runtime library, and device-dependent runtime libraries.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OPENMP_OMPT_CONNECTOR_H
-#define OMPTARGET_OPENMP_OMPT_CONNECTOR_H
-
-#ifdef OMPT_SUPPORT
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#include <memory>
-#include <string>
-
-#include "omp-tools.h"
-#include "omptarget.h"
-
-#include "Shared/Debug.h"
-
-#pragma push_macro("DEBUG_PREFIX")
-#undef DEBUG_PREFIX
-#define DEBUG_PREFIX "OMPT"
-
-/// Type for the function to be invoked for connecting two libraries.
-typedef void (*OmptConnectRtnTy)(ompt_start_tool_result_t *result);
-
-/// Establish connection between openmp runtime libraries
-///
-/// This class is used to communicate between an OMPT implementation in
-/// libomptarget and libomp. It is also used to communicate between an
-/// OMPT implementation in a device-specific plugin and
-/// libomptarget. The decision whether OMPT is enabled or not needs to
-/// be made when the library is loaded before any functions in the
-/// library are invoked. For that reason, an instance of this class is
-/// intended to be defined in the constructor for libomptarget or a
-/// plugin so that the decision about whether OMPT is supposed to be
-/// enabled is known before any interface function in the library is
-/// invoked.
-class OmptLibraryConnectorTy {
-public:
-  /// Use \p LibName as the prefix of the global function used for connecting
-  /// two libraries, the source indicated by \p LibName and the destination
-  /// being the one that creates this object.
-  OmptLibraryConnectorTy(const char *Ident) {
-    LibIdent.append(Ident);
-    IsInitialized = false;
-  }
-  OmptLibraryConnectorTy() = delete;
-  /// Use \p OmptResult init to connect the two libraries denoted by this
-  /// object. The init function of \p OmptResult will be used during connection
-  /// and the fini function of \p OmptResult will be used during teardown.
-  void connect(ompt_start_tool_result_t *OmptResult) {
-    initialize();
-    if (!LibConnHandle)
-      return;
-    // Call the function provided by the source library for connect
-    LibConnHandle(OmptResult);
-  }
-
-private:
-  void initialize() {
-    if (IsInitialized)
-      return;
-
-    std::string ErrMsg;
-    std::string LibName = LibIdent;
-    LibName += ".so";
-
-    DP("OMPT: Trying to load library %s\n", LibName.c_str());
-    auto DynLibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-        llvm::sys::DynamicLibrary::getPermanentLibrary(LibName.c_str(),
-                                                       &ErrMsg));
-    if (!DynLibHandle->isValid()) {
-      // The upper layer will bail out if the handle is null.
-      LibConnHandle = nullptr;
-    } else {
-      auto LibConnRtn = "ompt_" + LibIdent + "_connect";
-      DP("OMPT: Trying to get address of connection routine %s\n",
-         LibConnRtn.c_str());
-      LibConnHandle = reinterpret_cast<OmptConnectRtnTy>(
-          DynLibHandle->getAddressOfSymbol(LibConnRtn.c_str()));
-    }
-    DP("OMPT: Library connection handle = %p\n", LibConnHandle);
-    IsInitialized = true;
-  }
-
-  /// Ensure initialization occurs only once
-  bool IsInitialized;
-  /// Handle of connect routine provided by source library
-  OmptConnectRtnTy LibConnHandle;
-  /// Name of connect routine provided by source library
-  std::string LibIdent;
-};
-
-#endif // OMPT_SUPPORT
-
-#pragma pop_macro("DEBUG_PREFIX")
-
-#endif // OMPTARGET_OPENMP_OMPT_CONNECTOR_H

diff --git a/libomptarget/include/OpenMP/OMPT/Interface.h b/libomptarget/include/OpenMP/OMPT/Interface.h
deleted file mode 100644
index 327fadf..0000000
--- a/libomptarget/include/OpenMP/OMPT/Interface.h
+++ /dev/null

@@ -1,300 +0,0 @@
-//===-- OpenMP/OMPT/Interface.h - OpenMP Tooling interfaces ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations for OpenMP Tool callback dispatchers.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_OMPTINTERFACE_H
-#define _OMPTARGET_OMPTINTERFACE_H
-
-// Only provide functionality if target OMPT support is enabled
-#ifdef OMPT_SUPPORT
-#include <functional>
-#include <tuple>
-
-#include "Callback.h"
-#include "omp-tools.h"
-
-#include "llvm/Support/ErrorHandling.h"
-
-#define OMPT_IF_BUILT(stmt) stmt
-
-/// Callbacks for target regions require task_data representing the
-/// encountering task.
-/// Callbacks for target regions and target data ops require
-/// target_task_data representing the target task region.
-typedef ompt_data_t *(*ompt_get_task_data_t)();
-typedef ompt_data_t *(*ompt_get_target_task_data_t)();
-
-namespace llvm {
-namespace omp {
-namespace target {
-namespace ompt {
-
-/// Function pointers that will be used to track task_data and
-/// target_task_data.
-static ompt_get_task_data_t ompt_get_task_data_fn;
-static ompt_get_target_task_data_t ompt_get_target_task_data_fn;
-
-/// Used to maintain execution state for this thread
-class Interface {
-public:
-  /// Top-level function for invoking callback before device data allocation
-  void beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                            void **TgtPtrBegin, size_t Size, void *Code);
-
-  /// Top-level function for invoking callback after device data allocation
-  void endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                          void **TgtPtrBegin, size_t Size, void *Code);
-
-  /// Top-level function for invoking callback before data submit
-  void beginTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
-                             int64_t DstDeviceId, void *DstPtrBegin,
-                             size_t Size, void *Code);
-
-  /// Top-level function for invoking callback after data submit
-  void endTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
-                           int64_t DstDeviceId, void *DstPtrBegin, size_t Size,
-                           void *Code);
-
-  /// Top-level function for invoking callback before device data deallocation
-  void beginTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin, void *Code);
-
-  /// Top-level function for invoking callback after device data deallocation
-  void endTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin, void *Code);
-
-  /// Top-level function for invoking callback before data retrieve
-  void beginTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
-                               int64_t DstDeviceId, void *DstPtrBegin,
-                               size_t Size, void *Code);
-
-  /// Top-level function for invoking callback after data retrieve
-  void endTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
-                             int64_t DstDeviceId, void *DstPtrBegin,
-                             size_t Size, void *Code);
-
-  /// Top-level function for invoking callback before kernel dispatch
-  void beginTargetSubmit(unsigned int NumTeams = 1);
-
-  /// Top-level function for invoking callback after kernel dispatch
-  void endTargetSubmit(unsigned int NumTeams = 1);
-
-  // Target region callbacks
-
-  /// Top-level function for invoking callback before target enter data
-  /// construct
-  void beginTargetDataEnter(int64_t DeviceId, void *Code);
-
-  /// Top-level function for invoking callback after target enter data
-  /// construct
-  void endTargetDataEnter(int64_t DeviceId, void *Code);
-
-  /// Top-level function for invoking callback before target exit data
-  /// construct
-  void beginTargetDataExit(int64_t DeviceId, void *Code);
-
-  /// Top-level function for invoking callback after target exit data
-  /// construct
-  void endTargetDataExit(int64_t DeviceId, void *Code);
-
-  /// Top-level function for invoking callback before target update construct
-  void beginTargetUpdate(int64_t DeviceId, void *Code);
-
-  /// Top-level function for invoking callback after target update construct
-  void endTargetUpdate(int64_t DeviceId, void *Code);
-
-  /// Top-level function for invoking callback before target construct
-  void beginTarget(int64_t DeviceId, void *Code);
-
-  /// Top-level function for invoking callback after target construct
-  void endTarget(int64_t DeviceId, void *Code);
-
-  // Callback getter: Target data operations
-  template <ompt_target_data_op_t OpType> auto getCallbacks() {
-    if constexpr (OpType == ompt_target_data_alloc ||
-                  OpType == ompt_target_data_alloc_async)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetDataAlloc),
-                            std::mem_fn(&Interface::endTargetDataAlloc));
-
-    if constexpr (OpType == ompt_target_data_delete ||
-                  OpType == ompt_target_data_delete_async)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetDataDelete),
-                            std::mem_fn(&Interface::endTargetDataDelete));
-
-    if constexpr (OpType == ompt_target_data_transfer_to_device ||
-                  OpType == ompt_target_data_transfer_to_device_async)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetDataSubmit),
-                            std::mem_fn(&Interface::endTargetDataSubmit));
-
-    if constexpr (OpType == ompt_target_data_transfer_from_device ||
-                  OpType == ompt_target_data_transfer_from_device_async)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetDataRetrieve),
-                            std::mem_fn(&Interface::endTargetDataRetrieve));
-
-    llvm_unreachable("Unhandled target data operation type!");
-  }
-
-  // Callback getter: Target region operations
-  template <ompt_target_t OpType> auto getCallbacks() {
-    if constexpr (OpType == ompt_target_enter_data ||
-                  OpType == ompt_target_enter_data_nowait)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetDataEnter),
-                            std::mem_fn(&Interface::endTargetDataEnter));
-
-    if constexpr (OpType == ompt_target_exit_data ||
-                  OpType == ompt_target_exit_data_nowait)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetDataExit),
-                            std::mem_fn(&Interface::endTargetDataExit));
-
-    if constexpr (OpType == ompt_target_update ||
-                  OpType == ompt_target_update_nowait)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetUpdate),
-                            std::mem_fn(&Interface::endTargetUpdate));
-
-    if constexpr (OpType == ompt_target || OpType == ompt_target_nowait)
-      return std::make_pair(std::mem_fn(&Interface::beginTarget),
-                            std::mem_fn(&Interface::endTarget));
-
-    llvm_unreachable("Unknown target region operation type!");
-  }
-
-  // Callback getter: Kernel launch operation
-  template <ompt_callbacks_t OpType> auto getCallbacks() {
-    // We use 'ompt_callbacks_t', because no other enum is currently available
-    // to model a kernel launch / target submit operation.
-    if constexpr (OpType == ompt_callback_target_submit)
-      return std::make_pair(std::mem_fn(&Interface::beginTargetSubmit),
-                            std::mem_fn(&Interface::endTargetSubmit));
-
-    llvm_unreachable("Unhandled target operation!");
-  }
-
-  /// Setters for target region and target operation correlation ids
-  void setTargetDataValue(uint64_t DataValue) { TargetData.value = DataValue; }
-  void setTargetDataPtr(void *DataPtr) { TargetData.ptr = DataPtr; }
-  void setHostOpId(ompt_id_t OpId) { HostOpId = OpId; }
-
-  /// Getters for target region and target operation correlation ids
-  uint64_t getTargetDataValue() { return TargetData.value; }
-  void *getTargetDataPtr() { return TargetData.ptr; }
-  ompt_id_t getHostOpId() { return HostOpId; }
-
-private:
-  /// Target operations id
-  ompt_id_t HostOpId = 0;
-
-  /// Target region data
-  ompt_data_t TargetData = ompt_data_none;
-
-  /// Task data representing the encountering task
-  ompt_data_t *TaskData = nullptr;
-
-  /// Target task data representing the target task region
-  ompt_data_t *TargetTaskData = nullptr;
-
-  /// Used for marking begin of a data operation
-  void beginTargetDataOperation();
-
-  /// Used for marking end of a data operation
-  void endTargetDataOperation();
-
-  /// Used for marking begin of a target region
-  void beginTargetRegion();
-
-  /// Used for marking end of a target region
-  void endTargetRegion();
-};
-
-/// Thread local state for target region and associated metadata
-extern thread_local Interface RegionInterface;
-
-/// Thread local variable holding the return address.
-/// When using __builtin_return_address to set the return address,
-/// allow 0 as the only argument to avoid unpredictable effects.
-extern thread_local void *ReturnAddress;
-
-template <typename FuncTy, typename ArgsTy, size_t... IndexSeq>
-void InvokeInterfaceFunction(FuncTy Func, ArgsTy Args,
-                             std::index_sequence<IndexSeq...>) {
-  std::invoke(Func, RegionInterface, std::get<IndexSeq>(Args)...);
-}
-
-template <typename CallbackPairTy, typename... ArgsTy> class InterfaceRAII {
-public:
-  InterfaceRAII(CallbackPairTy Callbacks, ArgsTy... Args)
-      : Arguments(Args...), beginFunction(std::get<0>(Callbacks)),
-        endFunction(std::get<1>(Callbacks)) {
-    performIfOmptInitialized(begin());
-  }
-  ~InterfaceRAII() { performIfOmptInitialized(end()); }
-
-private:
-  void begin() {
-    auto IndexSequence =
-        std::make_index_sequence<std::tuple_size_v<decltype(Arguments)>>{};
-    InvokeInterfaceFunction(beginFunction, Arguments, IndexSequence);
-  }
-
-  void end() {
-    auto IndexSequence =
-        std::make_index_sequence<std::tuple_size_v<decltype(Arguments)>>{};
-    InvokeInterfaceFunction(endFunction, Arguments, IndexSequence);
-  }
-
-  std::tuple<ArgsTy...> Arguments;
-  typename CallbackPairTy::first_type beginFunction;
-  typename CallbackPairTy::second_type endFunction;
-};
-
-// InterfaceRAII's class template argument deduction guide
-template <typename CallbackPairTy, typename... ArgsTy>
-InterfaceRAII(CallbackPairTy Callbacks, ArgsTy... Args)
-    -> InterfaceRAII<CallbackPairTy, ArgsTy...>;
-
-/// Used to set and reset the thread-local return address. The RAII is expected
-/// to be created at a runtime entry point when the return address should be
-/// null. If so, the return address is set and \p IsSetter is set in the ctor.
-/// The dtor resets the return address only if the corresponding object set it.
-/// So if the RAII is called from a nested runtime function, the ctor/dtor will
-/// do nothing since the thread local return address is already set.
-class ReturnAddressSetterRAII {
-public:
-  ReturnAddressSetterRAII(void *RA) : IsSetter(false) {
-    // Handle nested calls. If already set, do not set again since it
-    // must be in a nested call.
-    if (ReturnAddress == nullptr) {
-      // Store the return address to a thread local variable.
-      ReturnAddress = RA;
-      IsSetter = true;
-    }
-  }
-  ~ReturnAddressSetterRAII() {
-    // Reset the return address if this object set it.
-    if (IsSetter)
-      ReturnAddress = nullptr;
-  }
-
-private:
-  // Did this object set the thread-local return address?
-  bool IsSetter;
-};
-
-} // namespace ompt
-} // namespace target
-} // namespace omp
-} // namespace llvm
-
-// The getter returns the address stored in the thread local variable.
-#define OMPT_GET_RETURN_ADDRESS llvm::omp::target::ompt::ReturnAddress
-
-#else
-#define OMPT_IF_BUILT(stmt)
-#endif
-
-#endif // _OMPTARGET_OMPTINTERFACE_H

diff --git a/libomptarget/include/OpenMP/omp.h b/libomptarget/include/OpenMP/omp.h
deleted file mode 100644
index b44c6af..0000000
--- a/libomptarget/include/OpenMP/omp.h
+++ /dev/null

@@ -1,155 +0,0 @@
-//===-- OpenMP/omp.h - Copies of OpenMP user facing types and APIs - C++ -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This copies some OpenMP user facing types and APIs for easy reach within the
-// implementation.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OPENMP_OMP_H
-#define OMPTARGET_OPENMP_OMP_H
-
-#include <cstdint>
-
-#if defined(_WIN32)
-#define __KAI_KMPC_CONVENTION __cdecl
-#ifndef __KMP_IMP
-#define __KMP_IMP __declspec(dllimport)
-#endif
-#else
-#define __KAI_KMPC_CONVENTION
-#ifndef __KMP_IMP
-#define __KMP_IMP
-#endif
-#endif
-
-extern "C" {
-
-/// Type declarations
-///{
-
-typedef void *omp_depend_t;
-
-///}
-
-/// API declarations
-///{
-
-int omp_get_default_device(void) __attribute__((weak));
-
-///}
-
-/// InteropAPI
-///
-///{
-
-/// TODO: Include the `omp.h` of the current build
-/* OpenMP 5.1 interop */
-typedef intptr_t omp_intptr_t;
-
-/* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined
- * properties */
-typedef enum omp_interop_property {
-  omp_ipr_fr_id = -1,
-  omp_ipr_fr_name = -2,
-  omp_ipr_vendor = -3,
-  omp_ipr_vendor_name = -4,
-  omp_ipr_device_num = -5,
-  omp_ipr_platform = -6,
-  omp_ipr_device = -7,
-  omp_ipr_device_context = -8,
-  omp_ipr_targetsync = -9,
-  omp_ipr_first = -9
-} omp_interop_property_t;
-
-#define omp_interop_none 0
-
-typedef enum omp_interop_rc {
-  omp_irc_no_value = 1,
-  omp_irc_success = 0,
-  omp_irc_empty = -1,
-  omp_irc_out_of_range = -2,
-  omp_irc_type_int = -3,
-  omp_irc_type_ptr = -4,
-  omp_irc_type_str = -5,
-  omp_irc_other = -6
-} omp_interop_rc_t;
-
-typedef enum omp_interop_fr {
-  omp_ifr_cuda = 1,
-  omp_ifr_cuda_driver = 2,
-  omp_ifr_opencl = 3,
-  omp_ifr_sycl = 4,
-  omp_ifr_hip = 5,
-  omp_ifr_level_zero = 6,
-  omp_ifr_last = 7
-} omp_interop_fr_t;
-
-typedef void *omp_interop_t;
-
-/*!
- * The `omp_get_num_interop_properties` routine retrieves the number of
- * implementation-defined properties available for an `omp_interop_t` object.
- */
-int __KAI_KMPC_CONVENTION omp_get_num_interop_properties(const omp_interop_t);
-/*!
- * The `omp_get_interop_int` routine retrieves an integer property from an
- * `omp_interop_t` object.
- */
-omp_intptr_t __KAI_KMPC_CONVENTION
-omp_get_interop_int(const omp_interop_t, omp_interop_property_t, int *);
-/*!
- * The `omp_get_interop_ptr` routine retrieves a pointer property from an
- * `omp_interop_t` object.
- */
-void *__KAI_KMPC_CONVENTION omp_get_interop_ptr(const omp_interop_t,
-                                                omp_interop_property_t, int *);
-/*!
- * The `omp_get_interop_str` routine retrieves a string property from an
- * `omp_interop_t` object.
- */
-const char *__KAI_KMPC_CONVENTION
-omp_get_interop_str(const omp_interop_t, omp_interop_property_t, int *);
-/*!
- * The `omp_get_interop_name` routine retrieves a property name from an
- * `omp_interop_t` object.
- */
-const char *__KAI_KMPC_CONVENTION omp_get_interop_name(const omp_interop_t,
-                                                       omp_interop_property_t);
-/*!
- * The `omp_get_interop_type_desc` routine retrieves a description of the type
- * of a property associated with an `omp_interop_t` object.
- */
-const char *__KAI_KMPC_CONVENTION
-omp_get_interop_type_desc(const omp_interop_t, omp_interop_property_t);
-/*!
- * The `omp_get_interop_rc_desc` routine retrieves a description of the return
- * code associated with an `omp_interop_t` object.
- */
-extern const char *__KAI_KMPC_CONVENTION
-omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);
-
-typedef enum omp_interop_backend_type_t {
-  // reserve 0
-  omp_interop_backend_type_cuda_1 = 1,
-} omp_interop_backend_type_t;
-
-typedef enum omp_foreign_runtime_ids {
-  cuda = 1,
-  cuda_driver = 2,
-  opencl = 3,
-  sycl = 4,
-  hip = 5,
-  level_zero = 6,
-} omp_foreign_runtime_ids_t;
-
-///} InteropAPI
-
-} // extern "C"
-
-#endif // OMPTARGET_OPENMP_OMP_H

diff --git a/libomptarget/include/PluginManager.h b/libomptarget/include/PluginManager.h
deleted file mode 100644
index eece752..0000000
--- a/libomptarget/include/PluginManager.h
+++ /dev/null

@@ -1,200 +0,0 @@
-//===-- PluginManager.h - Plugin loading and communication API --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations for managing devices that are handled by RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_PLUGIN_MANAGER_H
-#define OMPTARGET_PLUGIN_MANAGER_H
-
-#include "DeviceImage.h"
-#include "ExclusiveAccess.h"
-#include "Shared/APITypes.h"
-#include "Shared/PluginAPI.h"
-#include "Shared/Requirements.h"
-
-#include "device.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/Error.h"
-
-#include <cstdint>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-
-struct PluginManager;
-
-/// Plugin adaptors should be created via `PluginAdaptorTy::create` which will
-/// invoke the constructor and call `PluginAdaptorTy::init`. Eventual errors are
-/// reported back to the caller, otherwise a valid and initialized adaptor is
-/// returned.
-struct PluginAdaptorTy {
-  /// Try to create a plugin adaptor from a filename.
-  static llvm::Expected<std::unique_ptr<PluginAdaptorTy>>
-  create(const std::string &Name);
-
-  /// Name of the shared object file representing the plugin.
-  std::string Name;
-
-  /// Access to the shared object file representing the plugin.
-  std::unique_ptr<llvm::sys::DynamicLibrary> LibraryHandler;
-
-#define PLUGIN_API_HANDLE(NAME)                                                \
-  using NAME##_ty = decltype(__tgt_rtl_##NAME);                                \
-  NAME##_ty *NAME = nullptr;
-
-#include "Shared/PluginAPI.inc"
-#undef PLUGIN_API_HANDLE
-
-  /// Create a plugin adaptor for filename \p Name with a dynamic library \p DL.
-  PluginAdaptorTy(const std::string &Name,
-                  std::unique_ptr<llvm::sys::DynamicLibrary> DL);
-
-  /// Initialize the plugin adaptor, this can fail in which case the adaptor is
-  /// useless.
-  llvm::Error init();
-};
-
-/// Struct for the data required to handle plugins
-struct PluginManager {
-  /// Type of the devices container. We hand out DeviceTy& to queries which are
-  /// stable addresses regardless if the container changes.
-  using DeviceContainerTy = llvm::SmallVector<std::unique_ptr<DeviceTy>>;
-
-  /// Exclusive accessor type for the device container.
-  using ExclusiveDevicesAccessorTy = Accessor<DeviceContainerTy>;
-
-  PluginManager() {}
-
-  void init();
-
-  // Register a shared library with all (compatible) RTLs.
-  void registerLib(__tgt_bin_desc *Desc);
-
-  // Unregister a shared library from all RTLs.
-  void unregisterLib(__tgt_bin_desc *Desc);
-
-  void addDeviceImage(__tgt_bin_desc &TgtBinDesc,
-                      __tgt_device_image &TgtDeviceImage) {
-    DeviceImages.emplace_back(
-        std::make_unique<DeviceImageTy>(TgtBinDesc, TgtDeviceImage));
-  }
-
-  /// Initialize as many devices as possible for this plugin adaptor. Devices
-  /// that fail to initialize are ignored. Returns the offset the devices were
-  /// registered at.
-  void initDevices(PluginAdaptorTy &RTL);
-
-  /// Return the device presented to the user as device \p DeviceNo if it is
-  /// initialized and ready. Otherwise return an error explaining the problem.
-  llvm::Expected<DeviceTy &> getDevice(uint32_t DeviceNo);
-
-  /// Iterate over all initialized and ready devices registered with this
-  /// plugin.
-  auto devices(ExclusiveDevicesAccessorTy &DevicesAccessor) {
-    return llvm::make_pointee_range(*DevicesAccessor);
-  }
-
-  /// Iterate over all device images registered with this plugin.
-  auto deviceImages() { return llvm::make_pointee_range(DeviceImages); }
-
-  /// Translation table retreived from the binary
-  HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
-  std::mutex TrlTblMtx; ///< For Translation Table
-  /// Host offload entries in order of image registration
-  llvm::SmallVector<__tgt_offload_entry *> HostEntriesBeginRegistrationOrder;
-
-  /// Map from ptrs on the host to an entry in the Translation Table
-  HostPtrToTableMapTy HostPtrToTableMap;
-  std::mutex TblMapMtx; ///< For HostPtrToTableMap
-
-  // Work around for plugins that call dlopen on shared libraries that call
-  // tgt_register_lib during their initialisation. Stash the pointers in a
-  // vector until the plugins are all initialised and then register them.
-  bool delayRegisterLib(__tgt_bin_desc *Desc) {
-    if (RTLsLoaded)
-      return false;
-    DelayedBinDesc.push_back(Desc);
-    return true;
-  }
-
-  void registerDelayedLibraries() {
-    // Only called by libomptarget constructor
-    RTLsLoaded = true;
-    for (auto *Desc : DelayedBinDesc)
-      __tgt_register_lib(Desc);
-    DelayedBinDesc.clear();
-  }
-
-  /// Return the number of usable devices.
-  int getNumDevices() { return getExclusiveDevicesAccessor()->size(); }
-
-  /// Return an exclusive handle to access the devices container.
-  ExclusiveDevicesAccessorTy getExclusiveDevicesAccessor() {
-    return Devices.getExclusiveAccessor();
-  }
-
-  int getNumUsedPlugins() const { return DeviceOffsets.size(); }
-
-  // Initialize all plugins.
-  void initAllPlugins();
-
-  /// Iterator range for all plugin adaptors (in use or not, but always valid).
-  auto pluginAdaptors() { return llvm::make_pointee_range(PluginAdaptors); }
-
-  /// Return the user provided requirements.
-  int64_t getRequirements() const { return Requirements.getRequirements(); }
-
-  /// Add \p Flags to the user provided requirements.
-  void addRequirements(int64_t Flags) { Requirements.addRequirements(Flags); }
-
-private:
-  bool RTLsLoaded = false;
-  llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc;
-
-  // List of all plugin adaptors, in use or not.
-  llvm::SmallVector<std::unique_ptr<PluginAdaptorTy>> PluginAdaptors;
-
-  // Mapping of plugin adaptors to offsets in the device table.
-  llvm::DenseMap<const PluginAdaptorTy *, int32_t> DeviceOffsets;
-
-  // Mapping of plugin adaptors to the number of used devices.
-  llvm::DenseMap<const PluginAdaptorTy *, int32_t> DeviceUsed;
-
-  // Set of all device images currently in use.
-  llvm::DenseSet<const __tgt_device_image *> UsedImages;
-
-  /// Executable images and information extracted from the input images passed
-  /// to the runtime.
-  llvm::SmallVector<std::unique_ptr<DeviceImageTy>> DeviceImages;
-
-  /// The user provided requirements.
-  RequirementCollection Requirements;
-
-  std::mutex RTLsMtx; ///< For RTLs
-
-  /// Devices associated with plugins, accesses to the container are exclusive.
-  ProtectedObj<DeviceContainerTy> Devices;
-};
-
-/// Initialize the plugin manager and OpenMP runtime.
-void initRuntime();
-
-/// Deinitialize the plugin and delete it.
-void deinitRuntime();
-
-extern PluginManager *PM;
-
-#endif // OMPTARGET_PLUGIN_MANAGER_H

diff --git a/libomptarget/include/Shared/APITypes.h b/libomptarget/include/Shared/APITypes.h
deleted file mode 100644
index e8fc277..0000000
--- a/libomptarget/include/Shared/APITypes.h
+++ /dev/null

@@ -1,117 +0,0 @@
-//===-- Shared/APITypes.h - Offload and plugin API types --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines types used in the interface between the user code, the
-// target independent offload runtime library, and target dependent plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_API_TYPES_H
-#define OMPTARGET_SHARED_API_TYPES_H
-
-#include "Environment.h"
-
-#include "llvm/ADT/SmallVector.h"
-
-#include <cstddef>
-#include <cstdint>
-
-extern "C" {
-
-/// This struct is a record of an entry point or global. For a function
-/// entry point the size is expected to be zero
-struct __tgt_offload_entry {
-  void *addr;       // Pointer to the offload entry info (function or global)
-  char *name;       // Name of the function or global
-  size_t size;      // Size of the entry info (0 if it is a function)
-  int32_t flags;    // Flags associated with the entry, e.g. 'link'.
-  int32_t data;     // Extra data associated with the entry.
-};
-
-/// This struct is a record of the device image information
-struct __tgt_device_image {
-  void *ImageStart;                  // Pointer to the target code start
-  void *ImageEnd;                    // Pointer to the target code end
-  __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries
-  __tgt_offload_entry *EntriesEnd;   // End of table (non inclusive)
-};
-
-struct __tgt_device_info {
-  void *Context = nullptr;
-  void *Device = nullptr;
-};
-
-/// This struct is a record of all the host code that may be offloaded to a
-/// target.
-struct __tgt_bin_desc {
-  int32_t NumDeviceImages;          // Number of device types supported
-  __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type)
-  __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries
-  __tgt_offload_entry *HostEntriesEnd;   // End of table (non inclusive)
-};
-
-/// This struct contains the offload entries identified by the target runtime
-struct __tgt_target_table {
-  __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
-  __tgt_offload_entry
-      *EntriesEnd; // End of the table with all the entries (non inclusive)
-};
-
-/// This struct contains a handle to a loaded binary in the plugin device.
-struct __tgt_device_binary {
-  uintptr_t handle;
-};
-
-// clang-format on
-
-/// This struct contains information exchanged between different asynchronous
-/// operations for device-dependent optimization and potential synchronization
-struct __tgt_async_info {
-  // A pointer to a queue-like structure where offloading operations are issued.
-  // We assume to use this structure to do synchronization. In CUDA backend, it
-  // is CUstream.
-  void *Queue = nullptr;
-
-  /// A collection of allocations that are associated with this stream and that
-  /// should be freed after finalization.
-  llvm::SmallVector<void *, 2> AssociatedAllocations;
-
-  /// The kernel launch environment used to issue a kernel. Stored here to
-  /// ensure it is a valid location while the transfer to the device is
-  /// happening.
-  KernelLaunchEnvironmentTy KernelLaunchEnvironment;
-};
-
-/// This struct contains all of the arguments to a target kernel region launch.
-struct KernelArgsTy {
-  uint32_t Version;   // Version of this struct for ABI compatibility.
-  uint32_t NumArgs;   // Number of arguments in each input pointer.
-  void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct).
-  void **ArgPtrs;     // Pointer to the argument data.
-  int64_t *ArgSizes;  // Size of the argument data in bytes.
-  int64_t *ArgTypes;  // Type of the data (e.g. to / from).
-  void **ArgNames;    // Name of the data for debugging, possibly null.
-  void **ArgMappers;  // User-defined mappers, possibly null.
-  uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise.
-  struct {
-    uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
-    uint64_t Unused : 63;
-  } Flags;
-  uint32_t NumTeams[3];    // The number of teams (for x,y,z dimension).
-  uint32_t ThreadLimit[3]; // The number of threads (for x,y,z dimension).
-  uint32_t DynCGroupMem;   // Amount of dynamic cgroup memory requested.
-};
-static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
-              "Invalid struct size");
-static_assert(sizeof(KernelArgsTy) ==
-                  (8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
-                   4 * sizeof(void **) + 2 * sizeof(int64_t *)),
-              "Invalid struct size");
-}
-
-#endif // OMPTARGET_SHARED_API_TYPES_H

diff --git a/libomptarget/include/Shared/Debug.h b/libomptarget/include/Shared/Debug.h
deleted file mode 100644
index 7c3db8d..0000000
--- a/libomptarget/include/Shared/Debug.h
+++ /dev/null

@@ -1,201 +0,0 @@
-//===-- Shared/Debug.h - Target independent OpenMP target RTL -- C++ ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Routines used to provide debug messages and information from libomptarget
-// and plugin RTLs to the user.
-//
-// Each plugin RTL and libomptarget define TARGET_NAME and DEBUG_PREFIX for use
-// when sending messages to the user. These indicate which RTL sent the message
-//
-// Debug and information messages are controlled by the environment variables
-// LIBOMPTARGET_DEBUG and LIBOMPTARGET_INFO which is set upon initialization
-// of libomptarget or the plugin RTL.
-//
-// To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
-// use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
-//
-// DPxMOD expands to:
-//   "0x%0*" PRIxPTR
-// where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
-// specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
-//   "0x%0*lu"
-//
-// Ultimately, the whole statement expands to:
-//   printf("ptr=0x%0*lu...\n",  // the 0* modifier expects an extra argument
-//                               // specifying the width of the output
-//   (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
-//                               // 8 digits for 32bit systems
-//                               // 16 digits for 64bit
-//   (uintptr_t) ptr);
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_DEBUG_H
-#define OMPTARGET_SHARED_DEBUG_H
-
-#include <atomic>
-#include <mutex>
-#include <string>
-
-/// 32-Bit field data attributes controlling information presented to the user.
-enum OpenMPInfoType : uint32_t {
-  // Print data arguments and attributes upon entering an OpenMP device kernel.
-  OMP_INFOTYPE_KERNEL_ARGS = 0x0001,
-  // Indicate when an address already exists in the device mapping table.
-  OMP_INFOTYPE_MAPPING_EXISTS = 0x0002,
-  // Dump the contents of the device pointer map at kernel exit or failure.
-  OMP_INFOTYPE_DUMP_TABLE = 0x0004,
-  // Indicate when an address is added to the device mapping table.
-  OMP_INFOTYPE_MAPPING_CHANGED = 0x0008,
-  // Print kernel information from target device plugins.
-  OMP_INFOTYPE_PLUGIN_KERNEL = 0x0010,
-  // Print whenever data is transferred to the device
-  OMP_INFOTYPE_DATA_TRANSFER = 0x0020,
-  // Print whenever data does not have a viable device counterpart.
-  OMP_INFOTYPE_EMPTY_MAPPING = 0x0040,
-  // Enable every flag.
-  OMP_INFOTYPE_ALL = 0xffffffff,
-};
-
-inline std::atomic<uint32_t> &getInfoLevelInternal() {
-  static std::atomic<uint32_t> InfoLevel;
-  static std::once_flag Flag{};
-  std::call_once(Flag, []() {
-    if (char *EnvStr = getenv("LIBOMPTARGET_INFO"))
-      InfoLevel.store(std::stoi(EnvStr));
-  });
-
-  return InfoLevel;
-}
-
-inline uint32_t getInfoLevel() { return getInfoLevelInternal().load(); }
-
-inline uint32_t getDebugLevel() {
-  static uint32_t DebugLevel = 0;
-  static std::once_flag Flag{};
-  std::call_once(Flag, []() {
-    if (char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
-      DebugLevel = std::stoi(EnvStr);
-  });
-
-  return DebugLevel;
-}
-
-#undef USED
-#undef GCC_VERSION
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
-#undef __STDC_FORMAT_MACROS
-
-#define DPxMOD "0x%0*" PRIxPTR
-#define DPxPTR(ptr) ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(ptr))
-#define GETNAME2(name) #name
-#define GETNAME(name) GETNAME2(name)
-
-/// Print a generic message string from libomptarget or a plugin RTL
-#define MESSAGE0(_str)                                                         \
-  do {                                                                         \
-    fprintf(stderr, GETNAME(TARGET_NAME) " message: %s\n", _str);              \
-  } while (0)
-
-/// Print a printf formatting string message from libomptarget or a plugin RTL
-#define MESSAGE(_str, ...)                                                     \
-  do {                                                                         \
-    fprintf(stderr, GETNAME(TARGET_NAME) " message: " _str "\n", __VA_ARGS__); \
-  } while (0)
-
-/// Print fatal error message with an error string and error identifier
-#define FATAL_MESSAGE0(_num, _str)                                             \
-  do {                                                                         \
-    fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d: %s\n", (int)_num,   \
-            _str);                                                             \
-    abort();                                                                   \
-  } while (0)
-
-/// Print fatal error message with a printf string and error identifier
-#define FATAL_MESSAGE(_num, _str, ...)                                         \
-  do {                                                                         \
-    fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d: " _str "\n",        \
-            (int)_num, __VA_ARGS__);                                           \
-    abort();                                                                   \
-  } while (0)
-
-/// Print a generic error string from libomptarget or a plugin RTL
-#define FAILURE_MESSAGE(...)                                                   \
-  do {                                                                         \
-    fprintf(stderr, GETNAME(TARGET_NAME) " error: ");                          \
-    fprintf(stderr, __VA_ARGS__);                                              \
-  } while (0)
-
-/// Print a generic information string used if LIBOMPTARGET_INFO=1
-#define INFO_MESSAGE(_num, ...) INFO_MESSAGE_TO(stderr, _num, __VA_ARGS__)
-
-#define INFO_MESSAGE_TO(_stdDst, _num, ...)                                    \
-  do {                                                                         \
-    fprintf(_stdDst, GETNAME(TARGET_NAME) " device %d info: ", (int)_num);     \
-    fprintf(_stdDst, __VA_ARGS__);                                             \
-  } while (0)
-
-// Debugging messages
-#ifdef OMPTARGET_DEBUG
-#include <stdio.h>
-
-#define DEBUGP(prefix, ...)                                                    \
-  {                                                                            \
-    fprintf(stderr, "%s --> ", prefix);                                        \
-    fprintf(stderr, __VA_ARGS__);                                              \
-  }
-
-/// Emit a message for debugging
-#define DP(...)                                                                \
-  do {                                                                         \
-    if (getDebugLevel() > 0) {                                                 \
-      DEBUGP(DEBUG_PREFIX, __VA_ARGS__);                                       \
-    }                                                                          \
-  } while (false)
-
-/// Emit a message for debugging or failure if debugging is disabled
-#define REPORT(...)                                                            \
-  do {                                                                         \
-    if (getDebugLevel() > 0) {                                                 \
-      DP(__VA_ARGS__);                                                         \
-    } else {                                                                   \
-      FAILURE_MESSAGE(__VA_ARGS__);                                            \
-    }                                                                          \
-  } while (false)
-#else
-#define DEBUGP(prefix, ...)                                                    \
-  {}
-#define DP(...)                                                                \
-  {}
-#define REPORT(...) FAILURE_MESSAGE(__VA_ARGS__);
-#endif // OMPTARGET_DEBUG
-
-/// Emit a message giving the user extra information about the runtime if
-#define INFO(_flags, _id, ...)                                                 \
-  do {                                                                         \
-    if (getDebugLevel() > 0) {                                                 \
-      DEBUGP(DEBUG_PREFIX, __VA_ARGS__);                                       \
-    } else if (getInfoLevel() & _flags) {                                      \
-      INFO_MESSAGE(_id, __VA_ARGS__);                                          \
-    }                                                                          \
-  } while (false)
-
-#define DUMP_INFO(toStdOut, _flags, _id, ...)                                  \
-  do {                                                                         \
-    if (toStdOut) {                                                            \
-      INFO_MESSAGE_TO(stdout, _id, __VA_ARGS__);                               \
-    } else {                                                                   \
-      INFO(_flags, _id, __VA_ARGS__);                                          \
-    }                                                                          \
-  } while (false)
-
-#endif // OMPTARGET_SHARED_DEBUG_H

diff --git a/libomptarget/include/Shared/Environment.h b/libomptarget/include/Shared/Environment.h
deleted file mode 100644
index d141146..0000000
--- a/libomptarget/include/Shared/Environment.h
+++ /dev/null

@@ -1,108 +0,0 @@
-//===-- Shared/Environment.h - OpenMP GPU environments ------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Environments shared between host and device.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_ENVIRONMENT_H
-#define OMPTARGET_SHARED_ENVIRONMENT_H
-
-#ifdef OMPTARGET_DEVICE_RUNTIME
-#include "Types.h"
-#else
-#include "SourceInfo.h"
-
-#include <cstdint>
-
-using IdentTy = ident_t;
-#endif
-
-#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
-
-enum class DeviceDebugKind : uint32_t {
-  Assertion = 1U << 0,
-  FunctionTracing = 1U << 1,
-  CommonIssues = 1U << 2,
-  AllocationTracker = 1U << 3,
-};
-
-struct DeviceEnvironmentTy {
-  uint32_t DeviceDebugKind;
-  uint32_t NumDevices;
-  uint32_t DeviceNum;
-  uint32_t DynamicMemSize;
-  uint64_t ClockFrequency;
-  uintptr_t IndirectCallTable;
-  uint64_t IndirectCallTableSize;
-  uint64_t HardwareParallelism;
-};
-
-struct DeviceMemoryPoolTy {
-  void *Ptr;
-  uint64_t Size;
-};
-
-struct DeviceMemoryPoolTrackingTy {
-  uint64_t NumAllocations;
-  uint64_t AllocationTotal;
-  uint64_t AllocationMin;
-  uint64_t AllocationMax;
-
-  void combine(DeviceMemoryPoolTrackingTy &Other) {
-    NumAllocations += Other.NumAllocations;
-    AllocationTotal += Other.AllocationTotal;
-    AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
-                                                        : AllocationMin;
-    AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
-                                                        : AllocationMax;
-  }
-};
-
-// NOTE: Please don't change the order of those members as their indices are
-// used in the middle end. Always add the new data member at the end.
-// Different from KernelEnvironmentTy below, this structure contains members
-// that might be modified at runtime.
-struct DynamicEnvironmentTy {
-  /// Current indentation level for the function trace. Only accessed by thread
-  /// 0.
-  uint16_t DebugIndentionLevel;
-};
-
-// NOTE: Please don't change the order of those members as their indices are
-// used in the middle end. Always add the new data member at the end.
-struct ConfigurationEnvironmentTy {
-  uint8_t UseGenericStateMachine = 2;
-  uint8_t MayUseNestedParallelism = 2;
-  llvm::omp::OMPTgtExecModeFlags ExecMode = llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
-  // Information about (legal) launch configurations.
-  //{
-  int32_t MinThreads = -1;
-  int32_t MaxThreads = -1;
-  int32_t MinTeams = -1;
-  int32_t MaxTeams = -1;
-  int32_t ReductionDataSize = 0;
-  int32_t ReductionBufferLength = 0;
-  //}
-};
-
-// NOTE: Please don't change the order of those members as their indices are
-// used in the middle end. Always add the new data member at the end.
-struct KernelEnvironmentTy {
-  ConfigurationEnvironmentTy Configuration;
-  IdentTy *Ident = nullptr;
-  DynamicEnvironmentTy *DynamicEnv = nullptr;
-};
-
-struct KernelLaunchEnvironmentTy {
-  uint32_t ReductionCnt = 0;
-  uint32_t ReductionIterCnt = 0;
-  void *ReductionBuffer = nullptr;
-};
-
-#endif // OMPTARGET_SHARED_ENVIRONMENT_H

diff --git a/libomptarget/include/Shared/EnvironmentVar.h b/libomptarget/include/Shared/EnvironmentVar.h
deleted file mode 100644
index 4cbdad6..0000000
--- a/libomptarget/include/Shared/EnvironmentVar.h
+++ /dev/null

@@ -1,194 +0,0 @@
-//===-- Shared/EnvironmentVar.h - Environment variable handling -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_ENVIRONMENT_VAR_H
-#define OMPTARGET_SHARED_ENVIRONMENT_VAR_H
-
-#include "Debug.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-
-#include <sstream>
-#include <string>
-
-/// Utility class for parsing strings to other types.
-struct StringParser {
-  /// Parse a string to another type.
-  template <typename Ty> static bool parse(const char *Value, Ty &Result);
-};
-
-/// Class for reading and checking environment variables. Currently working with
-/// integer, floats, std::string and bool types.
-template <typename Ty> class Envar {
-  Ty Data;
-  bool IsPresent;
-  bool Initialized;
-
-public:
-  /// Auxiliary function to safely create envars. This static function safely
-  /// creates envars using fallible constructors. See the constructors to know
-  /// more details about the creation parameters.
-  template <typename... ArgsTy>
-  static llvm::Expected<Envar> create(ArgsTy &&...Args) {
-    llvm::Error Err = llvm::Error::success();
-    Envar Envar(std::forward<ArgsTy>(Args)..., Err);
-    if (Err)
-      return std::move(Err);
-    return std::move(Envar);
-  }
-
-  /// Create an empty envar. Cannot be consulted. This constructor is merely
-  /// for convenience. This constructor is not fallible.
-  Envar() : Data(Ty()), IsPresent(false), Initialized(false) {}
-
-  /// Create an envar with a name and an optional default. The Envar object will
-  /// take the value read from the environment variable, or the default if it
-  /// was not set or not correct. This constructor is not fallible.
-  Envar(llvm::StringRef Name, Ty Default = Ty())
-      : Data(Default), IsPresent(false), Initialized(true) {
-
-    if (const char *EnvStr = getenv(Name.data())) {
-      // Check whether the envar is defined and valid.
-      IsPresent = StringParser::parse<Ty>(EnvStr, Data);
-
-      if (!IsPresent) {
-        DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data());
-        Data = Default;
-      }
-    }
-  }
-
-  Envar<Ty> &operator=(const Ty &V) {
-    Data = V;
-    Initialized = true;
-    return *this;
-  }
-
-  /// Get the definitive value.
-  const Ty &get() const {
-    // Throw a runtime error in case this envar is not initialized.
-    if (!Initialized)
-      FATAL_MESSAGE0(1, "Consulting envar before initialization");
-
-    return Data;
-  }
-
-  /// Get the definitive value.
-  operator Ty() const { return get(); }
-
-  /// Indicate whether the environment variable was defined and valid.
-  bool isPresent() const { return IsPresent; }
-
-private:
-  /// This constructor should never fail but we provide it for convenience. This
-  /// way, the constructor can be used by the Envar::create() static function
-  /// to safely create this kind of envars.
-  Envar(llvm::StringRef Name, Ty Default, llvm::Error &Err)
-      : Envar(Name, Default) {
-    llvm::ErrorAsOutParameter EAO(&Err);
-    Err = llvm::Error::success();
-  }
-
-  /// Create an envar with a name, getter function and a setter function. The
-  /// Envar object will take the value read from the environment variable if
-  /// this value is accepted by the setter function. Otherwise, the getter
-  /// function will be executed to get the default value. The getter should be
-  /// of the form Error GetterFunctionTy(Ty &Value) and the setter should
-  /// be of the form Error SetterFunctionTy(Ty Value). This constructor has a
-  /// private visibility because is a fallible constructor. Please use the
-  /// Envar::create() static function to safely create this object instead.
-  template <typename GetterFunctor, typename SetterFunctor>
-  Envar(llvm::StringRef Name, GetterFunctor Getter, SetterFunctor Setter,
-        llvm::Error &Err)
-      : Data(Ty()), IsPresent(false), Initialized(true) {
-    llvm::ErrorAsOutParameter EAO(&Err);
-    Err = init(Name, Getter, Setter);
-  }
-
-  template <typename GetterFunctor, typename SetterFunctor>
-  llvm::Error init(llvm::StringRef Name, GetterFunctor Getter,
-                   SetterFunctor Setter);
-};
-
-/// Define some common envar types.
-using IntEnvar = Envar<int>;
-using Int32Envar = Envar<int32_t>;
-using Int64Envar = Envar<int64_t>;
-using UInt32Envar = Envar<uint32_t>;
-using UInt64Envar = Envar<uint64_t>;
-using StringEnvar = Envar<std::string>;
-using BoolEnvar = Envar<bool>;
-
-template <>
-inline bool StringParser::parse(const char *ValueStr, bool &Result) {
-  std::string Value(ValueStr);
-
-  // Convert the string to lowercase.
-  std::transform(Value.begin(), Value.end(), Value.begin(),
-                 [](unsigned char c) { return std::tolower(c); });
-
-  // May be implemented with fancier C++ features, but let's keep it simple.
-  if (Value == "true" || Value == "yes" || Value == "on" || Value == "1")
-    Result = true;
-  else if (Value == "false" || Value == "no" || Value == "off" || Value == "0")
-    Result = false;
-  else
-    return false;
-
-  // Parsed correctly.
-  return true;
-}
-
-template <typename Ty>
-inline bool StringParser::parse(const char *Value, Ty &Result) {
-  assert(Value && "Parsed value cannot be null");
-
-  std::istringstream Stream(Value);
-  Stream >> Result;
-
-  return !Stream.fail();
-}
-
-template <typename Ty>
-template <typename GetterFunctor, typename SetterFunctor>
-inline llvm::Error Envar<Ty>::init(llvm::StringRef Name, GetterFunctor Getter,
-                                   SetterFunctor Setter) {
-  // Get the default value.
-  Ty Default;
-  if (llvm::Error Err = Getter(Default))
-    return Err;
-
-  if (const char *EnvStr = getenv(Name.data())) {
-    IsPresent = StringParser::parse<Ty>(EnvStr, Data);
-    if (IsPresent) {
-      // Check whether the envar value is actually valid.
-      llvm::Error Err = Setter(Data);
-      if (Err) {
-        // The setter reported an invalid value. Mark the user-defined value as
-        // not present and reset to the getter value (default).
-        IsPresent = false;
-        Data = Default;
-        DP("Setter of envar %s failed, resetting to %s\n", Name.data(),
-           std::to_string(Data).data());
-        consumeError(std::move(Err));
-      }
-    } else {
-      DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data());
-      Data = Default;
-    }
-  } else {
-    Data = Default;
-  }
-
-  return llvm::Error::success();
-}
-
-#endif // OMPTARGET_SHARED_ENVIRONMENT_VAR_H

diff --git a/libomptarget/include/Shared/PluginAPI.h b/libomptarget/include/Shared/PluginAPI.h
deleted file mode 100644
index ecf669c..0000000
--- a/libomptarget/include/Shared/PluginAPI.h
+++ /dev/null

@@ -1,232 +0,0 @@
-//===-- Shared/PluginAPI.h - Target independent plugin API ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an interface between target independent OpenMP offload
-// runtime library libomptarget and target dependent plugin.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_PLUGIN_API_H
-#define OMPTARGET_SHARED_PLUGIN_API_H
-
-#include <cstddef>
-#include <cstdint>
-
-#include "Shared/APITypes.h"
-
-extern "C" {
-
-// First method called on the plugin
-int32_t __tgt_rtl_init_plugin();
-
-// Return the number of available devices of the type supported by the
-// target RTL.
-int32_t __tgt_rtl_number_of_devices(void);
-
-// Return an integer different from zero if the provided device image can be
-// supported by the runtime. The functionality is similar to comparing the
-// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
-// lightweight query to determine if the RTL is suitable for an image without
-// having to load the library, which can be expensive.
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
-
-// Return an integer other than zero if the data can be exchaned from SrcDevId
-// to DstDevId. If it is data exchangable, the device plugin should provide
-// function to move data from source device to destination device directly.
-int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId);
-
-// Initialize the requires flags for the device.
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);
-
-// Initialize the specified device. In case of success return 0; otherwise
-// return an error code.
-int32_t __tgt_rtl_init_device(int32_t ID);
-
-// Pass an executable image section described by image to the specified
-// device and prepare an address table of target entities. In case of error,
-// return NULL. Otherwise, return a pointer to the built address table.
-// Individual entries in the table may also be NULL, when the corresponding
-// offload region is not supported on the target device.
-int32_t __tgt_rtl_load_binary(int32_t ID, __tgt_device_image *Image,
-                              __tgt_device_binary *Binary);
-
-// Look up the device address of the named symbol in the given binary. Returns
-// non-zero on failure.
-int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
-                             const char *Name, void **DevicePtr);
-
-// Look up the device address of the named kernel in the given binary. Returns
-// non-zero on failure.
-int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
-                               void **DevicePtr);
-
-// Allocate data on the particular target device, of the specified size.
-// HostPtr is a address of the host data the allocated target data
-// will be associated with (HostPtr may be NULL if it is not known at
-// allocation time, like for example it would be for target data that
-// is allocated by omp_target_alloc() API). Return address of the
-// allocated data on the target that will be used by libomptarget.so to
-// initialize the target data mapping structures. These addresses are
-// used to generate a table of target variables to pass to
-// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
-// case an error occurred on the target device. Kind dictates what allocator
-// to use (e.g. shared, host, device).
-void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr,
-                           int32_t Kind);
-
-// Pass the data content to the target device using the target address. In case
-// of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
-                              int64_t Size);
-
-int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr,
-                                    int64_t Size, __tgt_async_info *AsyncInfo);
-
-// Retrieve the data content from the target device using its address. In case
-// of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
-                                int64_t Size);
-
-// Asynchronous version of __tgt_rtl_data_retrieve
-int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr,
-                                      void *TargetPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfo);
-
-// Copy the data content from one target device to another target device using
-// its address. This operation does not need to copy data back to host and then
-// from host to another device. In case of success, return zero. Otherwise,
-// return an error code.
-int32_t __tgt_rtl_data_exchange(int32_t SrcID, void *SrcPtr, int32_t DstID,
-                                void *DstPtr, int64_t Size);
-
-// Asynchronous version of __tgt_rtl_data_exchange
-int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
-                                      int32_t DesID, void *DstPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfo);
-
-// De-allocate the data referenced by target ptr on the device. In case of
-// success, return zero. Otherwise, return an error code. Kind dictates what
-// allocator to use (e.g. shared, host, device).
-int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr, int32_t Kind);
-
-// Transfer control to the offloaded entry Entry on the target device.
-// Args and Offsets are arrays of NumArgs size of target addresses and
-// offsets. An offset should be added to the target address before passing it
-// to the outlined function on device side. If AsyncInfo is nullptr, it is
-// synchronous; otherwise it is asynchronous. However, AsyncInfo may be
-// ignored on some platforms, like x86_64. In that case, it is synchronous. In
-// case of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
-                                    ptrdiff_t *Offsets, int32_t NumArgs);
-
-// Asynchronous version of __tgt_rtl_run_target_region
-int32_t __tgt_rtl_run_target_region_async(int32_t ID, void *Entry, void **Args,
-                                          ptrdiff_t *Offsets, int32_t NumArgs,
-                                          __tgt_async_info *AsyncInfo);
-
-// Similar to __tgt_rtl_run_target_region, but additionally specify the
-// number of teams to be created and a number of threads in each team. If
-// AsyncInfo is nullptr, it is synchronous; otherwise it is asynchronous.
-// However, AsyncInfo may be ignored on some platforms, like x86_64. In that
-// case, it is synchronous.
-int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
-                                         ptrdiff_t *Offsets, int32_t NumArgs,
-                                         int32_t NumTeams, int32_t ThreadLimit,
-                                         uint64_t LoopTripcount);
-
-// Asynchronous version of __tgt_rtl_run_target_team_region
-int32_t __tgt_rtl_run_target_team_region_async(
-    int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs,
-    int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount,
-    __tgt_async_info *AsyncInfo);
-
-// Device synchronization. In case of success, return zero. Otherwise, return an
-// error code.
-int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfo);
-
-// Queries for the completion of asynchronous operations. Instead of blocking
-// the calling thread as __tgt_rtl_synchronize, the progress of the operations
-// stored in AsyncInfo->Queue is queried in a non-blocking manner, partially
-// advancing their execution. If all operations are completed, AsyncInfo->Queue
-// is set to nullptr. If there are still pending operations, AsyncInfo->Queue is
-// kept as a valid queue. In any case of success (i.e., successful query
-// with/without completing all operations), return zero. Otherwise, return an
-// error code.
-int32_t __tgt_rtl_query_async(int32_t ID, __tgt_async_info *AsyncInfo);
-
-// Set plugin's internal information flag externally.
-void __tgt_rtl_set_info_flag(uint32_t);
-
-// Print the device information
-void __tgt_rtl_print_device_info(int32_t ID);
-
-// Event related interfaces. It is expected to use the interfaces in the
-// following way:
-// 1) Create an event on the target device (__tgt_rtl_create_event).
-// 2) Record the event based on the status of \p AsyncInfo->Queue at the moment
-// of function call to __tgt_rtl_record_event. An event becomes "meaningful"
-// once it is recorded, such that others can depend on it.
-// 3) Call __tgt_rtl_wait_event to set dependence on the event. Whether the
-// operation is blocking or non-blocking depends on the target. It is expected
-// to be non-blocking, just set dependence and return.
-// 4) Call __tgt_rtl_sync_event to sync the event. It is expected to block the
-// thread calling the function.
-// 5) Destroy the event (__tgt_rtl_destroy_event).
-// {
-int32_t __tgt_rtl_create_event(int32_t ID, void **Event);
-
-int32_t __tgt_rtl_record_event(int32_t ID, void *Event,
-                               __tgt_async_info *AsyncInfo);
-
-int32_t __tgt_rtl_wait_event(int32_t ID, void *Event,
-                             __tgt_async_info *AsyncInfo);
-
-int32_t __tgt_rtl_sync_event(int32_t ID, void *Event);
-
-int32_t __tgt_rtl_destroy_event(int32_t ID, void *Event);
-// }
-
-int32_t __tgt_rtl_init_async_info(int32_t ID, __tgt_async_info **AsyncInfoPtr);
-int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr,
-                                   const char **ErrStr);
-
-// lock/pin host memory
-int32_t __tgt_rtl_data_lock(int32_t ID, void *HstPtr, int64_t Size,
-                            void **LockedPtr);
-
-// unlock/unpin host memory
-int32_t __tgt_rtl_data_unlock(int32_t ID, void *HstPtr);
-
-// Notify the plugin about a new mapping starting at the host address \p HstPtr
-// and \p Size bytes. The plugin may lock/pin that buffer to achieve optimal
-// memory transfers involving that buffer.
-int32_t __tgt_rtl_data_notify_mapped(int32_t ID, void *HstPtr, int64_t Size);
-
-// Notify the plugin about an existing mapping being unmapped, starting at the
-// host address \p HstPtr and \p Size bytes.
-int32_t __tgt_rtl_data_notify_unmapped(int32_t ID, void *HstPtr);
-
-// Set the global device identifier offset, such that the plugin may determine a
-// unique device number.
-int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset);
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfoPtr);
-
-int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
-                                           void *VAddr, bool isRecord,
-                                           bool SaveOutput,
-                                           uint64_t &ReqPtrArgOffset);
-
-// Returns true if the device \p DeviceId suggests to use auto zero-copy.
-int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
-}
-
-#endif // OMPTARGET_SHARED_PLUGIN_API_H

diff --git a/libomptarget/include/Shared/PluginAPI.inc b/libomptarget/include/Shared/PluginAPI.inc
deleted file mode 100644
index e445da6..0000000
--- a/libomptarget/include/Shared/PluginAPI.inc
+++ /dev/null

@@ -1,51 +0,0 @@
-//===-- Shared/PluginAPI.inc - Target independent plugin API ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the names of the interface functions between target
-// independent offload runtime library and target dependent plugins.
-//
-//===----------------------------------------------------------------------===//
-
-// No include guards!
-
-PLUGIN_API_HANDLE(init_plugin);
-PLUGIN_API_HANDLE(is_valid_binary);
-PLUGIN_API_HANDLE(is_data_exchangable);
-PLUGIN_API_HANDLE(number_of_devices);
-PLUGIN_API_HANDLE(init_device);
-PLUGIN_API_HANDLE(load_binary);
-PLUGIN_API_HANDLE(get_global);
-PLUGIN_API_HANDLE(get_function);
-PLUGIN_API_HANDLE(data_alloc);
-PLUGIN_API_HANDLE(data_submit);
-PLUGIN_API_HANDLE(data_submit_async);
-PLUGIN_API_HANDLE(data_retrieve);
-PLUGIN_API_HANDLE(data_retrieve_async);
-PLUGIN_API_HANDLE(data_exchange);
-PLUGIN_API_HANDLE(data_exchange_async);
-PLUGIN_API_HANDLE(data_delete);
-PLUGIN_API_HANDLE(launch_kernel);
-PLUGIN_API_HANDLE(init_requires);
-PLUGIN_API_HANDLE(synchronize);
-PLUGIN_API_HANDLE(query_async);
-PLUGIN_API_HANDLE(set_info_flag);
-PLUGIN_API_HANDLE(print_device_info);
-PLUGIN_API_HANDLE(create_event);
-PLUGIN_API_HANDLE(record_event);
-PLUGIN_API_HANDLE(wait_event);
-PLUGIN_API_HANDLE(sync_event);
-PLUGIN_API_HANDLE(destroy_event);
-PLUGIN_API_HANDLE(init_async_info);
-PLUGIN_API_HANDLE(init_device_info);
-PLUGIN_API_HANDLE(data_lock);
-PLUGIN_API_HANDLE(data_unlock);
-PLUGIN_API_HANDLE(data_notify_mapped);
-PLUGIN_API_HANDLE(data_notify_unmapped);
-PLUGIN_API_HANDLE(set_device_offset);
-PLUGIN_API_HANDLE(initialize_record_replay);
-PLUGIN_API_HANDLE(use_auto_zero_copy);

diff --git a/libomptarget/include/Shared/Profile.h b/libomptarget/include/Shared/Profile.h
deleted file mode 100644
index 39817ba..0000000
--- a/libomptarget/include/Shared/Profile.h
+++ /dev/null

@@ -1,112 +0,0 @@
-//===-- Shared/Profile.h - Target independent OpenMP target RTL -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Macros to provide profile support via LLVM's time profiler.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_PROFILE_H
-#define OMPTARGET_SHARED_PROFILE_H
-
-#include "Shared/Debug.h"
-#include "Shared/EnvironmentVar.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/TimeProfiler.h"
-
-/// Class that holds the singleton profiler and allows to start/end events.
-class Profiler {
-
-  Profiler() {
-    if (!ProfileTraceFile.isPresent())
-      return;
-
-    // TODO: Add an alias without LIBOMPTARGET
-    // Flag to modify the profile granularity (in us).
-    Int32Envar ProfileGranularity =
-        Int32Envar("LIBOMPTARGET_PROFILE_GRANULARITY", 500);
-
-    llvm::timeTraceProfilerInitialize(ProfileGranularity /*us=*/,
-                                      "libomptarget");
-  }
-
-  ~Profiler() {
-    if (!ProfileTraceFile.isPresent())
-      return;
-
-    if (auto Err = llvm::timeTraceProfilerWrite(ProfileTraceFile.get(), "-"))
-      REPORT("Error writing out the time trace: %s\n",
-             llvm::toString(std::move(Err)).c_str());
-
-    llvm::timeTraceProfilerCleanup();
-  }
-
-  // TODO: Add an alias without LIBOMPTARGET
-  /// Flag to enable profiling which also specifies the file profile information
-  /// is stored in.
-  StringEnvar ProfileTraceFile = StringEnvar("LIBOMPTARGET_PROFILE");
-
-public:
-  static Profiler &get() {
-    static Profiler P;
-    return P;
-  }
-
-  /// Manually begin a time section, with the given \p Name and \p Detail.
-  /// Profiler copies the string data, so the pointers can be given into
-  /// temporaries. Time sections can be hierarchical; every Begin must have a
-  /// matching End pair but they can nest.
-  void beginSection(llvm::StringRef Name, llvm::StringRef Detail) {
-    llvm::timeTraceProfilerBegin(Name, Detail);
-  }
-  void beginSection(llvm::StringRef Name,
-                    llvm::function_ref<std::string()> Detail) {
-    llvm::timeTraceProfilerBegin(Name, Detail);
-  }
-
-  /// Manually end the last time section.
-  void endSection() { llvm::timeTraceProfilerEnd(); }
-};
-
-/// Time spend in the current scope, assigned to the function name.
-#define TIMESCOPE() llvm::TimeTraceScope TimeScope(__PRETTY_FUNCTION__)
-
-/// Time spend in the current scope, assigned to the function name and source
-/// info.
-#define TIMESCOPE_WITH_IDENT(IDENT)                                            \
-  SourceInfo SI(IDENT);                                                        \
-  llvm::TimeTraceScope TimeScope(__FUNCTION__, SI.getProfileLocation())
-
-/// Time spend in the current scope, assigned to the given name and source
-/// info.
-#define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT)                             \
-  SourceInfo SI(IDENT);                                                        \
-  llvm::TimeTraceScope TimeScope(NAME, SI.getProfileLocation())
-
-/// Time spend in the current scope, assigned to the function name and source
-/// info and RegionTypeMsg.
-#define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT)                     \
-  SourceInfo SI(IDENT);                                                        \
-  std::string ProfileLocation = SI.getProfileLocation();                       \
-  std::string RTM = RegionTypeMsg;                                             \
-  llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
-
-/// Time spend in the current scope, assigned to the regionType
-/// with details from runtime
-#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)        \
-  SourceInfo SI(IDENT);                                                        \
-  std::string ProfileLocation = SI.getProfileLocation();                       \
-  llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
-
-/// Time spend in the current scope, assigned to the function name and source
-/// with details
-#define TIMESCOPE_WITH_DETAILS(Details)                                        \
-  llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
-
-#endif // OMPTARGET_SHARED_PROFILE_H

diff --git a/libomptarget/include/Shared/Requirements.h b/libomptarget/include/Shared/Requirements.h
deleted file mode 100644
index b16a165..0000000
--- a/libomptarget/include/Shared/Requirements.h
+++ /dev/null

@@ -1,101 +0,0 @@
-//===-- OpenMP/Requirements.h - User required requirements -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Handling of the `omp requires` directive, e.g., requiring unified shared
-// memory.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_OPENMP_REQUIREMENTS_H
-#define OMPTARGET_OPENMP_REQUIREMENTS_H
-
-#include "Shared/Debug.h"
-
-#include "llvm/ADT/StringRef.h"
-
-#include <cassert>
-#include <cstdint>
-
-enum OpenMPOffloadingRequiresDirFlags : int64_t {
-  /// flag undefined.
-  OMP_REQ_UNDEFINED = 0x000,
-  /// no requires directive present.
-  OMP_REQ_NONE = 0x001,
-  /// reverse_offload clause.
-  OMP_REQ_REVERSE_OFFLOAD = 0x002,
-  /// unified_address clause.
-  OMP_REQ_UNIFIED_ADDRESS = 0x004,
-  /// unified_shared_memory clause.
-  OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
-  /// dynamic_allocators clause.
-  OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
-  /// Auto zero-copy extension:
-  /// when running on an APU, the GPU plugin may decide to
-  /// run in zero-copy even though the user did not program
-  /// their application with unified_shared_memory requirement.
-  OMPX_REQ_AUTO_ZERO_COPY = 0x020
-};
-
-class RequirementCollection {
-  int64_t SetFlags = OMP_REQ_UNDEFINED;
-
-  /// Check consistency between different requires flags (from different
-  /// translation units).
-  void checkConsistency(int64_t NewFlags, int64_t SetFlags,
-                        OpenMPOffloadingRequiresDirFlags Flag,
-                        llvm::StringRef Clause) {
-    if ((SetFlags & Flag) != (NewFlags & Flag)) {
-      FATAL_MESSAGE(2, "'#pragma omp requires %s' not used consistently!",
-                    Clause.data());
-    }
-  }
-
-public:
-  /// Register \p NewFlags as part of the user requirements.
-  void addRequirements(int64_t NewFlags) {
-    // TODO: add more elaborate check.
-    // Minimal check: only set requires flags if previous value
-    // is undefined. This ensures that only the first call to this
-    // function will set the requires flags. All subsequent calls
-    // will be checked for compatibility.
-    assert(NewFlags != OMP_REQ_UNDEFINED &&
-           "illegal undefined flag for requires directive!");
-    if (SetFlags == OMP_REQ_UNDEFINED) {
-      SetFlags = NewFlags;
-      return;
-    }
-
-    // Auto zero-copy is only valid when no other requirement has been set
-    // and it is computed at device initialization time, after the requirement
-    // flag has already been set to OMP_REQ_NONE.
-    if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) {
-      SetFlags = NewFlags;
-      return;
-    }
-
-    // If multiple compilation units are present enforce
-    // consistency across all of them for require clauses:
-    //  - reverse_offload
-    //  - unified_address
-    //  - unified_shared_memory
-    //  - dynamic_allocators
-    checkConsistency(NewFlags, SetFlags, OMP_REQ_REVERSE_OFFLOAD,
-                     "reverse_offload");
-    checkConsistency(NewFlags, SetFlags, OMP_REQ_UNIFIED_ADDRESS,
-                     "unified_address");
-    checkConsistency(NewFlags, SetFlags, OMP_REQ_UNIFIED_SHARED_MEMORY,
-                     "unified_shared_memory");
-    checkConsistency(NewFlags, SetFlags, OMP_REQ_DYNAMIC_ALLOCATORS,
-                     "dynamic_allocators");
-  }
-
-  /// Return the user provided requirements.
-  int64_t getRequirements() const { return SetFlags; }
-};
-
-#endif // OMPTARGET_OPENMP_DEVICE_REQUIREMENTS_H

diff --git a/libomptarget/include/Shared/SourceInfo.h b/libomptarget/include/Shared/SourceInfo.h
deleted file mode 100644
index 63488ff..0000000
--- a/libomptarget/include/Shared/SourceInfo.h
+++ /dev/null

@@ -1,114 +0,0 @@
-//===-- SharedSourceInfo.h - Target independent OpenMP target RTL - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Methods used to describe source information in target regions
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_SOURCE_INFO_H
-#define OMPTARGET_SHARED_SOURCE_INFO_H
-
-#include <cstdint>
-#include <string>
-
-#ifdef _WIN32
-constexpr bool OSWindows = true;
-#else
-constexpr bool OSWindows = false;
-#endif
-
-/// Type alias for source location information for variable mappings with
-/// data layout ";name;filename;row;col;;\0" from clang.
-using map_var_info_t = void *;
-
-/// The ident structure that describes a source location from kmp.h. with
-/// source location string data as ";filename;function;line;column;;\0".
-struct ident_t {
-  // Ident_t flags described in kmp.h.
-  int32_t reserved_1;
-  int32_t flags;
-  int32_t reserved_2;
-  int32_t reserved_3;
-  char const *psource;
-};
-
-/// Struct to hold source individual location information.
-class SourceInfo {
-  /// Underlying string copy of the original source information.
-  const std::string SourceStr;
-
-  /// Location fields extracted from the source information string.
-  const std::string Name;
-  const std::string Filename;
-  const int32_t Line;
-  const int32_t Column;
-
-  std::string initStr(const void *Name) {
-    if (!Name)
-      return ";unknown;unknown;0;0;;";
-
-    std::string Str = std::string(reinterpret_cast<const char *>(Name));
-    if (Str.find(';') == std::string::npos)
-      return ";" + Str + ";unknown;0;0;;";
-    return Str;
-  }
-
-  std::string initStr(const ident_t *Loc) {
-    if (!Loc)
-      return ";unknown;unknown;0;0;;";
-    return std::string(reinterpret_cast<const char *>(Loc->psource));
-  }
-
-  /// Get n-th substring in an expression separated by ;.
-  std::string getSubstring(const unsigned N) const {
-    std::size_t Begin = SourceStr.find(';');
-    std::size_t End = SourceStr.find(';', Begin + 1);
-    for (unsigned I = 0; I < N; I++) {
-      Begin = End;
-      End = SourceStr.find(';', Begin + 1);
-    }
-    return SourceStr.substr(Begin + 1, End - Begin - 1);
-  };
-
-  /// Get the filename from a full path.
-  std::string removePath(const std::string &Path) const {
-    std::size_t Pos = (OSWindows) ? Path.rfind('\\') : Path.rfind('/');
-    return Path.substr(Pos + 1);
-  };
-
-public:
-  SourceInfo(const ident_t *Loc)
-      : SourceStr(initStr(Loc)), Name(getSubstring(1)),
-        Filename(removePath(getSubstring(0))), Line(std::stoi(getSubstring(2))),
-        Column(std::stoi(getSubstring(3))) {}
-
-  SourceInfo(const map_var_info_t Name)
-      : SourceStr(initStr(Name)), Name(getSubstring(0)),
-        Filename(removePath(getSubstring(1))), Line(std::stoi(getSubstring(2))),
-        Column(std::stoi(getSubstring(3))) {}
-
-  const char *getName() const { return Name.c_str(); }
-  const char *getFilename() const { return Filename.c_str(); }
-  const char *getProfileLocation() const { return SourceStr.data(); }
-  int32_t getLine() const { return Line; }
-  int32_t getColumn() const { return Column; }
-  bool isAvailible() const { return (Line || Column); }
-};
-
-/// Standalone function for getting the variable name of a mapping.
-static inline std::string getNameFromMapping(const map_var_info_t Name) {
-  if (!Name)
-    return "unknown";
-
-  const std::string NameStr(reinterpret_cast<const char *>(Name));
-  std::size_t Begin = NameStr.find(';');
-  std::size_t End = NameStr.find(';', Begin + 1);
-  return NameStr.substr(Begin + 1, End - Begin - 1);
-}
-
-#endif // OMPTARGET_SHARED_SOURCE_INFO_H

diff --git a/libomptarget/include/Shared/Utils.h b/libomptarget/include/Shared/Utils.h
deleted file mode 100644
index fce14b5..0000000
--- a/libomptarget/include/Shared/Utils.h
+++ /dev/null

@@ -1,88 +0,0 @@
-//===-- Shared/Utils.h - Target independent OpenMP target RTL -- C++ ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Routines and classes used to provide useful functionalities like string
-// parsing and environment variables.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_UTILS_H
-#define OMPTARGET_SHARED_UTILS_H
-
-#include "llvm/ADT/StringRef.h"
-
-#include "Debug.h"
-
-#include <atomic>
-#include <cassert>
-#include <limits>
-#include <memory>
-
-namespace llvm {
-namespace omp {
-namespace target {
-
-/// Utility class for thread-safe reference counting. Any class that needs
-/// objects' reference counting can inherit from this entity or have it as a
-/// class data member.
-template <typename Ty = uint32_t,
-          std::memory_order MemoryOrder = std::memory_order_relaxed>
-struct RefCountTy {
-  /// Create a refcount object initialized to zero.
-  RefCountTy() : Refs(0) {}
-
-  ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
-
-  /// Increase the reference count atomically.
-  void increase() { Refs.fetch_add(1, MemoryOrder); }
-
-  /// Decrease the reference count and return whether it became zero. Decreasing
-  /// the counter in more units than it was previously increased results in
-  /// undefined behavior.
-  bool decrease() {
-    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
-    assert(Prev > 0 && "Invalid refcount");
-    return (Prev == 1);
-  }
-
-  Ty get() const { return Refs.load(MemoryOrder); }
-
-private:
-  /// The atomic reference counter.
-  std::atomic<Ty> Refs;
-};
-
-/// Return the difference (in bytes) between \p Begin and \p End.
-template <typename Ty = char>
-ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
-  return reinterpret_cast<const Ty *>(End) -
-         reinterpret_cast<const Ty *>(Begin);
-}
-
-/// Return \p Ptr advanced by \p Offset bytes.
-template <typename Ty> Ty *advanceVoidPtr(Ty *Ptr, int64_t Offset) {
-  static_assert(std::is_void<Ty>::value);
-  return const_cast<char *>(reinterpret_cast<const char *>(Ptr) + Offset);
-}
-
-/// Return \p Ptr aligned to \p Alignment bytes.
-template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
-  size_t Space = std::numeric_limits<size_t>::max();
-  return std::align(Alignment, sizeof(char), Ptr, Space);
-}
-
-/// Round up \p V to a \p Boundary.
-template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
-  return (V + Boundary - 1) / Boundary * Boundary;
-}
-
-} // namespace target
-} // namespace omp
-} // namespace llvm
-
-#endif // OMPTARGET_SHARED_UTILS_H

diff --git a/libomptarget/include/Utils/ExponentialBackoff.h b/libomptarget/include/Utils/ExponentialBackoff.h
deleted file mode 100644
index 9751f92..0000000
--- a/libomptarget/include/Utils/ExponentialBackoff.h
+++ /dev/null

@@ -1,52 +0,0 @@
-//===-- Utils/ExponentialBackoff.h - Heuristic helper class ------*- C++ -*===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implement exponential backoff counting.
-// Linearly increments until given maximum, exponentially decrements based on
-// given backoff factor.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_UTILS_EXPONENTIAL_BACKOFF_H
-#define OMPTARGET_UTILS_EXPONENTIAL_BACKOFF_H
-
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-
-namespace utils {
-
-class ExponentialBackoff {
-  int64_t Count = 0;
-  const int64_t MaxCount = 0;
-  const int64_t CountThreshold = 0;
-  const double BackoffFactor = 0;
-
-public:
-  ExponentialBackoff(int64_t MaxCount, int64_t CountThreshold,
-                     double BackoffFactor)
-      : MaxCount(MaxCount), CountThreshold(CountThreshold),
-        BackoffFactor(BackoffFactor) {
-    assert(MaxCount >= 0 &&
-           "ExponentialBackoff: maximum count value should be non-negative");
-    assert(CountThreshold >= 0 &&
-           "ExponentialBackoff: count threshold value should be non-negative");
-    assert(BackoffFactor >= 0 && BackoffFactor < 1 &&
-           "ExponentialBackoff: backoff factor should be in [0, 1) interval");
-  }
-
-  void increment() { Count = std::min(Count + 1, MaxCount); }
-
-  void decrement() { Count *= BackoffFactor; }
-
-  bool isAboveThreshold() const { return Count > CountThreshold; }
-};
-
-} // namespace utils
-
-#endif // OMPTARGET_UTILS_EXPONENTIAL_BACKOFF_H

diff --git a/libomptarget/include/device.h b/libomptarget/include/device.h
deleted file mode 100644
index bd28297..0000000
--- a/libomptarget/include/device.h
+++ /dev/null

@@ -1,166 +0,0 @@
-//===----------- device.h - Target independent OpenMP target RTL ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations for managing devices that are handled by RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_DEVICE_H
-#define _OMPTARGET_DEVICE_H
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <list>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <set>
-
-#include "ExclusiveAccess.h"
-#include "OffloadEntry.h"
-#include "omptarget.h"
-#include "rtl.h"
-
-#include "OpenMP/Mapping.h"
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-
-// Forward declarations.
-struct PluginAdaptorTy;
-struct __tgt_bin_desc;
-struct __tgt_target_table;
-
-struct DeviceTy {
-  int32_t DeviceID;
-  PluginAdaptorTy *RTL;
-  int32_t RTLDeviceID;
-
-  DeviceTy(PluginAdaptorTy *RTL, int32_t DeviceID, int32_t RTLDeviceID);
-  // DeviceTy is not copyable
-  DeviceTy(const DeviceTy &D) = delete;
-  DeviceTy &operator=(const DeviceTy &D) = delete;
-
-  ~DeviceTy();
-
-  /// Try to initialize the device and return any failure.
-  llvm::Error init();
-
-  /// Provide access to the mapping handler.
-  MappingInfoTy &getMappingInfo() { return MappingInfo; }
-
-  llvm::Expected<__tgt_device_binary> loadBinary(__tgt_device_image *Img);
-
-  // device memory allocation/deallocation routines
-  /// Allocates \p Size bytes on the device, host or shared memory space
-  /// (depending on \p Kind) and returns the address/nullptr when
-  /// succeeds/fails. \p HstPtr is an address of the host data which the
-  /// allocated target data will be associated with. If it is unknown, the
-  /// default value of \p HstPtr is nullptr. Note: this function doesn't do
-  /// pointer association. Actually, all the __tgt_rtl_data_alloc
-  /// implementations ignore \p HstPtr. \p Kind dictates what allocator should
-  /// be used (host, shared, device).
-  void *allocData(int64_t Size, void *HstPtr = nullptr,
-                  int32_t Kind = TARGET_ALLOC_DEFAULT);
-
-  /// Deallocates memory which \p TgtPtrBegin points at and returns
-  /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. p Kind dictates what
-  /// allocator should be used (host, shared, device).
-  int32_t deleteData(void *TgtPtrBegin, int32_t Kind = TARGET_ALLOC_DEFAULT);
-
-  // Data transfer. When AsyncInfo is nullptr, the transfer will be
-  // synchronous.
-  // Copy data from host to device
-  int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
-                     AsyncInfoTy &AsyncInfo,
-                     HostDataToTargetTy *Entry = nullptr,
-                     MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr = nullptr);
-
-  // Copy data from device back to host
-  int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
-                       AsyncInfoTy &AsyncInfo,
-                       HostDataToTargetTy *Entry = nullptr,
-                       MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr = nullptr);
-
-  // Return true if data can be copied to DstDevice directly
-  bool isDataExchangable(const DeviceTy &DstDevice);
-
-  // Copy data from current device to destination device directly
-  int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
-                       int64_t Size, AsyncInfoTy &AsyncInfo);
-
-  /// Notify the plugin about a new mapping starting at the host address
-  /// \p HstPtr and \p Size bytes.
-  int32_t notifyDataMapped(void *HstPtr, int64_t Size);
-
-  /// Notify the plugin about an existing mapping being unmapped starting at
-  /// the host address \p HstPtr.
-  int32_t notifyDataUnmapped(void *HstPtr);
-
-  // Launch the kernel identified by \p TgtEntryPtr with the given arguments.
-  int32_t launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
-                       ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
-                       AsyncInfoTy &AsyncInfo);
-
-  /// Synchronize device/queue/event based on \p AsyncInfo and return
-  /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
-  int32_t synchronize(AsyncInfoTy &AsyncInfo);
-
-  /// Query for device/queue/event based completion on \p AsyncInfo in a
-  /// non-blocking manner and return OFFLOAD_SUCCESS/OFFLOAD_FAIL when
-  /// succeeds/fails. Must be called multiple times until AsyncInfo is
-  /// completed and AsyncInfo.isDone() returns true.
-  int32_t queryAsync(AsyncInfoTy &AsyncInfo);
-
-  /// Calls the corresponding print device info function in the plugin.
-  bool printDeviceInfo();
-
-  /// Event related interfaces.
-  /// {
-  /// Create an event.
-  int32_t createEvent(void **Event);
-
-  /// Record the event based on status in AsyncInfo->Queue at the moment the
-  /// function is called.
-  int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo);
-
-  /// Wait for an event. This function can be blocking or non-blocking,
-  /// depending on the implmentation. It is expected to set a dependence on the
-  /// event such that corresponding operations shall only start once the event
-  /// is fulfilled.
-  int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo);
-
-  /// Synchronize the event. It is expected to block the thread.
-  int32_t syncEvent(void *Event);
-
-  /// Destroy the event.
-  int32_t destroyEvent(void *Event);
-  /// }
-
-  /// Print all offload entries to stderr.
-  void dumpOffloadEntries();
-
-  /// Ask the device whether the runtime should use auto zero-copy.
-  bool useAutoZeroCopy();
-
-private:
-  /// Deinitialize the device (and plugin).
-  void deinit();
-
-  /// All offload entries available on this device.
-  using DeviceOffloadEntriesMapTy =
-      llvm::DenseMap<llvm::StringRef, OffloadEntryTy>;
-  ProtectedObj<DeviceOffloadEntriesMapTy> DeviceOffloadEntries;
-
-  /// Handler to collect and organize host-2-device mapping information.
-  MappingInfoTy MappingInfo;
-};
-
-#endif

diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
deleted file mode 100644
index 323dee4..0000000
--- a/libomptarget/include/omptarget.h
+++ /dev/null

@@ -1,438 +0,0 @@
-//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Interface to be used by Clang during the codegen of a
-// target region.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_H_
-#define _OMPTARGET_H_
-
-#include "Shared/APITypes.h"
-#include "Shared/Environment.h"
-#include "Shared/SourceInfo.h"
-
-#include "OpenMP/InternalTypes.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <deque>
-#include <functional>
-#include <type_traits>
-
-#include "llvm/ADT/SmallVector.h"
-
-#define OFFLOAD_SUCCESS (0)
-#define OFFLOAD_FAIL (~0)
-
-#define OFFLOAD_DEVICE_DEFAULT -1
-
-// Don't format out enums and structs.
-// clang-format off
-
-/// return flags of __tgt_target_XXX public APIs
-enum __tgt_target_return_t : int {
-  /// successful offload executed on a target device
-  OMP_TGT_SUCCESS = 0,
-  /// offload may not execute on the requested target device
-  /// this scenario can be caused by the device not available or unsupported
-  /// as described in the Execution Model in the specifcation
-  /// this status may not be used for target device execution failure
-  /// which should be handled internally in libomptarget
-  OMP_TGT_FAIL = ~0
-};
-
-/// Data attributes for each data reference used in an OpenMP target region.
-enum tgt_map_type {
-  // No flags
-  OMP_TGT_MAPTYPE_NONE            = 0x000,
-  // copy data from host to device
-  OMP_TGT_MAPTYPE_TO              = 0x001,
-  // copy data from device to host
-  OMP_TGT_MAPTYPE_FROM            = 0x002,
-  // copy regardless of the reference count
-  OMP_TGT_MAPTYPE_ALWAYS          = 0x004,
-  // force unmapping of data
-  OMP_TGT_MAPTYPE_DELETE          = 0x008,
-  // map the pointer as well as the pointee
-  OMP_TGT_MAPTYPE_PTR_AND_OBJ     = 0x010,
-  // pass device base address to kernel
-  OMP_TGT_MAPTYPE_TARGET_PARAM    = 0x020,
-  // return base device address of mapped data
-  OMP_TGT_MAPTYPE_RETURN_PARAM    = 0x040,
-  // private variable - not mapped
-  OMP_TGT_MAPTYPE_PRIVATE         = 0x080,
-  // copy by value - not mapped
-  OMP_TGT_MAPTYPE_LITERAL         = 0x100,
-  // mapping is implicit
-  OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
-  // copy data to device
-  OMP_TGT_MAPTYPE_CLOSE           = 0x400,
-  // runtime error if not already allocated
-  OMP_TGT_MAPTYPE_PRESENT         = 0x1000,
-  // use a separate reference counter so that the data cannot be unmapped within
-  // the structured region
-  // This is an OpenMP extension for the sake of OpenACC support.
-  OMP_TGT_MAPTYPE_OMPX_HOLD       = 0x2000,
-  // descriptor for non-contiguous target-update
-  OMP_TGT_MAPTYPE_NON_CONTIG      = 0x100000000000,
-  // member of struct, member given by [16 MSBs] - 1
-  OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
-};
-
-/// Flags for offload entries.
-enum OpenMPOffloadingDeclareTargetFlags {
-  /// Mark the entry global as having a 'link' attribute.
-  OMP_DECLARE_TARGET_LINK = 0x01,
-  /// Mark the entry global as being an indirectly callable function.
-  OMP_DECLARE_TARGET_INDIRECT = 0x08,
-  /// This is an entry corresponding to a requirement to be registered.
-  OMP_REGISTER_REQUIRES = 0x10,
-};
-
-enum TargetAllocTy : int32_t {
-  TARGET_ALLOC_DEVICE = 0,
-  TARGET_ALLOC_HOST,
-  TARGET_ALLOC_SHARED,
-  TARGET_ALLOC_DEFAULT,
-  /// The allocation will not block on other streams.
-  TARGET_ALLOC_DEVICE_NON_BLOCKING,
-};
-
-inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
-	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0},       {1, 0, 0}, {1, 0, 0}, 0};
-
-struct DeviceTy;
-
-/// The libomptarget wrapper around a __tgt_async_info object directly
-/// associated with a libomptarget layer device. RAII semantics to avoid
-/// mistakes.
-class AsyncInfoTy {
-public:
-  enum class SyncTy { BLOCKING, NON_BLOCKING };
-
-private:
-  /// Locations we used in (potentially) asynchronous calls which should live
-  /// as long as this AsyncInfoTy object.
-  std::deque<void *> BufferLocations;
-
-  /// Post-processing operations executed after a successful synchronization.
-  /// \note the post-processing function should return OFFLOAD_SUCCESS or
-  /// OFFLOAD_FAIL appropriately.
-  using PostProcFuncTy = std::function<int()>;
-  llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions;
-
-  __tgt_async_info AsyncInfo;
-  DeviceTy &Device;
-
-public:
-  /// Synchronization method to be used.
-  SyncTy SyncType;
-
-  AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
-      : Device(Device), SyncType(SyncType) {}
-  ~AsyncInfoTy() { synchronize(); }
-
-  /// Implicit conversion to the __tgt_async_info which is used in the
-  /// plugin interface.
-  operator __tgt_async_info *() { return &AsyncInfo; }
-
-  /// Synchronize all pending actions.
-  ///
-  /// \note synchronization will be performance in a blocking or non-blocking
-  /// manner, depending on the SyncType.
-  ///
-  /// \note if the operations are completed, the registered post-processing
-  /// functions will be executed once and unregistered afterwards.
-  ///
-  /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
-  int synchronize();
-
-  /// Return a void* reference with a lifetime that is at least as long as this
-  /// AsyncInfoTy object. The location can be used as intermediate buffer.
-  void *&getVoidPtrLocation();
-
-  /// Check if all asynchronous operations are completed.
-  ///
-  /// \note only a lightweight check. If needed, use synchronize() to query the
-  /// status of AsyncInfo before checking.
-  ///
-  /// \returns true if there is no pending asynchronous operations, false
-  /// otherwise.
-  bool isDone() const;
-
-  /// Add a new post-processing function to be executed after synchronization.
-  ///
-  /// \param[in] Function is a templated function (e.g., function pointers,
-  /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e.,
-  /// it must have int() as its function signature).
-  template <typename FuncTy> void addPostProcessingFunction(FuncTy &&Function) {
-    static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>,
-                  "Invalid post-processing function type. Please check "
-                  "function signature!");
-    PostProcessingFunctions.emplace_back(Function);
-  }
-
-private:
-  /// Run all the post-processing functions sequentially.
-  ///
-  /// \note after a successful execution, all previously registered functions
-  /// are unregistered.
-  ///
-  /// \returns OFFLOAD_FAIL if any post-processing function failed,
-  /// OFFLOAD_SUCCESS otherwise.
-  int32_t runPostProcessing();
-
-  /// Check if the internal asynchronous info queue is empty or not.
-  ///
-  /// \returns true if empty, false otherwise.
-  bool isQueueEmpty() const;
-};
-
-// Wrapper for task stored async info objects.
-class TaskAsyncInfoWrapperTy {
-  // Invalid GTID as defined by libomp; keep in sync
-  static constexpr int KMP_GTID_DNE = -2;
-
-  const int ExecThreadID = KMP_GTID_DNE;
-  AsyncInfoTy LocalAsyncInfo;
-  AsyncInfoTy *AsyncInfo = &LocalAsyncInfo;
-  void **TaskAsyncInfoPtr = nullptr;
-
-public:
-  TaskAsyncInfoWrapperTy(DeviceTy &Device)
-      : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
-    // If we failed to acquired the current global thread id, we cannot
-    // re-enqueue the current task. Thus we should use the local blocking async
-    // info.
-    if (ExecThreadID == KMP_GTID_DNE)
-      return;
-
-    // Only tasks with an assigned task team can be re-enqueue and thus can
-    // use the non-blocking synchronization scheme. Thus we should use the local
-    // blocking async info, if we don´t have one.
-    if (!__kmpc_omp_has_task_team(ExecThreadID))
-      return;
-
-    // Acquire a pointer to the AsyncInfo stored inside the current task being
-    // executed.
-    TaskAsyncInfoPtr = __kmpc_omp_get_target_async_handle_ptr(ExecThreadID);
-
-    // If we cannot acquire such pointer, fallback to using the local blocking
-    // async info.
-    if (!TaskAsyncInfoPtr)
-      return;
-
-    // When creating a new task async info, the task handle must always be
-    // invalid. We must never overwrite any task async handle and there should
-    // never be any valid handle store inside the task at this point.
-    assert((*TaskAsyncInfoPtr) == nullptr &&
-           "Task async handle is not empty when dispatching new device "
-           "operations. The handle was not cleared properly or "
-           "__tgt_target_nowait_query should have been called!");
-
-    // If no valid async handle is present, a new AsyncInfo will be allocated
-    // and stored in the current task.
-    AsyncInfo = new AsyncInfoTy(Device, AsyncInfoTy::SyncTy::NON_BLOCKING);
-    *TaskAsyncInfoPtr = (void *)AsyncInfo;
-  }
-
-  ~TaskAsyncInfoWrapperTy() {
-    // Local async info destruction is automatically handled by ~AsyncInfoTy.
-    if (AsyncInfo == &LocalAsyncInfo)
-      return;
-
-    // If the are device operations still pending, return immediately without
-    // deallocating the handle.
-    if (!AsyncInfo->isDone())
-      return;
-
-    // Delete the handle and unset it from the OpenMP task data.
-    delete AsyncInfo;
-    *TaskAsyncInfoPtr = nullptr;
-  }
-
-  operator AsyncInfoTy &() { return *AsyncInfo; }
-};
-
-/// This struct is a record of non-contiguous information
-struct __tgt_target_non_contig {
-  uint64_t Offset;
-  uint64_t Count;
-  uint64_t Stride;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void ompx_dump_mapping_tables(void);
-int omp_get_num_devices(void);
-int omp_get_device_num(void);
-int omp_get_initial_device(void);
-void *omp_target_alloc(size_t Size, int DeviceNum);
-void omp_target_free(void *DevicePtr, int DeviceNum);
-int omp_target_is_present(const void *Ptr, int DeviceNum);
-int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
-                      size_t DstOffset, size_t SrcOffset, int DstDevice,
-                      int SrcDevice);
-int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
-                           int NumDims, const size_t *Volume,
-                           const size_t *DstOffsets, const size_t *SrcOffsets,
-                           const size_t *DstDimensions,
-                           const size_t *SrcDimensions, int DstDevice,
-                           int SrcDevice);
-void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
-int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
-                             size_t Size, size_t DeviceOffset, int DeviceNum);
-int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
-
-/// Explicit target memory allocators
-/// Using the llvm_ prefix until they become part of the OpenMP standard.
-void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
-void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
-void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
-
-/// Explicit target memory deallocators
-/// Using the llvm_ prefix until they become part of the OpenMP standard.
-void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum);
-void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum);
-void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
-
-/// Dummy target so we have a symbol for generating host fallback.
-void *llvm_omp_target_dynamic_shared_alloc();
-
-/// add the clauses of the requires directives in a given file
-void __tgt_register_requires(int64_t Flags);
-
-/// Initializes the runtime library.
-void __tgt_rtl_init();
-
-/// Deinitializes the runtime library.
-void __tgt_rtl_deinit();
-
-/// adds a target shared library to the target execution image
-void __tgt_register_lib(__tgt_bin_desc *Desc);
-
-/// Initialize all RTLs at once
-void __tgt_init_all_rtls();
-
-/// removes a target shared library from the target execution image
-void __tgt_unregister_lib(__tgt_bin_desc *Desc);
-
-// creates the host to target data mapping, stores it in the
-// libomptarget.so internal structure (an entry in a stack of data maps) and
-// passes the data to the device;
-void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-                             void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
-void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum,
-                                    void **ArgsBase, void **Args,
-                                    int64_t *ArgSizes, int64_t *ArgTypes,
-                                    int32_t DepNum, void *DepList,
-                                    int32_t NoAliasDepNum,
-                                    void *NoAliasDepList);
-void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
-                                    int32_t ArgNum, void **ArgsBase,
-                                    void **Args, int64_t *ArgSizes,
-                                    int64_t *ArgTypes, map_var_info_t *ArgNames,
-                                    void **ArgMappers);
-void __tgt_target_data_begin_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
-    void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
-    void *NoAliasDepList);
-
-// passes data from the target, release target memory and destroys the
-// host-target mapping (top entry from the stack of data maps) created by
-// the last __tgt_target_data_begin
-void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-                           void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
-void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum,
-                                  void **ArgsBase, void **Args,
-                                  int64_t *ArgSizes, int64_t *ArgTypes,
-                                  int32_t DepNum, void *DepList,
-                                  int32_t NoAliasDepNum, void *NoAliasDepList);
-void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
-                                  int32_t ArgNum, void **ArgsBase, void **Args,
-                                  int64_t *ArgSizes, int64_t *ArgTypes,
-                                  map_var_info_t *ArgNames, void **ArgMappers);
-void __tgt_target_data_end_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
-    void **ArgMappers, int32_t depNum, void *depList, int32_t NoAliasDepNum,
-    void *NoAliasDepList);
-
-/// passes data to/from the target
-void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-                              void **Args, int64_t *ArgSizes,
-                              int64_t *ArgTypes);
-void __tgt_target_data_update_nowait(int64_t DeviceId, int32_t ArgNum,
-                                     void **ArgsBase, void **Args,
-                                     int64_t *ArgSizes, int64_t *ArgTypes,
-                                     int32_t DepNum, void *DepList,
-                                     int32_t NoAliasDepNum,
-                                     void *NoAliasDepList);
-void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
-                                     int32_t ArgNum, void **ArgsBase,
-                                     void **Args, int64_t *ArgSizes,
-                                     int64_t *ArgTypes,
-                                     map_var_info_t *ArgNames,
-                                     void **ArgMappers);
-void __tgt_target_data_update_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
-    void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
-    void *NoAliasDepList);
-
-// Performs the same actions as data_begin in case ArgNum is non-zero
-// and initiates run of offloaded region on target platform; if ArgNum
-// is non-zero after the region execution is done it also performs the
-// same action as data_end above. The following types are used; this
-// function returns 0 if it was able to transfer the execution to a
-// target and an int different from zero otherwise.
-int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
-                        int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
-
-// Non-blocking synchronization for target nowait regions. This function
-// acquires the asynchronous context from task data of the current task being
-// executed and tries to query for the completion of its operations. If the
-// operations are still pending, the function returns immediately. If the
-// operations are completed, all the post-processing procedures stored in the
-// asynchronous context are executed and the context is removed from the task
-// data.
-void __tgt_target_nowait_query(void **AsyncHandle);
-
-/// Executes a target kernel by replaying recorded kernel arguments and
-/// device memory.
-int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
-                               void *DeviceMemory, int64_t DeviceMemorySize,
-                               void **TgtArgs, ptrdiff_t *TgtOffsets,
-                               int32_t NumArgs, int32_t NumTeams,
-                               int32_t ThreadLimit, uint64_t LoopTripCount);
-
-void __tgt_set_info_flag(uint32_t);
-
-int __tgt_print_device_info(int64_t DeviceId);
-
-int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
-                                 void *VAddr, bool IsRecord, bool SaveOutput,
-                                 uint64_t &ReqPtrArgOffset);
-
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef __cplusplus
-#define EXTERN extern "C"
-#else
-#define EXTERN extern
-#endif
-
-#endif // _OMPTARGET_H_

diff --git a/libomptarget/include/rtl.h b/libomptarget/include/rtl.h
deleted file mode 100644
index 5e198bd..0000000
--- a/libomptarget/include/rtl.h
+++ /dev/null

@@ -1,56 +0,0 @@
-//===------------ rtl.h - Target independent OpenMP target RTL ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations for handling RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_RTL_H
-#define _OMPTARGET_RTL_H
-
-#include "llvm/ADT/SmallVector.h"
-
-#include "omptarget.h"
-
-#include <cstdint>
-#include <map>
-
-/// Map between the host entry begin and the translation table. Each
-/// registered library gets one TranslationTable. Use the map from
-/// __tgt_offload_entry so that we may quickly determine whether we
-/// are trying to (re)register an existing lib or really have a new one.
-struct TranslationTable {
-  __tgt_target_table HostTable;
-  llvm::SmallVector<__tgt_target_table> DeviceTables;
-
-  // Image assigned to a given device.
-  llvm::SmallVector<__tgt_device_image *>
-      TargetsImages; // One image per device ID.
-
-  // Arrays of entries active on the device.
-  llvm::SmallVector<llvm::SmallVector<__tgt_offload_entry>>
-      TargetsEntries; // One table per device ID.
-
-  // Table of entry points or NULL if it was not already computed.
-  llvm::SmallVector<__tgt_target_table *>
-      TargetsTable; // One table per device ID.
-};
-typedef std::map<__tgt_offload_entry *, TranslationTable>
-    HostEntriesBeginToTransTableTy;
-
-/// Map between the host ptr and a table index
-struct TableMap {
-  TranslationTable *Table = nullptr; // table associated with the host ptr.
-  uint32_t Index = 0; // index in which the host ptr translated entry is found.
-  TableMap() = default;
-  TableMap(TranslationTable *Table, uint32_t Index)
-      : Table(Table), Index(Index) {}
-};
-typedef std::map<void *, TableMap> HostPtrToTableMapTy;
-
-#endif

diff --git a/libomptarget/plugins-nextgen/CMakeLists.txt b/libomptarget/plugins-nextgen/CMakeLists.txt
deleted file mode 100644
index dbd82ac..0000000
--- a/libomptarget/plugins-nextgen/CMakeLists.txt
+++ /dev/null

@@ -1,78 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build plugins for the user system if available.
-#
-##===----------------------------------------------------------------------===##
-
-# Common interface to handle creating a plugin library.
-set(common_dir ${CMAKE_CURRENT_SOURCE_DIR}/common)
-add_subdirectory(common)
-function(add_target_library target_name lib_name)
-  add_llvm_library(${target_name} SHARED
-    LINK_COMPONENTS
-      ${LLVM_TARGETS_TO_BUILD}
-      AggressiveInstCombine
-      Analysis
-      BinaryFormat
-      BitReader
-      BitWriter
-      CodeGen
-      Core
-      Extensions
-      InstCombine
-      Instrumentation
-      IPO
-      IRReader
-      Linker
-      MC
-      Object
-      Passes
-      Remarks
-      ScalarOpts
-      Support
-      Target
-      TargetParser
-      TransformUtils
-      Vectorize
-
-    NO_INSTALL_RPATH
-    BUILDTREE_ONLY
-  )
-
-  llvm_update_compile_flags(${target_name})
-  target_link_libraries(${target_name} PRIVATE
-                        PluginCommon ${OPENMP_PTHREAD_LIB})
-
-  target_compile_definitions(${target_name} PRIVATE TARGET_NAME=${lib_name})
-  target_compile_definitions(${target_name} PRIVATE 
-                             DEBUG_PREFIX="TARGET ${lib_name} RTL")
-
-  if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by
-    # the dynamic linker at runtime. Therefore, allow the symbol to be undefined
-    # when creating a shared library.
-    target_link_libraries(${target_name} PRIVATE "-Wl,--allow-shlib-undefined")
-  else()
-    target_link_libraries(${target_name} PRIVATE "-Wl,-z,defs")
-  endif()
-
-  if(LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-    target_link_libraries(${target_name} PRIVATE
-    "-Wl,--version-script=${common_dir}/../exports")
-  endif()
-  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET protected)
-endfunction()
-
-add_subdirectory(amdgpu)
-add_subdirectory(cuda)
-add_subdirectory(host)
-
-# Make sure the parent scope can see the plugins that will be created.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)

diff --git a/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt b/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
deleted file mode 100644
index 40df771..0000000
--- a/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
+++ /dev/null

@@ -1,64 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is dual licensed under the MIT and the University of Illinois Open
-# Source Licenses. See LICENSE.txt for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an AMDGPU machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-################################################################################
-set(LIBOMPTARGET_BUILD_AMDGPU_PLUGIN TRUE CACHE BOOL
-  "Whether to build AMDGPU plugin")
-if (NOT LIBOMPTARGET_BUILD_AMDGPU_PLUGIN)
-  libomptarget_say("Not building AMDGPU NextGen offloading plugin: LIBOMPTARGET_BUILD_AMDGPU_PLUGIN is false")
-  return()
-endif()
-
-# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
-find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-
-if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building AMDGPU NextGen plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts")
-  return()
-endif()
-
-# Create the library and add the default arguments.
-add_target_library(omptarget.rtl.amdgpu AMDGPU)
-
-target_sources(omptarget.rtl.amdgpu PRIVATE src/rtl.cpp)
-target_include_directories(omptarget.rtl.amdgpu PRIVATE
-                           ${CMAKE_CURRENT_SOURCE_DIR}/utils)
-
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" OFF)
-if(hsa-runtime64_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA)
-  libomptarget_say("Building AMDGPU plugin linked against libhsa")
-  target_link_libraries(omptarget.rtl.amdgpu PRIVATE hsa-runtime64::hsa-runtime64)
-else()
-  libomptarget_say("Building AMDGPU plugin for dlopened libhsa")
-  target_include_directories(omptarget.rtl.amdgpu PRIVATE dynamic_hsa)
-  target_sources(omptarget.rtl.amdgpu PRIVATE dynamic_hsa/hsa.cpp)
-endif()
-
-# Configure testing for the AMDGPU plugin. We will build tests if we could a
-# functional AMD GPU on the system, or if manually specifies by the user.
-option(LIBOMPTARGET_FORCE_AMDGPU_TESTS "Build AMDGPU libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_AMDGPU_GPU OR LIBOMPTARGET_FORCE_AMDGPU_TESTS)
-  # Report to the parent scope that we are building a plugin for amdgpu
-  set(LIBOMPTARGET_SYSTEM_TARGETS 
-      "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu")
-  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
-  libomptarget_say("Not generating AMDGPU tests, no supported devices detected."
-                   " Use 'LIBOMPTARGET_FORCE_AMDGPU_TESTS' to override.")
-endif()
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.amdgpu PROPERTIES
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")

diff --git a/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
deleted file mode 100644
index cb82180..0000000
--- a/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ /dev/null

@@ -1,118 +0,0 @@
-//===--- amdgpu/dynamic_hsa/hsa.cpp ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implement subset of hsa api by calling into hsa library via dlopen
-// Does the dlopen/dlsym calls as part of the call to hsa_init
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#include "Shared/Debug.h"
-
-#include "DLWrap.h"
-#include "hsa.h"
-#include "hsa_ext_amd.h"
-#include <memory>
-
-DLWRAP_INITIALIZE()
-
-DLWRAP_INTERNAL(hsa_init, 0)
-
-DLWRAP(hsa_status_string, 2)
-DLWRAP(hsa_shut_down, 0)
-DLWRAP(hsa_system_get_info, 2)
-DLWRAP(hsa_agent_get_info, 3)
-DLWRAP(hsa_isa_get_info_alt, 3)
-DLWRAP(hsa_iterate_agents, 2)
-DLWRAP(hsa_agent_iterate_isas, 3)
-DLWRAP(hsa_signal_create, 4)
-DLWRAP(hsa_signal_destroy, 1)
-DLWRAP(hsa_signal_store_relaxed, 2)
-DLWRAP(hsa_signal_store_screlease, 2)
-DLWRAP(hsa_signal_wait_scacquire, 5)
-DLWRAP(hsa_signal_load_scacquire, 1)
-DLWRAP(hsa_signal_subtract_screlease, 2)
-DLWRAP(hsa_queue_create, 8)
-DLWRAP(hsa_queue_destroy, 1)
-DLWRAP(hsa_queue_load_read_index_scacquire, 1)
-DLWRAP(hsa_queue_add_write_index_relaxed, 2)
-DLWRAP(hsa_memory_copy, 3)
-DLWRAP(hsa_executable_create, 4)
-DLWRAP(hsa_executable_create_alt, 4)
-DLWRAP(hsa_executable_destroy, 1)
-DLWRAP(hsa_executable_freeze, 2)
-DLWRAP(hsa_executable_validate, 2)
-DLWRAP(hsa_executable_symbol_get_info, 3)
-DLWRAP(hsa_executable_get_symbol_by_name, 4)
-DLWRAP(hsa_executable_iterate_symbols, 3)
-DLWRAP(hsa_code_object_deserialize, 4)
-DLWRAP(hsa_executable_load_code_object, 4)
-DLWRAP(hsa_code_object_destroy, 1)
-DLWRAP(hsa_amd_agent_memory_pool_get_info, 4)
-DLWRAP(hsa_amd_agent_iterate_memory_pools, 3)
-DLWRAP(hsa_amd_memory_pool_allocate, 4)
-DLWRAP(hsa_amd_memory_pool_free, 1)
-DLWRAP(hsa_amd_memory_async_copy, 8)
-DLWRAP(hsa_amd_memory_pool_get_info, 3)
-DLWRAP(hsa_amd_agents_allow_access, 4)
-DLWRAP(hsa_amd_memory_lock, 5)
-DLWRAP(hsa_amd_memory_unlock, 1)
-DLWRAP(hsa_amd_memory_fill, 3)
-DLWRAP(hsa_amd_register_system_event_handler, 2)
-DLWRAP(hsa_amd_signal_create, 5)
-DLWRAP(hsa_amd_signal_async_handler, 5)
-DLWRAP(hsa_amd_pointer_info, 5)
-
-DLWRAP_FINALIZE()
-
-#ifndef DYNAMIC_HSA_PATH
-#define DYNAMIC_HSA_PATH "libhsa-runtime64.so"
-#endif
-
-#ifndef TARGET_NAME
-#error "Missing TARGET_NAME macro"
-#endif
-#ifndef DEBUG_PREFIX
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#endif
-
-static bool checkForHSA() {
-  // return true if dlopen succeeded and all functions found
-
-  const char *HsaLib = DYNAMIC_HSA_PATH;
-  std::string ErrMsg;
-  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-      llvm::sys::DynamicLibrary::getPermanentLibrary(HsaLib, &ErrMsg));
-  if (!DynlibHandle->isValid()) {
-    DP("Unable to load library '%s': %s!\n", HsaLib, ErrMsg.c_str());
-    return false;
-  }
-
-  for (size_t I = 0; I < dlwrap::size(); I++) {
-    const char *Sym = dlwrap::symbol(I);
-
-    void *P = DynlibHandle->getAddressOfSymbol(Sym);
-    if (P == nullptr) {
-      DP("Unable to find '%s' in '%s'!\n", Sym, HsaLib);
-      return false;
-    }
-    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
-
-    *dlwrap::pointer(I) = P;
-  }
-
-  return true;
-}
-
-hsa_status_t hsa_init() {
-  if (!checkForHSA()) {
-    return HSA_STATUS_ERROR;
-  }
-  return dlwrap_hsa_init();
-}

diff --git a/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
deleted file mode 100644
index 64a1d33..0000000
--- a/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ /dev/null

@@ -1,370 +0,0 @@
-//===--- amdgpu/dynamic_hsa/hsa.h --------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The parts of the hsa api that are presently in use by the amdgpu plugin
-//
-//===----------------------------------------------------------------------===//
-#ifndef HSA_RUNTIME_INC_HSA_H_
-#define HSA_RUNTIME_INC_HSA_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-// Detect and set large model builds.
-#undef HSA_LARGE_MODEL
-#if defined(__LP64__) || defined(_M_X64)
-#define HSA_LARGE_MODEL
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-  HSA_STATUS_SUCCESS = 0x0,
-  HSA_STATUS_INFO_BREAK = 0x1,
-  HSA_STATUS_ERROR = 0x1000,
-  HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
-  HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
-} hsa_status_t;
-
-hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);
-
-typedef struct hsa_dim3_s {
-  uint32_t x;
-  uint32_t y;
-  uint32_t z;
-} hsa_dim3_t;
-
-hsa_status_t hsa_init();
-
-hsa_status_t hsa_shut_down();
-
-typedef struct hsa_agent_s {
-  uint64_t handle;
-} hsa_agent_t;
-
-typedef enum {
-  HSA_DEVICE_TYPE_CPU = 0,
-  HSA_DEVICE_TYPE_GPU = 1,
-  HSA_DEVICE_TYPE_DSP = 2
-} hsa_device_type_t;
-
-typedef enum {
-  HSA_ISA_INFO_NAME_LENGTH = 0,
-  HSA_ISA_INFO_NAME = 1
-} hsa_isa_info_t;
-
-typedef enum {
-  HSA_AGENT_INFO_NAME = 0,
-  HSA_AGENT_INFO_VENDOR_NAME = 1,
-  HSA_AGENT_INFO_FEATURE = 2,
-  HSA_AGENT_INFO_PROFILE = 4,
-  HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
-  HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
-  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8,
-  HSA_AGENT_INFO_GRID_MAX_DIM = 9,
-  HSA_AGENT_INFO_GRID_MAX_SIZE = 10,
-  HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11,
-  HSA_AGENT_INFO_QUEUES_MAX = 12,
-  HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13,
-  HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14,
-  HSA_AGENT_INFO_NODE = 16,
-  HSA_AGENT_INFO_DEVICE = 17,
-  HSA_AGENT_INFO_CACHE_SIZE = 18,
-  HSA_AGENT_INFO_FAST_F16_OPERATION = 24,
-} hsa_agent_info_t;
-
-typedef enum {
-  HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
-  HSA_SYSTEM_INFO_VERSION_MINOR = 1,
-} hsa_system_info_t;
-
-typedef enum {
-  HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1,
-  HSA_AGENT_FEATURE_AGENT_DISPATCH = 2,
-} hsa_agent_feature_t;
-
-typedef struct hsa_region_s {
-  uint64_t handle;
-} hsa_region_t;
-
-typedef struct hsa_isa_s {
-  uint64_t handle;
-} hsa_isa_t;
-
-hsa_status_t hsa_system_get_info(hsa_system_info_t attribute, void *value);
-
-hsa_status_t hsa_agent_get_info(hsa_agent_t agent, hsa_agent_info_t attribute,
-                                void *value);
-
-hsa_status_t hsa_isa_get_info_alt(hsa_isa_t isa, hsa_isa_info_t attribute,
-                                  void *value);
-
-hsa_status_t hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent,
-                                                         void *data),
-                                void *data);
-
-hsa_status_t hsa_agent_iterate_isas(hsa_agent_t agent,
-                                    hsa_status_t (*callback)(hsa_isa_t isa,
-                                                             void *data),
-                                    void *data);
-
-typedef struct hsa_signal_s {
-  uint64_t handle;
-} hsa_signal_t;
-
-#ifdef HSA_LARGE_MODEL
-typedef int64_t hsa_signal_value_t;
-#else
-typedef int32_t hsa_signal_value_t;
-#endif
-
-hsa_status_t hsa_signal_create(hsa_signal_value_t initial_value,
-                               uint32_t num_consumers,
-                               const hsa_agent_t *consumers,
-                               hsa_signal_t *signal);
-
-hsa_status_t hsa_amd_signal_create(hsa_signal_value_t initial_value,
-                                   uint32_t num_consumers,
-                                   const hsa_agent_t *consumers,
-                                   uint64_t attributes, hsa_signal_t *signal);
-
-hsa_status_t hsa_signal_destroy(hsa_signal_t signal);
-
-void hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value);
-
-void hsa_signal_store_screlease(hsa_signal_t signal, hsa_signal_value_t value);
-
-hsa_signal_value_t hsa_signal_load_scacquire(hsa_signal_t signal);
-
-void hsa_signal_subtract_screlease(hsa_signal_t signal,
-                                   hsa_signal_value_t value);
-
-typedef enum {
-  HSA_SIGNAL_CONDITION_EQ = 0,
-  HSA_SIGNAL_CONDITION_NE = 1,
-} hsa_signal_condition_t;
-
-typedef enum {
-  HSA_WAIT_STATE_BLOCKED = 0,
-  HSA_WAIT_STATE_ACTIVE = 1
-} hsa_wait_state_t;
-
-hsa_signal_value_t hsa_signal_wait_scacquire(hsa_signal_t signal,
-                                             hsa_signal_condition_t condition,
-                                             hsa_signal_value_t compare_value,
-                                             uint64_t timeout_hint,
-                                             hsa_wait_state_t wait_state_hint);
-
-typedef enum {
-  HSA_QUEUE_TYPE_MULTI = 0,
-  HSA_QUEUE_TYPE_SINGLE = 1,
-} hsa_queue_type_t;
-
-typedef enum {
-  HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1,
-  HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2
-} hsa_queue_feature_t;
-
-typedef uint32_t hsa_queue_type32_t;
-
-typedef struct hsa_queue_s {
-  hsa_queue_type32_t type;
-  uint32_t features;
-
-#ifdef HSA_LARGE_MODEL
-  void *base_address;
-#elif defined HSA_LITTLE_ENDIAN
-  void *base_address;
-  uint32_t reserved0;
-#else
-  uint32_t reserved0;
-  void *base_address;
-#endif
-  hsa_signal_t doorbell_signal;
-  uint32_t size;
-  uint32_t reserved1;
-  uint64_t id;
-} hsa_queue_t;
-
-hsa_status_t hsa_queue_create(hsa_agent_t agent, uint32_t size,
-                              hsa_queue_type32_t type,
-                              void (*callback)(hsa_status_t status,
-                                               hsa_queue_t *source, void *data),
-                              void *data, uint32_t private_segment_size,
-                              uint32_t group_segment_size, hsa_queue_t **queue);
-
-hsa_status_t hsa_queue_destroy(hsa_queue_t *queue);
-
-uint64_t hsa_queue_load_read_index_scacquire(const hsa_queue_t *queue);
-
-uint64_t hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue,
-                                           uint64_t value);
-
-typedef enum {
-  HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
-  HSA_PACKET_TYPE_BARRIER_AND = 3,
-} hsa_packet_type_t;
-
-typedef enum { HSA_FENCE_SCOPE_SYSTEM = 2 } hsa_fence_scope_t;
-
-typedef enum {
-  HSA_PACKET_HEADER_TYPE = 0,
-  HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
-  HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
-} hsa_packet_header_t;
-
-typedef enum {
-  HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
-} hsa_kernel_dispatch_packet_setup_t;
-
-typedef enum {
-  HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2
-} hsa_kernel_dispatch_packet_setup_width_t;
-
-typedef struct hsa_kernel_dispatch_packet_s {
-  uint16_t header;
-  uint16_t setup;
-  uint16_t workgroup_size_x;
-  uint16_t workgroup_size_y;
-  uint16_t workgroup_size_z;
-  uint16_t reserved0;
-  uint32_t grid_size_x;
-  uint32_t grid_size_y;
-  uint32_t grid_size_z;
-  uint32_t private_segment_size;
-  uint32_t group_segment_size;
-  uint64_t kernel_object;
-#ifdef HSA_LARGE_MODEL
-  void *kernarg_address;
-#elif defined HSA_LITTLE_ENDIAN
-  void *kernarg_address;
-  uint32_t reserved1;
-#else
-  uint32_t reserved1;
-  void *kernarg_address;
-#endif
-  uint64_t reserved2;
-  hsa_signal_t completion_signal;
-} hsa_kernel_dispatch_packet_t;
-
-typedef struct hsa_barrier_and_packet_s {
-  uint16_t header;
-  uint16_t reserved0;
-  uint32_t reserved1;
-  hsa_signal_t dep_signal[5];
-  uint64_t reserved2;
-  hsa_signal_t completion_signal;
-} hsa_barrier_and_packet_t;
-
-typedef enum { HSA_PROFILE_BASE = 0, HSA_PROFILE_FULL = 1 } hsa_profile_t;
-
-typedef enum {
-  HSA_EXECUTABLE_STATE_UNFROZEN = 0,
-  HSA_EXECUTABLE_STATE_FROZEN = 1
-} hsa_executable_state_t;
-
-typedef struct hsa_executable_s {
-  uint64_t handle;
-} hsa_executable_t;
-
-typedef struct hsa_executable_symbol_s {
-  uint64_t handle;
-} hsa_executable_symbol_t;
-
-typedef enum {
-  HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0,
-  HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1,
-  HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2,
-  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21,
-  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
-} hsa_executable_symbol_info_t;
-
-typedef struct hsa_code_object_s {
-  uint64_t handle;
-} hsa_code_object_t;
-
-typedef enum {
-  HSA_SYMBOL_KIND_VARIABLE = 0,
-  HSA_SYMBOL_KIND_KERNEL = 1,
-  HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2
-} hsa_symbol_kind_t;
-
-typedef enum {
-  HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0,
-  HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1,
-  HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2,
-} hsa_default_float_rounding_mode_t;
-
-hsa_status_t hsa_memory_copy(void *dst, const void *src, size_t size);
-
-hsa_status_t hsa_executable_create(hsa_profile_t profile,
-                                   hsa_executable_state_t executable_state,
-                                   const char *options,
-                                   hsa_executable_t *executable);
-
-hsa_status_t hsa_executable_create_alt(
-    hsa_profile_t profile,
-    hsa_default_float_rounding_mode_t default_float_rounding_mode,
-    const char *options, hsa_executable_t *executable);
-
-hsa_status_t hsa_executable_destroy(hsa_executable_t executable);
-
-hsa_status_t hsa_executable_freeze(hsa_executable_t executable,
-                                   const char *options);
-
-hsa_status_t hsa_executable_validate(hsa_executable_t executable,
-                                     uint32_t *result);
-
-hsa_status_t
-hsa_executable_symbol_get_info(hsa_executable_symbol_t executable_symbol,
-                               hsa_executable_symbol_info_t attribute,
-                               void *value);
-
-hsa_status_t hsa_executable_iterate_symbols(
-    hsa_executable_t executable,
-    hsa_status_t (*callback)(hsa_executable_t exec,
-                             hsa_executable_symbol_t symbol, void *data),
-    void *data);
-
-hsa_status_t hsa_executable_get_symbol_by_name(hsa_executable_t executable,
-                                               const char *symbol_name,
-                                               const hsa_agent_t *agent,
-                                               hsa_executable_symbol_t *symbol);
-
-hsa_status_t hsa_code_object_deserialize(void *serialized_code_object,
-                                         size_t serialized_code_object_size,
-                                         const char *options,
-                                         hsa_code_object_t *code_object);
-
-hsa_status_t hsa_executable_load_code_object(hsa_executable_t executable,
-                                             hsa_agent_t agent,
-                                             hsa_code_object_t code_object,
-                                             const char *options);
-
-hsa_status_t hsa_code_object_destroy(hsa_code_object_t code_object);
-
-typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void *arg);
-
-hsa_status_t hsa_amd_signal_async_handler(hsa_signal_t signal,
-                                          hsa_signal_condition_t cond,
-                                          hsa_signal_value_t value,
-                                          hsa_amd_signal_handler handler,
-                                          void *arg);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif

diff --git a/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
deleted file mode 100644
index 3117763..0000000
--- a/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ /dev/null

@@ -1,174 +0,0 @@
-//===--- amdgpu/dynamic_hsa/hsa_ext_amd.h ------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The parts of the hsa api that are presently in use by the amdgpu plugin
-//
-//===----------------------------------------------------------------------===//
-#ifndef HSA_RUNTIME_EXT_AMD_H_
-#define HSA_RUNTIME_EXT_AMD_H_
-
-#include "hsa.h"
-
-/* Using this header means we do not know what version library will be linked.
-   Until such point as a CMake level override is requested, default to the
-   minimum. */
-/*
- * - 1.0 - initial version
- */
-#define HSA_AMD_INTERFACE_VERSION_MAJOR 1
-#define HSA_AMD_INTERFACE_VERSION_MINOR 0
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct hsa_amd_memory_pool_s {
-  uint64_t handle;
-} hsa_amd_memory_pool_t;
-
-typedef enum hsa_amd_memory_pool_global_flag_s {
-  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1,
-  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2,
-  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
-} hsa_amd_memory_pool_global_flag_t;
-
-typedef enum {
-  HSA_AMD_SEGMENT_GLOBAL = 0,
-  HSA_AMD_SEGMENT_READONLY = 1,
-  HSA_AMD_SEGMENT_PRIVATE = 2,
-  HSA_AMD_SEGMENT_GROUP = 3,
-} hsa_amd_segment_t;
-
-typedef enum {
-  HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0,
-  HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1,
-  HSA_AMD_MEMORY_POOL_INFO_SIZE = 2,
-  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5,
-  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
-  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
-  HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
-} hsa_amd_memory_pool_info_t;
-
-typedef enum {
-  HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0,
-} hsa_amd_agent_memory_pool_info_t;
-
-typedef enum {
-  HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0,
-} hsa_amd_memory_pool_access_t;
-
-typedef enum hsa_amd_agent_info_s {
-  HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
-  HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
-  HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
-  HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
-  HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
-  HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
-  HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
-  HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010,
-  HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016,
-} hsa_amd_agent_info_t;
-
-hsa_status_t hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
-                                          hsa_amd_memory_pool_info_t attribute,
-                                          void *value);
-
-hsa_status_t hsa_amd_agent_iterate_memory_pools(
-    hsa_agent_t agent,
-    hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void *data),
-    void *data);
-
-hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool,
-                                          size_t size, uint32_t flags,
-                                          void **ptr);
-
-hsa_status_t hsa_amd_memory_pool_free(void *ptr);
-
-hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent,
-                                       const void *src, hsa_agent_t src_agent,
-                                       size_t size, uint32_t num_dep_signals,
-                                       const hsa_signal_t *dep_signals,
-                                       hsa_signal_t completion_signal);
-
-hsa_status_t hsa_amd_agent_memory_pool_get_info(
-    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
-    hsa_amd_agent_memory_pool_info_t attribute, void *value);
-
-hsa_status_t hsa_amd_agents_allow_access(uint32_t num_agents,
-                                         const hsa_agent_t *agents,
-                                         const uint32_t *flags,
-                                         const void *ptr);
-
-hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
-                                hsa_agent_t* agents, int num_agent,
-                                void** agent_ptr);
-
-hsa_status_t hsa_amd_memory_unlock(void* host_ptr);
-
-hsa_status_t hsa_amd_memory_fill(void *ptr, uint32_t value, size_t count);
-
-typedef enum hsa_amd_event_type_s {
-  HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0,
-} hsa_amd_event_type_t;
-
-typedef struct hsa_amd_gpu_memory_fault_info_s {
-  hsa_agent_t agent;
-  uint64_t virtual_address;
-  uint32_t fault_reason_mask;
-} hsa_amd_gpu_memory_fault_info_t;
-
-typedef struct hsa_amd_event_s {
-  hsa_amd_event_type_t event_type;
-  union {
-    hsa_amd_gpu_memory_fault_info_t memory_fault;
-  };
-} hsa_amd_event_t;
-
-typedef hsa_status_t (*hsa_amd_system_event_callback_t)(
-    const hsa_amd_event_t *event, void *data);
-
-hsa_status_t
-hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback,
-                                      void *data);
-
-typedef enum {
-  HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0,
-  HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1,
-  HSA_AMD_MEMORY_FAULT_NX = 1 << 2,
-  HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
-  HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
-  HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
-  HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
-  HSA_AMD_MEMORY_FAULT_HANG = 1 << 31
-} hsa_amd_memory_fault_reason_t;
-
-typedef enum {
-  HSA_EXT_POINTER_TYPE_UNKNOWN = 0,
-  HSA_EXT_POINTER_TYPE_HSA = 1,
-  HSA_EXT_POINTER_TYPE_LOCKED = 2
-} hsa_amd_pointer_type_t;
-
-typedef struct hsa_amd_pointer_info_s {
-  uint32_t size;
-  hsa_amd_pointer_type_t type;
-  void* agentBaseAddress;
-  void* hostBaseAddress;
-  size_t sizeInBytes;
-} hsa_amd_pointer_info_t;
-
-hsa_status_t hsa_amd_pointer_info(const void* ptr,
-                                          hsa_amd_pointer_info_t* info,
-                                          void* (*alloc)(size_t),
-                                          uint32_t* num_agents_accessible,
-                                          hsa_agent_t** accessible);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif

diff --git a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
deleted file mode 100644
index 00650b8..0000000
--- a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ /dev/null

@@ -1,3478 +0,0 @@
-//===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL NextGen for AMDGPU machine
-//
-//===----------------------------------------------------------------------===//
-
-#include <atomic>
-#include <cassert>
-#include <cstddef>
-#include <deque>
-#include <mutex>
-#include <string>
-#include <system_error>
-#include <unistd.h>
-#include <unordered_map>
-
-#include "Shared/Debug.h"
-#include "Shared/Environment.h"
-#include "Shared/Utils.h"
-#include "Utils/ELF.h"
-
-#include "GlobalHandler.h"
-#include "OpenMP/OMPT/Callback.h"
-#include "PluginInterface.h"
-#include "UtilitiesRTL.h"
-#include "omptarget.h"
-
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/raw_ostream.h"
-
-#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
-    !defined(__ORDER_BIG_ENDIAN__)
-#error "Missing preprocessor definitions for endianness detection."
-#endif
-
-// The HSA headers require these definitions.
-#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LITTLEENDIAN_CPU
-#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#define BIGENDIAN_CPU
-#endif
-
-#if defined(__has_include)
-#if __has_include("hsa/hsa.h")
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#elif __has_include("hsa.h")
-#include "hsa.h"
-#include "hsa_ext_amd.h"
-#endif
-#else
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#endif
-
-namespace llvm {
-namespace omp {
-namespace target {
-namespace plugin {
-
-/// Forward declarations for all specialized data structures.
-struct AMDGPUKernelTy;
-struct AMDGPUDeviceTy;
-struct AMDGPUPluginTy;
-struct AMDGPUStreamTy;
-struct AMDGPUEventTy;
-struct AMDGPUStreamManagerTy;
-struct AMDGPUEventManagerTy;
-struct AMDGPUDeviceImageTy;
-struct AMDGPUMemoryManagerTy;
-struct AMDGPUMemoryPoolTy;
-
-namespace utils {
-
-/// Iterate elements using an HSA iterate function. Do not use this function
-/// directly but the specialized ones below instead.
-template <typename ElemTy, typename IterFuncTy, typename CallbackTy>
-hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) {
-  auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
-    CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
-    return (*Unwrapped)(Elem);
-  };
-  return Func(L, static_cast<void *>(&Cb));
-}
-
-/// Iterate elements using an HSA iterate function passing a parameter. Do not
-/// use this function directly but the specialized ones below instead.
-template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy,
-          typename CallbackTy>
-hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
-  auto L = [](ElemTy Elem, void *Data) -> hsa_status_t {
-    CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
-    return (*Unwrapped)(Elem);
-  };
-  return Func(FuncArg, L, static_cast<void *>(&Cb));
-}
-
-/// Iterate elements using an HSA iterate function passing a parameter. Do not
-/// use this function directly but the specialized ones below instead.
-template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy,
-          typename IterFuncArgTy, typename CallbackTy>
-hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) {
-  auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t {
-    CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data);
-    return (*Unwrapped)(Elem1, Elem2);
-  };
-  return Func(FuncArg, L, static_cast<void *>(&Cb));
-}
-
-/// Iterate agents.
-template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) {
-  hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback);
-  return Plugin::check(Status, "Error in hsa_iterate_agents: %s");
-}
-
-/// Iterate ISAs of an agent.
-template <typename CallbackTy>
-Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) {
-  hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb);
-  return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s");
-}
-
-/// Iterate memory pools of an agent.
-template <typename CallbackTy>
-Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
-  hsa_status_t Status = iterate<hsa_amd_memory_pool_t>(
-      hsa_amd_agent_iterate_memory_pools, Agent, Cb);
-  return Plugin::check(Status,
-                       "Error in hsa_amd_agent_iterate_memory_pools: %s");
-}
-
-/// Dispatches an asynchronous memory copy.
-/// Enables different SDMA engines for the dispatch in a round-robin fashion.
-Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
-                   const void *Src, hsa_agent_t SrcAgent, size_t Size,
-                   uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
-                   hsa_signal_t CompletionSignal) {
-  if (!UseMultipleSdmaEngines) {
-    hsa_status_t S =
-        hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
-                                  NumDepSignals, DepSignals, CompletionSignal);
-    return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
-  }
-
-// This solution is probably not the best
-#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 &&                                  \
-      HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
-  return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
-#else
-  static std::atomic<int> SdmaEngine{1};
-
-  // This atomics solution is probably not the best, but should be sufficient
-  // for now.
-  // In a worst case scenario, in which threads read the same value, they will
-  // dispatch to the same SDMA engine. This may result in sub-optimal
-  // performance. However, I think the possibility to be fairly low.
-  int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
-  // This call is only avail in ROCm >= 5.7
-  hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
-      Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
-      CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
-      /*force_copy_on_sdma=*/true);
-  // Increment to use one of two SDMA engines: 0x1, 0x2
-  LocalSdmaEngine = (LocalSdmaEngine << 1) % 3;
-  SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
-
-  return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
-#endif
-}
-
-Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
-  std::string Target;
-  auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
-    uint32_t Length;
-    hsa_status_t Status;
-    Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
-    if (Status != HSA_STATUS_SUCCESS)
-      return Status;
-
-    llvm::SmallVector<char> ISAName(Length);
-    Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
-    if (Status != HSA_STATUS_SUCCESS)
-      return Status;
-
-    llvm::StringRef TripleTarget(ISAName.begin(), Length);
-    if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
-      Target = TripleTarget.ltrim('-').rtrim('\0').str();
-    return HSA_STATUS_SUCCESS;
-  });
-  if (Err)
-    return Err;
-  return Target;
-}
-} // namespace utils
-
-/// Utility class representing generic resource references to AMDGPU resources.
-template <typename ResourceTy>
-struct AMDGPUResourceRef : public GenericDeviceResourceRef {
-  /// The underlying handle type for resources.
-  using HandleTy = ResourceTy *;
-
-  /// Create an empty reference to an invalid resource.
-  AMDGPUResourceRef() : Resource(nullptr) {}
-
-  /// Create a reference to an existing resource.
-  AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {}
-
-  virtual ~AMDGPUResourceRef() {}
-
-  /// Create a new resource and save the reference. The reference must be empty
-  /// before calling to this function.
-  Error create(GenericDeviceTy &Device) override;
-
-  /// Destroy the referenced resource and invalidate the reference. The
-  /// reference must be to a valid resource before calling to this function.
-  Error destroy(GenericDeviceTy &Device) override {
-    if (!Resource)
-      return Plugin::error("Destroying an invalid resource");
-
-    if (auto Err = Resource->deinit())
-      return Err;
-
-    delete Resource;
-
-    Resource = nullptr;
-    return Plugin::success();
-  }
-
-  /// Get the underlying resource handle.
-  operator HandleTy() const { return Resource; }
-
-private:
-  /// The handle to the actual resource.
-  HandleTy Resource;
-};
-
-/// Class holding an HSA memory pool.
-struct AMDGPUMemoryPoolTy {
-  /// Create a memory pool from an HSA memory pool.
-  AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool)
-      : MemoryPool(MemoryPool), GlobalFlags(0) {}
-
-  /// Initialize the memory pool retrieving its properties.
-  Error init() {
-    if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment))
-      return Err;
-
-    if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Getter of the HSA memory pool.
-  hsa_amd_memory_pool_t get() const { return MemoryPool; }
-
-  /// Indicate the segment which belongs to.
-  bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); }
-  bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); }
-  bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); }
-  bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); }
-
-  /// Indicate if it is fine-grained memory. Valid only for global.
-  bool isFineGrained() const {
-    assert(isGlobal() && "Not global memory");
-    return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
-  }
-
-  /// Indicate if it is coarse-grained memory. Valid only for global.
-  bool isCoarseGrained() const {
-    assert(isGlobal() && "Not global memory");
-    return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED);
-  }
-
-  /// Indicate if it supports storing kernel arguments. Valid only for global.
-  bool supportsKernelArgs() const {
-    assert(isGlobal() && "Not global memory");
-    return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
-  }
-
-  /// Allocate memory on the memory pool.
-  Error allocate(size_t Size, void **PtrStorage) {
-    hsa_status_t Status =
-        hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage);
-    return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s");
-  }
-
-  /// Return memory to the memory pool.
-  Error deallocate(void *Ptr) {
-    hsa_status_t Status = hsa_amd_memory_pool_free(Ptr);
-    return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s");
-  }
-
-  /// Allow the device to access a specific allocation.
-  Error enableAccess(void *Ptr, int64_t Size,
-                     const llvm::SmallVector<hsa_agent_t> &Agents) const {
-#ifdef OMPTARGET_DEBUG
-    for (hsa_agent_t Agent : Agents) {
-      hsa_amd_memory_pool_access_t Access;
-      if (auto Err =
-              getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access))
-        return Err;
-
-      // The agent is not allowed to access the memory pool in any case. Do not
-      // continue because otherwise it result in undefined behavior.
-      if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
-        return Plugin::error("An agent is not allowed to access a memory pool");
-    }
-#endif
-
-    // We can access but it is disabled by default. Enable the access then.
-    hsa_status_t Status =
-        hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr);
-    return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s");
-  }
-
-  /// Get attribute from the memory pool.
-  template <typename Ty>
-  Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
-    hsa_status_t Status;
-    Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
-    return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s");
-  }
-
-  template <typename Ty>
-  hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const {
-    return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value);
-  }
-
-  /// Get attribute from the memory pool relating to an agent.
-  template <typename Ty>
-  Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind,
-                Ty &Value) const {
-    hsa_status_t Status;
-    Status =
-        hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value);
-    return Plugin::check(Status,
-                         "Error in hsa_amd_agent_memory_pool_get_info: %s");
-  }
-
-private:
-  /// The HSA memory pool.
-  hsa_amd_memory_pool_t MemoryPool;
-
-  /// The segment where the memory pool belongs to.
-  hsa_amd_segment_t Segment;
-
-  /// The global flags of memory pool. Only valid if the memory pool belongs to
-  /// the global segment.
-  uint32_t GlobalFlags;
-};
-
-/// Class that implements a memory manager that gets memory from a specific
-/// memory pool.
-struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
-
-  /// Create an empty memory manager.
-  AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
-      : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
-
-  /// Initialize the memory manager from a memory pool.
-  Error init(AMDGPUMemoryPoolTy &MemoryPool) {
-    const uint32_t Threshold = 1 << 30;
-    this->MemoryManager = new MemoryManagerTy(*this, Threshold);
-    this->MemoryPool = &MemoryPool;
-    return Plugin::success();
-  }
-
-  /// Deinitialize the memory manager and free its allocations.
-  Error deinit() {
-    assert(MemoryManager && "Invalid memory manager");
-
-    // Delete and invalidate the memory manager. At this point, the memory
-    // manager will deallocate all its allocations.
-    delete MemoryManager;
-    MemoryManager = nullptr;
-
-    return Plugin::success();
-  }
-
-  /// Reuse or allocate memory through the memory manager.
-  Error allocate(size_t Size, void **PtrStorage) {
-    assert(MemoryManager && "Invalid memory manager");
-    assert(PtrStorage && "Invalid pointer storage");
-
-    *PtrStorage = MemoryManager->allocate(Size, nullptr);
-    if (*PtrStorage == nullptr)
-      return Plugin::error("Failure to allocate from AMDGPU memory manager");
-
-    return Plugin::success();
-  }
-
-  /// Release an allocation to be reused.
-  Error deallocate(void *Ptr) {
-    assert(Ptr && "Invalid pointer");
-
-    if (MemoryManager->free(Ptr))
-      return Plugin::error("Failure to deallocate from AMDGPU memory manager");
-
-    return Plugin::success();
-  }
-
-private:
-  /// Allocation callback that will be called once the memory manager does not
-  /// have more previously allocated buffers.
-  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
-
-  /// Deallocation callack that will be called by the memory manager.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
-    if (auto Err = MemoryPool->deallocate(TgtPtr)) {
-      consumeError(std::move(Err));
-      return OFFLOAD_FAIL;
-    }
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// The underlying plugin that owns this memory manager.
-  AMDGPUPluginTy &Plugin;
-
-  /// The memory pool used to allocate memory.
-  AMDGPUMemoryPoolTy *MemoryPool;
-
-  /// Reference to the actual memory manager.
-  MemoryManagerTy *MemoryManager;
-};
-
-/// Class implementing the AMDGPU device images' properties.
-struct AMDGPUDeviceImageTy : public DeviceImageTy {
-  /// Create the AMDGPU image with the id and the target image pointer.
-  AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                      const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage) {}
-
-  /// Prepare and load the executable corresponding to the image.
-  Error loadExecutable(const AMDGPUDeviceTy &Device);
-
-  /// Unload the executable.
-  Error unloadExecutable() {
-    hsa_status_t Status = hsa_executable_destroy(Executable);
-    if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s"))
-      return Err;
-
-    Status = hsa_code_object_destroy(CodeObject);
-    return Plugin::check(Status, "Error in hsa_code_object_destroy: %s");
-  }
-
-  /// Get the executable.
-  hsa_executable_t getExecutable() const { return Executable; }
-
-  /// Get to Code Object Version of the ELF
-  uint16_t getELFABIVersion() const { return ELFABIVersion; }
-
-  /// Find an HSA device symbol by its name on the executable.
-  Expected<hsa_executable_symbol_t>
-  findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
-
-  /// Get additional info for kernel, e.g., register spill counts
-  std::optional<utils::KernelMetaDataTy>
-  getKernelInfo(StringRef Identifier) const {
-    auto It = KernelInfoMap.find(Identifier);
-
-    if (It == KernelInfoMap.end())
-      return {};
-
-    return It->second;
-  }
-
-private:
-  /// The exectuable loaded on the agent.
-  hsa_executable_t Executable;
-  hsa_code_object_t CodeObject;
-  StringMap<utils::KernelMetaDataTy> KernelInfoMap;
-  uint16_t ELFABIVersion;
-};
-
-/// Class implementing the AMDGPU kernel functionalities which derives from the
-/// generic kernel class.
-struct AMDGPUKernelTy : public GenericKernelTy {
-  /// Create an AMDGPU kernel with a name and an execution mode.
-  AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
-
-  /// Initialize the AMDGPU kernel.
-  Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
-    AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
-
-    // Kernel symbols have a ".kd" suffix.
-    std::string KernelName(getName());
-    KernelName += ".kd";
-
-    // Find the symbol on the device executable.
-    auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName);
-    if (!SymbolOrErr)
-      return SymbolOrErr.takeError();
-
-    hsa_executable_symbol_t Symbol = *SymbolOrErr;
-    hsa_symbol_kind_t SymbolType;
-    hsa_status_t Status;
-
-    // Retrieve different properties of the kernel symbol.
-    std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
-        {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
-        {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
-        {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
-        {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
-        {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
-        {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
-
-    for (auto &Info : RequiredInfos) {
-      Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
-      if (auto Err = Plugin::check(
-              Status, "Error in hsa_executable_symbol_get_info: %s"))
-        return Err;
-    }
-
-    // Make sure it is a kernel symbol.
-    if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
-      return Plugin::error("Symbol %s is not a kernel function");
-
-    // TODO: Read the kernel descriptor for the max threads per block. May be
-    // read from the image.
-
-    ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
-    DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
-
-    // Get additional kernel info read from image
-    KernelInfo = AMDImage.getKernelInfo(getName());
-    if (!KernelInfo.has_value())
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
-           "Could not read extra information for kernel %s.", getName());
-
-    return Plugin::success();
-  }
-
-  /// Launch the AMDGPU kernel function.
-  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
-                   uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
-                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
-
-  /// Print more elaborate kernel launch info for AMDGPU
-  Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
-                               KernelArgsTy &KernelArgs, uint32_t NumThreads,
-                               uint64_t NumBlocks) const override;
-
-  /// Get group and private segment kernel size.
-  uint32_t getGroupSize() const { return GroupSize; }
-  uint32_t getPrivateSize() const { return PrivateSize; }
-
-  /// Get the HSA kernel object representing the kernel function.
-  uint64_t getKernelObject() const { return KernelObject; }
-
-  /// Get the size of implicitargs based on the code object version
-  /// @return 56 for cov4 and 256 for cov5
-  uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
-
-  /// Indicates whether or not we need to set up our own private segment size.
-  bool usesDynamicStack() const { return DynamicStack; }
-
-private:
-  /// The kernel object to execute.
-  uint64_t KernelObject;
-
-  /// The args, group and private segments sizes required by a kernel instance.
-  uint32_t ArgsSize;
-  uint32_t GroupSize;
-  uint32_t PrivateSize;
-  bool DynamicStack;
-
-  /// The size of implicit kernel arguments.
-  uint32_t ImplicitArgsSize;
-
-  /// Additional Info for the AMD GPU Kernel
-  std::optional<utils::KernelMetaDataTy> KernelInfo;
-};
-
-/// Class representing an HSA signal. Signals are used to define dependencies
-/// between asynchronous operations: kernel launches and memory transfers.
-struct AMDGPUSignalTy {
-  /// Create an empty signal.
-  AMDGPUSignalTy() : HSASignal({0}), UseCount() {}
-  AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {}
-
-  /// Initialize the signal with an initial value.
-  Error init(uint32_t InitialValue = 1) {
-    hsa_status_t Status =
-        hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal);
-    return Plugin::check(Status, "Error in hsa_signal_create: %s");
-  }
-
-  /// Deinitialize the signal.
-  Error deinit() {
-    hsa_status_t Status = hsa_signal_destroy(HSASignal);
-    return Plugin::check(Status, "Error in hsa_signal_destroy: %s");
-  }
-
-  /// Wait until the signal gets a zero value.
-  Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
-             GenericDeviceTy *Device = nullptr) const {
-    if (ActiveTimeout && !RPCServer) {
-      hsa_signal_value_t Got = 1;
-      Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                      ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
-      if (Got == 0)
-        return Plugin::success();
-    }
-
-    // If there is an RPC device attached to this stream we run it as a server.
-    uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
-    auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
-    while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                     Timeout, WaitState) != 0) {
-      if (RPCServer && Device)
-        if (auto Err = RPCServer->runServer(*Device))
-          return Err;
-    }
-    return Plugin::success();
-  }
-
-  /// Load the value on the signal.
-  hsa_signal_value_t load() const {
-    return hsa_signal_load_scacquire(HSASignal);
-  }
-
-  /// Signal decrementing by one.
-  void signal() {
-    assert(load() > 0 && "Invalid signal value");
-    hsa_signal_subtract_screlease(HSASignal, 1);
-  }
-
-  /// Reset the signal value before reusing the signal. Do not call this
-  /// function if the signal is being currently used by any watcher, such as a
-  /// plugin thread or the HSA runtime.
-  void reset() { hsa_signal_store_screlease(HSASignal, 1); }
-
-  /// Increase the number of concurrent uses.
-  void increaseUseCount() { UseCount.increase(); }
-
-  /// Decrease the number of concurrent uses and return whether was the last.
-  bool decreaseUseCount() { return UseCount.decrease(); }
-
-  hsa_signal_t get() const { return HSASignal; }
-
-private:
-  /// The underlying HSA signal.
-  hsa_signal_t HSASignal;
-
-  /// Reference counter for tracking the concurrent use count. This is mainly
-  /// used for knowing how many streams are using the signal.
-  RefCountTy<> UseCount;
-};
-
-/// Classes for holding AMDGPU signals and managing signals.
-using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>;
-using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
-
-/// Class holding an HSA queue to submit kernel and barrier packets.
-struct AMDGPUQueueTy {
-  /// Create an empty queue.
-  AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
-
-  /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
-    if (Queue)
-      return Plugin::success();
-    hsa_status_t Status =
-        hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
-    return Plugin::check(Status, "Error in hsa_queue_create: %s");
-  }
-
-  /// Deinitialize the queue and destroy its resources.
-  Error deinit() {
-    std::lock_guard<std::mutex> Lock(Mutex);
-    if (!Queue)
-      return Plugin::success();
-    hsa_status_t Status = hsa_queue_destroy(Queue);
-    return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
-  }
-
-  /// Returns the number of streams, this queue is currently assigned to.
-  bool getUserCount() const { return NumUsers; }
-
-  /// Returns if the underlying HSA queue is initialized.
-  bool isInitialized() { return Queue != nullptr; }
-
-  /// Decrement user count of the queue object.
-  void removeUser() { --NumUsers; }
-
-  /// Increase user count of the queue object.
-  void addUser() { ++NumUsers; }
-
-  /// Push a kernel launch to the queue. The kernel launch requires an output
-  /// signal and can define an optional input signal (nullptr if none).
-  Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
-                         uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
-                         AMDGPUSignalTy *OutputSignal,
-                         AMDGPUSignalTy *InputSignal) {
-    assert(OutputSignal && "Invalid kernel output signal");
-
-    // Lock the queue during the packet publishing process. Notice this blocks
-    // the addition of other packets to the queue. The following piece of code
-    // should be lightweight; do not block the thread, allocate memory, etc.
-    std::lock_guard<std::mutex> Lock(Mutex);
-    assert(Queue && "Interacted with a non-initialized queue!");
-
-    // Add a barrier packet before the kernel packet in case there is a pending
-    // preceding operation. The barrier packet will delay the processing of
-    // subsequent queue's packets until the barrier input signal are satisfied.
-    // No need output signal needed because the dependency is already guaranteed
-    // by the queue barrier itself.
-    if (InputSignal && InputSignal->load())
-      if (auto Err = pushBarrierImpl(nullptr, InputSignal))
-        return Err;
-
-    // Now prepare the kernel packet.
-    uint64_t PacketId;
-    hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
-    assert(Packet && "Invalid packet");
-
-    // The first 32 bits of the packet are written after the other fields
-    uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-    Packet->workgroup_size_x = NumThreads;
-    Packet->workgroup_size_y = 1;
-    Packet->workgroup_size_z = 1;
-    Packet->reserved0 = 0;
-    Packet->grid_size_x = NumBlocks * NumThreads;
-    Packet->grid_size_y = 1;
-    Packet->grid_size_z = 1;
-    Packet->private_segment_size =
-        Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
-    Packet->group_segment_size = GroupSize;
-    Packet->kernel_object = Kernel.getKernelObject();
-    Packet->kernarg_address = KernelArgs;
-    Packet->reserved2 = 0;
-    Packet->completion_signal = OutputSignal->get();
-
-    // Publish the packet. Do not modify the packet after this point.
-    publishKernelPacket(PacketId, Setup, Packet);
-
-    return Plugin::success();
-  }
-
-  /// Push a barrier packet that will wait up to two input signals. All signals
-  /// are optional (nullptr if none).
-  Error pushBarrier(AMDGPUSignalTy *OutputSignal,
-                    const AMDGPUSignalTy *InputSignal1,
-                    const AMDGPUSignalTy *InputSignal2) {
-    // Lock the queue during the packet publishing process.
-    std::lock_guard<std::mutex> Lock(Mutex);
-    assert(Queue && "Interacted with a non-initialized queue!");
-
-    // Push the barrier with the lock acquired.
-    return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
-  }
-
-private:
-  /// Push a barrier packet that will wait up to two input signals. Assumes the
-  /// the queue lock is acquired.
-  Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal,
-                        const AMDGPUSignalTy *InputSignal1,
-                        const AMDGPUSignalTy *InputSignal2 = nullptr) {
-    // Add a queue barrier waiting on both the other stream's operation and the
-    // last operation on the current stream (if any).
-    uint64_t PacketId;
-    hsa_barrier_and_packet_t *Packet =
-        (hsa_barrier_and_packet_t *)acquirePacket(PacketId);
-    assert(Packet && "Invalid packet");
-
-    Packet->reserved0 = 0;
-    Packet->reserved1 = 0;
-    Packet->dep_signal[0] = {0};
-    Packet->dep_signal[1] = {0};
-    Packet->dep_signal[2] = {0};
-    Packet->dep_signal[3] = {0};
-    Packet->dep_signal[4] = {0};
-    Packet->reserved2 = 0;
-    Packet->completion_signal = {0};
-
-    // Set input and output dependencies if needed.
-    if (OutputSignal)
-      Packet->completion_signal = OutputSignal->get();
-    if (InputSignal1)
-      Packet->dep_signal[0] = InputSignal1->get();
-    if (InputSignal2)
-      Packet->dep_signal[1] = InputSignal2->get();
-
-    // Publish the packet. Do not modify the packet after this point.
-    publishBarrierPacket(PacketId, Packet);
-
-    return Plugin::success();
-  }
-
-  /// Acquire a packet from the queue. This call may block the thread if there
-  /// is no space in the underlying HSA queue. It may need to wait until the HSA
-  /// runtime processes some packets. Assumes the queue lock is acquired.
-  hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) {
-    // Increase the queue index with relaxed memory order. Notice this will need
-    // another subsequent atomic operation with acquire order.
-    PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
-
-    // Wait for the package to be available. Notice the atomic operation uses
-    // the acquire memory order.
-    while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size)
-      ;
-
-    // Return the packet reference.
-    const uint32_t Mask = Queue->size - 1; // The size is a power of 2.
-    return (hsa_kernel_dispatch_packet_t *)Queue->base_address +
-           (PacketId & Mask);
-  }
-
-  /// Publish the kernel packet so that the HSA runtime can start processing
-  /// the kernel launch. Do not modify the packet once this function is called.
-  /// Assumes the queue lock is acquired.
-  void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
-                           hsa_kernel_dispatch_packet_t *Packet) {
-    uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
-
-    uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-
-    // Publish the packet. Do not modify the package after this point.
-    uint32_t HeaderWord = Header | (Setup << 16u);
-    __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
-
-    // Signal the doorbell about the published packet.
-    hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
-  }
-
-  /// Publish the barrier packet so that the HSA runtime can start processing
-  /// the barrier. Next packets in the queue will not be processed until all
-  /// barrier dependencies (signals) are satisfied. Assumes the queue is locked
-  void publishBarrierPacket(uint64_t PacketId,
-                            hsa_barrier_and_packet_t *Packet) {
-    uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
-    uint16_t Setup = 0;
-    uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-    Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-
-    // Publish the packet. Do not modify the package after this point.
-    uint32_t HeaderWord = Header | (Setup << 16u);
-    __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
-
-    // Signal the doorbell about the published packet.
-    hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
-  }
-
-  /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
-
-  /// The HSA queue.
-  hsa_queue_t *Queue;
-
-  /// Mutex to protect the acquiring and publishing of packets. For the moment,
-  /// we need this mutex to prevent publishing packets that are not ready to be
-  /// published in a multi-thread scenario. Without a queue lock, a thread T1
-  /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could
-  /// publish its packet P+1 (signaling the queue's doorbell) before packet P
-  /// from T1 is ready to be processed. That scenario should be invalid. Thus,
-  /// we use the following mutex to make packet acquiring and publishing atomic.
-  /// TODO: There are other more advanced approaches to avoid this mutex using
-  /// atomic operations. We can further investigate it if this is a bottleneck.
-  std::mutex Mutex;
-
-  /// The number of streams, this queue is currently assigned to. A queue is
-  /// considered idle when this is zero, otherwise: busy.
-  uint32_t NumUsers;
-};
-
-/// Struct that implements a stream of asynchronous operations for AMDGPU
-/// devices. This class relies on signals to implement streams and define the
-/// dependencies between asynchronous operations.
-struct AMDGPUStreamTy {
-private:
-  /// Utility struct holding arguments for async H2H memory copies.
-  struct MemcpyArgsTy {
-    void *Dst;
-    const void *Src;
-    size_t Size;
-  };
-
-  /// Utility struct holding arguments for freeing buffers to memory managers.
-  struct ReleaseBufferArgsTy {
-    void *Buffer;
-    AMDGPUMemoryManagerTy *MemoryManager;
-  };
-
-  /// Utility struct holding arguments for releasing signals to signal managers.
-  struct ReleaseSignalArgsTy {
-    AMDGPUSignalTy *Signal;
-    AMDGPUSignalManagerTy *SignalManager;
-  };
-
-  /// The stream is composed of N stream's slots. The struct below represents
-  /// the fields of each slot. Each slot has a signal and an optional action
-  /// function. When appending an HSA asynchronous operation to the stream, one
-  /// slot is consumed and used to store the operation's information. The
-  /// operation's output signal is set to the consumed slot's signal. If there
-  /// is a previous asynchronous operation on the previous slot, the HSA async
-  /// operation's input signal is set to the signal of the previous slot. This
-  /// way, we obtain a chain of dependant async operations. The action is a
-  /// function that will be executed eventually after the operation is
-  /// completed, e.g., for releasing a buffer.
-  struct StreamSlotTy {
-    /// The output signal of the stream operation. May be used by the subsequent
-    /// operation as input signal.
-    AMDGPUSignalTy *Signal;
-
-    /// The action that must be performed after the operation's completion. Set
-    /// to nullptr when there is no action to perform.
-    Error (*ActionFunction)(void *);
-
-    /// Space for the action's arguments. A pointer to these arguments is passed
-    /// to the action function. Notice the space of arguments is limited.
-    union {
-      MemcpyArgsTy MemcpyArgs;
-      ReleaseBufferArgsTy ReleaseBufferArgs;
-      ReleaseSignalArgsTy ReleaseSignalArgs;
-    } ActionArgs;
-
-    /// Create an empty slot.
-    StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
-
-    /// Schedule a host memory copy action on the slot.
-    Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
-      ActionFunction = memcpyAction;
-      ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
-      return Plugin::success();
-    }
-
-    /// Schedule a release buffer action on the slot.
-    Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
-      ActionFunction = releaseBufferAction;
-      ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
-      return Plugin::success();
-    }
-
-    /// Schedule a signal release action on the slot.
-    Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
-                             AMDGPUSignalManagerTy *SignalManager) {
-      ActionFunction = releaseSignalAction;
-      ActionArgs.ReleaseSignalArgs =
-          ReleaseSignalArgsTy{SignalToRelease, SignalManager};
-      return Plugin::success();
-    }
-
-    // Perform the action if needed.
-    Error performAction() {
-      if (!ActionFunction)
-        return Plugin::success();
-
-      // Perform the action.
-      if (ActionFunction == memcpyAction) {
-        if (auto Err = memcpyAction(&ActionArgs))
-          return Err;
-      } else if (ActionFunction == releaseBufferAction) {
-        if (auto Err = releaseBufferAction(&ActionArgs))
-          return Err;
-      } else if (ActionFunction == releaseSignalAction) {
-        if (auto Err = releaseSignalAction(&ActionArgs))
-          return Err;
-      } else {
-        return Plugin::error("Unknown action function!");
-      }
-
-      // Invalidate the action.
-      ActionFunction = nullptr;
-
-      return Plugin::success();
-    }
-  };
-
-  /// The device agent where the stream was created.
-  hsa_agent_t Agent;
-
-  /// The queue that the stream uses to launch kernels.
-  AMDGPUQueueTy *Queue;
-
-  /// The manager of signals to reuse signals.
-  AMDGPUSignalManagerTy &SignalManager;
-
-  /// A reference to the associated device.
-  GenericDeviceTy &Device;
-
-  /// Array of stream slots. Use std::deque because it can dynamically grow
-  /// without invalidating the already inserted elements. For instance, the
-  /// std::vector may invalidate the elements by reallocating the internal
-  /// array if there is not enough space on new insertions.
-  std::deque<StreamSlotTy> Slots;
-
-  /// The next available slot on the queue. This is reset to zero each time the
-  /// stream is synchronized. It also indicates the current number of consumed
-  /// slots at a given time.
-  uint32_t NextSlot;
-
-  /// The synchronization id. This number is increased each time the stream is
-  /// synchronized. It is useful to detect if an AMDGPUEventTy points to an
-  /// operation that was already finalized in a previous stream sycnhronize.
-  uint32_t SyncCycle;
-
-  /// A pointer associated with an RPC server running on the given device. If
-  /// RPC is not being used this will be a null pointer. Otherwise, this
-  /// indicates that an RPC server is expected to be run on this stream.
-  RPCServerTy *RPCServer;
-
-  /// Mutex to protect stream's management.
-  mutable std::mutex Mutex;
-
-  /// Timeout hint for HSA actively waiting for signal value to change
-  const uint64_t StreamBusyWaitMicroseconds;
-
-  /// Indicate to spread data transfers across all avilable SDMAs
-  bool UseMultipleSdmaEngines;
-
-  /// Return the current number of asychronous operations on the stream.
-  uint32_t size() const { return NextSlot; }
-
-  /// Return the last valid slot on the stream.
-  uint32_t last() const { return size() - 1; }
-
-  /// Consume one slot from the stream. Since the stream uses signals on demand
-  /// and releases them once the slot is no longer used, the function requires
-  /// an idle signal for the new consumed slot.
-  std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) {
-    // Double the stream size if needed. Since we use std::deque, this operation
-    // does not invalidate the already added slots.
-    if (Slots.size() == NextSlot)
-      Slots.resize(Slots.size() * 2);
-
-    // Update the next available slot and the stream size.
-    uint32_t Curr = NextSlot++;
-
-    // Retrieve the input signal, if any, of the current operation.
-    AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr;
-
-    // Set the output signal of the current slot.
-    Slots[Curr].Signal = OutputSignal;
-
-    return std::make_pair(Curr, InputSignal);
-  }
-
-  /// Complete all pending post actions and reset the stream after synchronizing
-  /// or positively querying the stream.
-  Error complete() {
-    for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) {
-      // Take the post action of the operation if any.
-      if (auto Err = Slots[Slot].performAction())
-        return Err;
-
-      // Release the slot's signal if possible. Otherwise, another user will.
-      if (Slots[Slot].Signal->decreaseUseCount())
-        if (auto Err = SignalManager.returnResource(Slots[Slot].Signal))
-          return Err;
-
-      Slots[Slot].Signal = nullptr;
-    }
-
-    // Reset the stream slots to zero.
-    NextSlot = 0;
-
-    // Increase the synchronization id since the stream completed a sync cycle.
-    SyncCycle += 1;
-
-    return Plugin::success();
-  }
-
-  /// Make the current stream wait on a specific operation of another stream.
-  /// The idea is to make the current stream waiting on two signals: 1) the last
-  /// signal of the current stream, and 2) the last signal of the other stream.
-  /// Use a barrier packet with two input signals.
-  Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) {
-    if (Queue == nullptr)
-      return Plugin::error("Target queue was nullptr");
-
-    /// The signal that we must wait from the other stream.
-    AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal;
-
-    // Prevent the release of the other stream's signal.
-    OtherSignal->increaseUseCount();
-
-    // Retrieve an available signal for the operation's output.
-    AMDGPUSignalTy *OutputSignal = nullptr;
-    if (auto Err = SignalManager.getResource(OutputSignal))
-      return Err;
-    OutputSignal->reset();
-    OutputSignal->increaseUseCount();
-
-    // Consume stream slot and compute dependencies.
-    auto [Curr, InputSignal] = consume(OutputSignal);
-
-    // Setup the post action to release the signal.
-    if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager))
-      return Err;
-
-    // Push a barrier into the queue with both input signals.
-    return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal);
-  }
-
-  /// Callback for running a specific asynchronous operation. This callback is
-  /// used for hsa_amd_signal_async_handler. The argument is the operation that
-  /// should be executed. Notice we use the post action mechanism to codify the
-  /// asynchronous operation.
-  static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) {
-    StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args);
-    assert(Slot && "Invalid slot");
-    assert(Slot->Signal && "Invalid signal");
-
-    // This thread is outside the stream mutex. Make sure the thread sees the
-    // changes on the slot.
-    std::atomic_thread_fence(std::memory_order_acquire);
-
-    // Peform the operation.
-    if (auto Err = Slot->performAction())
-      FATAL_MESSAGE(1, "Error peforming post action: %s",
-                    toString(std::move(Err)).data());
-
-    // Signal the output signal to notify the asycnhronous operation finalized.
-    Slot->Signal->signal();
-
-    // Unregister callback.
-    return false;
-  }
-
-  // Callback for host-to-host memory copies. This is an asynchronous action.
-  static Error memcpyAction(void *Data) {
-    MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data);
-    assert(Args && "Invalid arguments");
-    assert(Args->Dst && "Invalid destination buffer");
-    assert(Args->Src && "Invalid source buffer");
-
-    std::memcpy(Args->Dst, Args->Src, Args->Size);
-
-    return Plugin::success();
-  }
-
-  /// Releasing a memory buffer to a memory manager. This is a post completion
-  /// action. There are two kinds of memory buffers:
-  ///   1. For kernel arguments. This buffer can be freed after receiving the
-  ///   kernel completion signal.
-  ///   2. For H2D tranfers that need pinned memory space for staging. This
-  ///   buffer can be freed after receiving the transfer completion signal.
-  ///   3. For D2H tranfers that need pinned memory space for staging. This
-  ///   buffer cannot be freed after receiving the transfer completion signal
-  ///   because of the following asynchronous H2H callback.
-  ///      For this reason, This action can only be taken at
-  ///      AMDGPUStreamTy::complete()
-  /// Because of the case 3, all releaseBufferActions are taken at
-  /// AMDGPUStreamTy::complete() in the current implementation.
-  static Error releaseBufferAction(void *Data) {
-    ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data);
-    assert(Args && "Invalid arguments");
-    assert(Args->MemoryManager && "Invalid memory manager");
-    assert(Args->Buffer && "Invalid buffer");
-
-    // Release the allocation to the memory manager.
-    return Args->MemoryManager->deallocate(Args->Buffer);
-  }
-
-  /// Releasing a signal object back to SignalManager. This is a post completion
-  /// action. This action can only be taken at AMDGPUStreamTy::complete()
-  static Error releaseSignalAction(void *Data) {
-    ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data);
-    assert(Args && "Invalid arguments");
-    assert(Args->Signal && "Invalid signal");
-    assert(Args->SignalManager && "Invalid signal manager");
-
-    // Release the signal if needed.
-    if (Args->Signal->decreaseUseCount())
-      if (auto Err = Args->SignalManager->returnResource(Args->Signal))
-        return Err;
-
-    return Plugin::success();
-  }
-
-public:
-  /// Create an empty stream associated with a specific device.
-  AMDGPUStreamTy(AMDGPUDeviceTy &Device);
-
-  /// Intialize the stream's signals.
-  Error init() { return Plugin::success(); }
-
-  /// Deinitialize the stream's signals.
-  Error deinit() { return Plugin::success(); }
-
-  /// Attach an RPC server to this stream.
-  void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
-
-  /// Push a asynchronous kernel to the stream. The kernel arguments must be
-  /// placed in a special allocation for kernel args and must keep alive until
-  /// the kernel finalizes. Once the kernel is finished, the stream will release
-  /// the kernel args buffer to the specified memory manager.
-  Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
-                         uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
-                         AMDGPUMemoryManagerTy &MemoryManager) {
-    if (Queue == nullptr)
-      return Plugin::error("Target queue was nullptr");
-
-    // Retrieve an available signal for the operation's output.
-    AMDGPUSignalTy *OutputSignal = nullptr;
-    if (auto Err = SignalManager.getResource(OutputSignal))
-      return Err;
-    OutputSignal->reset();
-    OutputSignal->increaseUseCount();
-
-    std::lock_guard<std::mutex> StreamLock(Mutex);
-
-    // Consume stream slot and compute dependencies.
-    auto [Curr, InputSignal] = consume(OutputSignal);
-
-    // Setup the post action to release the kernel args buffer.
-    if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
-      return Err;
-
-    // Push the kernel with the output signal and an input signal (optional)
-    return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
-                                   GroupSize, StackSize, OutputSignal,
-                                   InputSignal);
-  }
-
-  /// Push an asynchronous memory copy between pinned memory buffers.
-  Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src,
-                                  uint64_t CopySize) {
-    // Retrieve an available signal for the operation's output.
-    AMDGPUSignalTy *OutputSignal = nullptr;
-    if (auto Err = SignalManager.getResource(OutputSignal))
-      return Err;
-    OutputSignal->reset();
-    OutputSignal->increaseUseCount();
-
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    // Consume stream slot and compute dependencies.
-    auto [Curr, InputSignal] = consume(OutputSignal);
-
-    // Issue the async memory copy.
-    if (InputSignal && InputSignal->load()) {
-      hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                                 CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
-    }
-
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
-  }
-
-  /// Push an asynchronous memory copy device-to-host involving an unpinned
-  /// memory buffer. The operation consists of a two-step copy from the
-  /// device buffer to an intermediate pinned host buffer, and then, to a
-  /// unpinned host buffer. Both operations are asynchronous and dependant.
-  /// The intermediate pinned buffer will be released to the specified memory
-  /// manager once the operation completes.
-  Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
-                               uint64_t CopySize,
-                               AMDGPUMemoryManagerTy &MemoryManager) {
-    // Retrieve available signals for the operation's outputs.
-    AMDGPUSignalTy *OutputSignals[2] = {};
-    if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
-      return Err;
-    for (auto *Signal : OutputSignals) {
-      Signal->reset();
-      Signal->increaseUseCount();
-    }
-
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    // Consume stream slot and compute dependencies.
-    auto [Curr, InputSignal] = consume(OutputSignals[0]);
-
-    // Setup the post action for releasing the intermediate buffer.
-    if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
-      return Err;
-
-    // Issue the first step: device to host transfer. Avoid defining the input
-    // dependency if already satisfied.
-    if (InputSignal && InputSignal->load()) {
-      hsa_signal_t InputSignalRaw = InputSignal->get();
-      if (auto Err = utils::asyncMemCopy(
-              UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
-              &InputSignalRaw, OutputSignals[0]->get()))
-        return Err;
-    } else {
-      if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
-                                         Src, Agent, CopySize, 0, nullptr,
-                                         OutputSignals[0]->get()))
-        return Err;
-    }
-
-    // Consume another stream slot and compute dependencies.
-    std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
-    assert(InputSignal && "Invalid input signal");
-
-    // The std::memcpy is done asynchronously using an async handler. We store
-    // the function's information in the action but it's not actually an action.
-    if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize))
-      return Err;
-
-    // Make changes on this slot visible to the async handler's thread.
-    std::atomic_thread_fence(std::memory_order_release);
-
-    // Issue the second step: host to host transfer.
-    hsa_status_t Status = hsa_amd_signal_async_handler(
-        InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
-        (void *)&Slots[Curr]);
-
-    return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s");
-  }
-
-  /// Push an asynchronous memory copy host-to-device involving an unpinned
-  /// memory buffer. The operation consists of a two-step copy from the
-  /// unpinned host buffer to an intermediate pinned host buffer, and then, to
-  /// the pinned host buffer. Both operations are asynchronous and dependant.
-  /// The intermediate pinned buffer will be released to the specified memory
-  /// manager once the operation completes.
-  Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
-                               uint64_t CopySize,
-                               AMDGPUMemoryManagerTy &MemoryManager) {
-    // Retrieve available signals for the operation's outputs.
-    AMDGPUSignalTy *OutputSignals[2] = {};
-    if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
-      return Err;
-    for (auto *Signal : OutputSignals) {
-      Signal->reset();
-      Signal->increaseUseCount();
-    }
-
-    AMDGPUSignalTy *OutputSignal = OutputSignals[0];
-
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    // Consume stream slot and compute dependencies.
-    auto [Curr, InputSignal] = consume(OutputSignal);
-
-    // Issue the first step: host to host transfer.
-    if (InputSignal && InputSignal->load()) {
-      // The std::memcpy is done asynchronously using an async handler. We store
-      // the function's information in the action but it is not actually a
-      // post action.
-      if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
-        return Err;
-
-      // Make changes on this slot visible to the async handler's thread.
-      std::atomic_thread_fence(std::memory_order_release);
-
-      hsa_status_t Status = hsa_amd_signal_async_handler(
-          InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
-          (void *)&Slots[Curr]);
-
-      if (auto Err = Plugin::check(Status,
-                                   "Error in hsa_amd_signal_async_handler: %s"))
-        return Err;
-
-      // Let's use now the second output signal.
-      OutputSignal = OutputSignals[1];
-
-      // Consume another stream slot and compute dependencies.
-      std::tie(Curr, InputSignal) = consume(OutputSignal);
-    } else {
-      // All preceding operations completed, copy the memory synchronously.
-      std::memcpy(Inter, Src, CopySize);
-
-      // Return the second signal because it will not be used.
-      OutputSignals[1]->decreaseUseCount();
-      if (auto Err = SignalManager.returnResource(OutputSignals[1]))
-        return Err;
-    }
-
-    // Setup the post action to release the intermediate pinned buffer.
-    if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
-      return Err;
-
-    // Issue the second step: host to device transfer. Avoid defining the input
-    // dependency if already satisfied.
-    if (InputSignal && InputSignal->load()) {
-      hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
-                                 Agent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
-    }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
-                               CopySize, 0, nullptr, OutputSignal->get());
-  }
-
-  // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
-  Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
-                               hsa_agent_t SrcAgent, uint64_t CopySize) {
-    AMDGPUSignalTy *OutputSignal;
-    if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
-      return Err;
-    OutputSignal->reset();
-    OutputSignal->increaseUseCount();
-
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    // Consume stream slot and compute dependencies.
-    auto [Curr, InputSignal] = consume(OutputSignal);
-
-    // The agents need to have access to the corresponding memory
-    // This is presently only true if the pointers were originally
-    // allocated by this runtime or the caller made the appropriate
-    // access calls.
-
-    if (InputSignal && InputSignal->load()) {
-      hsa_signal_t InputSignalRaw = InputSignal->get();
-      return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                                 SrcAgent, CopySize, 1, &InputSignalRaw,
-                                 OutputSignal->get());
-    }
-    return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
-                               SrcAgent, CopySize, 0, nullptr,
-                               OutputSignal->get());
-  }
-
-  /// Synchronize with the stream. The current thread waits until all operations
-  /// are finalized and it performs the pending post actions (i.e., releasing
-  /// intermediate buffers).
-  Error synchronize() {
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    // No need to synchronize anything.
-    if (size() == 0)
-      return Plugin::success();
-
-    // Wait until all previous operations on the stream have completed.
-    if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
-                                              RPCServer, &Device))
-      return Err;
-
-    // Reset the stream and perform all pending post actions.
-    return complete();
-  }
-
-  /// Query the stream and complete pending post actions if operations finished.
-  /// Return whether all the operations completed. This operation does not block
-  /// the calling thread.
-  Expected<bool> query() {
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    // No need to query anything.
-    if (size() == 0)
-      return true;
-
-    // The last operation did not complete yet. Return directly.
-    if (Slots[last()].Signal->load())
-      return false;
-
-    // Reset the stream and perform all pending post actions.
-    if (auto Err = complete())
-      return std::move(Err);
-
-    return true;
-  }
-
-  /// Record the state of the stream on an event.
-  Error recordEvent(AMDGPUEventTy &Event) const;
-
-  /// Make the stream wait on an event.
-  Error waitEvent(const AMDGPUEventTy &Event);
-
-  friend struct AMDGPUStreamManagerTy;
-};
-
-/// Class representing an event on AMDGPU. The event basically stores some
-/// information regarding the state of the recorded stream.
-struct AMDGPUEventTy {
-  /// Create an empty event.
-  AMDGPUEventTy(AMDGPUDeviceTy &Device)
-      : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
-
-  /// Initialize and deinitialize.
-  Error init() { return Plugin::success(); }
-  Error deinit() { return Plugin::success(); }
-
-  /// Record the state of a stream on the event.
-  Error record(AMDGPUStreamTy &Stream) {
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    // Ignore the last recorded stream.
-    RecordedStream = &Stream;
-
-    return Stream.recordEvent(*this);
-  }
-
-  /// Make a stream wait on the current event.
-  Error wait(AMDGPUStreamTy &Stream) {
-    std::lock_guard<std::mutex> Lock(Mutex);
-
-    if (!RecordedStream)
-      return Plugin::error("Event does not have any recorded stream");
-
-    // Synchronizing the same stream. Do nothing.
-    if (RecordedStream == &Stream)
-      return Plugin::success();
-
-    // No need to wait anything, the recorded stream already finished the
-    // corresponding operation.
-    if (RecordedSlot < 0)
-      return Plugin::success();
-
-    return Stream.waitEvent(*this);
-  }
-
-protected:
-  /// The stream registered in this event.
-  AMDGPUStreamTy *RecordedStream;
-
-  /// The recordered operation on the recorded stream.
-  int64_t RecordedSlot;
-
-  /// The sync cycle when the stream was recorded. Used to detect stale events.
-  int64_t RecordedSyncCycle;
-
-  /// Mutex to safely access event fields.
-  mutable std::mutex Mutex;
-
-  friend struct AMDGPUStreamTy;
-};
-
-Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
-  std::lock_guard<std::mutex> Lock(Mutex);
-
-  if (size() > 0) {
-    // Record the synchronize identifier (to detect stale recordings) and
-    // the last valid stream's operation.
-    Event.RecordedSyncCycle = SyncCycle;
-    Event.RecordedSlot = last();
-
-    assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
-    assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
-  } else {
-    // The stream is empty, everything already completed, record nothing.
-    Event.RecordedSyncCycle = -1;
-    Event.RecordedSlot = -1;
-  }
-  return Plugin::success();
-}
-
-Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
-  // Retrieve the recorded stream on the event.
-  AMDGPUStreamTy &RecordedStream = *Event.RecordedStream;
-
-  std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex);
-
-  // The recorded stream already completed the operation because the synchronize
-  // identifier is already outdated.
-  if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle)
-    return Plugin::success();
-
-  // Again, the recorded stream already completed the operation, the last
-  // operation's output signal is satisfied.
-  if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load())
-    return Plugin::success();
-
-  // Otherwise, make the current stream wait on the other stream's operation.
-  return waitOnStreamOperation(RecordedStream, Event.RecordedSlot);
-}
-
-struct AMDGPUStreamManagerTy final
-    : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
-  using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
-  using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
-
-  AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
-        OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
-        NextQueue(0), Agent(HSAAgent) {}
-
-  Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
-    Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
-    QueueSize = HSAQueueSize;
-    MaxNumQueues = NumHSAQueues;
-    // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
-      return Err;
-
-    return GenericDeviceResourceManagerTy::init(InitialSize);
-  }
-
-  /// Deinitialize the resource pool and delete all resources. This function
-  /// must be called before the destructor.
-  Error deinit() override {
-    // De-init all queues
-    for (AMDGPUQueueTy &Queue : Queues) {
-      if (auto Err = Queue.deinit())
-        return Err;
-    }
-
-    return GenericDeviceResourceManagerTy::deinit();
-  }
-
-  /// Get a single stream from the pool or create new resources.
-  virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override {
-    return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) {
-      return assignNextQueue(Handle);
-    });
-  }
-
-  /// Return stream to the pool.
-  virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override {
-    return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) {
-      Handle->Queue->removeUser();
-      return Plugin::success();
-    });
-  }
-
-private:
-  /// Search for and assign an prefereably idle queue to the given Stream. If
-  /// there is no queue without current users, choose the queue with the lowest
-  /// user count. If utilization is ignored: use round robin selection.
-  inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
-    // Start from zero when tracking utilization, otherwise: round robin policy.
-    uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
-
-    if (OMPX_QueueTracking) {
-      // Find the least used queue.
-      for (uint32_t I = 0; I < MaxNumQueues; ++I) {
-        // Early exit when an initialized queue is idle.
-        if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
-          Index = I;
-          break;
-        }
-
-        // Update the least used queue.
-        if (Queues[Index].getUserCount() > Queues[I].getUserCount())
-          Index = I;
-      }
-    }
-
-    // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
-      return Err;
-    Queues[Index].addUser();
-    Stream->Queue = &Queues[Index];
-
-    return Plugin::success();
-  }
-
-  /// Envar for controlling the tracking of busy HSA queues.
-  BoolEnvar OMPX_QueueTracking;
-
-  /// The next queue index to use for round robin selection.
-  uint32_t NextQueue;
-
-  /// The queues which are assigned to requested streams.
-  std::vector<AMDGPUQueueTy> Queues;
-
-  /// The corresponding device as HSA agent.
-  hsa_agent_t Agent;
-
-  /// The maximum number of queues.
-  int MaxNumQueues;
-
-  /// The size of created queues.
-  int QueueSize;
-};
-
-/// Abstract class that holds the common members of the actual kernel devices
-/// and the host device. Both types should inherit from this class.
-struct AMDGenericDeviceTy {
-  AMDGenericDeviceTy() {}
-
-  virtual ~AMDGenericDeviceTy() {}
-
-  /// Create all memory pools which the device has access to and classify them.
-  Error initMemoryPools() {
-    // Retrieve all memory pools from the device agent(s).
-    Error Err = retrieveAllMemoryPools();
-    if (Err)
-      return Err;
-
-    for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) {
-      // Initialize the memory pool and retrieve some basic info.
-      Error Err = MemoryPool->init();
-      if (Err)
-        return Err;
-
-      if (!MemoryPool->isGlobal())
-        continue;
-
-      // Classify the memory pools depending on their properties.
-      if (MemoryPool->isFineGrained()) {
-        FineGrainedMemoryPools.push_back(MemoryPool);
-        if (MemoryPool->supportsKernelArgs())
-          ArgsMemoryPools.push_back(MemoryPool);
-      } else if (MemoryPool->isCoarseGrained()) {
-        CoarseGrainedMemoryPools.push_back(MemoryPool);
-      }
-    }
-    return Plugin::success();
-  }
-
-  /// Destroy all memory pools.
-  Error deinitMemoryPools() {
-    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools)
-      delete Pool;
-
-    AllMemoryPools.clear();
-    FineGrainedMemoryPools.clear();
-    CoarseGrainedMemoryPools.clear();
-    ArgsMemoryPools.clear();
-
-    return Plugin::success();
-  }
-
-  /// Retrieve and construct all memory pools from the device agent(s).
-  virtual Error retrieveAllMemoryPools() = 0;
-
-  /// Get the device agent.
-  virtual hsa_agent_t getAgent() const = 0;
-
-protected:
-  /// Array of all memory pools available to the host agents.
-  llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools;
-
-  /// Array of fine-grained memory pools available to the host agents.
-  llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools;
-
-  /// Array of coarse-grained memory pools available to the host agents.
-  llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools;
-
-  /// Array of kernel args memory pools available to the host agents.
-  llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools;
-};
-
-/// Class representing the host device. This host device may have more than one
-/// HSA host agent. We aggregate all its resources into the same instance.
-struct AMDHostDeviceTy : public AMDGenericDeviceTy {
-  /// Create a host device from an array of host agents.
-  AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
-                  const llvm::SmallVector<hsa_agent_t> &HostAgents)
-      : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
-        PinnedMemoryManager(Plugin) {
-    assert(HostAgents.size() && "No host agent found");
-  }
-
-  /// Initialize the host device memory pools and the memory managers for
-  /// kernel args and host pinned memory allocations.
-  Error init() {
-    if (auto Err = initMemoryPools())
-      return Err;
-
-    if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool()))
-      return Err;
-
-    if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool()))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Deinitialize memory pools and managers.
-  Error deinit() {
-    if (auto Err = deinitMemoryPools())
-      return Err;
-
-    if (auto Err = ArgsMemoryManager.deinit())
-      return Err;
-
-    if (auto Err = PinnedMemoryManager.deinit())
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Retrieve and construct all memory pools from the host agents.
-  Error retrieveAllMemoryPools() override {
-    // Iterate through the available pools across the host agents.
-    for (hsa_agent_t Agent : Agents) {
-      Error Err = utils::iterateAgentMemoryPools(
-          Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
-            AMDGPUMemoryPoolTy *MemoryPool =
-                new AMDGPUMemoryPoolTy(HSAMemoryPool);
-            AllMemoryPools.push_back(MemoryPool);
-            return HSA_STATUS_SUCCESS;
-          });
-      if (Err)
-        return Err;
-    }
-    return Plugin::success();
-  }
-
-  /// Get one of the host agents. Return always the first agent.
-  hsa_agent_t getAgent() const override { return Agents[0]; }
-
-  /// Get a memory pool for fine-grained allocations.
-  AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
-    assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
-    // Retrive any memory pool.
-    return *FineGrainedMemoryPools[0];
-  }
-
-  AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
-    assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
-    // Retrive any memory pool.
-    return *CoarseGrainedMemoryPools[0];
-  }
-
-  /// Get a memory pool for kernel args allocations.
-  AMDGPUMemoryPoolTy &getArgsMemoryPool() {
-    assert(!ArgsMemoryPools.empty() && "No kernelargs mempool");
-    // Retrieve any memory pool.
-    return *ArgsMemoryPools[0];
-  }
-
-  /// Getters for kernel args and host pinned memory managers.
-  AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; }
-  AMDGPUMemoryManagerTy &getPinnedMemoryManager() {
-    return PinnedMemoryManager;
-  }
-
-private:
-  /// Array of agents on the host side.
-  const llvm::SmallVector<hsa_agent_t> Agents;
-
-  // Memory manager for kernel arguments.
-  AMDGPUMemoryManagerTy ArgsMemoryManager;
-
-  // Memory manager for pinned memory.
-  AMDGPUMemoryManagerTy PinnedMemoryManager;
-};
-
-/// Class implementing the AMDGPU device functionalities which derives from the
-/// generic device class.
-struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
-  // Create an AMDGPU device with a device id and default AMDGPU grid values.
-  AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
-                 AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
-      : GenericDeviceTy(Plugin, DeviceId, NumDevices, {0}),
-        AMDGenericDeviceTy(),
-        OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
-        OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
-        OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
-        OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
-                               1 * 1024 * 1024), // 1MB
-        OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
-                               64),
-        OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
-        OMPX_UseMultipleSdmaEngines(
-            "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
-        OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
-        AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
-        HostDevice(HostDevice) {}
-
-  ~AMDGPUDeviceTy() {}
-
-  /// Initialize the device, its resources and get its properties.
-  Error initImpl(GenericPluginTy &Plugin) override {
-    // First setup all the memory pools.
-    if (auto Err = initMemoryPools())
-      return Err;
-
-    char GPUName[64];
-    if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
-      return Err;
-    ComputeUnitKind = GPUName;
-
-    // Get the wavefront size.
-    uint32_t WavefrontSize = 0;
-    if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
-      return Err;
-    GridValues.GV_Warp_Size = WavefrontSize;
-
-    // Get the frequency of the steady clock. If the attribute is missing
-    // assume running on an older libhsa and default to 0, omp_get_wtime
-    // will be inaccurate but otherwise programs can still run.
-    if (getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
-                         ClockFrequency) != HSA_STATUS_SUCCESS)
-      ClockFrequency = 0;
-
-    // Load the grid values dependending on the wavefront.
-    if (WavefrontSize == 32)
-      GridValues = getAMDGPUGridValues<32>();
-    else if (WavefrontSize == 64)
-      GridValues = getAMDGPUGridValues<64>();
-    else
-      return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
-
-    // Get maximum number of workitems per workgroup.
-    uint16_t WorkgroupMaxDim[3];
-    if (auto Err =
-            getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim))
-      return Err;
-    GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0];
-
-    // Get maximum number of workgroups.
-    hsa_dim3_t GridMaxDim;
-    if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim))
-      return Err;
-
-    GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size;
-    if (GridValues.GV_Max_Teams == 0)
-      return Plugin::error("Maximum number of teams cannot be zero");
-
-    // Compute the default number of teams.
-    uint32_t ComputeUnits = 0;
-    if (auto Err =
-            getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits))
-      return Err;
-    GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
-
-    uint32_t WavesPerCU = 0;
-    if (auto Err =
-            getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
-      return Err;
-    HardwareParallelism = ComputeUnits * WavesPerCU;
-
-    // Get maximum size of any device queues and maximum number of queues.
-    uint32_t MaxQueueSize;
-    if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
-      return Err;
-
-    uint32_t MaxQueues;
-    if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues))
-      return Err;
-
-    // Compute the number of queues and their size.
-    OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
-    OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
-
-    // Initialize stream pool.
-    if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams,
-                                            OMPX_NumQueues, OMPX_QueueSize))
-      return Err;
-
-    // Initialize event pool.
-    if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents))
-      return Err;
-
-    // Initialize signal pool.
-    if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
-      return Err;
-
-    // Detect if XNACK is enabled
-    auto TargeTripleAndFeaturesOrError =
-        utils::getTargetTripleAndFeatures(Agent);
-    if (!TargeTripleAndFeaturesOrError)
-      return TargeTripleAndFeaturesOrError.takeError();
-    if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
-            .contains("xnack+"))
-      IsXnackEnabled = true;
-
-    // detect if device is an APU.
-    if (auto Err = checkIfAPU())
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Deinitialize the device and release its resources.
-  Error deinitImpl() override {
-    // Deinitialize the stream and event pools.
-    if (auto Err = AMDGPUStreamManager.deinit())
-      return Err;
-
-    if (auto Err = AMDGPUEventManager.deinit())
-      return Err;
-
-    if (auto Err = AMDGPUSignalManager.deinit())
-      return Err;
-
-    // Close modules if necessary.
-    if (!LoadedImages.empty()) {
-      // Each image has its own module.
-      for (DeviceImageTy *Image : LoadedImages) {
-        AMDGPUDeviceImageTy &AMDImage =
-            static_cast<AMDGPUDeviceImageTy &>(*Image);
-
-        // Unload the executable of the image.
-        if (auto Err = AMDImage.unloadExecutable())
-          return Err;
-      }
-    }
-
-    // Invalidate agent reference.
-    Agent = {0};
-
-    return Plugin::success();
-  }
-
-  virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
-                                       DeviceImageTy &Image) override {
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini"))
-      Image.setPendingGlobalDtors();
-
-    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
-  }
-
-  virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
-  }
-
-  const uint64_t getStreamBusyWaitMicroseconds() const {
-    return OMPX_StreamBusyWait;
-  }
-
-  Expected<std::unique_ptr<MemoryBuffer>>
-  doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
-
-    // TODO: We should try to avoid materialization but there seems to be no
-    // good linker interface w/o file i/o.
-    SmallString<128> LinkerInputFilePath;
-    std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit",
-                                                      "o", LinkerInputFilePath);
-    if (EC)
-      return Plugin::error("Failed to create temporary file for linker");
-
-    // Write the file's contents to the output file.
-    Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
-        FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size());
-    if (!OutputOrErr)
-      return OutputOrErr.takeError();
-    std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
-    llvm::copy(MB->getBuffer(), Output->getBufferStart());
-    if (Error E = Output->commit())
-      return std::move(E);
-
-    SmallString<128> LinkerOutputFilePath;
-    EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so",
-                                      LinkerOutputFilePath);
-    if (EC)
-      return Plugin::error("Failed to create temporary file for linker");
-
-    const auto &ErrorOrPath = sys::findProgramByName("lld");
-    if (!ErrorOrPath)
-      return createStringError(inconvertibleErrorCode(),
-                               "Failed to find `lld` on the PATH.");
-
-    std::string LLDPath = ErrorOrPath.get();
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
-         "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
-
-    std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
-    StringRef Args[] = {LLDPath,
-                        "-flavor",
-                        "gnu",
-                        "--no-undefined",
-                        "-shared",
-                        MCPU,
-                        "-o",
-                        LinkerOutputFilePath.data(),
-                        LinkerInputFilePath.data()};
-
-    std::string Error;
-    int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error);
-    if (RC)
-      return Plugin::error("Linking optimized bitcode failed: %s",
-                           Error.c_str());
-
-    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath);
-    if (!BufferOrErr)
-      return Plugin::error("Failed to open temporary file for lld");
-
-    // Clean up the temporary files afterwards.
-    if (sys::fs::remove(LinkerOutputFilePath))
-      return Plugin::error("Failed to remove temporary output file for lld");
-    if (sys::fs::remove(LinkerInputFilePath))
-      return Plugin::error("Failed to remove temporary input file for lld");
-
-    return std::move(*BufferOrErr);
-  }
-
-  /// See GenericDeviceTy::getComputeUnitKind().
-  std::string getComputeUnitKind() const override { return ComputeUnitKind; }
-
-  /// Returns the clock frequency for the given AMDGPU device.
-  uint64_t getClockFrequency() const override { return ClockFrequency; }
-
-  /// Allocate and construct an AMDGPU kernel.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
-    // Allocate and construct the AMDGPU kernel.
-    AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
-    if (!AMDGPUKernel)
-      return Plugin::error("Failed to allocate memory for AMDGPU kernel");
-
-    new (AMDGPUKernel) AMDGPUKernelTy(Name);
-
-    return *AMDGPUKernel;
-  }
-
-  /// Set the current context to this device's context. Do nothing since the
-  /// AMDGPU devices do not have the concept of contexts.
-  Error setContext() override { return Plugin::success(); }
-
-  /// AMDGPU returns the product of the number of compute units and the waves
-  /// per compute unit.
-  uint64_t getHardwareParallelism() const override {
-    return HardwareParallelism;
-  }
-
-  /// We want to set up the RPC server for host services to the GPU if it is
-  /// availible.
-  bool shouldSetupRPCServer() const override {
-    return libomptargetSupportsRPC();
-  }
-
-  /// The RPC interface should have enough space for all availible parallelism.
-  uint64_t requestedRPCPortCount() const override {
-    return getHardwareParallelism();
-  }
-
-  /// Get the stream of the asynchronous info sructure or get a new one.
-  Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
-                  AMDGPUStreamTy *&Stream) {
-    // Get the stream (if any) from the async info.
-    Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
-    if (!Stream) {
-      // There was no stream; get an idle one.
-      if (auto Err = AMDGPUStreamManager.getResource(Stream))
-        return Err;
-
-      // Modify the async info's stream.
-      AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
-    }
-    return Plugin::success();
-  }
-
-  /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
-    // Allocate and initialize the image object.
-    AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
-    new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
-
-    // Load the HSA executable.
-    if (Error Err = AMDImage->loadExecutable(*this))
-      return std::move(Err);
-
-    return AMDImage;
-  }
-
-  /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
-
-  /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
-    if (TgtPtr == nullptr)
-      return OFFLOAD_SUCCESS;
-
-    AMDGPUMemoryPoolTy *MemoryPool = nullptr;
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
-      MemoryPool = CoarseGrainedMemoryPools[0];
-      break;
-    case TARGET_ALLOC_HOST:
-      MemoryPool = &HostDevice.getFineGrainedMemoryPool();
-      break;
-    case TARGET_ALLOC_SHARED:
-      MemoryPool = &HostDevice.getFineGrainedMemoryPool();
-      break;
-    }
-
-    if (!MemoryPool) {
-      REPORT("No memory pool for the specified allocation kind\n");
-      return OFFLOAD_FAIL;
-    }
-
-    if (Error Err = MemoryPool->deallocate(TgtPtr)) {
-      REPORT("%s\n", toString(std::move(Err)).data());
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Synchronize current thread with the pending operations on the async info.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
-    AMDGPUStreamTy *Stream =
-        reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
-    assert(Stream && "Invalid stream");
-
-    if (auto Err = Stream->synchronize())
-      return Err;
-
-    // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
-    AsyncInfo.Queue = nullptr;
-    return AMDGPUStreamManager.returnResource(Stream);
-  }
-
-  /// Query for the completion of the pending operations on the async info.
-  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
-    AMDGPUStreamTy *Stream =
-        reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
-    assert(Stream && "Invalid stream");
-
-    auto CompletedOrErr = Stream->query();
-    if (!CompletedOrErr)
-      return CompletedOrErr.takeError();
-
-    // Return if it the stream did not complete yet.
-    if (!(*CompletedOrErr))
-      return Plugin::success();
-
-    // Once the stream is completed, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
-    AsyncInfo.Queue = nullptr;
-    return AMDGPUStreamManager.returnResource(Stream);
-  }
-
-  /// Pin the host buffer and return the device pointer that should be used for
-  /// device transfers.
-  Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
-    void *PinnedPtr = nullptr;
-
-    hsa_status_t Status =
-        hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr);
-    if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
-      return std::move(Err);
-
-    return PinnedPtr;
-  }
-
-  /// Unpin the host buffer.
-  Error dataUnlockImpl(void *HstPtr) override {
-    hsa_status_t Status = hsa_amd_memory_unlock(HstPtr);
-    return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
-  }
-
-  /// Check through the HSA runtime whether the \p HstPtr buffer is pinned.
-  Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
-                                 void *&BaseDevAccessiblePtr,
-                                 size_t &BaseSize) const override {
-    hsa_amd_pointer_info_t Info;
-    Info.size = sizeof(hsa_amd_pointer_info_t);
-
-    hsa_status_t Status = hsa_amd_pointer_info(
-        HstPtr, &Info, /*Allocator=*/nullptr, /*num_agents_accessible=*/nullptr,
-        /*accessible=*/nullptr);
-    if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s"))
-      return std::move(Err);
-
-    // The buffer may be locked or allocated through HSA allocators. Assume that
-    // the buffer is host pinned if the runtime reports a HSA type.
-    if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED &&
-        Info.type != HSA_EXT_POINTER_TYPE_HSA)
-      return false;
-
-    assert(Info.hostBaseAddress && "Invalid host pinned address");
-    assert(Info.agentBaseAddress && "Invalid agent pinned address");
-    assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size");
-
-    // Save the allocation info in the output parameters.
-    BaseHstPtr = Info.hostBaseAddress;
-    BaseDevAccessiblePtr = Info.agentBaseAddress;
-    BaseSize = Info.sizeInBytes;
-
-    return true;
-  }
-
-  /// Submit data to the device (host to device transfer).
-  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
-                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    AMDGPUStreamTy *Stream = nullptr;
-    void *PinnedPtr = nullptr;
-
-    // Use one-step asynchronous operation when host memory is already pinned.
-    if (void *PinnedPtr =
-            PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
-      if (auto Err = getStream(AsyncInfoWrapper, Stream))
-        return Err;
-      return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size);
-    }
-
-    // For large transfers use synchronous behavior.
-    if (Size >= OMPX_MaxAsyncCopyBytes) {
-      if (AsyncInfoWrapper.hasQueue())
-        if (auto Err = synchronize(AsyncInfoWrapper))
-          return Err;
-
-      hsa_status_t Status;
-      Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
-                                   &PinnedPtr);
-      if (auto Err =
-              Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
-        return Err;
-
-      AMDGPUSignalTy Signal;
-      if (auto Err = Signal.init())
-        return Err;
-
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
-                                         Agent, PinnedPtr, Agent, Size, 0,
-                                         nullptr, Signal.get()))
-        return Err;
-
-      if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
-        return Err;
-
-      if (auto Err = Signal.deinit())
-        return Err;
-
-      Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
-      return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
-    }
-
-    // Otherwise, use two-step copy with an intermediate pinned host buffer.
-    AMDGPUMemoryManagerTy &PinnedMemoryManager =
-        HostDevice.getPinnedMemoryManager();
-    if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
-      return Err;
-
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
-                                          PinnedMemoryManager);
-  }
-
-  /// Retrieve data from the device (device to host transfer).
-  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
-                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    AMDGPUStreamTy *Stream = nullptr;
-    void *PinnedPtr = nullptr;
-
-    // Use one-step asynchronous operation when host memory is already pinned.
-    if (void *PinnedPtr =
-            PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
-      if (auto Err = getStream(AsyncInfoWrapper, Stream))
-        return Err;
-
-      return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size);
-    }
-
-    // For large transfers use synchronous behavior.
-    if (Size >= OMPX_MaxAsyncCopyBytes) {
-      if (AsyncInfoWrapper.hasQueue())
-        if (auto Err = synchronize(AsyncInfoWrapper))
-          return Err;
-
-      hsa_status_t Status;
-      Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0,
-                                   &PinnedPtr);
-      if (auto Err =
-              Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n"))
-        return Err;
-
-      AMDGPUSignalTy Signal;
-      if (auto Err = Signal.init())
-        return Err;
-
-      if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
-                                         Agent, TgtPtr, Agent, Size, 0, nullptr,
-                                         Signal.get()))
-        return Err;
-
-      if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
-        return Err;
-
-      if (auto Err = Signal.deinit())
-        return Err;
-
-      Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr));
-      return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n");
-    }
-
-    // Otherwise, use two-step copy with an intermediate pinned host buffer.
-    AMDGPUMemoryManagerTy &PinnedMemoryManager =
-        HostDevice.getPinnedMemoryManager();
-    if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
-      return Err;
-
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
-                                          PinnedMemoryManager);
-  }
-
-  /// Exchange data between two devices within the plugin.
-  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
-                         void *DstPtr, int64_t Size,
-                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
-
-    // For large transfers use synchronous behavior.
-    if (Size >= OMPX_MaxAsyncCopyBytes) {
-      if (AsyncInfoWrapper.hasQueue())
-        if (auto Err = synchronize(AsyncInfoWrapper))
-          return Err;
-
-      AMDGPUSignalTy Signal;
-      if (auto Err = Signal.init())
-        return Err;
-
-      if (auto Err = utils::asyncMemCopy(
-              useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
-              getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
-        return Err;
-
-      if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
-        return Err;
-
-      return Signal.deinit();
-    }
-
-    AMDGPUStreamTy *Stream = nullptr;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-    if (Size <= 0)
-      return Plugin::success();
-
-    return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
-                                          getAgent(), (uint64_t)Size);
-  }
-
-  /// Initialize the async info for interoperability purposes.
-  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    // TODO: Implement this function.
-    return Plugin::success();
-  }
-
-  /// Initialize the device info for interoperability purposes.
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    DeviceInfo->Context = nullptr;
-
-    if (!DeviceInfo->Device)
-      DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle);
-
-    return Plugin::success();
-  }
-
-  /// Create an event.
-  Error createEventImpl(void **EventPtrStorage) override {
-    AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
-    return AMDGPUEventManager.getResource(*Event);
-  }
-
-  /// Destroy a previously created event.
-  Error destroyEventImpl(void *EventPtr) override {
-    AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
-    return AMDGPUEventManager.returnResource(Event);
-  }
-
-  /// Record the event.
-  Error recordEventImpl(void *EventPtr,
-                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
-    assert(Event && "Invalid event");
-
-    AMDGPUStreamTy *Stream = nullptr;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    return Event->record(*Stream);
-  }
-
-  /// Make the stream wait on the event.
-  Error waitEventImpl(void *EventPtr,
-                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
-
-    AMDGPUStreamTy *Stream = nullptr;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    return Event->wait(*Stream);
-  }
-
-  /// Synchronize the current thread with the event.
-  Error syncEventImpl(void *EventPtr) override {
-    return Plugin::error("Synchronize event not implemented");
-  }
-
-  /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
-    char TmpChar[1000];
-    const char *TmpCharPtr = "Unknown";
-    uint16_t Major, Minor;
-    uint32_t TmpUInt, TmpUInt2;
-    uint32_t CacheSize[4];
-    size_t TmpSt;
-    bool TmpBool;
-    uint16_t WorkgrpMaxDim[3];
-    hsa_dim3_t GridMaxDim;
-    hsa_status_t Status, Status2;
-
-    Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
-    Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
-    if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
-      Info.add("HSA Runtime Version",
-               std::to_string(Major) + "." + std::to_string(Minor));
-
-    Info.add("HSA OpenMP Device Number", DeviceId);
-
-    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Product Name", TmpChar);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Device Name", TmpChar);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Vendor Name", TmpChar);
-
-    hsa_device_type_t DevType;
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
-    if (Status == HSA_STATUS_SUCCESS) {
-      switch (DevType) {
-      case HSA_DEVICE_TYPE_CPU:
-        TmpCharPtr = "CPU";
-        break;
-      case HSA_DEVICE_TYPE_GPU:
-        TmpCharPtr = "GPU";
-        break;
-      case HSA_DEVICE_TYPE_DSP:
-        TmpCharPtr = "DSP";
-        break;
-      }
-      Info.add("Device Type", TmpCharPtr);
-    }
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Max Queues", TmpUInt);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Queue Min Size", TmpUInt);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Queue Max Size", TmpUInt);
-
-    // FIXME: This is deprecated according to HSA documentation. But using
-    // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
-    // runtime.
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
-    if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Cache");
-
-      for (int I = 0; I < 4; I++)
-        if (CacheSize[I])
-          Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
-    }
-
-    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Cacheline Size", TmpUInt);
-
-    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Max Clock Freq", TmpUInt, "MHz");
-
-    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Compute Units", TmpUInt);
-
-    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("SIMD per CU", TmpUInt);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Fast F16 Operation", TmpBool);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Wavefront Size", TmpUInt2);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Workgroup Max Size", TmpUInt);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
-    if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Workgroup Max Size per Dimension");
-      Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
-      Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
-      Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
-    }
-
-    Status = getDeviceAttrRaw(
-        (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Max Waves Per CU", TmpUInt);
-      Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2);
-    }
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Grid Max Size", TmpUInt);
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
-    if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Grid Max Size per Dimension");
-      Info.add<InfoLevel2>("x", GridMaxDim.x);
-      Info.add<InfoLevel2>("y", GridMaxDim.y);
-      Info.add<InfoLevel2>("z", GridMaxDim.z);
-    }
-
-    Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
-    if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Max fbarriers/Workgrp", TmpUInt);
-
-    Info.add("Memory Pools");
-    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
-      std::string TmpStr, TmpStr2;
-
-      if (Pool->isGlobal())
-        TmpStr = "Global";
-      else if (Pool->isReadOnly())
-        TmpStr = "ReadOnly";
-      else if (Pool->isPrivate())
-        TmpStr = "Private";
-      else if (Pool->isGroup())
-        TmpStr = "Group";
-      else
-        TmpStr = "Unknown";
-
-      Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
-
-      if (Pool->isGlobal()) {
-        if (Pool->isFineGrained())
-          TmpStr2 += "Fine Grained ";
-        if (Pool->isCoarseGrained())
-          TmpStr2 += "Coarse Grained ";
-        if (Pool->supportsKernelArgs())
-          TmpStr2 += "Kernarg ";
-
-        Info.add<InfoLevel3>("Flags", TmpStr2);
-      }
-
-      Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
-      if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Size", TmpSt, "bytes");
-
-      Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
-                                TmpBool);
-      if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Allocatable", TmpBool);
-
-      Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
-                                TmpSt);
-      if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
-
-      Status = Pool->getAttrRaw(
-          HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
-      if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
-
-      Status =
-          Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
-      if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Accessable by all", TmpBool);
-    }
-
-    Info.add("ISAs");
-    auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
-      Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
-      if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel2>("Name", TmpChar);
-
-      return Status;
-    });
-
-    // Silently consume the error.
-    if (Err)
-      consumeError(std::move(Err));
-
-    return Plugin::success();
-  }
-
-  /// Returns true if auto zero-copy the best configuration for the current
-  /// arch.
-  /// On AMDGPUs, automatic zero-copy is turned on
-  /// when running on an APU with XNACK (unified memory) support
-  /// enabled. On discrete GPUs, automatic zero-copy is triggered
-  /// if the user sets the environment variable OMPX_APU_MAPS=1
-  /// and if XNACK is enabled. The rationale is that zero-copy
-  /// is the best configuration (performance, memory footprint) on APUs,
-  /// while it is often not the best on discrete GPUs.
-  /// XNACK can be enabled with a kernel boot parameter or with
-  /// the HSA_XNACK environment variable.
-  bool useAutoZeroCopyImpl() override {
-    return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
-  }
-
-  /// Getters and setters for stack and heap sizes.
-  Error getDeviceStackSize(uint64_t &Value) override {
-    Value = StackSize;
-    return Plugin::success();
-  }
-  Error setDeviceStackSize(uint64_t Value) override {
-    StackSize = Value;
-    return Plugin::success();
-  }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = DeviceMemoryPoolSize;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override {
-    for (DeviceImageTy *Image : LoadedImages)
-      if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
-        return Err;
-    DeviceMemoryPoolSize = Value;
-    return Plugin::success();
-  }
-  Error getDeviceMemorySize(uint64_t &Value) override {
-    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
-      if (Pool->isGlobal()) {
-        hsa_status_t Status =
-            Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
-        return Plugin::check(Status, "Error in getting device memory size: %s");
-      }
-    }
-    return Plugin::error("getDeviceMemorySize:: no global pool");
-  }
-
-  /// AMDGPU-specific function to get device attributes.
-  template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
-    hsa_status_t Status =
-        hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
-    return Plugin::check(Status, "Error in hsa_agent_get_info: %s");
-  }
-
-  template <typename Ty>
-  hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) {
-    return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value);
-  }
-
-  /// Get the device agent.
-  hsa_agent_t getAgent() const override { return Agent; }
-
-  /// Get the signal manager.
-  AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; }
-
-  /// Retrieve and construct all memory pools of the device agent.
-  Error retrieveAllMemoryPools() override {
-    // Iterate through the available pools of the device agent.
-    return utils::iterateAgentMemoryPools(
-        Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
-          AMDGPUMemoryPoolTy *MemoryPool =
-              Plugin.allocate<AMDGPUMemoryPoolTy>();
-          new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
-          AllMemoryPools.push_back(MemoryPool);
-          return HSA_STATUS_SUCCESS;
-        });
-  }
-
-  bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
-
-private:
-  using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
-  using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
-
-  /// Common method to invoke a single threaded constructor or destructor
-  /// kernel by name.
-  Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                                 bool IsCtor) {
-    const char *KernelName =
-        IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini";
-    // Perform a quick check for the named kernel in the image. The kernel
-    // should be created by the 'amdgpu-lower-ctor-dtor' pass.
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
-      return Plugin::success();
-
-    // Allocate and construct the AMDGPU kernel.
-    AMDGPUKernelTy AMDGPUKernel(KernelName);
-    if (auto Err = AMDGPUKernel.init(*this, Image))
-      return Err;
-
-    AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
-
-    KernelArgsTy KernelArgs = {};
-    if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
-                                           /*NumBlocks=*/1ul, KernelArgs,
-                                           /*Args=*/nullptr, AsyncInfoWrapper))
-      return Err;
-
-    Error Err = Plugin::success();
-    AsyncInfoWrapper.finalize(Err);
-
-    return Err;
-  }
-
-  /// Detect if current architecture is an APU.
-  Error checkIfAPU() {
-    // TODO: replace with ROCr API once it becomes available.
-    llvm::StringRef StrGfxName(ComputeUnitKind);
-    IsAPU = llvm::StringSwitch<bool>(StrGfxName)
-                .Case("gfx940", true)
-                .Default(false);
-    if (IsAPU)
-      return Plugin::success();
-
-    bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
-                        .Case("gfx942", true)
-                        .Default(false);
-    if (!MayBeAPU)
-      return Plugin::success();
-
-    // can be MI300A or MI300X
-    uint32_t ChipID = 0;
-    if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
-      return Err;
-
-    if (!(ChipID & 0x1)) {
-      IsAPU = true;
-      return Plugin::success();
-    }
-    return Plugin::success();
-  }
-
-  /// Envar for controlling the number of HSA queues per device. High number of
-  /// queues may degrade performance.
-  UInt32Envar OMPX_NumQueues;
-
-  /// Envar for controlling the size of each HSA queue. The size is the number
-  /// of HSA packets a queue is expected to hold. It is also the number of HSA
-  /// packets that can be pushed into each queue without waiting the driver to
-  /// process them.
-  UInt32Envar OMPX_QueueSize;
-
-  /// Envar for controlling the default number of teams relative to the number
-  /// of compute units (CUs) the device has:
-  ///   #default_teams = OMPX_DefaultTeamsPerCU * #CUs.
-  UInt32Envar OMPX_DefaultTeamsPerCU;
-
-  /// Envar specifying the maximum size in bytes where the memory copies are
-  /// asynchronous operations. Up to this transfer size, the memory copies are
-  /// asychronous operations pushed to the corresponding stream. For larger
-  /// transfers, they are synchronous transfers.
-  UInt32Envar OMPX_MaxAsyncCopyBytes;
-
-  /// Envar controlling the initial number of HSA signals per device. There is
-  /// one manager of signals per device managing several pre-allocated signals.
-  /// These signals are mainly used by AMDGPU streams. If needed, more signals
-  /// will be created.
-  UInt32Envar OMPX_InitialNumSignals;
-
-  /// Environment variables to set the time to wait in active state before
-  /// switching to blocked state. The default 2000000 busywaits for 2 seconds
-  /// before going into a blocking HSA wait state. The unit for these variables
-  /// are microseconds.
-  UInt32Envar OMPX_StreamBusyWait;
-
-  /// Use ROCm 5.7 interface for multiple SDMA engines
-  BoolEnvar OMPX_UseMultipleSdmaEngines;
-
-  /// Value of OMPX_APU_MAPS env var used to force
-  /// automatic zero-copy behavior on non-APU GPUs.
-  BoolEnvar OMPX_ApuMaps;
-
-  /// Stream manager for AMDGPU streams.
-  AMDGPUStreamManagerTy AMDGPUStreamManager;
-
-  /// Event manager for AMDGPU events.
-  AMDGPUEventManagerTy AMDGPUEventManager;
-
-  /// Signal manager for AMDGPU signals.
-  AMDGPUSignalManagerTy AMDGPUSignalManager;
-
-  /// The agent handler corresponding to the device.
-  hsa_agent_t Agent;
-
-  /// The GPU architecture.
-  std::string ComputeUnitKind;
-
-  /// The frequency of the steady clock inside the device.
-  uint64_t ClockFrequency;
-
-  /// The total number of concurrent work items that can be running on the GPU.
-  uint64_t HardwareParallelism;
-
-  /// Reference to the host device.
-  AMDHostDeviceTy &HostDevice;
-
-  /// The current size of the global device memory pool (managed by us).
-  uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
-
-  /// The current size of the stack that will be used in cases where it could
-  /// not be statically determined.
-  uint64_t StackSize = 16 * 1024 /* 16 KB */;
-
-  /// Is the plugin associated with an APU?
-  bool IsAPU = false;
-
-  /// True is the system is configured with XNACK-Enabled.
-  /// False otherwise.
-  bool IsXnackEnabled = false;
-};
-
-Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
-  hsa_status_t Status;
-  Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject);
-  if (auto Err =
-          Plugin::check(Status, "Error in hsa_code_object_deserialize: %s"))
-    return Err;
-
-  Status = hsa_executable_create_alt(
-      HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable);
-  if (auto Err =
-          Plugin::check(Status, "Error in hsa_executable_create_alt: %s"))
-    return Err;
-
-  Status = hsa_executable_load_code_object(Executable, Device.getAgent(),
-                                           CodeObject, "");
-  if (auto Err =
-          Plugin::check(Status, "Error in hsa_executable_load_code_object: %s"))
-    return Err;
-
-  Status = hsa_executable_freeze(Executable, "");
-  if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s"))
-    return Err;
-
-  uint32_t Result;
-  Status = hsa_executable_validate(Executable, &Result);
-  if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s"))
-    return Err;
-
-  if (Result)
-    return Plugin::error("Loaded HSA executable does not validate");
-
-  if (auto Err = utils::readAMDGPUMetaDataFromImage(
-          getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
-    return Err;
-
-  return Plugin::success();
-}
-
-Expected<hsa_executable_symbol_t>
-AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device,
-                                      StringRef SymbolName) const {
-
-  AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
-  hsa_agent_t Agent = AMDGPUDevice.getAgent();
-
-  hsa_executable_symbol_t Symbol;
-  hsa_status_t Status = hsa_executable_get_symbol_by_name(
-      Executable, SymbolName.data(), &Agent, &Symbol);
-  if (auto Err = Plugin::check(
-          Status, "Error in hsa_executable_get_symbol_by_name(%s): %s",
-          SymbolName.data()))
-    return std::move(Err);
-
-  return Symbol;
-}
-
-template <typename ResourceTy>
-Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) {
-  if (Resource)
-    return Plugin::error("Creating an existing resource");
-
-  AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device);
-
-  Resource = new ResourceTy(AMDGPUDevice);
-
-  return Resource->init();
-}
-
-AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
-    : Agent(Device.getAgent()), Queue(nullptr),
-      SignalManager(Device.getSignalManager()), Device(Device),
-      // Initialize the std::deque with some empty positions.
-      Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
-      StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
-      UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
-
-/// Class implementing the AMDGPU-specific functionalities of the global
-/// handler.
-struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
-  /// Get the metadata of a global from the device. The name and size of the
-  /// global is read from DeviceGlobal and the address of the global is written
-  /// to DeviceGlobal.
-  Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
-                                    DeviceImageTy &Image,
-                                    GlobalTy &DeviceGlobal) override {
-    AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image);
-
-    // Find the symbol on the device executable.
-    auto SymbolOrErr =
-        AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName());
-    if (!SymbolOrErr)
-      return SymbolOrErr.takeError();
-
-    hsa_executable_symbol_t Symbol = *SymbolOrErr;
-    hsa_symbol_kind_t SymbolType;
-    hsa_status_t Status;
-    uint64_t SymbolAddr;
-    uint32_t SymbolSize;
-
-    // Retrieve the type, address and size of the symbol.
-    std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = {
-        {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType},
-        {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr},
-        {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}};
-
-    for (auto &Info : RequiredInfos) {
-      Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second);
-      if (auto Err = Plugin::check(
-              Status, "Error in hsa_executable_symbol_get_info: %s"))
-        return Err;
-    }
-
-    // Check the size of the symbol.
-    if (SymbolSize != DeviceGlobal.getSize())
-      return Plugin::error(
-          "Failed to load global '%s' due to size mismatch (%zu != %zu)",
-          DeviceGlobal.getName().data(), SymbolSize,
-          (size_t)DeviceGlobal.getSize());
-
-    // Store the symbol address on the device global metadata.
-    DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr));
-
-    return Plugin::success();
-  }
-};
-
-/// Class implementing the AMDGPU-specific functionalities of the plugin.
-struct AMDGPUPluginTy final : public GenericPluginTy {
-  /// Create an AMDGPU plugin and initialize the AMDGPU driver.
-  AMDGPUPluginTy()
-      : GenericPluginTy(getTripleArch()), Initialized(false),
-        HostDevice(nullptr) {}
-
-  /// This class should not be copied.
-  AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
-  AMDGPUPluginTy(AMDGPUPluginTy &&) = delete;
-
-  /// Initialize the plugin and return the number of devices.
-  Expected<int32_t> initImpl() override {
-    hsa_status_t Status = hsa_init();
-    if (Status != HSA_STATUS_SUCCESS) {
-      // Cannot call hsa_success_string.
-      DP("Failed to initialize AMDGPU's HSA library\n");
-      return 0;
-    }
-
-    // The initialization of HSA was successful. It should be safe to call
-    // HSA functions from now on, e.g., hsa_shut_down.
-    Initialized = true;
-
-#ifdef OMPT_SUPPORT
-    ompt::connectLibrary();
-#endif
-
-    // Register event handler to detect memory errors on the devices.
-    Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
-    if (auto Err = Plugin::check(
-            Status, "Error in hsa_amd_register_system_event_handler: %s"))
-      return std::move(Err);
-
-    // List of host (CPU) agents.
-    llvm::SmallVector<hsa_agent_t> HostAgents;
-
-    // Count the number of available agents.
-    auto Err = utils::iterateAgents([&](hsa_agent_t Agent) {
-      // Get the device type of the agent.
-      hsa_device_type_t DeviceType;
-      hsa_status_t Status =
-          hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
-      if (Status != HSA_STATUS_SUCCESS)
-        return Status;
-
-      // Classify the agents into kernel (GPU) and host (CPU) kernels.
-      if (DeviceType == HSA_DEVICE_TYPE_GPU) {
-        // Ensure that the GPU agent supports kernel dispatch packets.
-        hsa_agent_feature_t Features;
-        Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features);
-        if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
-          KernelAgents.push_back(Agent);
-      } else if (DeviceType == HSA_DEVICE_TYPE_CPU) {
-        HostAgents.push_back(Agent);
-      }
-      return HSA_STATUS_SUCCESS;
-    });
-
-    if (Err)
-      return std::move(Err);
-
-    int32_t NumDevices = KernelAgents.size();
-    if (NumDevices == 0) {
-      // Do not initialize if there are no devices.
-      DP("There are no devices supporting AMDGPU.\n");
-      return 0;
-    }
-
-    // There are kernel agents but there is no host agent. That should be
-    // treated as an error.
-    if (HostAgents.empty())
-      return Plugin::error("No AMDGPU host agents");
-
-    // Initialize the host device using host agents.
-    HostDevice = allocate<AMDHostDeviceTy>();
-    new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
-
-    // Setup the memory pools of available for the host.
-    if (auto Err = HostDevice->init())
-      return std::move(Err);
-
-    return NumDevices;
-  }
-
-  /// Deinitialize the plugin.
-  Error deinitImpl() override {
-    // The HSA runtime was not initialized, so nothing from the plugin was
-    // actually initialized.
-    if (!Initialized)
-      return Plugin::success();
-
-    if (HostDevice)
-      if (auto Err = HostDevice->deinit())
-        return Err;
-
-    // Finalize the HSA runtime.
-    hsa_status_t Status = hsa_shut_down();
-    return Plugin::check(Status, "Error in hsa_shut_down: %s");
-  }
-
-  /// Creates an AMDGPU device.
-  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
-                                int32_t NumDevices) override {
-    return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
-                              getKernelAgent(DeviceId));
-  }
-
-  /// Creates an AMDGPU global handler.
-  GenericGlobalHandlerTy *createGlobalHandler() override {
-    return new AMDGPUGlobalHandlerTy();
-  }
-
-  Triple::ArchType getTripleArch() const override { return Triple::amdgcn; }
-
-  /// Get the ELF code for recognizing the compatible image binary.
-  uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; }
-
-  /// Check whether the image is compatible with an AMDGPU device.
-  Expected<bool> isELFCompatible(StringRef Image) const override {
-    // Get the associated architecture and flags from the ELF.
-    auto ElfOrErr = ELF64LEObjectFile::create(
-        MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false);
-    if (!ElfOrErr)
-      return ElfOrErr.takeError();
-    std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
-
-    for (hsa_agent_t Agent : KernelAgents) {
-      auto TargeTripleAndFeaturesOrError =
-          utils::getTargetTripleAndFeatures(Agent);
-      if (!TargeTripleAndFeaturesOrError)
-        return TargeTripleAndFeaturesOrError.takeError();
-      if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
-                                           ElfOrErr->getPlatformFlags(),
-                                           *TargeTripleAndFeaturesOrError))
-        return false;
-    }
-    return true;
-  }
-
-  bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
-    return true;
-  }
-
-  /// Get the host device instance.
-  AMDHostDeviceTy &getHostDevice() {
-    assert(HostDevice && "Host device not initialized");
-    return *HostDevice;
-  }
-
-  /// Get the kernel agent with the corresponding agent id.
-  hsa_agent_t getKernelAgent(int32_t AgentId) const {
-    assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id");
-    return KernelAgents[AgentId];
-  }
-
-  /// Get the list of the available kernel agents.
-  const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const {
-    return KernelAgents;
-  }
-
-private:
-  /// Event handler that will be called by ROCr if an event is detected.
-  static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
-    if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
-      return HSA_STATUS_SUCCESS;
-
-    SmallVector<std::string> Reasons;
-    uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask;
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT)
-      Reasons.emplace_back("Page not present or supervisor privilege");
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY)
-      Reasons.emplace_back("Write access to a read-only page");
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX)
-      Reasons.emplace_back("Execute access to a page marked NX");
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY)
-      Reasons.emplace_back("GPU attempted access to a host only page");
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC)
-      Reasons.emplace_back("DRAM ECC failure");
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE)
-      Reasons.emplace_back("Can't determine the exact fault address");
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC)
-      Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)");
-    if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG)
-      Reasons.emplace_back("GPU reset following unspecified hang");
-
-    // If we do not know the reason, say so, otherwise remove the trailing comma
-    // and space.
-    if (Reasons.empty())
-      Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")");
-
-    uint32_t Node = -1;
-    hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
-
-    // Abort the execution since we do not recover from this error.
-    FATAL_MESSAGE(1,
-                  "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
-                  ") at virtual address %p. Reasons: %s",
-                  Node, Event->memory_fault.agent.handle,
-                  (void *)Event->memory_fault.virtual_address,
-                  llvm::join(Reasons, ", ").c_str());
-
-    return HSA_STATUS_ERROR;
-  }
-
-  /// Indicate whether the HSA runtime was correctly initialized. Even if there
-  /// is no available devices this boolean will be true. It indicates whether
-  /// we can safely call HSA functions (e.g., hsa_shut_down).
-  bool Initialized;
-
-  /// Arrays of the available GPU and CPU agents. These arrays of handles should
-  /// not be here but in the AMDGPUDeviceTy structures directly. However, the
-  /// HSA standard does not provide API functions to retirve agents directly,
-  /// only iterating functions. We cache the agents here for convenience.
-  llvm::SmallVector<hsa_agent_t> KernelAgents;
-
-  /// The device representing all HSA host agents.
-  AMDHostDeviceTy *HostDevice;
-};
-
-Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
-                                 uint32_t NumThreads, uint64_t NumBlocks,
-                                 KernelArgsTy &KernelArgs, void *Args,
-                                 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
-  const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
-
-  if (ArgsSize < KernelArgsSize)
-    return Plugin::error("Mismatch of kernel arguments size");
-
-  // The args size reported by HSA may or may not contain the implicit args.
-  // For now, assume that HSA does not consider the implicit arguments when
-  // reporting the arguments of a kernel. In the worst case, we can waste
-  // 56 bytes per allocation.
-  uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
-
-  AMDGPUPluginTy &AMDGPUPlugin =
-      static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
-  AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
-  AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
-
-  void *AllArgs = nullptr;
-  if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs))
-    return Err;
-
-  // Account for user requested dynamic shared memory.
-  uint32_t GroupSize = getGroupSize();
-  if (uint32_t MaxDynCGroupMem = std::max(
-          KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
-    GroupSize += MaxDynCGroupMem;
-  }
-
-  uint64_t StackSize;
-  if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
-    return Err;
-
-  // Initialize implicit arguments.
-  utils::AMDGPUImplicitArgsTy *ImplArgs =
-      reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
-          advanceVoidPtr(AllArgs, KernelArgsSize));
-
-  // Initialize the implicit arguments to zero.
-  std::memset(ImplArgs, 0, ImplicitArgsSize);
-
-  // Copy the explicit arguments.
-  // TODO: We should expose the args memory manager alloc to the common part as
-  // 	   alternative to copying them twice.
-  if (KernelArgs.NumArgs)
-    std::memcpy(AllArgs, *static_cast<void **>(Args),
-                sizeof(void *) * KernelArgs.NumArgs);
-
-  AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
-
-  AMDGPUStreamTy *Stream = nullptr;
-  if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
-    return Err;
-
-  // If this kernel requires an RPC server we attach its pointer to the stream.
-  if (GenericDevice.getRPCServer())
-    Stream->setRPCServer(GenericDevice.getRPCServer());
-
-  // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
-  if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
-    ImplArgs->BlockCountX = NumBlocks;
-    ImplArgs->BlockCountY = 1;
-    ImplArgs->BlockCountZ = 1;
-    ImplArgs->GroupSizeX = NumThreads;
-    ImplArgs->GroupSizeY = 1;
-    ImplArgs->GroupSizeZ = 1;
-    ImplArgs->GridDims = 1;
-    ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
-  }
-
-  // Push the kernel launch into the stream.
-  return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
-}
-
-Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
-                                             KernelArgsTy &KernelArgs,
-                                             uint32_t NumThreads,
-                                             uint64_t NumBlocks) const {
-  // Only do all this when the output is requested
-  if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
-    return Plugin::success();
-
-  // We don't have data to print additional info, but no hard error
-  if (!KernelInfo.has_value())
-    return Plugin::success();
-
-  // General Info
-  auto NumGroups = NumBlocks;
-  auto ThreadsPerGroup = NumThreads;
-
-  // Kernel Arguments Info
-  auto ArgNum = KernelArgs.NumArgs;
-  auto LoopTripCount = KernelArgs.Tripcount;
-
-  // Details for AMDGPU kernels (read from image)
-  // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
-  auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
-  auto SGPRCount = (*KernelInfo).SGPRCount;
-  auto VGPRCount = (*KernelInfo).VGPRCount;
-  auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
-  auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
-  auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
-
-  // Prints additional launch info that contains the following.
-  // Num Args: The number of kernel arguments
-  // Teams x Thrds: The number of teams and the number of threads actually
-  // running.
-  // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
-  // kernel in work-items
-  // LDS Usage: Amount of bytes used in LDS storage
-  // S/VGPR Count: the number of S/V GPRs occupied by the kernel
-  // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
-  // Tripcount: loop tripcount for the kernel
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
-       "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
-       "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
-       "%lu\n",
-       ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
-       GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
-       LoopTripCount);
-
-  return Plugin::success();
-}
-
-GenericPluginTy *PluginTy::createPlugin() { return new AMDGPUPluginTy(); }
-
-template <typename... ArgsTy>
-static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
-  hsa_status_t ResultCode = static_cast<hsa_status_t>(Code);
-  if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK)
-    return Error::success();
-
-  const char *Desc = "Unknown error";
-  hsa_status_t Ret = hsa_status_string(ResultCode, &Desc);
-  if (Ret != HSA_STATUS_SUCCESS)
-    REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
-
-  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
-                                                    ErrFmt, Args..., Desc);
-}
-
-void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
-                                      TargetAllocTy Kind) {
-  // Allocate memory from the pool.
-  void *Ptr = nullptr;
-  if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
-    consumeError(std::move(Err));
-    return nullptr;
-  }
-  assert(Ptr && "Invalid pointer");
-
-  auto &KernelAgents = Plugin.getKernelAgents();
-
-  // Allow all kernel agents to access the allocation.
-  if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) {
-    REPORT("%s\n", toString(std::move(Err)).data());
-    return nullptr;
-  }
-  return Ptr;
-}
-
-void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
-  if (Size == 0)
-    return nullptr;
-
-  // Find the correct memory pool.
-  AMDGPUMemoryPoolTy *MemoryPool = nullptr;
-  switch (Kind) {
-  case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
-    MemoryPool = CoarseGrainedMemoryPools[0];
-    break;
-  case TARGET_ALLOC_HOST:
-    MemoryPool = &HostDevice.getFineGrainedMemoryPool();
-    break;
-  case TARGET_ALLOC_SHARED:
-    MemoryPool = &HostDevice.getFineGrainedMemoryPool();
-    break;
-  }
-
-  if (!MemoryPool) {
-    REPORT("No memory pool for the specified allocation kind\n");
-    return nullptr;
-  }
-
-  // Allocate from the corresponding memory pool.
-  void *Alloc = nullptr;
-  if (Error Err = MemoryPool->allocate(Size, &Alloc)) {
-    REPORT("%s\n", toString(std::move(Err)).data());
-    return nullptr;
-  }
-
-  if (Alloc) {
-    auto &KernelAgents =
-        static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents();
-    // Inherently necessary for host or shared allocations
-    // Also enabled for device memory to allow device to device memcpy
-
-    // Enable all kernel agents to access the buffer.
-    if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) {
-      REPORT("%s\n", toString(std::move(Err)).data());
-      return nullptr;
-    }
-  }
-
-  return Alloc;
-}
-
-} // namespace plugin
-} // namespace target
-} // namespace omp
-} // namespace llvm

diff --git a/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
deleted file mode 100644
index 58a3b5d..0000000
--- a/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ /dev/null

@@ -1,317 +0,0 @@
-//===----RTLs/amdgpu/utils/UtilitiesRTL.h ------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL Utilities for AMDGPU plugins
-//
-//===----------------------------------------------------------------------===//
-
-#include <cstdint>
-
-#include "Shared/Debug.h"
-#include "Utils/ELF.h"
-
-#include "omptarget.h"
-
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-
-#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/BinaryFormat/MsgPackDocument.h"
-#include "llvm/Support/MemoryBufferRef.h"
-#include "llvm/Support/YAMLTraits.h"
-
-using namespace llvm::ELF;
-
-namespace llvm {
-namespace omp {
-namespace target {
-namespace plugin {
-namespace utils {
-
-// The implicit arguments of COV5 AMDGPU kernels.
-struct AMDGPUImplicitArgsTy {
-  uint32_t BlockCountX;
-  uint32_t BlockCountY;
-  uint32_t BlockCountZ;
-  uint16_t GroupSizeX;
-  uint16_t GroupSizeY;
-  uint16_t GroupSizeZ;
-  uint8_t Unused0[46]; // 46 byte offset.
-  uint16_t GridDims;
-  uint8_t Unused1[54]; // 54 byte offset.
-  uint32_t DynamicLdsSize;
-  uint8_t Unused2[132]; // 132 byte offset.
-};
-
-// Dummy struct for COV4 implicitargs.
-struct AMDGPUImplicitArgsTyCOV4 {
-  uint8_t Unused[56];
-};
-
-inline uint32_t getImplicitArgsSize(uint16_t Version) {
-  return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5
-             ? sizeof(AMDGPUImplicitArgsTyCOV4)
-             : sizeof(AMDGPUImplicitArgsTy);
-}
-
-/// Check if an image is compatible with current system's environment. The
-/// system environment is given as a 'target-id' which has the form:
-///
-/// <target-id> := <processor> ( ":" <target-feature> ( "+" | "-" ) )*
-///
-/// If a feature is not specific as '+' or '-' it is assumed to be in an 'any'
-/// and is compatible with either '+' or '-'. The HSA runtime returns this
-/// information using the target-id, while we use the ELF header to determine
-/// these features.
-inline bool isImageCompatibleWithEnv(StringRef ImageArch, uint32_t ImageFlags,
-                                     StringRef EnvTargetID) {
-  StringRef EnvArch = EnvTargetID.split(":").first;
-
-  // Trivial check if the base processors match.
-  if (EnvArch != ImageArch)
-    return false;
-
-  // Check if the image is requesting xnack on or off.
-  switch (ImageFlags & EF_AMDGPU_FEATURE_XNACK_V4) {
-  case EF_AMDGPU_FEATURE_XNACK_OFF_V4:
-    // The image is 'xnack-' so the environment must be 'xnack-'.
-    if (!EnvTargetID.contains("xnack-"))
-      return false;
-    break;
-  case EF_AMDGPU_FEATURE_XNACK_ON_V4:
-    // The image is 'xnack+' so the environment must be 'xnack+'.
-    if (!EnvTargetID.contains("xnack+"))
-      return false;
-    break;
-  case EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4:
-  case EF_AMDGPU_FEATURE_XNACK_ANY_V4:
-  default:
-    break;
-  }
-
-  // Check if the image is requesting sramecc on or off.
-  switch (ImageFlags & EF_AMDGPU_FEATURE_SRAMECC_V4) {
-  case EF_AMDGPU_FEATURE_SRAMECC_OFF_V4:
-    // The image is 'sramecc-' so the environment must be 'sramecc-'.
-    if (!EnvTargetID.contains("sramecc-"))
-      return false;
-    break;
-  case EF_AMDGPU_FEATURE_SRAMECC_ON_V4:
-    // The image is 'sramecc+' so the environment must be 'sramecc+'.
-    if (!EnvTargetID.contains("sramecc+"))
-      return false;
-    break;
-  case EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4:
-  case EF_AMDGPU_FEATURE_SRAMECC_ANY_V4:
-    break;
-  }
-
-  return true;
-}
-
-struct KernelMetaDataTy {
-  uint64_t KernelObject;
-  uint32_t GroupSegmentList;
-  uint32_t PrivateSegmentSize;
-  uint32_t SGPRCount;
-  uint32_t VGPRCount;
-  uint32_t SGPRSpillCount;
-  uint32_t VGPRSpillCount;
-  uint32_t KernelSegmentSize;
-  uint32_t ExplicitArgumentCount;
-  uint32_t ImplicitArgumentCount;
-  uint32_t RequestedWorkgroupSize[3];
-  uint32_t WorkgroupSizeHint[3];
-  uint32_t WavefronSize;
-  uint32_t MaxFlatWorkgroupSize;
-};
-namespace {
-
-/// Reads the AMDGPU specific per-kernel-metadata from an image.
-class KernelInfoReader {
-public:
-  KernelInfoReader(StringMap<KernelMetaDataTy> &KIM) : KernelInfoMap(KIM) {}
-
-  /// Process ELF note to read AMDGPU metadata from respective information
-  /// fields.
-  Error processNote(const object::ELF64LE::Note &Note, size_t Align) {
-    if (Note.getName() != "AMDGPU")
-      return Error::success(); // We are not interested in other things
-
-    assert(Note.getType() == ELF::NT_AMDGPU_METADATA &&
-           "Parse AMDGPU MetaData");
-    auto Desc = Note.getDesc(Align);
-    StringRef MsgPackString =
-        StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
-    msgpack::Document MsgPackDoc;
-    if (!MsgPackDoc.readFromBlob(MsgPackString, /*Multi=*/false))
-      return Error::success();
-
-    AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true);
-    if (!Verifier.verify(MsgPackDoc.getRoot()))
-      return Error::success();
-
-    auto RootMap = MsgPackDoc.getRoot().getMap(true);
-
-    if (auto Err = iterateAMDKernels(RootMap))
-      return Err;
-
-    return Error::success();
-  }
-
-private:
-  /// Extracts the relevant information via simple string look-up in the msgpack
-  /// document elements.
-  Error extractKernelData(msgpack::MapDocNode::MapTy::value_type V,
-                          std::string &KernelName,
-                          KernelMetaDataTy &KernelData) {
-    if (!V.first.isString())
-      return Error::success();
-
-    const auto IsKey = [](const msgpack::DocNode &DK, StringRef SK) {
-      return DK.getString() == SK;
-    };
-
-    const auto GetSequenceOfThreeInts = [](msgpack::DocNode &DN,
-                                           uint32_t *Vals) {
-      assert(DN.isArray() && "MsgPack DocNode is an array node");
-      auto DNA = DN.getArray();
-      assert(DNA.size() == 3 && "ArrayNode has at most three elements");
-
-      int I = 0;
-      for (auto DNABegin = DNA.begin(), DNAEnd = DNA.end(); DNABegin != DNAEnd;
-           ++DNABegin) {
-        Vals[I++] = DNABegin->getUInt();
-      }
-    };
-
-    if (IsKey(V.first, ".name")) {
-      KernelName = V.second.toString();
-    } else if (IsKey(V.first, ".sgpr_count")) {
-      KernelData.SGPRCount = V.second.getUInt();
-    } else if (IsKey(V.first, ".sgpr_spill_count")) {
-      KernelData.SGPRSpillCount = V.second.getUInt();
-    } else if (IsKey(V.first, ".vgpr_count")) {
-      KernelData.VGPRCount = V.second.getUInt();
-    } else if (IsKey(V.first, ".vgpr_spill_count")) {
-      KernelData.VGPRSpillCount = V.second.getUInt();
-    } else if (IsKey(V.first, ".private_segment_fixed_size")) {
-      KernelData.PrivateSegmentSize = V.second.getUInt();
-    } else if (IsKey(V.first, ".group_segment_fixed_size")) {
-      KernelData.GroupSegmentList = V.second.getUInt();
-    } else if (IsKey(V.first, ".reqd_workgroup_size")) {
-      GetSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize);
-    } else if (IsKey(V.first, ".workgroup_size_hint")) {
-      GetSequenceOfThreeInts(V.second, KernelData.WorkgroupSizeHint);
-    } else if (IsKey(V.first, ".wavefront_size")) {
-      KernelData.WavefronSize = V.second.getUInt();
-    } else if (IsKey(V.first, ".max_flat_workgroup_size")) {
-      KernelData.MaxFlatWorkgroupSize = V.second.getUInt();
-    }
-
-    return Error::success();
-  }
-
-  /// Get the "amdhsa.kernels" element from the msgpack Document
-  Expected<msgpack::ArrayDocNode> getAMDKernelsArray(msgpack::MapDocNode &MDN) {
-    auto Res = MDN.find("amdhsa.kernels");
-    if (Res == MDN.end())
-      return createStringError(inconvertibleErrorCode(),
-                               "Could not find amdhsa.kernels key");
-
-    auto Pair = *Res;
-    assert(Pair.second.isArray() &&
-           "AMDGPU kernel entries are arrays of entries");
-
-    return Pair.second.getArray();
-  }
-
-  /// Iterate all entries for one "amdhsa.kernels" entry. Each entry is a
-  /// MapDocNode that either maps a string to a single value (most of them) or
-  /// to another array of things. Currently, we only handle the case that maps
-  /// to scalar value.
-  Error generateKernelInfo(msgpack::ArrayDocNode::ArrayTy::iterator It) {
-    KernelMetaDataTy KernelData;
-    std::string KernelName;
-    auto Entry = (*It).getMap();
-    for (auto MI = Entry.begin(), E = Entry.end(); MI != E; ++MI)
-      if (auto Err = extractKernelData(*MI, KernelName, KernelData))
-        return Err;
-
-    KernelInfoMap.insert({KernelName, KernelData});
-    return Error::success();
-  }
-
-  /// Go over the list of AMD kernels in the "amdhsa.kernels" entry
-  Error iterateAMDKernels(msgpack::MapDocNode &MDN) {
-    auto KernelsOrErr = getAMDKernelsArray(MDN);
-    if (auto Err = KernelsOrErr.takeError())
-      return Err;
-
-    auto KernelsArr = *KernelsOrErr;
-    for (auto It = KernelsArr.begin(), E = KernelsArr.end(); It != E; ++It) {
-      if (!It->isMap())
-        continue; // we expect <key,value> pairs
-
-      // Obtain the value for the different entries. Each array entry is a
-      // MapDocNode
-      if (auto Err = generateKernelInfo(It))
-        return Err;
-    }
-    return Error::success();
-  }
-
-  // Kernel names are the keys
-  StringMap<KernelMetaDataTy> &KernelInfoMap;
-};
-} // namespace
-
-/// Reads the AMDGPU specific metadata from the ELF file and propagates the
-/// KernelInfoMap
-inline Error
-readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
-                            StringMap<KernelMetaDataTy> &KernelInfoMap,
-                            uint16_t &ELFABIVersion) {
-  Error Err = Error::success(); // Used later as out-parameter
-
-  auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
-  if (auto Err = ELFOrError.takeError())
-    return Err;
-
-  const object::ELF64LEFile ELFObj = ELFOrError.get();
-  ArrayRef<object::ELF64LE::Shdr> Sections = cantFail(ELFObj.sections());
-  KernelInfoReader Reader(KernelInfoMap);
-
-  // Read the code object version from ELF image header
-  auto Header = ELFObj.getHeader();
-  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
-  DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
-
-  for (const auto &S : Sections) {
-    if (S.sh_type != ELF::SHT_NOTE)
-      continue;
-
-    for (const auto N : ELFObj.notes(S, Err)) {
-      if (Err)
-        return Err;
-      // Fills the KernelInfoTabel entries in the reader
-      if ((Err = Reader.processNote(N, S.sh_addralign)))
-        return Err;
-    }
-  }
-
-  return Error::success();
-}
-
-} // namespace utils
-} // namespace plugin
-} // namespace target
-} // namespace omp
-} // namespace llvm

diff --git a/libomptarget/plugins-nextgen/common/CMakeLists.txt b/libomptarget/plugins-nextgen/common/CMakeLists.txt
deleted file mode 100644
index a7350e6..0000000
--- a/libomptarget/plugins-nextgen/common/CMakeLists.txt
+++ /dev/null

@@ -1,70 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Common parts which can be used by all plugins
-#
-##===----------------------------------------------------------------------===##
-
-# NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we
-# don't want to export `PluginInterface` while `add_llvm_library` requires that.
-add_library(PluginCommon OBJECT
-  src/PluginInterface.cpp
-  src/GlobalHandler.cpp
-  src/JIT.cpp
-  src/RPC.cpp
-  src/Utils/ELF.cpp
-)
-add_dependencies(PluginCommon intrinsics_gen)
-
-# Only enable JIT for those targets that LLVM can support.
-string(TOUPPER "${LLVM_TARGETS_TO_BUILD}" TargetsSupported)
-foreach(Target ${TargetsSupported})
-	target_compile_definitions(PluginCommon PRIVATE "LIBOMPTARGET_JIT_${Target}")
-endforeach()
-
-# Include the RPC server from the `libc` project if availible.
-if(TARGET llvmlibc_rpc_server AND ${LIBOMPTARGET_GPU_LIBC_SUPPORT})
-	target_link_libraries(PluginCommon PRIVATE llvmlibc_rpc_server)
-	target_compile_definitions(PluginCommon PRIVATE LIBOMPTARGET_RPC_SUPPORT)
-elseif(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
-  find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server
-               PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
-  if(llvmlibc_rpc_server)
-		target_link_libraries(PluginCommon PRIVATE ${llvmlibc_rpc_server})
-		target_compile_definitions(PluginCommon PRIVATE LIBOMPTARGET_RPC_SUPPORT)
-    # We may need to get the headers directly from the 'libc' source directory.
-    target_include_directories(PluginCommon PRIVATE
-                               ${CMAKE_SOURCE_DIR}/../libc/utils/gpu/server
-                               ${CMAKE_SOURCE_DIR}/../libc/include)
-  endif()
-endif()
-
-# If we have OMPT enabled include it in the list of sources.
-if (OMPT_TARGET_DEFAULT AND LIBOMPTARGET_OMPT_SUPPORT)
-  target_sources(PluginCommon PRIVATE OMPT/OmptCallback.cpp)
-  target_include_directories(PluginCommon PRIVATE OMPT)
-endif()
-
-# Define the TARGET_NAME and DEBUG_PREFIX.
-target_compile_definitions(PluginCommon PRIVATE
-  TARGET_NAME="PluginInterface"
-  DEBUG_PREFIX="PluginInterface"
-)
-
-target_compile_options(PluginCommon PUBLIC ${offload_compile_flags})
-target_link_options(PluginCommon PUBLIC ${offload_link_flags})
-
-target_include_directories(PluginCommon PUBLIC 
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
-  ${LIBOMPTARGET_INCLUDE_DIR}
-)
-
-set_target_properties(PluginCommon PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-  CXX_VISIBILITY_PRESET protected)

diff --git a/libomptarget/plugins-nextgen/common/OMPT/OmptCallback.cpp b/libomptarget/plugins-nextgen/common/OMPT/OmptCallback.cpp
deleted file mode 100644
index fb8a156..0000000
--- a/libomptarget/plugins-nextgen/common/OMPT/OmptCallback.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-//===---------- OmptCallback.cpp - Generic OMPT callbacks --------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// OMPT support for PluginInterface
-//
-//===----------------------------------------------------------------------===//
-
-#ifdef OMPT_SUPPORT
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#include "Shared/Debug.h"
-
-#include "OpenMP/OMPT/Callback.h"
-#include "OpenMP/OMPT/Connector.h"
-
-using namespace llvm::omp::target::ompt;
-
-bool llvm::omp::target::ompt::Initialized = false;
-
-ompt_get_callback_t llvm::omp::target::ompt::lookupCallbackByCode = nullptr;
-ompt_function_lookup_t llvm::omp::target::ompt::lookupCallbackByName = nullptr;
-
-int llvm::omp::target::ompt::initializeLibrary(ompt_function_lookup_t lookup,
-                                               int initial_device_num,
-                                               ompt_data_t *tool_data) {
-  DP("OMPT: Executing initializeLibrary (libomptarget)\n");
-#define bindOmptFunctionName(OmptFunction, DestinationFunction)                \
-  if (lookup)                                                                  \
-    DestinationFunction = (OmptFunction##_t)lookup(#OmptFunction);             \
-  DP("OMPT: initializeLibrary (libomptarget) bound %s=%p\n",                   \
-     #DestinationFunction, ((void *)(uint64_t)DestinationFunction));
-
-  bindOmptFunctionName(ompt_get_callback, lookupCallbackByCode);
-#undef bindOmptFunctionName
-
-  // Store pointer of 'ompt_libomp_target_fn_lookup' for use by the plugin
-  lookupCallbackByName = lookup;
-
-  Initialized = true;
-
-  return 0;
-}
-
-void llvm::omp::target::ompt::finalizeLibrary(ompt_data_t *tool_data) {
-  DP("OMPT: Executing finalizeLibrary (libomptarget)\n");
-}
-
-void llvm::omp::target::ompt::connectLibrary() {
-  DP("OMPT: Entering connectLibrary (libomptarget)\n");
-  /// Connect plugin instance with libomptarget
-  OmptLibraryConnectorTy LibomptargetConnector("libomptarget");
-  ompt_start_tool_result_t OmptResult;
-
-  // Initialize OmptResult with the init and fini functions that will be
-  // called by the connector
-  OmptResult.initialize = ompt::initializeLibrary;
-  OmptResult.finalize = ompt::finalizeLibrary;
-  OmptResult.tool_data.value = 0;
-
-  // Now call connect that causes the above init/fini functions to be called
-  LibomptargetConnector.connect(&OmptResult);
-  DP("OMPT: Exiting connectLibrary (libomptarget)\n");
-}
-
-#endif

diff --git a/libomptarget/plugins-nextgen/common/include/DLWrap.h b/libomptarget/plugins-nextgen/common/include/DLWrap.h
deleted file mode 100644
index 8934e7e..0000000
--- a/libomptarget/plugins-nextgen/common/include/DLWrap.h
+++ /dev/null

@@ -1,286 +0,0 @@
-//===-- Shared/DLWrap.h - Convenience wrapper for dlopen/dlsym --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The openmp plugins depend on extern libraries. These can be used via:
-//  - bitcode file statically linked
-//  - (relocatable) object file statically linked
-//  - static library
-//  - dynamic library, linked at build time
-//  - dynamic library, loaded at application run time by dlopen
-//
-// This file factors out most boilerplate for using a dlopened library.
-// - Function symbols are generated that are statically linked against
-// - The dlopen can be done implicitly when initializing the library
-// - dlsym lookups are done once and cached
-// - The abstraction is very thin to permit varied uses of the library
-//
-// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to:
-// int foo(char x0, double x1, void* x2) {
-//   constexpr size_t index = id();
-//   void * dlsymResult = pointer(index);
-//   return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2);
-// }
-//
-// Multiple calls to DLWRAP(symbol_name, arity) with bespoke
-// initialization code that can use the thin abstraction:
-// namespace dlwrap {
-//   static size_t size();
-//   static const char *symbol(size_t);
-//   static void **pointer(size_t);
-// }
-// will compile to an object file that only exposes the symbols that the
-// dynamic library would do, with the right function types.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SHARED_DLWRAP_H
-#define OMPTARGET_SHARED_DLWRAP_H
-
-#include <array>
-#include <cstddef>
-#include <tuple>
-#include <type_traits>
-
-// Where symbol is a function, these expand to some book keeping and an
-// implementation of that function
-#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY)
-#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)
-
-// For example, given a prototype:
-// int foo(char, double);
-//
-// DLWRAP(foo, 2) expands to:
-//
-// namespace dlwrap {
-// struct foo_Trait : public dlwrap::trait<decltype(&foo)> {
-//   using T = dlwrap::trait<decltype(&foo)>;
-//   static T::FunctionType get() {
-//     constexpr size_t Index = getIndex();
-//     void *P = *dlwrap::pointer(Index);
-//     return reinterpret_cast<T::FunctionType>(P);
-//   }
-// };
-// }
-// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); }
-//
-// DLWRAP_INTERNAL is similar, except the function it expands to is:
-// static int dlwrap_foo(char x0, double x1) { ... }
-// so that the function pointer call can be wrapped in library-specific code
-//
-// DLWRAP_INITIALIZE() declares static functions:
-#define DLWRAP_INITIALIZE()                                                    \
-  namespace dlwrap {                                                           \
-  static size_t size();                                                        \
-  static const char *symbol(size_t); /* get symbol name in [0, size()) */      \
-  static void **                                                               \
-      pointer(size_t); /* get pointer to function pointer in [0, size()) */    \
-  }
-
-// DLWRAP_FINALIZE() implements the functions from DLWRAP_INITIALIZE
-#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL()
-
-// Implementation details follow.
-
-namespace dlwrap {
-
-// Extract return / argument types from address of function symbol
-template <typename F> struct trait;
-template <typename R, typename... Ts> struct trait<R (*)(Ts...)> {
-  constexpr static const size_t nargs = sizeof...(Ts);
-  typedef R ReturnType;
-  template <size_t i> struct arg {
-    typedef typename std::tuple_element<i, std::tuple<Ts...>>::type type;
-  };
-
-  typedef R (*FunctionType)(Ts...);
-};
-
-namespace type {
-// Book keeping is by type specialization
-
-template <size_t S> struct count {
-  static constexpr size_t N = count<S - 1>::N;
-};
-
-template <> struct count<0> { static constexpr size_t N = 0; };
-
-// Get a constexpr size_t ID, starts at zero
-#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N)
-
-// Increment value returned by DLWRAP_ID
-#define DLWRAP_INC()                                                           \
-  template <> struct dlwrap::type::count<__LINE__> {                           \
-    static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N;      \
-  }
-
-template <size_t N> struct symbol;
-#define DLWRAP_SYMBOL(SYMBOL, ID)                                              \
-  template <> struct dlwrap::type::symbol<ID> {                                \
-    static constexpr const char *call() { return #SYMBOL; }                    \
-  }
-} // namespace type
-
-template <size_t N, size_t... Is>
-constexpr std::array<const char *, N> static getSymbolArray(
-    std::index_sequence<Is...>) {
-  return {{dlwrap::type::symbol<Is>::call()...}};
-}
-
-template <size_t Requested, size_t Required> constexpr void verboseAssert() {
-  static_assert(Requested == Required, "Arity Error");
-}
-
-} // namespace dlwrap
-
-#define DLWRAP_INSTANTIATE(SYM_DEF, SYM_USE, ARITY)                            \
-  DLWRAP_INSTANTIATE_##ARITY(SYM_DEF, SYM_USE,                                 \
-                             dlwrap::trait<decltype(&SYM_USE)>)
-
-#define DLWRAP_FINALIZE_IMPL()                                                 \
-  static size_t dlwrap::size() { return DLWRAP_ID(); }                         \
-  static const char *dlwrap::symbol(size_t i) {                                \
-    static constexpr const std::array<const char *, DLWRAP_ID()>               \
-        dlwrap_symbols = getSymbolArray<DLWRAP_ID()>(                          \
-            std::make_index_sequence<DLWRAP_ID()>());                          \
-    return dlwrap_symbols[i];                                                  \
-  }                                                                            \
-  static void **dlwrap::pointer(size_t i) {                                    \
-    static std::array<void *, DLWRAP_ID()> dlwrap_pointers;                    \
-    return &dlwrap_pointers.data()[i];                                         \
-  }
-
-#define DLWRAP_COMMON(SYMBOL, ARITY)                                           \
-  DLWRAP_INC();                                                                \
-  DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1);                                      \
-  namespace dlwrap {                                                           \
-  struct SYMBOL##_Trait : public dlwrap::trait<decltype(&SYMBOL)> {            \
-    using T = dlwrap::trait<decltype(&SYMBOL)>;                                \
-    static T::FunctionType get() {                                             \
-      verboseAssert<ARITY, trait<decltype(&SYMBOL)>::nargs>();                 \
-      constexpr size_t Index = DLWRAP_ID() - 1;                                \
-      void *P = *dlwrap::pointer(Index);                                       \
-      return reinterpret_cast<T::FunctionType>(P);                             \
-    }                                                                          \
-  };                                                                           \
-  }
-
-#define DLWRAP_IMPL(SYMBOL, ARITY)                                             \
-  DLWRAP_COMMON(SYMBOL, ARITY)                                                 \
-  DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY)
-
-#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)                                    \
-  DLWRAP_COMMON(SYMBOL, ARITY)                                                 \
-  static DLWRAP_INSTANTIATE(dlwrap_##SYMBOL, SYMBOL, ARITY)
-
-#define DLWRAP_INSTANTIATE_0(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); }
-#define DLWRAP_INSTANTIATE_1(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0);                                 \
-  }
-#define DLWRAP_INSTANTIATE_2(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1);                             \
-  }
-#define DLWRAP_INSTANTIATE_3(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2);                         \
-  }
-#define DLWRAP_INSTANTIATE_4(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3);                     \
-  }
-#define DLWRAP_INSTANTIATE_5(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3,                  \
-                        typename T::template arg<4>::type x4) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4);                 \
-  }
-#define DLWRAP_INSTANTIATE_6(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3,                  \
-                        typename T::template arg<4>::type x4,                  \
-                        typename T::template arg<5>::type x5) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5);             \
-  }
-
-#define DLWRAP_INSTANTIATE_7(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3,                  \
-                        typename T::template arg<4>::type x4,                  \
-                        typename T::template arg<5>::type x5,                  \
-                        typename T::template arg<6>::type x6) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6);         \
-  }
-
-#define DLWRAP_INSTANTIATE_8(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3,                  \
-                        typename T::template arg<4>::type x4,                  \
-                        typename T::template arg<5>::type x5,                  \
-                        typename T::template arg<6>::type x6,                  \
-                        typename T::template arg<7>::type x7) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7);     \
-  }
-#define DLWRAP_INSTANTIATE_9(SYM_DEF, SYM_USE, T)                              \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3,                  \
-                        typename T::template arg<4>::type x4,                  \
-                        typename T::template arg<5>::type x5,                  \
-                        typename T::template arg<6>::type x6,                  \
-                        typename T::template arg<7>::type x7,                  \
-                        typename T::template arg<8>::type x8) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \
-  }
-#define DLWRAP_INSTANTIATE_10(SYM_DEF, SYM_USE, T)                             \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3,                  \
-                        typename T::template arg<4>::type x4,                  \
-                        typename T::template arg<5>::type x5,                  \
-                        typename T::template arg<6>::type x6,                  \
-                        typename T::template arg<7>::type x7,                  \
-                        typename T::template arg<8>::type x8,                  \
-                        typename T::template arg<9>::type x9) {                \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
-                                          x9);                                 \
-  }
-#define DLWRAP_INSTANTIATE_11(SYM_DEF, SYM_USE, T)                             \
-  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
-                        typename T::template arg<1>::type x1,                  \
-                        typename T::template arg<2>::type x2,                  \
-                        typename T::template arg<3>::type x3,                  \
-                        typename T::template arg<4>::type x4,                  \
-                        typename T::template arg<5>::type x5,                  \
-                        typename T::template arg<6>::type x6,                  \
-                        typename T::template arg<7>::type x7,                  \
-                        typename T::template arg<8>::type x8,                  \
-                        typename T::template arg<9>::type x9,                  \
-                        typename T::template arg<10>::type x10) {              \
-    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
-                                          x9, x10);                            \
-  }
-
-#endif // OMPTARGET_SHARED_DLWRAP_H

diff --git a/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
deleted file mode 100644
index 829b4b7..0000000
--- a/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ /dev/null

@@ -1,174 +0,0 @@
-//===- GlobalHandler.h - Target independent global & enviroment handling --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target independent global handler and environment manager.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
-
-#include <string>
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include "Shared/Debug.h"
-#include "Shared/Utils.h"
-
-#include "omptarget.h"
-
-namespace llvm {
-namespace omp {
-namespace target {
-namespace plugin {
-
-class DeviceImageTy;
-struct GenericDeviceTy;
-
-using namespace llvm::object;
-
-/// Common abstraction for globals that live on the host and device.
-/// It simply encapsulates the symbol name, symbol size, and symbol address
-/// (which might be host or device depending on the context).
-class GlobalTy {
-  // NOTE: Maybe we can have a pointer to the offload entry name instead of
-  // holding a private copy of the name as a std::string.
-  std::string Name;
-  uint32_t Size;
-  void *Ptr;
-
-public:
-  GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr)
-      : Name(Name), Size(Size), Ptr(Ptr) {}
-
-  const std::string &getName() const { return Name; }
-  uint32_t getSize() const { return Size; }
-  void *getPtr() const { return Ptr; }
-
-  void setSize(int32_t S) { Size = S; }
-  void setPtr(void *P) { Ptr = P; }
-};
-
-/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
-template <typename Ty> class StaticGlobalTy : public GlobalTy {
-  Ty Data;
-
-public:
-  template <typename... Args>
-  StaticGlobalTy(const std::string &Name, Args &&...args)
-      : GlobalTy(Name, sizeof(Ty), &Data),
-        Data(Ty{std::forward<Args>(args)...}) {}
-
-  template <typename... Args>
-  StaticGlobalTy(const char *Name, Args &&...args)
-      : GlobalTy(Name, sizeof(Ty), &Data),
-        Data(Ty{std::forward<Args>(args)...}) {}
-
-  template <typename... Args>
-  StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args)
-      : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data),
-        Data(Ty{std::forward<Args>(args)...}) {}
-
-  Ty &getValue() { return Data; }
-  const Ty &getValue() const { return Data; }
-  void setValue(const Ty &V) { Data = V; }
-};
-
-/// Helper class to do the heavy lifting when it comes to moving globals between
-/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD,
-/// which means the only things specialized by the subclass is the retrival of
-/// global metadata (size, addr) from the device.
-/// \see getGlobalMetadataFromDevice
-class GenericGlobalHandlerTy {
-  /// Actually move memory between host and device. See readGlobalFromDevice and
-  /// writeGlobalToDevice for the interface description.
-  Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
-                                       DeviceImageTy &Image,
-                                       const GlobalTy &HostGlobal,
-                                       bool Device2Host);
-
-  /// Actually move memory between host and device. See readGlobalFromDevice and
-  /// writeGlobalToDevice for the interface description.
-  Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
-                                       const GlobalTy &HostGlobal,
-                                       const GlobalTy &DeviceGlobal,
-                                       bool Device2Host);
-
-public:
-  virtual ~GenericGlobalHandlerTy() {}
-
-  /// Helper function for getting an ELF from a device image.
-  Expected<std::unique_ptr<ObjectFile>> getELFObjectFile(DeviceImageTy &Image);
-
-  /// Returns whether the symbol named \p SymName is present in the given \p
-  /// Image.
-  bool isSymbolInImage(GenericDeviceTy &Device, DeviceImageTy &Image,
-                       StringRef SymName);
-
-  /// Get the address and size of a global in the image. Address and size are
-  /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal.
-  Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
-                                   DeviceImageTy &Image, GlobalTy &ImageGlobal);
-
-  /// Read the memory associated with a global from the image and store it on
-  /// the host. The name, size, and destination are defined by \p HostGlobal.
-  Error readGlobalFromImage(GenericDeviceTy &Device, DeviceImageTy &Image,
-                            const GlobalTy &HostGlobal);
-
-  /// Get the address and size of a global from the device. Address is return in
-  /// \p DeviceGlobal, the global name and expected size are passed in
-  /// \p DeviceGlobal.
-  virtual Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
-                                            DeviceImageTy &Image,
-                                            GlobalTy &DeviceGlobal) = 0;
-
-  /// Copy the memory associated with a global from the device to its
-  /// counterpart on the host. The name, size, and destination are defined by
-  /// \p HostGlobal. The origin is defined by \p DeviceGlobal.
-  Error readGlobalFromDevice(GenericDeviceTy &Device,
-                             const GlobalTy &HostGlobal,
-                             const GlobalTy &DeviceGlobal) {
-    return moveGlobalBetweenDeviceAndHost(Device, HostGlobal, DeviceGlobal,
-                                          /*D2H=*/true);
-  }
-
-  /// Copy the memory associated with a global from the device to its
-  /// counterpart on the host. The name, size, and destination are defined by
-  /// \p HostGlobal. The origin is automatically resolved.
-  Error readGlobalFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
-                             const GlobalTy &HostGlobal) {
-    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
-                                          /*D2H=*/true);
-  }
-
-  /// Copy the memory associated with a global from the host to its counterpart
-  /// on the device. The name, size, and origin are defined by \p HostGlobal.
-  /// The destination is defined by \p DeviceGlobal.
-  Error writeGlobalToDevice(GenericDeviceTy &Device, const GlobalTy &HostGlobal,
-                            const GlobalTy &DeviceGlobal) {
-    return moveGlobalBetweenDeviceAndHost(Device, HostGlobal, DeviceGlobal,
-                                          /*D2H=*/false);
-  }
-
-  /// Copy the memory associated with a global from the host to its counterpart
-  /// on the device. The name, size, and origin are defined by \p HostGlobal.
-  /// The destination is automatically resolved.
-  Error writeGlobalToDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
-                            const GlobalTy &HostGlobal) {
-    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
-                                          /*D2H=*/false);
-  }
-};
-
-} // namespace plugin
-} // namespace target
-} // namespace omp
-} // namespace llvm
-
-#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H

diff --git a/libomptarget/plugins-nextgen/common/include/JIT.h b/libomptarget/plugins-nextgen/common/include/JIT.h
deleted file mode 100644
index b22197b..0000000
--- a/libomptarget/plugins-nextgen/common/include/JIT.h
+++ /dev/null

@@ -1,125 +0,0 @@
-//===- JIT.h - Target independent JIT infrastructure ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
-#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H
-
-#include "Shared/EnvironmentVar.h"
-#include "Shared/Utils.h"
-
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/TargetParser/Triple.h"
-
-#include <functional>
-#include <memory>
-#include <string>
-
-struct __tgt_device_image;
-
-namespace llvm {
-class MemoryBuffer;
-
-namespace omp {
-namespace target {
-namespace plugin {
-struct GenericDeviceTy;
-} // namespace plugin
-
-/// The JIT infrastructure and caching mechanism.
-struct JITEngine {
-  /// Function type for a callback that will be called after the backend is
-  /// called.
-  using PostProcessingFn =
-      std::function<Expected<std::unique_ptr<MemoryBuffer>>(
-          std::unique_ptr<MemoryBuffer>)>;
-
-  JITEngine(Triple::ArchType TA);
-
-  /// Run jit compilation if \p Image is a bitcode image, otherwise simply
-  /// return \p Image. It is expected to return a memory buffer containing the
-  /// generated device image that could be loaded to the device directly.
-  Expected<const __tgt_device_image *>
-  process(const __tgt_device_image &Image,
-          target::plugin::GenericDeviceTy &Device);
-
-  /// Return true if \p Image is a bitcode image that can be JITed for the given
-  /// architecture.
-  Expected<bool> checkBitcodeImage(StringRef Buffer) const;
-
-private:
-  /// Compile the bitcode image \p Image and generate the binary image that can
-  /// be loaded to the target device of the triple \p Triple architecture \p
-  /// MCpu. \p PostProcessing will be called after codegen to handle cases such
-  /// as assember as an external tool.
-  Expected<const __tgt_device_image *>
-  compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind,
-          PostProcessingFn PostProcessing);
-
-  /// Create or retrieve the object image file from the file system or via
-  /// compilation of the \p Image.
-  Expected<std::unique_ptr<MemoryBuffer>>
-  getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
-                     const std::string &ComputeUnitKind);
-
-  /// Run backend, which contains optimization and code generation.
-  Expected<std::unique_ptr<MemoryBuffer>>
-  backend(Module &M, const std::string &ComputeUnitKind, unsigned OptLevel);
-
-  /// Run optimization pipeline.
-  void opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
-           unsigned OptLevel);
-
-  /// Run code generation.
-  void codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
-               raw_pwrite_stream &OS);
-
-  /// The target triple used by the JIT.
-  const Triple TT;
-
-  struct ComputeUnitInfo {
-    /// LLVM Context in which the modules will be constructed.
-    LLVMContext Context;
-
-    /// Output images generated from LLVM backend.
-    SmallVector<std::unique_ptr<MemoryBuffer>, 4> JITImages;
-
-    /// A map of embedded IR images to JITed images.
-    DenseMap<const __tgt_device_image *, __tgt_device_image *> TgtImageMap;
-  };
-
-  /// Map from (march) "CPUs" (e.g., sm_80, or gfx90a), which we call compute
-  /// units as they are not CPUs, to the image information we cached for them.
-  StringMap<ComputeUnitInfo> ComputeUnitMap;
-  std::mutex ComputeUnitMapMutex;
-
-  /// Control environment variables.
-  StringEnvar ReplacementObjectFileName =
-      StringEnvar("LIBOMPTARGET_JIT_REPLACEMENT_OBJECT");
-  StringEnvar ReplacementModuleFileName =
-      StringEnvar("LIBOMPTARGET_JIT_REPLACEMENT_MODULE");
-  StringEnvar PreOptIRModuleFileName =
-      StringEnvar("LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE");
-  StringEnvar PostOptIRModuleFileName =
-      StringEnvar("LIBOMPTARGET_JIT_POST_OPT_IR_MODULE");
-  UInt32Envar JITOptLevel = UInt32Envar("LIBOMPTARGET_JIT_OPT_LEVEL", 3);
-  BoolEnvar JITSkipOpt = BoolEnvar("LIBOMPTARGET_JIT_SKIP_OPT", false);
-};
-
-} // namespace target
-} // namespace omp
-} // namespace llvm
-
-#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_JIT_H

diff --git a/libomptarget/plugins-nextgen/common/include/MemoryManager.h b/libomptarget/plugins-nextgen/common/include/MemoryManager.h
deleted file mode 100644
index fe19899..0000000
--- a/libomptarget/plugins-nextgen/common/include/MemoryManager.h
+++ /dev/null

@@ -1,347 +0,0 @@
-//===----------- MemoryManager.h - Target independent memory manager ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target independent memory manager.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H
-
-#include <cassert>
-#include <functional>
-#include <list>
-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-#include "Shared/Debug.h"
-#include "Shared/Utils.h"
-#include "omptarget.h"
-
-/// Base class of per-device allocator.
-class DeviceAllocatorTy {
-public:
-  virtual ~DeviceAllocatorTy() = default;
-
-  /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
-  /// allocation.
-  virtual void *allocate(size_t Size, void *HstPtr,
-                         TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
-
-  /// Delete the pointer \p TgtPtr on the device
-  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
-};
-
-/// Class of memory manager. The memory manager is per-device by using
-/// per-device allocator. Therefore, each plugin using memory manager should
-/// have an allocator for each device.
-class MemoryManagerTy {
-  static constexpr const size_t BucketSize[] = {
-      0,       1U << 2, 1U << 3,  1U << 4,  1U << 5,  1U << 6, 1U << 7,
-      1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13};
-
-  static constexpr const int NumBuckets =
-      sizeof(BucketSize) / sizeof(BucketSize[0]);
-
-  /// Find the previous number that is power of 2 given a number that is not
-  /// power of 2.
-  static size_t floorToPowerOfTwo(size_t Num) {
-    Num |= Num >> 1;
-    Num |= Num >> 2;
-    Num |= Num >> 4;
-    Num |= Num >> 8;
-    Num |= Num >> 16;
-#if INTPTR_MAX == INT64_MAX
-    Num |= Num >> 32;
-#elif INTPTR_MAX == INT32_MAX
-    // Do nothing with 32-bit
-#else
-#error Unsupported architecture
-#endif
-    Num += 1;
-    return Num >> 1;
-  }
-
-  /// Find a suitable bucket
-  static int findBucket(size_t Size) {
-    const size_t F = floorToPowerOfTwo(Size);
-
-    DP("findBucket: Size %zu is floored to %zu.\n", Size, F);
-
-    int L = 0, H = NumBuckets - 1;
-    while (H - L > 1) {
-      int M = (L + H) >> 1;
-      if (BucketSize[M] == F)
-        return M;
-      if (BucketSize[M] > F)
-        H = M - 1;
-      else
-        L = M;
-    }
-
-    assert(L >= 0 && L < NumBuckets && "L is out of range");
-
-    DP("findBucket: Size %zu goes to bucket %d\n", Size, L);
-
-    return L;
-  }
-
-  /// A structure stores the meta data of a target pointer
-  struct NodeTy {
-    /// Memory size
-    const size_t Size;
-    /// Target pointer
-    void *Ptr;
-
-    /// Constructor
-    NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {}
-  };
-
-  /// To make \p NodePtrTy ordered when they're put into \p std::multiset.
-  struct NodeCmpTy {
-    bool operator()(const NodeTy &LHS, const NodeTy &RHS) const {
-      return LHS.Size < RHS.Size;
-    }
-  };
-
-  /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make
-  /// the look up procedure more efficient.
-  using FreeListTy = std::multiset<std::reference_wrapper<NodeTy>, NodeCmpTy>;
-
-  /// A list of \p FreeListTy entries, each of which is a \p std::multiset of
-  /// Nodes whose size is less or equal to a specific bucket size.
-  std::vector<FreeListTy> FreeLists;
-  /// A list of mutex for each \p FreeListTy entry
-  std::vector<std::mutex> FreeListLocks;
-  /// A table to map from a target pointer to its node
-  std::unordered_map<void *, NodeTy> PtrToNodeTable;
-  /// The mutex for the table \p PtrToNodeTable
-  std::mutex MapTableLock;
-
-  /// The reference to a device allocator
-  DeviceAllocatorTy &DeviceAllocator;
-
-  /// The threshold to manage memory using memory manager. If the request size
-  /// is larger than \p SizeThreshold, the allocation will not be managed by the
-  /// memory manager.
-  size_t SizeThreshold = 1U << 13;
-
-  /// Request memory from target device
-  void *allocateOnDevice(size_t Size, void *HstPtr) const {
-    return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE);
-  }
-
-  /// Deallocate data on device
-  int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); }
-
-  /// This function is called when it tries to allocate memory on device but the
-  /// device returns out of memory. It will first free all memory in the
-  /// FreeList and try to allocate again.
-  void *freeAndAllocate(size_t Size, void *HstPtr) {
-    std::vector<void *> RemoveList;
-
-    // Deallocate all memory in FreeList
-    for (int I = 0; I < NumBuckets; ++I) {
-      FreeListTy &List = FreeLists[I];
-      std::lock_guard<std::mutex> Lock(FreeListLocks[I]);
-      if (List.empty())
-        continue;
-      for (const NodeTy &N : List) {
-        deleteOnDevice(N.Ptr);
-        RemoveList.push_back(N.Ptr);
-      }
-      FreeLists[I].clear();
-    }
-
-    // Remove all nodes in the map table which have been released
-    if (!RemoveList.empty()) {
-      std::lock_guard<std::mutex> LG(MapTableLock);
-      for (void *P : RemoveList)
-        PtrToNodeTable.erase(P);
-    }
-
-    // Try allocate memory again
-    return allocateOnDevice(Size, HstPtr);
-  }
-
-  /// The goal is to allocate memory on the device. It first tries to
-  /// allocate directly on the device. If a \p nullptr is returned, it might
-  /// be because the device is OOM. In that case, it will free all unused
-  /// memory and then try again.
-  void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) {
-    void *TgtPtr = allocateOnDevice(Size, HstPtr);
-    // We cannot get memory from the device. It might be due to OOM. Let's
-    // free all memory in FreeLists and try again.
-    if (TgtPtr == nullptr) {
-      DP("Failed to get memory on device. Free all memory in FreeLists and "
-         "try again.\n");
-      TgtPtr = freeAndAllocate(Size, HstPtr);
-    }
-
-    if (TgtPtr == nullptr)
-      DP("Still cannot get memory on device probably because the device is "
-         "OOM.\n");
-
-    return TgtPtr;
-  }
-
-public:
-  /// Constructor. If \p Threshold is non-zero, then the default threshold will
-  /// be overwritten by \p Threshold.
-  MemoryManagerTy(DeviceAllocatorTy &DeviceAllocator, size_t Threshold = 0)
-      : FreeLists(NumBuckets), FreeListLocks(NumBuckets),
-        DeviceAllocator(DeviceAllocator) {
-    if (Threshold)
-      SizeThreshold = Threshold;
-  }
-
-  /// Destructor
-  ~MemoryManagerTy() {
-    for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end();
-         ++Itr) {
-      assert(Itr->second.Ptr && "nullptr in map table");
-      deleteOnDevice(Itr->second.Ptr);
-    }
-  }
-
-  /// Allocate memory of size \p Size from target device. \p HstPtr is used to
-  /// assist the allocation.
-  void *allocate(size_t Size, void *HstPtr) {
-    // If the size is zero, we will not bother the target device. Just return
-    // nullptr directly.
-    if (Size == 0)
-      return nullptr;
-
-    DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n",
-       Size, DPxPTR(HstPtr));
-
-    // If the size is greater than the threshold, allocate it directly from
-    // device.
-    if (Size > SizeThreshold) {
-      DP("%zu is greater than the threshold %zu. Allocate it directly from "
-         "device\n",
-         Size, SizeThreshold);
-      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
-
-      DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr));
-
-      return TgtPtr;
-    }
-
-    NodeTy *NodePtr = nullptr;
-
-    // Try to get a node from FreeList
-    {
-      const int B = findBucket(Size);
-      FreeListTy &List = FreeLists[B];
-
-      NodeTy TempNode(Size, nullptr);
-      std::lock_guard<std::mutex> LG(FreeListLocks[B]);
-      const auto Itr = List.find(TempNode);
-
-      if (Itr != List.end()) {
-        NodePtr = &Itr->get();
-        List.erase(Itr);
-      }
-    }
-
-    if (NodePtr != nullptr)
-      DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr));
-
-    // We cannot find a valid node in FreeLists. Let's allocate on device and
-    // create a node for it.
-    if (NodePtr == nullptr) {
-      DP("Cannot find a node in the FreeLists. Allocate on device.\n");
-      // Allocate one on device
-      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
-
-      if (TgtPtr == nullptr)
-        return nullptr;
-
-      // Create a new node and add it into the map table
-      {
-        std::lock_guard<std::mutex> Guard(MapTableLock);
-        auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr));
-        NodePtr = &Itr.first->second;
-      }
-
-      DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n",
-         DPxPTR(NodePtr), DPxPTR(TgtPtr), Size);
-    }
-
-    assert(NodePtr && "NodePtr should not be nullptr at this point");
-
-    return NodePtr->Ptr;
-  }
-
-  /// Deallocate memory pointed by \p TgtPtr
-  int free(void *TgtPtr) {
-    DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr));
-
-    NodeTy *P = nullptr;
-
-    // Look it up into the table
-    {
-      std::lock_guard<std::mutex> G(MapTableLock);
-      auto Itr = PtrToNodeTable.find(TgtPtr);
-
-      // We don't remove the node from the map table because the map does not
-      // change.
-      if (Itr != PtrToNodeTable.end())
-        P = &Itr->second;
-    }
-
-    // The memory is not managed by the manager
-    if (P == nullptr) {
-      DP("Cannot find its node. Delete it on device directly.\n");
-      return deleteOnDevice(TgtPtr);
-    }
-
-    // Insert the node to the free list
-    const int B = findBucket(P->Size);
-
-    DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B);
-
-    {
-      std::lock_guard<std::mutex> G(FreeListLocks[B]);
-      FreeLists[B].insert(*P);
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Get the size threshold from the environment variable
-  /// \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD . Returns a <tt>
-  /// std::pair<size_t, bool> </tt> where the first element represents the
-  /// threshold and the second element represents whether user disables memory
-  /// manager explicitly by setting the var to 0. If user doesn't specify
-  /// anything, returns <0, true>.
-  static std::pair<size_t, bool> getSizeThresholdFromEnv() {
-    static UInt32Envar MemoryManagerThreshold(
-        "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD", 0);
-
-    size_t Threshold = MemoryManagerThreshold.get();
-
-    if (MemoryManagerThreshold.isPresent() && Threshold == 0) {
-      DP("Disabled memory manager as user set "
-         "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=0.\n");
-      return std::make_pair(0, false);
-    }
-
-    return std::make_pair(Threshold, true);
-  }
-};
-
-// GCC still cannot handle the static data member like Clang so we still need
-// this part.
-constexpr const size_t MemoryManagerTy::BucketSize[];
-constexpr const int MemoryManagerTy::NumBuckets;
-
-#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H

diff --git a/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/libomptarget/plugins-nextgen/common/include/PluginInterface.h
deleted file mode 100644
index 79e8464..0000000
--- a/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ /dev/null

@@ -1,1537 +0,0 @@
-//===- PluginInterface.h - Target independent plugin device interface -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
-#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
-
-#include <cstddef>
-#include <cstdint>
-#include <deque>
-#include <list>
-#include <map>
-#include <shared_mutex>
-#include <vector>
-
-#include "Shared/Debug.h"
-#include "Shared/Environment.h"
-#include "Shared/EnvironmentVar.h"
-#include "Shared/Requirements.h"
-#include "Shared/Utils.h"
-
-#include "GlobalHandler.h"
-#include "JIT.h"
-#include "MemoryManager.h"
-#include "RPC.h"
-#include "omptarget.h"
-
-#ifdef OMPT_SUPPORT
-#include "omp-tools.h"
-#endif
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBufferRef.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/Triple.h"
-
-namespace llvm {
-namespace omp {
-namespace target {
-
-namespace plugin {
-
-struct GenericPluginTy;
-struct GenericKernelTy;
-struct GenericDeviceTy;
-
-/// Class that wraps the __tgt_async_info to simply its usage. In case the
-/// object is constructed without a valid __tgt_async_info, the object will use
-/// an internal one and will synchronize the current thread with the pending
-/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function
-/// must be called before destroying the wrapper object.
-struct AsyncInfoWrapperTy {
-  AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr);
-
-  ~AsyncInfoWrapperTy() {
-    assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized");
-  }
-
-  /// Get the raw __tgt_async_info pointer.
-  operator __tgt_async_info *() const { return AsyncInfoPtr; }
-
-  /// Indicate whether there is queue.
-  bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); }
-
-  /// Get the queue.
-  template <typename Ty> Ty getQueueAs() {
-    static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
-                  "Queue is not of the same size as target type");
-    return static_cast<Ty>(AsyncInfoPtr->Queue);
-  }
-
-  /// Set the queue.
-  template <typename Ty> void setQueueAs(Ty Queue) {
-    static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
-                  "Queue is not of the same size as target type");
-    assert(!AsyncInfoPtr->Queue && "Overwriting queue");
-    AsyncInfoPtr->Queue = Queue;
-  }
-
-  /// Synchronize with the __tgt_async_info's pending operations if it's the
-  /// internal async info. The error associated to the aysnchronous operations
-  /// issued in this queue must be provided in \p Err. This function will update
-  /// the error parameter with the result of the synchronization if it was
-  /// actually executed. This function must be called before destroying the
-  /// object and only once.
-  void finalize(Error &Err);
-
-  /// Register \p Ptr as an associated alloction that is freed after
-  /// finalization.
-  void freeAllocationAfterSynchronization(void *Ptr) {
-    AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
-  }
-
-private:
-  GenericDeviceTy &Device;
-  __tgt_async_info LocalAsyncInfo;
-  __tgt_async_info *AsyncInfoPtr;
-};
-
-/// The information level represents the level of a key-value property in the
-/// info tree print (i.e. indentation). The first level should be the default.
-enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
-
-/// Class for storing device information and later be printed. An object of this
-/// type acts as a queue of key-value properties. Each property has a key, a
-/// a value, and an optional unit for the value. For printing purposes, the
-/// information can be classified into several levels. These levels are useful
-/// for defining sections and subsections. Thus, each key-value property also
-/// has an additional field indicating to which level belongs to. Notice that
-/// we use the level to determine the indentation of the key-value property at
-/// printing time. See the enum InfoLevelKind for the list of accepted levels.
-class InfoQueueTy {
-  struct InfoQueueEntryTy {
-    std::string Key;
-    std::string Value;
-    std::string Units;
-    uint64_t Level;
-  };
-
-  std::deque<InfoQueueEntryTy> Queue;
-
-public:
-  /// Add a new info entry to the queue. The entry requires at least a key
-  /// string in \p Key. The value in \p Value is optional and can be any type
-  /// that is representable as a string. The units in \p Units is optional and
-  /// must be a string. The info level is a template parameter that defaults to
-  /// the first level (top level).
-  template <InfoLevelKind L = InfoLevel1, typename T = std::string>
-  void add(const std::string &Key, T Value = T(),
-           const std::string &Units = std::string()) {
-    assert(!Key.empty() && "Invalid info key");
-
-    // Convert the value to a string depending on its type.
-    if constexpr (std::is_same_v<T, bool>)
-      Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
-    else if constexpr (std::is_arithmetic_v<T>)
-      Queue.push_back({Key, std::to_string(Value), Units, L});
-    else
-      Queue.push_back({Key, Value, Units, L});
-  }
-
-  /// Print all info entries added to the queue.
-  void print() const {
-    // We print four spances for each level.
-    constexpr uint64_t IndentSize = 4;
-
-    // Find the maximum key length (level + key) to compute the individual
-    // indentation of each entry.
-    uint64_t MaxKeySize = 0;
-    for (const auto &Entry : Queue) {
-      uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
-      if (KeySize > MaxKeySize)
-        MaxKeySize = KeySize;
-    }
-
-    // Print all info entries.
-    for (const auto &Entry : Queue) {
-      // Compute the indentations for the current entry.
-      uint64_t KeyIndentSize = Entry.Level * IndentSize;
-      uint64_t ValIndentSize =
-          MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
-
-      llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
-                   << std::string(ValIndentSize, ' ') << Entry.Value
-                   << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
-    }
-  }
-};
-
-/// Class wrapping a __tgt_device_image and its offload entry table on a
-/// specific device. This class is responsible for storing and managing
-/// the offload entries for an image on a device.
-class DeviceImageTy {
-  /// Image identifier within the corresponding device. Notice that this id is
-  /// not unique between different device; they may overlap.
-  int32_t ImageId;
-
-  /// The pointer to the raw __tgt_device_image.
-  const __tgt_device_image *TgtImage;
-  const __tgt_device_image *TgtImageBitcode;
-
-  /// Reference to the device this image is loaded on.
-  GenericDeviceTy &Device;
-
-  /// If this image has any global destructors that much be called.
-  /// FIXME: This is only required because we currently have no invariants
-  ///        towards the lifetime of the underlying image. We should either copy
-  ///        the image into memory locally or erase the pointers after init.
-  bool PendingGlobalDtors;
-
-public:
-  DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
-                const __tgt_device_image *Image)
-      : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
-        PendingGlobalDtors(false) {
-    assert(TgtImage && "Invalid target image");
-  }
-
-  /// Get the image identifier within the device.
-  int32_t getId() const { return ImageId; }
-
-  /// Get the device that this image is loaded onto.
-  GenericDeviceTy &getDevice() const { return Device; }
-
-  /// Get the pointer to the raw __tgt_device_image.
-  const __tgt_device_image *getTgtImage() const { return TgtImage; }
-
-  void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
-    this->TgtImageBitcode = TgtImageBitcode;
-  }
-
-  const __tgt_device_image *getTgtImageBitcode() const {
-    return TgtImageBitcode;
-  }
-
-  /// Get the image starting address.
-  void *getStart() const { return TgtImage->ImageStart; }
-
-  /// Get the image size.
-  size_t getSize() const {
-    return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
-  }
-
-  /// Get a memory buffer reference to the whole image.
-  MemoryBufferRef getMemoryBuffer() const {
-    return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
-                           "Image");
-  }
-  /// Accessors to the boolean value
-  bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
-  bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
-};
-
-/// Class implementing common functionalities of offload kernels. Each plugin
-/// should define the specific kernel class, derive from this generic one, and
-/// implement the necessary virtual function members.
-struct GenericKernelTy {
-  /// Construct a kernel with a name and a execution mode.
-  GenericKernelTy(const char *Name)
-      : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
-
-  virtual ~GenericKernelTy() {}
-
-  /// Initialize the kernel object from a specific device.
-  Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
-  virtual Error initImpl(GenericDeviceTy &GenericDevice,
-                         DeviceImageTy &Image) = 0;
-
-  /// Launch the kernel on the specific device. The device must be the same
-  /// one used to initialize the kernel.
-  Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
-               ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
-               AsyncInfoWrapperTy &AsyncInfoWrapper) const;
-  virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
-                           uint64_t NumBlocks, KernelArgsTy &KernelArgs,
-                           void *Args,
-                           AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
-
-  /// Get the kernel name.
-  const char *getName() const { return Name; }
-
-  /// Return true if this kernel is a constructor or destructor.
-  bool isCtorOrDtor() const {
-    // TODO: This is not a great solution and should be revisited.
-    return StringRef(Name).ends_with("tor");
-  }
-
-  /// Get the kernel image.
-  DeviceImageTy &getImage() const {
-    assert(ImagePtr && "Kernel is not initialized!");
-    return *ImagePtr;
-  }
-
-  /// Return the kernel environment object for kernel \p Name.
-  const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
-    return KernelEnvironment;
-  }
-
-  /// Return a device pointer to a new kernel launch environment.
-  Expected<KernelLaunchEnvironmentTy *>
-  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
-                             AsyncInfoWrapperTy &AsyncInfo) const;
-
-  /// Indicate whether an execution mode is valid.
-  static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
-    switch (ExecutionMode) {
-    case OMP_TGT_EXEC_MODE_SPMD:
-    case OMP_TGT_EXEC_MODE_GENERIC:
-    case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
-      return true;
-    }
-    return false;
-  }
-
-protected:
-  /// Get the execution mode name of the kernel.
-  const char *getExecutionModeName() const {
-    switch (KernelEnvironment.Configuration.ExecMode) {
-    case OMP_TGT_EXEC_MODE_SPMD:
-      return "SPMD";
-    case OMP_TGT_EXEC_MODE_GENERIC:
-      return "Generic";
-    case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
-      return "Generic-SPMD";
-    }
-    llvm_unreachable("Unknown execution mode!");
-  }
-
-  /// Prints generic kernel launch information.
-  Error printLaunchInfo(GenericDeviceTy &GenericDevice,
-                        KernelArgsTy &KernelArgs, uint32_t NumThreads,
-                        uint64_t NumBlocks) const;
-
-  /// Prints plugin-specific kernel launch information after generic kernel
-  /// launch information
-  virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
-                                       KernelArgsTy &KernelArgs,
-                                       uint32_t NumThreads,
-                                       uint64_t NumBlocks) const;
-
-private:
-  /// Prepare the arguments before launching the kernel.
-  void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
-                    ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
-                    llvm::SmallVectorImpl<void *> &Args,
-                    llvm::SmallVectorImpl<void *> &Ptrs,
-                    KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
-
-  /// Get the number of threads and blocks for the kernel based on the
-  /// user-defined threads and block clauses.
-  uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
-                         uint32_t ThreadLimitClause[3]) const;
-
-  /// The number of threads \p NumThreads can be adjusted by this method.
-  /// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
-  /// thread_limit clause.
-  uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
-                        uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
-                        uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
-
-  /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
-  bool isGenericSPMDMode() const {
-    return KernelEnvironment.Configuration.ExecMode ==
-           OMP_TGT_EXEC_MODE_GENERIC_SPMD;
-  }
-  bool isGenericMode() const {
-    return KernelEnvironment.Configuration.ExecMode ==
-           OMP_TGT_EXEC_MODE_GENERIC;
-  }
-  bool isSPMDMode() const {
-    return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
-  }
-
-  /// The kernel name.
-  const char *Name;
-
-  /// The image that contains this kernel.
-  DeviceImageTy *ImagePtr = nullptr;
-
-protected:
-  /// The preferred number of threads to run the kernel.
-  uint32_t PreferredNumThreads;
-
-  /// The maximum number of threads which the kernel could leverage.
-  uint32_t MaxNumThreads;
-
-  /// The kernel environment, including execution flags.
-  KernelEnvironmentTy KernelEnvironment;
-
-  /// The prototype kernel launch environment.
-  KernelLaunchEnvironmentTy KernelLaunchEnvironment;
-
-  /// If the kernel is a bare kernel.
-  bool IsBareKernel = false;
-};
-
-/// Class representing a map of host pinned allocations. We track these pinned
-/// allocations, so memory tranfers invloving these buffers can be optimized.
-class PinnedAllocationMapTy {
-
-  /// Struct representing a map entry.
-  struct EntryTy {
-    /// The host pointer of the pinned allocation.
-    void *HstPtr;
-
-    /// The pointer that devices' driver should use to transfer data from/to the
-    /// pinned allocation. In most plugins, this pointer will be the same as the
-    /// host pointer above.
-    void *DevAccessiblePtr;
-
-    /// The size of the pinned allocation.
-    size_t Size;
-
-    /// Indicate whether the allocation was locked from outside the plugin, for
-    /// instance, from the application. The externally locked allocations are
-    /// not unlocked by the plugin when unregistering the last user.
-    bool ExternallyLocked;
-
-    /// The number of references to the pinned allocation. The allocation should
-    /// remain pinned and registered to the map until the number of references
-    /// becomes zero.
-    mutable size_t References;
-
-    /// Create an entry with the host and device acessible pointers, the buffer
-    /// size, and a boolean indicating whether the buffer was locked externally.
-    EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
-            bool ExternallyLocked)
-        : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size),
-          ExternallyLocked(ExternallyLocked), References(1) {}
-
-    /// Utility constructor used for std::set searches.
-    EntryTy(void *HstPtr)
-        : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0),
-          ExternallyLocked(false), References(0) {}
-  };
-
-  /// Comparator of mep entries. Use the host pointer to enforce an order
-  /// between entries.
-  struct EntryCmpTy {
-    bool operator()(const EntryTy &Left, const EntryTy &Right) const {
-      return Left.HstPtr < Right.HstPtr;
-    }
-  };
-
-  typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy;
-
-  /// The map of host pinned allocations.
-  PinnedAllocSetTy Allocs;
-
-  /// The mutex to protect accesses to the map.
-  mutable std::shared_mutex Mutex;
-
-  /// Reference to the corresponding device.
-  GenericDeviceTy &Device;
-
-  /// Indicate whether mapped host buffers should be locked automatically.
-  bool LockMappedBuffers;
-
-  /// Indicate whether failures when locking mapped buffers should be ingored.
-  bool IgnoreLockMappedFailures;
-
-  /// Find an allocation that intersects with \p HstPtr pointer. Assume the
-  /// map's mutex is acquired.
-  const EntryTy *findIntersecting(const void *HstPtr) const {
-    if (Allocs.empty())
-      return nullptr;
-
-    // Search the first allocation with starting address that is not less than
-    // the buffer address.
-    auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)});
-
-    // Direct match of starting addresses.
-    if (It != Allocs.end() && It->HstPtr == HstPtr)
-      return &(*It);
-
-    // Not direct match but may be a previous pinned allocation in the map which
-    // contains the buffer. Return false if there is no such a previous
-    // allocation.
-    if (It == Allocs.begin())
-      return nullptr;
-
-    // Move to the previous pinned allocation.
-    --It;
-
-    // The buffer is not contained in the pinned allocation.
-    if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
-      return &(*It);
-
-    // None found.
-    return nullptr;
-  }
-
-  /// Insert an entry to the map representing a locked buffer. The number of
-  /// references is set to one.
-  Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size,
-                    bool ExternallyLocked = false);
-
-  /// Erase an existing entry from the map.
-  Error eraseEntry(const EntryTy &Entry);
-
-  /// Register a new user into an entry that represents a locked buffer. Check
-  /// also that the registered buffer with \p HstPtr address and \p Size is
-  /// actually contained into the entry.
-  Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size);
-
-  /// Unregister a user from the entry and return whether it is the last user.
-  /// If it is the last user, the entry will have to be removed from the map
-  /// and unlock the entry's host buffer (if necessary).
-  Expected<bool> unregisterEntryUse(const EntryTy &Entry);
-
-  /// Indicate whether the first range A fully contains the second range B.
-  static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
-    return (PtrB >= PtrA && EndB <= EndA);
-  }
-
-  /// Indicate whether the first range A intersects with the second range B.
-  static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
-    void *EndA = advanceVoidPtr(PtrA, SizeA);
-    void *EndB = advanceVoidPtr(PtrB, SizeB);
-    return (PtrA < EndB && PtrB < EndA);
-  }
-
-public:
-  /// Create the map of pinned allocations corresponding to a specific device.
-  PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {
-
-    // Envar that indicates whether mapped host buffers should be locked
-    // automatically. The possible values are boolean (on/off) and a special:
-    //   off:       Mapped host buffers are not locked.
-    //   on:        Mapped host buffers are locked in a best-effort approach.
-    //              Failure to lock the buffers are silent.
-    //   mandatory: Mapped host buffers are always locked and failures to lock
-    //              a buffer results in a fatal error.
-    StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS",
-                                       "off");
-
-    bool Enabled;
-    if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) {
-      // Parsed as a boolean value. Enable the feature if necessary.
-      LockMappedBuffers = Enabled;
-      IgnoreLockMappedFailures = true;
-    } else if (OMPX_LockMappedBuffers.get() == "mandatory") {
-      // Enable the feature and failures are fatal.
-      LockMappedBuffers = true;
-      IgnoreLockMappedFailures = false;
-    } else {
-      // Disable by default.
-      DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n",
-         OMPX_LockMappedBuffers.get().data());
-      LockMappedBuffers = false;
-    }
-  }
-
-  /// Register a buffer that was recently allocated as a locked host buffer.
-  /// None of the already registered pinned allocations should intersect with
-  /// this new one. The registration requires the host pointer in \p HstPtr,
-  /// the device accessible pointer in \p DevAccessiblePtr, and the size of the
-  /// allocation in \p Size. The allocation must be unregistered using the
-  /// unregisterHostBuffer function.
-  Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size);
-
-  /// Unregister a host pinned allocation passing the host pointer which was
-  /// previously registered using the registerHostBuffer function. When calling
-  /// this function, the pinned allocation cannot have any other user and will
-  /// not be unlocked by this function.
-  Error unregisterHostBuffer(void *HstPtr);
-
-  /// Lock the host buffer at \p HstPtr or register a new user if it intersects
-  /// with an already existing one. A partial overlapping with extension is not
-  /// allowed. The function returns the device accessible pointer of the pinned
-  /// buffer. The buffer must be unlocked using the unlockHostBuffer function.
-  Expected<void *> lockHostBuffer(void *HstPtr, size_t Size);
-
-  /// Unlock the host buffer at \p HstPtr or unregister a user if other users
-  /// are still using the pinned allocation. If this was the last user, the
-  /// pinned allocation is removed from the map and the memory is unlocked.
-  Error unlockHostBuffer(void *HstPtr);
-
-  /// Lock or register a host buffer that was recently mapped by libomptarget.
-  /// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is
-  /// enabled. Even if not enabled, externally locked buffers are registered
-  /// in order to optimize their transfers.
-  Error lockMappedHostBuffer(void *HstPtr, size_t Size);
-
-  /// Unlock or unregister a host buffer that was unmapped by libomptarget.
-  Error unlockUnmappedHostBuffer(void *HstPtr);
-
-  /// Return the device accessible pointer associated to the host pinned
-  /// allocation which the \p HstPtr belongs, if any. Return null in case the
-  /// \p HstPtr does not belong to any host pinned allocation. The device
-  /// accessible pointer is the one that devices should use for data transfers
-  /// that involve a host pinned buffer.
-  void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const {
-    std::shared_lock<std::shared_mutex> Lock(Mutex);
-
-    // Find the intersecting allocation if any.
-    const EntryTy *Entry = findIntersecting(HstPtr);
-    if (!Entry)
-      return nullptr;
-
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
-  }
-
-  /// Check whether a buffer belongs to a registered host pinned allocation.
-  bool isHostPinnedBuffer(const void *HstPtr) const {
-    std::shared_lock<std::shared_mutex> Lock(Mutex);
-
-    // Return whether there is an intersecting allocation.
-    return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr);
-  }
-};
-
-/// Class implementing common functionalities of offload devices. Each plugin
-/// should define the specific device class, derive from this generic one, and
-/// implement the necessary virtual function members.
-struct GenericDeviceTy : public DeviceAllocatorTy {
-  /// Construct a device with its device id within the plugin, the number of
-  /// devices in the plugin and the grid values for that kind of device.
-  GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
-                  const llvm::omp::GV &GridValues);
-
-  /// Get the device identifier within the corresponding plugin. Notice that
-  /// this id is not unique between different plugins; they may overlap.
-  int32_t getDeviceId() const { return DeviceId; }
-
-  /// Set the context of the device if needed, before calling device-specific
-  /// functions. Plugins may implement this function as a no-op if not needed.
-  virtual Error setContext() = 0;
-
-  /// Initialize the device. After this call, the device should be already
-  /// working and ready to accept queries or modifications.
-  Error init(GenericPluginTy &Plugin);
-  virtual Error initImpl(GenericPluginTy &Plugin) = 0;
-
-  /// Deinitialize the device and free all its resources. After this call, the
-  /// device is no longer considered ready, so no queries or modifications are
-  /// allowed.
-  Error deinit(GenericPluginTy &Plugin);
-  virtual Error deinitImpl() = 0;
-
-  /// Load the binary image into the device and return the target table.
-  Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
-                                       const __tgt_device_image *TgtImage);
-  virtual Expected<DeviceImageTy *>
-  loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
-
-  /// Setup the device environment if needed. Notice this setup may not be run
-  /// on some plugins. By default, it will be executed, but plugins can change
-  /// this behavior by overriding the shouldSetupDeviceEnvironment function.
-  Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
-
-  /// Setup the global device memory pool, if the plugin requires one.
-  Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                              uint64_t PoolSize);
-
-  // Setup the RPC server for this device if needed. This may not run on some
-  // plugins like the CPU targets. By default, it will not be executed so it is
-  // up to the target to override this using the shouldSetupRPCServer function.
-  Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
-
-  /// Synchronize the current thread with the pending operations on the
-  /// __tgt_async_info structure.
-  Error synchronize(__tgt_async_info *AsyncInfo);
-  virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
-
-  /// Invokes any global constructors on the device if present and is required
-  /// by the target.
-  virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
-                                       DeviceImageTy &Image) {
-    return Error::success();
-  }
-
-  /// Invokes any global destructors on the device if present and is required
-  /// by the target.
-  virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) {
-    return Error::success();
-  }
-
-  /// Query for the completion of the pending operations on the __tgt_async_info
-  /// structure in a non-blocking manner.
-  Error queryAsync(__tgt_async_info *AsyncInfo);
-  virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
-
-  /// Check whether the architecture supports VA management
-  virtual bool supportVAManagement() const { return false; }
-
-  /// Get the total device memory size
-  virtual Error getDeviceMemorySize(uint64_t &DSize);
-
-  /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
-  /// map it to \p VAddr. The obtained address is stored in \p Addr. At return
-  /// \p RSize contains the actual size which can be equal or larger than the
-  /// requested size.
-  virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
-
-  /// De-allocates device memory and unmaps the virtual address \p VAddr
-  virtual Error memoryVAUnMap(void *VAddr, size_t Size);
-
-  /// Allocate data on the device or involving the device.
-  Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
-
-  /// Deallocate data from the device or involving the device.
-  Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
-
-  /// Pin host memory to optimize transfers and return the device accessible
-  /// pointer that devices should use for memory transfers involving the host
-  /// pinned allocation.
-  Expected<void *> dataLock(void *HstPtr, int64_t Size) {
-    return PinnedAllocs.lockHostBuffer(HstPtr, Size);
-  }
-
-  /// Unpin a host memory buffer that was previously pinned.
-  Error dataUnlock(void *HstPtr) {
-    return PinnedAllocs.unlockHostBuffer(HstPtr);
-  }
-
-  /// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific
-  /// API and return the device accessible pointer.
-  virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0;
-
-  /// Unlock a previously locked host buffer starting at \p HstPtr.
-  virtual Error dataUnlockImpl(void *HstPtr) = 0;
-
-  /// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped
-  /// buffer. This means that libomptarget created a new mapping of that host
-  /// buffer (e.g., because a user OpenMP target map) and the buffer may be used
-  /// as source/destination of memory transfers. We can use this information to
-  /// lock the host buffer and optimize its memory transfers.
-  Error notifyDataMapped(void *HstPtr, int64_t Size) {
-    return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
-  }
-
-  /// Mark the host buffer with address \p HstPtr as unmapped. This means that
-  /// libomptarget removed an existing mapping. If the plugin locked the buffer
-  /// in notifyDataMapped, this function should unlock it.
-  Error notifyDataUnmapped(void *HstPtr) {
-    return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
-  }
-
-  /// Check whether the host buffer with address \p HstPtr is pinned by the
-  /// underlying vendor-specific runtime (if any). Retrieve the host pointer,
-  /// the device accessible pointer and the size of the original pinned buffer.
-  virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
-                                         void *&BaseDevAccessiblePtr,
-                                         size_t &BaseSize) const = 0;
-
-  /// Submit data to the device (host to device transfer).
-  Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
-                   __tgt_async_info *AsyncInfo);
-  virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
-                               AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
-
-  /// Retrieve data from the device (device to host transfer).
-  Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
-                     __tgt_async_info *AsyncInfo);
-  virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
-                                 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
-
-  /// Exchange data between devices (device to device transfer). Calling this
-  /// function is only valid if GenericPlugin::isDataExchangable() passing the
-  /// two devices returns true.
-  Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
-                     int64_t Size, __tgt_async_info *AsyncInfo);
-  virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
-                                 void *DstPtr, int64_t Size,
-                                 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
-
-  /// Run the kernel associated with \p EntryPtr
-  Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
-                     KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
-
-  /// Initialize a __tgt_async_info structure. Related to interop features.
-  Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
-  virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
-
-  /// Initialize a __tgt_device_info structure. Related to interop features.
-  Error initDeviceInfo(__tgt_device_info *DeviceInfo);
-  virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
-
-  /// Create an event.
-  Error createEvent(void **EventPtrStorage);
-  virtual Error createEventImpl(void **EventPtrStorage) = 0;
-
-  /// Destroy an event.
-  Error destroyEvent(void *Event);
-  virtual Error destroyEventImpl(void *EventPtr) = 0;
-
-  /// Start the recording of the event.
-  Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
-  virtual Error recordEventImpl(void *EventPtr,
-                                AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
-
-  /// Wait for an event to finish. Notice this wait is asynchronous if the
-  /// __tgt_async_info is not nullptr.
-  Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
-  virtual Error waitEventImpl(void *EventPtr,
-                              AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
-
-  /// Synchronize the current thread with the event.
-  Error syncEvent(void *EventPtr);
-  virtual Error syncEventImpl(void *EventPtr) = 0;
-
-  /// Print information about the device.
-  Error printInfo();
-  virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
-
-  /// Getters of the grid values.
-  uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
-  uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
-  uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
-  uint32_t getDefaultNumThreads() const {
-    return GridValues.GV_Default_WG_Size;
-  }
-  uint32_t getDefaultNumBlocks() const {
-    return GridValues.GV_Default_Num_Teams;
-  }
-  uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
-  virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
-
-  /// Get target compute unit kind (e.g., sm_80, or gfx908).
-  virtual std::string getComputeUnitKind() const { return "unknown"; }
-
-  /// Post processing after jit backend. The ownership of \p MB will be taken.
-  virtual Expected<std::unique_ptr<MemoryBuffer>>
-  doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
-    return std::move(MB);
-  }
-
-  /// The minimum number of threads we use for a low-trip count combined loop.
-  /// Instead of using more threads we increase the outer (block/team)
-  /// parallelism.
-  /// @see OMPX_MinThreadsForLowTripCount
-  virtual uint32_t getMinThreadsForLowTripCountLoop() {
-    return OMPX_MinThreadsForLowTripCount;
-  }
-
-  /// Get the total amount of hardware parallelism supported by the target
-  /// device. This is the total amount of warps or wavefronts that can be
-  /// resident on the device simultaneously.
-  virtual uint64_t getHardwareParallelism() const { return 0; }
-
-  /// Get the RPC server running on this device.
-  RPCServerTy *getRPCServer() const { return RPCServer; }
-
-  /// The number of parallel RPC ports to use on the device. In general, this
-  /// should be roughly equivalent to the amount of hardware parallelism the
-  /// device can support. This is because GPUs in general do not have forward
-  /// progress guarantees, so we minimize thread level dependencies by
-  /// allocating enough space such that each device thread can have a port. This
-  /// is likely overly pessimistic in the average case, but guarantees no
-  /// deadlocks at the cost of memory. This must be overloaded by targets
-  /// expecting to use the RPC server.
-  virtual uint64_t requestedRPCPortCount() const {
-    assert(!shouldSetupRPCServer() && "Default implementation cannot be used");
-    return 0;
-  }
-
-  virtual Error getDeviceStackSize(uint64_t &V) = 0;
-
-  /// Returns true if current plugin architecture is an APU
-  /// and unified_shared_memory was not requested by the program.
-  bool useAutoZeroCopy();
-  virtual bool useAutoZeroCopyImpl() { return false; }
-
-  /// Allocate and construct a kernel object.
-  virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
-
-  /// Reference to the underlying plugin that created this device.
-  GenericPluginTy &Plugin;
-
-private:
-  /// Get and set the stack size and heap size for the device. If not used, the
-  /// plugin can implement the setters as no-op and setting the output
-  /// value to zero for the getters.
-  virtual Error setDeviceStackSize(uint64_t V) = 0;
-  virtual Error getDeviceHeapSize(uint64_t &V) = 0;
-  virtual Error setDeviceHeapSize(uint64_t V) = 0;
-
-  /// Indicate whether the device should setup the device environment. Notice
-  /// that returning false in this function will change the behavior of the
-  /// setupDeviceEnvironment() function.
-  virtual bool shouldSetupDeviceEnvironment() const { return true; }
-
-  /// Indicate whether the device should setup the global device memory pool. If
-  /// false is return the value on the device will be uninitialized.
-  virtual bool shouldSetupDeviceMemoryPool() const { return true; }
-
-  /// Indicate whether or not the device should setup the RPC server. This is
-  /// only necessary for unhosted targets like the GPU.
-  virtual bool shouldSetupRPCServer() const { return false; }
-
-  /// Pointer to the memory manager or nullptr if not available.
-  MemoryManagerTy *MemoryManager;
-
-  /// Environment variables defined by the OpenMP standard.
-  Int32Envar OMP_TeamLimit;
-  Int32Envar OMP_NumTeams;
-  Int32Envar OMP_TeamsThreadLimit;
-
-  /// Environment variables defined by the LLVM OpenMP implementation.
-  Int32Envar OMPX_DebugKind;
-  UInt32Envar OMPX_SharedMemorySize;
-  UInt64Envar OMPX_TargetStackSize;
-  UInt64Envar OMPX_TargetHeapSize;
-
-  /// Environment flag to set the minimum number of threads we use for a
-  /// low-trip count combined loop. Instead of using more threads we increase
-  /// the outer (block/team) parallelism.
-  UInt32Envar OMPX_MinThreadsForLowTripCount =
-      UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
-
-protected:
-  /// Environment variables defined by the LLVM OpenMP implementation
-  /// regarding the initial number of streams and events.
-  UInt32Envar OMPX_InitialNumStreams;
-  UInt32Envar OMPX_InitialNumEvents;
-
-  /// Array of images loaded into the device. Images are automatically
-  /// deallocated by the allocator.
-  llvm::SmallVector<DeviceImageTy *> LoadedImages;
-
-  /// The identifier of the device within the plugin. Notice this is not a
-  /// global device id and is not the device id visible to the OpenMP user.
-  const int32_t DeviceId;
-
-  /// The default grid values used for this device.
-  llvm::omp::GV GridValues;
-
-  /// Enumeration used for representing the current state between two devices
-  /// two devices (both under the same plugin) for the peer access between them.
-  /// The states can be a) PENDING when the state has not been queried and needs
-  /// to be queried, b) AVAILABLE when the peer access is available to be used,
-  /// and c) UNAVAILABLE if the system does not allow it.
-  enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
-
-  /// Array of peer access states with the rest of devices. This means that if
-  /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
-  /// the device I can access device J's memory directly. However, notice this
-  /// does not mean that device J can access device I's memory directly.
-  llvm::SmallVector<PeerAccessState> PeerAccesses;
-  std::mutex PeerAccessesLock;
-
-  /// Map of host pinned allocations used for optimize device transfers.
-  PinnedAllocationMapTy PinnedAllocs;
-
-  /// A pointer to an RPC server instance attached to this device if present.
-  /// This is used to run the RPC server during task synchronization.
-  RPCServerTy *RPCServer;
-
-#ifdef OMPT_SUPPORT
-  /// OMPT callback functions
-#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
-  FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
-#undef defineOmptCallback
-
-  /// Internal representation for OMPT device (initialize & finalize)
-  std::atomic<bool> OmptInitialized;
-#endif
-
-private:
-  DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
-  DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-};
-
-/// Class implementing common functionalities of offload plugins. Each plugin
-/// should define the specific plugin class, derive from this generic one, and
-/// implement the necessary virtual function members.
-struct GenericPluginTy {
-
-  /// Construct a plugin instance.
-  GenericPluginTy(Triple::ArchType TA)
-      : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr), JIT(TA),
-        RPCServer(nullptr) {}
-
-  virtual ~GenericPluginTy() {}
-
-  /// Initialize the plugin.
-  Error init();
-
-  /// Initialize the plugin and return the number of available devices.
-  virtual Expected<int32_t> initImpl() = 0;
-
-  /// Deinitialize the plugin and release the resources.
-  Error deinit();
-  virtual Error deinitImpl() = 0;
-
-  /// Create a new device for the underlying plugin.
-  virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin,
-                                        int32_t DeviceID,
-                                        int32_t NumDevices) = 0;
-
-  /// Create a new global handler for the underlying plugin.
-  virtual GenericGlobalHandlerTy *createGlobalHandler() = 0;
-
-  /// Get the reference to the device with a certain device id.
-  GenericDeviceTy &getDevice(int32_t DeviceId) {
-    assert(isValidDeviceId(DeviceId) && "Invalid device id");
-    assert(Devices[DeviceId] && "Device is unitialized");
-
-    return *Devices[DeviceId];
-  }
-
-  /// Get the number of active devices.
-  int32_t getNumDevices() const { return NumDevices; }
-
-  /// Get the plugin-specific device identifier offset.
-  int32_t getDeviceIdStartIndex() const { return DeviceIdStartIndex; }
-
-  /// Set the plugin-specific device identifier offset.
-  void setDeviceIdStartIndex(int32_t Offset) { DeviceIdStartIndex = Offset; }
-
-  /// Get the ELF code to recognize the binary image of this plugin.
-  virtual uint16_t getMagicElfBits() const = 0;
-
-  /// Get the target triple of this plugin.
-  virtual Triple::ArchType getTripleArch() const = 0;
-
-  /// Allocate a structure using the internal allocator.
-  template <typename Ty> Ty *allocate() {
-    return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
-  }
-
-  /// Get the reference to the global handler of this plugin.
-  GenericGlobalHandlerTy &getGlobalHandler() {
-    assert(GlobalHandler && "Global handler not initialized");
-    return *GlobalHandler;
-  }
-
-  /// Get the reference to the JIT used for all devices connected to this
-  /// plugin.
-  JITEngine &getJIT() { return JIT; }
-
-  /// Get a reference to the RPC server used to provide host services.
-  RPCServerTy &getRPCServer() {
-    assert(RPCServer && "RPC server not initialized");
-    return *RPCServer;
-  }
-
-  /// Get the OpenMP requires flags set for this plugin.
-  int64_t getRequiresFlags() const { return RequiresFlags; }
-
-  /// Set the OpenMP requires flags for this plugin.
-  void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; }
-
-  /// Initialize a device within the plugin.
-  Error initDevice(int32_t DeviceId);
-
-  /// Deinitialize a device within the plugin and release its resources.
-  Error deinitDevice(int32_t DeviceId);
-
-  /// Indicate whether data can be exchanged directly between two devices under
-  /// this same plugin. If this function returns true, it's safe to call the
-  /// GenericDeviceTy::exchangeData() function on the source device.
-  virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
-    return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
-  }
-
-  /// Top level interface to verify if a given ELF image can be executed on a
-  /// given target. Returns true if the \p Image is compatible with the plugin.
-  Expected<bool> checkELFImage(StringRef Image) const;
-
-  /// Indicate if an image is compatible with the plugin devices. Notice that
-  /// this function may be called before actually initializing the devices. So
-  /// we could not move this function into GenericDeviceTy.
-  virtual Expected<bool> isELFCompatible(StringRef Image) const = 0;
-
-protected:
-  /// Indicate whether a device id is valid.
-  bool isValidDeviceId(int32_t DeviceId) const {
-    return (DeviceId >= 0 && DeviceId < getNumDevices());
-  }
-
-public:
-  // TODO: This plugin interface needs to be cleaned up.
-
-  /// Returns non-zero if the provided \p Image can be executed by the runtime.
-  int32_t is_valid_binary(__tgt_device_image *Image);
-
-  /// Initialize the device inside of the plugin.
-  int32_t init_device(int32_t DeviceId);
-
-  /// Return the number of devices this plugin can support.
-  int32_t number_of_devices();
-
-  /// Initializes the OpenMP register requires information.
-  int64_t init_requires(int64_t RequiresFlags);
-
-  /// Returns non-zero if the data can be exchanged between the two devices.
-  int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
-
-  /// Initializes the record and replay mechanism inside the plugin.
-  int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
-                                   void *VAddr, bool isRecord, bool SaveOutput,
-                                   uint64_t &ReqPtrArgOffset);
-
-  /// Loads the associated binary into the plugin and returns a handle to it.
-  int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
-                      __tgt_device_binary *Binary);
-
-  /// Allocates memory that is accessively to the given device.
-  void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind);
-
-  /// Deallocates memory on the given device.
-  int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
-
-  /// Locks / pins host memory using the plugin runtime.
-  int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
-                    void **LockedPtr);
-
-  /// Unlocks / unpins host memory using the plugin runtime.
-  int32_t data_unlock(int32_t DeviceId, void *Ptr);
-
-  /// Notify the runtime about a new mapping that has been created outside.
-  int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
-
-  /// Notify t he runtime about a mapping that has been deleted.
-  int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
-
-  /// Copy data to the given device.
-  int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                      int64_t Size);
-
-  /// Copy data to the given device asynchronously.
-  int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                            int64_t Size, __tgt_async_info *AsyncInfoPtr);
-
-  /// Copy data from the given device.
-  int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                        int64_t Size);
-
-  /// Copy data from the given device asynchornously.
-  int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                              int64_t Size, __tgt_async_info *AsyncInfoPtr);
-
-  /// Exchange memory addresses between two devices.
-  int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId,
-                        void *DstPtr, int64_t Size);
-
-  /// Exchange memory addresses between two devices asynchronously.
-  int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
-                              int DstDeviceId, void *DstPtr, int64_t Size,
-                              __tgt_async_info *AsyncInfo);
-
-  /// Begin executing a kernel on the given device.
-  int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
-                        ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
-                        __tgt_async_info *AsyncInfoPtr);
-
-  /// Synchronize an asyncrhonous queue with the plugin runtime.
-  int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
-
-  /// Query the current state of an asynchronous queue.
-  int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
-
-  /// Prints information about the given devices supported by the plugin.
-  void print_device_info(int32_t DeviceId);
-
-  /// Creates an event in the given plugin if supported.
-  int32_t create_event(int32_t DeviceId, void **EventPtr);
-
-  /// Records an event that has occurred.
-  int32_t record_event(int32_t DeviceId, void *EventPtr,
-                       __tgt_async_info *AsyncInfoPtr);
-
-  /// Wait until an event has occurred.
-  int32_t wait_event(int32_t DeviceId, void *EventPtr,
-                     __tgt_async_info *AsyncInfoPtr);
-
-  /// Syncrhonize execution until an event is done.
-  int32_t sync_event(int32_t DeviceId, void *EventPtr);
-
-  /// Remove the event from the plugin.
-  int32_t destroy_event(int32_t DeviceId, void *EventPtr);
-
-  /// Remove the event from the plugin.
-  void set_info_flag(uint32_t NewInfoLevel);
-
-  /// Creates an asynchronous queue for the given plugin.
-  int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
-
-  /// Creates device information to be used for diagnostics.
-  int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
-                           const char **ErrStr);
-
-  /// Sets the offset into the devices for use by OMPT.
-  int32_t set_device_offset(int32_t DeviceIdOffset);
-
-  /// Returns if the plugin can support auotmatic copy.
-  int32_t use_auto_zero_copy(int32_t DeviceId);
-
-  /// Look up a global symbol in the given binary.
-  int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
-                     const char *Name, void **DevicePtr);
-
-  /// Look up a kernel function in the given binary.
-  int32_t get_function(__tgt_device_binary Binary, const char *Name,
-                       void **KernelPtr);
-
-private:
-  /// Number of devices available for the plugin.
-  int32_t NumDevices = 0;
-
-  /// Index offset, which when added to a DeviceId, will yield a unique
-  /// user-observable device identifier. This is especially important when
-  /// DeviceIds of multiple plugins / RTLs need to be distinguishable.
-  int32_t DeviceIdStartIndex = 0;
-
-  /// Array of pointers to the devices. Initially, they are all set to nullptr.
-  /// Once a device is initialized, the pointer is stored in the position given
-  /// by its device id. A position with nullptr means that the corresponding
-  /// device was not initialized yet.
-  llvm::SmallVector<GenericDeviceTy *> Devices;
-
-  /// OpenMP requires flags.
-  int64_t RequiresFlags;
-
-  /// Pointer to the global handler for this plugin.
-  GenericGlobalHandlerTy *GlobalHandler;
-
-  /// Internal allocator for different structures.
-  BumpPtrAllocator Allocator;
-
-  /// The JIT engine shared by all devices connected to this plugin.
-  JITEngine JIT;
-
-  /// The interface between the plugin and the GPU for host services.
-  RPCServerTy *RPCServer;
-};
-
-namespace Plugin {
-/// Create a success error. This is the same as calling Error::success(), but
-/// it is recommended to use this one for consistency with Plugin::error() and
-/// Plugin::check().
-static Error success() { return Error::success(); }
-
-/// Create a string error.
-template <typename... ArgsTy>
-static Error error(const char *ErrFmt, ArgsTy... Args) {
-  return createStringError(inconvertibleErrorCode(), ErrFmt, Args...);
-}
-
-/// Check the plugin-specific error code and return an error or success
-/// accordingly. In case of an error, create a string error with the error
-/// description. The ErrFmt should follow the format:
-///     "Error in <function name>[<optional info>]: %s"
-/// The last format specifier "%s" is mandatory and will be used to place the
-/// error code's description. Notice this function should be only called from
-/// the plugin-specific code.
-/// TODO: Refactor this, must be defined individually by each plugin.
-template <typename... ArgsTy>
-static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
-} // namespace Plugin
-
-/// Class for simplifying the getter operation of the plugin. Anywhere on the
-/// code, the current plugin can be retrieved by Plugin::get(). The class also
-/// declares functions to create plugin-specific object instances. The check(),
-/// createPlugin(), createDevice() and createGlobalHandler() functions should be
-/// defined by each plugin implementation.
-class PluginTy {
-  // Reference to the plugin instance.
-  static GenericPluginTy *SpecificPlugin;
-
-  PluginTy() {
-    if (auto Err = init())
-      REPORT("Failed to initialize plugin: %s\n",
-             toString(std::move(Err)).data());
-  }
-
-  ~PluginTy() {
-    if (auto Err = deinit())
-      REPORT("Failed to deinitialize plugin: %s\n",
-             toString(std::move(Err)).data());
-  }
-
-  PluginTy(const PluginTy &) = delete;
-  void operator=(const PluginTy &) = delete;
-
-  /// Create and intialize the plugin instance.
-  static Error init() {
-    assert(!SpecificPlugin && "Plugin already created");
-
-    // Create the specific plugin.
-    SpecificPlugin = createPlugin();
-    assert(SpecificPlugin && "Plugin was not created");
-
-    // Initialize the plugin.
-    return SpecificPlugin->init();
-  }
-
-  // Deinitialize and destroy the plugin instance.
-  static Error deinit() {
-    assert(SpecificPlugin && "Plugin no longer valid");
-
-    for (int32_t DevNo = 0, NumDev = SpecificPlugin->getNumDevices();
-         DevNo < NumDev; ++DevNo)
-      if (auto Err = SpecificPlugin->deinitDevice(DevNo))
-        return Err;
-
-    // Deinitialize the plugin.
-    if (auto Err = SpecificPlugin->deinit())
-      return Err;
-
-    // Delete the plugin instance.
-    delete SpecificPlugin;
-
-    // Invalidate the plugin reference.
-    SpecificPlugin = nullptr;
-
-    return Plugin::success();
-  }
-
-public:
-  /// Initialize the plugin if needed. The plugin could have been initialized by
-  /// a previous call to Plugin::get().
-  static Error initIfNeeded() {
-    // Trigger the initialization if needed.
-    get();
-
-    return Error::success();
-  }
-
-  /// Get a reference (or create if it was not created) to the plugin instance.
-  static GenericPluginTy &get() {
-    // This static variable will initialize the underlying plugin instance in
-    // case there was no previous explicit initialization. The initialization is
-    // thread safe.
-    static PluginTy Plugin;
-
-    assert(SpecificPlugin && "Plugin is not active");
-    return *SpecificPlugin;
-  }
-
-  /// Get a reference to the plugin with a specific plugin-specific type.
-  template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); }
-
-  /// Indicate whether the plugin is active.
-  static bool isActive() { return SpecificPlugin != nullptr; }
-
-  /// Create a plugin instance.
-  static GenericPluginTy *createPlugin();
-};
-
-/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
-/// acts as a reference to a device resource, such as a stream, and requires
-/// some basic functions to be implemented. The derived class should define an
-/// empty constructor that creates an empty and invalid resource reference. Do
-/// not create a new resource on the ctor, but on the create() function instead.
-///
-/// The derived class should also define the type HandleTy as the underlying
-/// resource handle type. For instance, in a CUDA stream it would be:
-///   using HandleTy = CUstream;
-struct GenericDeviceResourceRef {
-  /// Create a new resource and stores a reference.
-  virtual Error create(GenericDeviceTy &Device) = 0;
-
-  /// Destroy and release the resources pointed by the reference.
-  virtual Error destroy(GenericDeviceTy &Device) = 0;
-
-protected:
-  ~GenericDeviceResourceRef() = default;
-};
-
-/// Class that implements a resource pool belonging to a device. This class
-/// operates with references to the actual resources. These reference must
-/// derive from the GenericDeviceResourceRef class and implement the create
-/// and destroy virtual functions.
-template <typename ResourceRef> class GenericDeviceResourceManagerTy {
-  using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
-  using ResourceHandleTy = typename ResourceRef::HandleTy;
-
-public:
-  /// Create an empty resource pool for a specific device.
-  GenericDeviceResourceManagerTy(GenericDeviceTy &Device)
-      : Device(Device), NextAvailable(0) {}
-
-  /// Destroy the resource pool. At this point, the deinit() function should
-  /// already have been executed so the resource pool should be empty.
-  virtual ~GenericDeviceResourceManagerTy() {
-    assert(ResourcePool.empty() && "Resource pool not empty");
-  }
-
-  /// Initialize the resource pool.
-  Error init(uint32_t InitialSize) {
-    assert(ResourcePool.empty() && "Resource pool already initialized");
-    return ResourcePoolTy::resizeResourcePool(InitialSize);
-  }
-
-  /// Deinitialize the resource pool and delete all resources. This function
-  /// must be called before the destructor.
-  virtual Error deinit() {
-    if (NextAvailable)
-      DP("Missing %d resources to be returned\n", NextAvailable);
-
-    // TODO: This prevents a bug on libomptarget to make the plugins fail. There
-    // may be some resources not returned. Do not destroy these ones.
-    if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
-      return Err;
-
-    ResourcePool.clear();
-
-    return Plugin::success();
-  }
-
-  /// Get a resource from the pool or create new ones. If the function
-  /// succeeds, the handle to the resource is saved in \p Handle.
-  virtual Error getResource(ResourceHandleTy &Handle) {
-    // Get a resource with an empty resource processor.
-    return getResourcesImpl(1, &Handle,
-                            [](ResourceHandleTy) { return Plugin::success(); });
-  }
-
-  /// Get multiple resources from the pool or create new ones. If the function
-  /// succeeds, the handles to the resources are saved in \p Handles.
-  virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) {
-    // Get resources with an empty resource processor.
-    return getResourcesImpl(Num, Handles,
-                            [](ResourceHandleTy) { return Plugin::success(); });
-  }
-
-  /// Return resource to the pool.
-  virtual Error returnResource(ResourceHandleTy Handle) {
-    // Return a resource with an empty resource processor.
-    return returnResourceImpl(
-        Handle, [](ResourceHandleTy) { return Plugin::success(); });
-  }
-
-protected:
-  /// Get multiple resources from the pool or create new ones. If the function
-  /// succeeds, the handles to the resources are saved in \p Handles. Also
-  /// process each of the obtained resources with \p Processor.
-  template <typename FuncTy>
-  Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles,
-                         FuncTy Processor) {
-    const std::lock_guard<std::mutex> Lock(Mutex);
-
-    assert(NextAvailable <= ResourcePool.size() &&
-           "Resource pool is corrupted");
-
-    if (NextAvailable + Num > ResourcePool.size())
-      // Double the resource pool or resize it to provide the requested ones.
-      if (auto Err = ResourcePoolTy::resizeResourcePool(
-              std::max(NextAvailable * 2, NextAvailable + Num)))
-        return Err;
-
-    // Save the handles in the output array parameter.
-    for (uint32_t r = 0; r < Num; ++r)
-      Handles[r] = ResourcePool[NextAvailable + r];
-
-    // Process all obtained resources.
-    for (uint32_t r = 0; r < Num; ++r)
-      if (auto Err = Processor(Handles[r]))
-        return Err;
-
-    NextAvailable += Num;
-
-    return Plugin::success();
-  }
-
-  /// Return resource to the pool and process the resource with \p Processor.
-  template <typename FuncTy>
-  Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) {
-    const std::lock_guard<std::mutex> Lock(Mutex);
-
-    // Process the returned resource.
-    if (auto Err = Processor(Handle))
-      return Err;
-
-    assert(NextAvailable > 0 && "Resource pool is corrupted");
-    ResourcePool[--NextAvailable] = Handle;
-
-    return Plugin::success();
-  }
-
-protected:
-  /// The resources between \p OldSize and \p NewSize need to be created or
-  /// destroyed. The mutex is locked when this function is called.
-  Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
-    assert(OldSize != NewSize && "Resizing to the same size");
-
-    if (auto Err = Device.setContext())
-      return Err;
-
-    if (OldSize < NewSize) {
-      // Create new resources.
-      for (uint32_t I = OldSize; I < NewSize; ++I) {
-        if (auto Err = ResourcePool[I].create(Device))
-          return Err;
-      }
-    } else {
-      // Destroy the obsolete resources.
-      for (uint32_t I = NewSize; I < OldSize; ++I) {
-        if (auto Err = ResourcePool[I].destroy(Device))
-          return Err;
-      }
-    }
-    return Plugin::success();
-  }
-
-  /// Increase or decrease the number of resources. This function should
-  /// be called with the mutex acquired.
-  Error resizeResourcePool(uint32_t NewSize) {
-    uint32_t OldSize = ResourcePool.size();
-
-    // Nothing to do.
-    if (OldSize == NewSize)
-      return Plugin::success();
-
-    if (OldSize < NewSize) {
-      // Increase the number of resources.
-      ResourcePool.resize(NewSize);
-      return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
-    }
-
-    // Decrease the number of resources otherwise.
-    auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
-    ResourcePool.resize(NewSize);
-
-    return Err;
-  }
-
-  /// The device to which the resources belong
-  GenericDeviceTy &Device;
-
-  /// Mutex for the resource pool.
-  std::mutex Mutex;
-
-  /// The next available resource in the pool.
-  uint32_t NextAvailable;
-
-  /// The actual resource pool.
-  std::deque<ResourceRef> ResourcePool;
-};
-
-/// A static check on whether or not we support RPC in libomptarget.
-bool libomptargetSupportsRPC();
-
-} // namespace plugin
-} // namespace target
-} // namespace omp
-} // namespace llvm
-
-#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H

diff --git a/libomptarget/plugins-nextgen/common/include/RPC.h b/libomptarget/plugins-nextgen/common/include/RPC.h
deleted file mode 100644
index 01bf539..0000000
--- a/libomptarget/plugins-nextgen/common/include/RPC.h
+++ /dev/null

@@ -1,69 +0,0 @@
-//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides the interface to support remote procedure calls (RPC) from
-// the GPU. This is required to implement host services like printf or malloc.
-// The interface to the RPC server is provided by the 'libc' project in LLVM.
-// For more information visit https://libc.llvm.org/gpu/.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
-#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Error.h"
-
-#include <cstdint>
-
-namespace llvm::omp::target {
-namespace plugin {
-struct GenericPluginTy;
-struct GenericDeviceTy;
-class GenericGlobalHandlerTy;
-class DeviceImageTy;
-} // namespace plugin
-
-/// A generic class implementing the interface between the RPC server provided
-/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible
-/// these routines will perform no action.
-struct RPCServerTy {
-public:
-  /// Initializes the handles to the number of devices we may need to service.
-  RPCServerTy(plugin::GenericPluginTy &Plugin);
-
-  /// Check if this device image is using an RPC server. This checks for the
-  /// precense of an externally visible symbol in the device image that will
-  /// be present whenever RPC code is called.
-  llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
-                                        plugin::GenericGlobalHandlerTy &Handler,
-                                        plugin::DeviceImageTy &Image);
-
-  /// Initialize the RPC server for the given device. This will allocate host
-  /// memory for the internal server and copy the data to the client on the
-  /// device. The device must be loaded before this is valid.
-  llvm::Error initDevice(plugin::GenericDeviceTy &Device,
-                         plugin::GenericGlobalHandlerTy &Handler,
-                         plugin::DeviceImageTy &Image);
-
-  /// Runs the RPC server associated with the \p Device until the pending work
-  /// is cleared.
-  llvm::Error runServer(plugin::GenericDeviceTy &Device);
-
-  /// Deinitialize the RPC server for the given device. This will free the
-  /// memory associated with the k
-  llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);
-
-private:
-  /// Array from this device's identifier to its attached devices.
-  llvm::SmallVector<uintptr_t> Handles;
-};
-
-} // namespace llvm::omp::target
-
-#endif

diff --git a/libomptarget/plugins-nextgen/common/include/Utils/ELF.h b/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
deleted file mode 100644
index 88c83d3..0000000
--- a/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
+++ /dev/null

@@ -1,44 +0,0 @@
-//===-- Utils/ELF.h - Common ELF functionality ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common ELF functionality for target plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H
-
-#include "Shared/PluginAPI.h"
-
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-namespace utils {
-namespace elf {
-
-/// Returns true or false if the \p Buffer is an ELF file.
-bool isELF(llvm::StringRef Buffer);
-
-/// Checks if the given \p Object is a valid ELF matching the e_machine value.
-llvm::Expected<bool> checkMachine(llvm::StringRef Object, uint16_t EMachine);
-
-/// Returns a pointer to the given \p Symbol inside of an ELF object.
-llvm::Expected<const void *>
-getSymbolAddress(const llvm::object::ELFSymbolRef &Symbol);
-
-/// Returns the symbol associated with the \p Name in the \p ELFObj. It will
-/// first search for the hash sections to identify symbols from the hash table.
-/// If that fails it will fall back to a linear search in the case of an
-/// executable file without a hash table.
-llvm::Expected<std::optional<llvm::object::ELFSymbolRef>>
-getSymbol(const llvm::object::ObjectFile &ELFObj, llvm::StringRef Name);
-
-} // namespace elf
-} // namespace utils
-
-#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_ELF_UTILS_H

diff --git a/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
deleted file mode 100644
index ba0aa47..0000000
--- a/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ /dev/null

@@ -1,163 +0,0 @@
-//===- GlobalHandler.cpp - Target independent global & env. var handling --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target independent global handler and environment manager.
-//
-//===----------------------------------------------------------------------===//
-
-#include "GlobalHandler.h"
-#include "PluginInterface.h"
-#include "Utils/ELF.h"
-
-#include "Shared/Utils.h"
-
-#include "llvm/Support/Error.h"
-
-#include <cstring>
-
-using namespace llvm;
-using namespace omp;
-using namespace target;
-using namespace plugin;
-
-Expected<std::unique_ptr<ObjectFile>>
-GenericGlobalHandlerTy::getELFObjectFile(DeviceImageTy &Image) {
-  assert(utils::elf::isELF(Image.getMemoryBuffer().getBuffer()) &&
-         "Input is not an ELF file");
-
-  return ELFObjectFileBase::createELFObjectFile(Image.getMemoryBuffer());
-}
-
-Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
-    GenericDeviceTy &Device, DeviceImageTy &Image, const GlobalTy &HostGlobal,
-    bool Device2Host) {
-
-  GlobalTy DeviceGlobal(HostGlobal.getName(), HostGlobal.getSize());
-
-  // Get the metadata from the global on the device.
-  if (auto Err = getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
-    return Err;
-
-  // Perform the actual transfer.
-  return moveGlobalBetweenDeviceAndHost(Device, HostGlobal, DeviceGlobal,
-                                        Device2Host);
-}
-
-/// Actually move memory between host and device. See readGlobalFromDevice and
-/// writeGlobalToDevice for the interface description.
-Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
-    GenericDeviceTy &Device, const GlobalTy &HostGlobal,
-    const GlobalTy &DeviceGlobal, bool Device2Host) {
-
-  // Transfer the data from the source to the destination.
-  if (Device2Host) {
-    if (auto Err =
-            Device.dataRetrieve(HostGlobal.getPtr(), DeviceGlobal.getPtr(),
-                                HostGlobal.getSize(), nullptr))
-      return Err;
-  } else {
-    if (auto Err = Device.dataSubmit(DeviceGlobal.getPtr(), HostGlobal.getPtr(),
-                                     HostGlobal.getSize(), nullptr))
-      return Err;
-  }
-
-  DP("Succesfully %s %u bytes associated with global symbol '%s' %s the "
-     "device "
-     "(%p -> %p).\n",
-     Device2Host ? "read" : "write", HostGlobal.getSize(),
-     HostGlobal.getName().data(), Device2Host ? "from" : "to",
-     DeviceGlobal.getPtr(), HostGlobal.getPtr());
-
-  return Plugin::success();
-}
-
-bool GenericGlobalHandlerTy::isSymbolInImage(GenericDeviceTy &Device,
-                                             DeviceImageTy &Image,
-                                             StringRef SymName) {
-  // Get the ELF object file for the image. Notice the ELF object may already
-  // be created in previous calls, so we can reuse it. If this is unsuccessful
-  // just return false as we couldn't find it.
-  auto ELFObjOrErr = getELFObjectFile(Image);
-  if (!ELFObjOrErr) {
-    consumeError(ELFObjOrErr.takeError());
-    return false;
-  }
-
-  // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(**ELFObjOrErr, SymName);
-  if (!SymOrErr) {
-    consumeError(SymOrErr.takeError());
-    return false;
-  }
-
-  return SymOrErr->has_value();
-}
-
-Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
-    GenericDeviceTy &Device, DeviceImageTy &Image, GlobalTy &ImageGlobal) {
-
-  // Get the ELF object file for the image. Notice the ELF object may already
-  // be created in previous calls, so we can reuse it.
-  auto ELFObj = getELFObjectFile(Image);
-  if (!ELFObj)
-    return ELFObj.takeError();
-
-  // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(**ELFObj, ImageGlobal.getName());
-  if (!SymOrErr)
-    return Plugin::error("Failed ELF lookup of global '%s': %s",
-                         ImageGlobal.getName().data(),
-                         toString(SymOrErr.takeError()).data());
-
-  if (!SymOrErr->has_value())
-    return Plugin::error("Failed to find global symbol '%s' in the ELF image",
-                         ImageGlobal.getName().data());
-
-  auto AddrOrErr = utils::elf::getSymbolAddress(**SymOrErr);
-  // Get the section to which the symbol belongs.
-  if (!AddrOrErr)
-    return Plugin::error("Failed to get ELF symbol from global '%s': %s",
-                         ImageGlobal.getName().data(),
-                         toString(AddrOrErr.takeError()).data());
-
-  // Setup the global symbol's address and size.
-  ImageGlobal.setPtr(const_cast<void *>(*AddrOrErr));
-  ImageGlobal.setSize((*SymOrErr)->getSize());
-
-  return Plugin::success();
-}
-
-Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
-                                                  DeviceImageTy &Image,
-                                                  const GlobalTy &HostGlobal) {
-
-  GlobalTy ImageGlobal(HostGlobal.getName(), -1);
-  if (auto Err = getGlobalMetadataFromImage(Device, Image, ImageGlobal))
-    return Err;
-
-  if (ImageGlobal.getSize() != HostGlobal.getSize())
-    return Plugin::error("Transfer failed because global symbol '%s' has "
-                         "%u bytes in the ELF image but %u bytes on the host",
-                         HostGlobal.getName().data(), ImageGlobal.getSize(),
-                         HostGlobal.getSize());
-
-  DP("Global symbol '%s' was found in the ELF image and %u bytes will copied "
-     "from %p to %p.\n",
-     HostGlobal.getName().data(), HostGlobal.getSize(), ImageGlobal.getPtr(),
-     HostGlobal.getPtr());
-
-  assert(Image.getStart() <= ImageGlobal.getPtr() &&
-         advanceVoidPtr(ImageGlobal.getPtr(), ImageGlobal.getSize()) <
-             advanceVoidPtr(Image.getStart(), Image.getSize()) &&
-         "Attempting to read outside the image!");
-
-  // Perform the copy from the image to the host memory.
-  std::memcpy(HostGlobal.getPtr(), ImageGlobal.getPtr(), HostGlobal.getSize());
-
-  return Plugin::success();
-}

diff --git a/libomptarget/plugins-nextgen/common/src/JIT.cpp b/libomptarget/plugins-nextgen/common/src/JIT.cpp
deleted file mode 100644
index 9eb610c..0000000
--- a/libomptarget/plugins-nextgen/common/src/JIT.cpp
+++ /dev/null

@@ -1,347 +0,0 @@
-//===- JIT.cpp - Target independent JIT infrastructure --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "JIT.h"
-
-#include "Shared/Debug.h"
-#include "Shared/Utils.h"
-
-#include "PluginInterface.h"
-#include "omptarget.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LLVMRemarkStreamer.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Object/IRObjectFile.h"
-#include "llvm/Passes/OptimizationLevel.h"
-#include "llvm/Passes/PassBuilder.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/TimeProfiler.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/TargetParser/SubtargetFeature.h"
-
-#include <mutex>
-#include <shared_mutex>
-#include <system_error>
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace omp;
-using namespace omp::target;
-
-namespace {
-
-bool isImageBitcode(const __tgt_device_image &Image) {
-  StringRef Binary(reinterpret_cast<const char *>(Image.ImageStart),
-                   target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
-
-  return identify_magic(Binary) == file_magic::bitcode;
-}
-
-std::once_flag InitFlag;
-
-void init(Triple TT) {
-  codegen::RegisterCodeGenFlags();
-#ifdef LIBOMPTARGET_JIT_NVPTX
-  if (TT.isNVPTX()) {
-    LLVMInitializeNVPTXTargetInfo();
-    LLVMInitializeNVPTXTarget();
-    LLVMInitializeNVPTXTargetMC();
-    LLVMInitializeNVPTXAsmPrinter();
-  }
-#endif
-#ifdef LIBOMPTARGET_JIT_AMDGPU
-  if (TT.isAMDGPU()) {
-    LLVMInitializeAMDGPUTargetInfo();
-    LLVMInitializeAMDGPUTarget();
-    LLVMInitializeAMDGPUTargetMC();
-    LLVMInitializeAMDGPUAsmPrinter();
-  }
-#endif
-}
-
-Expected<std::unique_ptr<Module>>
-createModuleFromMemoryBuffer(std::unique_ptr<MemoryBuffer> &MB,
-                             LLVMContext &Context) {
-  SMDiagnostic Err;
-  auto Mod = parseIR(*MB, Err, Context);
-  if (!Mod)
-    return make_error<StringError>("Failed to create module",
-                                   inconvertibleErrorCode());
-  return std::move(Mod);
-}
-Expected<std::unique_ptr<Module>>
-createModuleFromImage(const __tgt_device_image &Image, LLVMContext &Context) {
-  StringRef Data((const char *)Image.ImageStart,
-                 target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
-  std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(
-      Data, /*BufferName=*/"", /*RequiresNullTerminator=*/false);
-  return createModuleFromMemoryBuffer(MB, Context);
-}
-
-OptimizationLevel getOptLevel(unsigned OptLevel) {
-  switch (OptLevel) {
-  case 0:
-    return OptimizationLevel::O0;
-  case 1:
-    return OptimizationLevel::O1;
-  case 2:
-    return OptimizationLevel::O2;
-  case 3:
-    return OptimizationLevel::O3;
-  }
-  llvm_unreachable("Invalid optimization level");
-}
-
-Expected<std::unique_ptr<TargetMachine>>
-createTargetMachine(Module &M, std::string CPU, unsigned OptLevel) {
-  Triple TT(M.getTargetTriple());
-  std::optional<CodeGenOptLevel> CGOptLevelOrNone =
-      CodeGenOpt::getLevel(OptLevel);
-  assert(CGOptLevelOrNone && "Invalid optimization level");
-  CodeGenOptLevel CGOptLevel = *CGOptLevelOrNone;
-
-  std::string Msg;
-  const Target *T = TargetRegistry::lookupTarget(M.getTargetTriple(), Msg);
-  if (!T)
-    return make_error<StringError>(Msg, inconvertibleErrorCode());
-
-  SubtargetFeatures Features;
-  Features.getDefaultSubtargetFeatures(TT);
-
-  std::optional<Reloc::Model> RelocModel;
-  if (M.getModuleFlag("PIC Level"))
-    RelocModel =
-        M.getPICLevel() == PICLevel::NotPIC ? Reloc::Static : Reloc::PIC_;
-
-  std::optional<CodeModel::Model> CodeModel = M.getCodeModel();
-
-  TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags(TT);
-
-  std::unique_ptr<TargetMachine> TM(
-      T->createTargetMachine(M.getTargetTriple(), CPU, Features.getString(),
-                             Options, RelocModel, CodeModel, CGOptLevel));
-  if (!TM)
-    return make_error<StringError>("Failed to create target machine",
-                                   inconvertibleErrorCode());
-  return std::move(TM);
-}
-
-} // namespace
-
-JITEngine::JITEngine(Triple::ArchType TA) : TT(Triple::getArchTypeName(TA)) {
-  std::call_once(InitFlag, init, TT);
-}
-
-void JITEngine::opt(TargetMachine *TM, TargetLibraryInfoImpl *TLII, Module &M,
-                    unsigned OptLevel) {
-  PipelineTuningOptions PTO;
-  std::optional<PGOOptions> PGOOpt;
-
-  LoopAnalysisManager LAM;
-  FunctionAnalysisManager FAM;
-  CGSCCAnalysisManager CGAM;
-  ModuleAnalysisManager MAM;
-  ModulePassManager MPM;
-
-  PassBuilder PB(TM, PTO, PGOOpt, nullptr);
-
-  FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); });
-
-  // Register all the basic analyses with the managers.
-  PB.registerModuleAnalyses(MAM);
-  PB.registerCGSCCAnalyses(CGAM);
-  PB.registerFunctionAnalyses(FAM);
-  PB.registerLoopAnalyses(LAM);
-  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-  MPM.addPass(PB.buildPerModuleDefaultPipeline(getOptLevel(OptLevel)));
-  MPM.run(M, MAM);
-}
-
-void JITEngine::codegen(TargetMachine *TM, TargetLibraryInfoImpl *TLII,
-                        Module &M, raw_pwrite_stream &OS) {
-  legacy::PassManager PM;
-  PM.add(new TargetLibraryInfoWrapperPass(*TLII));
-  MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(
-      reinterpret_cast<LLVMTargetMachine *>(TM));
-  TM->addPassesToEmitFile(PM, OS, nullptr,
-                          TT.isNVPTX() ? CodeGenFileType::AssemblyFile
-                                       : CodeGenFileType::ObjectFile,
-                          /*DisableVerify=*/false, MMIWP);
-
-  PM.run(M);
-}
-
-Expected<std::unique_ptr<MemoryBuffer>>
-JITEngine::backend(Module &M, const std::string &ComputeUnitKind,
-                   unsigned OptLevel) {
-
-  auto RemarksFileOrErr = setupLLVMOptimizationRemarks(
-      M.getContext(), /*RemarksFilename=*/"", /*RemarksPasses=*/"",
-      /*RemarksFormat=*/"", /*RemarksWithHotness=*/false);
-  if (Error E = RemarksFileOrErr.takeError())
-    return std::move(E);
-  if (*RemarksFileOrErr)
-    (*RemarksFileOrErr)->keep();
-
-  auto TMOrErr = createTargetMachine(M, ComputeUnitKind, OptLevel);
-  if (!TMOrErr)
-    return TMOrErr.takeError();
-
-  std::unique_ptr<TargetMachine> TM = std::move(*TMOrErr);
-  TargetLibraryInfoImpl TLII(TT);
-
-  if (PreOptIRModuleFileName.isPresent()) {
-    std::error_code EC;
-    raw_fd_stream FD(PreOptIRModuleFileName.get(), EC);
-    if (EC)
-      return createStringError(
-          EC, "Could not open %s to write the pre-opt IR module\n",
-          PreOptIRModuleFileName.get().c_str());
-    M.print(FD, nullptr);
-  }
-
-  if (!JITSkipOpt)
-    opt(TM.get(), &TLII, M, OptLevel);
-
-  if (PostOptIRModuleFileName.isPresent()) {
-    std::error_code EC;
-    raw_fd_stream FD(PostOptIRModuleFileName.get(), EC);
-    if (EC)
-      return createStringError(
-          EC, "Could not open %s to write the post-opt IR module\n",
-          PreOptIRModuleFileName.get().c_str());
-    M.print(FD, nullptr);
-  }
-
-  // Prepare the output buffer and stream for codegen.
-  SmallVector<char> CGOutputBuffer;
-  raw_svector_ostream OS(CGOutputBuffer);
-
-  codegen(TM.get(), &TLII, M, OS);
-
-  return MemoryBuffer::getMemBufferCopy(OS.str());
-}
-
-Expected<std::unique_ptr<MemoryBuffer>>
-JITEngine::getOrCreateObjFile(const __tgt_device_image &Image, LLVMContext &Ctx,
-                              const std::string &ComputeUnitKind) {
-
-  // Check if the user replaces the module at runtime with a finished object.
-  if (ReplacementObjectFileName.isPresent()) {
-    auto MBOrErr =
-        MemoryBuffer::getFileOrSTDIN(ReplacementObjectFileName.get());
-    if (!MBOrErr)
-      return createStringError(MBOrErr.getError(),
-                               "Could not read replacement obj from %s\n",
-                               ReplacementModuleFileName.get().c_str());
-    return std::move(*MBOrErr);
-  }
-
-  Module *Mod = nullptr;
-  // Check if the user replaces the module at runtime or we read it from the
-  // image.
-  // TODO: Allow the user to specify images per device (Arch + ComputeUnitKind).
-  if (!ReplacementModuleFileName.isPresent()) {
-    auto ModOrErr = createModuleFromImage(Image, Ctx);
-    if (!ModOrErr)
-      return ModOrErr.takeError();
-    Mod = ModOrErr->release();
-  } else {
-    auto MBOrErr =
-        MemoryBuffer::getFileOrSTDIN(ReplacementModuleFileName.get());
-    if (!MBOrErr)
-      return createStringError(MBOrErr.getError(),
-                               "Could not read replacement module from %s\n",
-                               ReplacementModuleFileName.get().c_str());
-    auto ModOrErr = createModuleFromMemoryBuffer(MBOrErr.get(), Ctx);
-    if (!ModOrErr)
-      return ModOrErr.takeError();
-    Mod = ModOrErr->release();
-  }
-
-  return backend(*Mod, ComputeUnitKind, JITOptLevel);
-}
-
-Expected<const __tgt_device_image *>
-JITEngine::compile(const __tgt_device_image &Image,
-                   const std::string &ComputeUnitKind,
-                   PostProcessingFn PostProcessing) {
-  std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex);
-
-  // Check if we JITed this image for the given compute unit kind before.
-  ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
-  if (__tgt_device_image *JITedImage = CUI.TgtImageMap.lookup(&Image))
-    return JITedImage;
-
-  auto ObjMBOrErr = getOrCreateObjFile(Image, CUI.Context, ComputeUnitKind);
-  if (!ObjMBOrErr)
-    return ObjMBOrErr.takeError();
-
-  auto ImageMBOrErr = PostProcessing(std::move(*ObjMBOrErr));
-  if (!ImageMBOrErr)
-    return ImageMBOrErr.takeError();
-
-  CUI.JITImages.push_back(std::move(*ImageMBOrErr));
-  __tgt_device_image *&JITedImage = CUI.TgtImageMap[&Image];
-  JITedImage = new __tgt_device_image();
-  *JITedImage = Image;
-
-  auto &ImageMB = CUI.JITImages.back();
-
-  JITedImage->ImageStart = const_cast<char *>(ImageMB->getBufferStart());
-  JITedImage->ImageEnd = const_cast<char *>(ImageMB->getBufferEnd());
-
-  return JITedImage;
-}
-
-Expected<const __tgt_device_image *>
-JITEngine::process(const __tgt_device_image &Image,
-                   target::plugin::GenericDeviceTy &Device) {
-  const std::string &ComputeUnitKind = Device.getComputeUnitKind();
-
-  PostProcessingFn PostProcessing = [&Device](std::unique_ptr<MemoryBuffer> MB)
-      -> Expected<std::unique_ptr<MemoryBuffer>> {
-    return Device.doJITPostProcessing(std::move(MB));
-  };
-
-  if (isImageBitcode(Image))
-    return compile(Image, ComputeUnitKind, PostProcessing);
-
-  return &Image;
-}
-
-Expected<bool> JITEngine::checkBitcodeImage(StringRef Buffer) const {
-  TimeTraceScope TimeScope("Check bitcode image");
-
-  assert(identify_magic(Buffer) == file_magic::bitcode &&
-         "Input is not bitcode");
-
-  LLVMContext Context;
-  auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Buffer, ""), Context,
-                                          /*ShouldLazyLoadMetadata=*/true);
-  if (!ModuleOrErr)
-    return ModuleOrErr.takeError();
-  Module &M = **ModuleOrErr;
-
-  return Triple(M.getTargetTriple()).getArch() == TT.getArch();
-}

diff --git a/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
deleted file mode 100644
index b5f3c45..0000000
--- a/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ /dev/null

@@ -1,2225 +0,0 @@
-//===- PluginInterface.cpp - Target independent plugin device interface ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "PluginInterface.h"
-
-#include "Shared/APITypes.h"
-#include "Shared/Debug.h"
-#include "Shared/Environment.h"
-#include "Shared/PluginAPI.h"
-
-#include "GlobalHandler.h"
-#include "JIT.h"
-#include "Utils/ELF.h"
-#include "omptarget.h"
-
-#ifdef OMPT_SUPPORT
-#include "OpenMP/OMPT/Callback.h"
-#include "omp-tools.h"
-#endif
-
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/JSON.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-#include <cstdint>
-#include <limits>
-
-using namespace llvm;
-using namespace omp;
-using namespace target;
-using namespace plugin;
-
-GenericPluginTy *PluginTy::SpecificPlugin = nullptr;
-
-// TODO: Fix any thread safety issues for multi-threaded kernel recording.
-struct RecordReplayTy {
-
-  // Describes the state of the record replay mechanism.
-  enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };
-
-private:
-  // Memory pointers for recording, replaying memory.
-  void *MemoryStart = nullptr;
-  void *MemoryPtr = nullptr;
-  size_t MemorySize = 0;
-  size_t TotalSize = 0;
-  GenericDeviceTy *Device = nullptr;
-  std::mutex AllocationLock;
-
-  RRStatusTy Status = RRDeactivated;
-  bool ReplaySaveOutput = false;
-  bool UsedVAMap = false;
-  uintptr_t MemoryOffset = 0;
-
-  // A list of all globals mapped to the device.
-  struct GlobalEntry {
-    const char *Name;
-    uint64_t Size;
-    void *Addr;
-  };
-  llvm::SmallVector<GlobalEntry> GlobalEntries{};
-
-  void *suggestAddress(uint64_t MaxMemoryAllocation) {
-    // Get a valid pointer address for this system
-    void *Addr =
-        Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
-    Device->free(Addr);
-    // Align Address to MaxMemoryAllocation
-    Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
-    return Addr;
-  }
-
-  Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
-    size_t ASize = MaxMemoryAllocation;
-
-    if (!VAddr && isRecording())
-      VAddr = suggestAddress(MaxMemoryAllocation);
-
-    DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
-
-    if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
-      return Err;
-
-    if (isReplaying() && VAddr != MemoryStart) {
-      return Plugin::error("Record-Replay cannot assign the"
-                           "requested recorded address (%p, %p)",
-                           VAddr, MemoryStart);
-    }
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
-         "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
-
-    MemoryPtr = MemoryStart;
-    MemorySize = 0;
-    TotalSize = ASize;
-    UsedVAMap = true;
-    return Plugin::success();
-  }
-
-  Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
-                             uint64_t RequiredMemoryAllocation, void *VAddr) {
-    const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
-    constexpr size_t STEP = 1024 * 1024 * 1024ULL;
-    MemoryStart = nullptr;
-    for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
-      MemoryStart =
-          Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT);
-      if (MemoryStart)
-        break;
-    }
-    if (!MemoryStart)
-      return Plugin::error("Allocating record/replay memory");
-
-    if (VAddr && VAddr != MemoryStart)
-      MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);
-
-    MemoryPtr = MemoryStart;
-    MemorySize = 0;
-
-    // Check if we need adjustment.
-    if (MemoryOffset > 0 &&
-        TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
-      // If we are off but "before" the required address and with enough space,
-      // we just "allocate" the offset to match the required address.
-      MemoryPtr = (char *)MemoryPtr + MemoryOffset;
-      MemorySize += MemoryOffset;
-      MemoryOffset = 0;
-      assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
-    } else if (MemoryOffset) {
-      // If we are off and in a situation we cannot just "waste" memory to force
-      // a match, we hope adjusting the arguments is sufficient.
-      REPORT(
-          "WARNING Failed to allocate replay memory at required location %p, "
-          "got %p, trying to offset argument pointers by %" PRIi64 "\n",
-          VAddr, MemoryStart, MemoryOffset);
-    }
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
-         "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
-         MemoryStart);
-
-    return Plugin::success();
-  }
-
-  Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
-    if (Device->supportVAManagement()) {
-      auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
-      if (Err) {
-        REPORT("WARNING VA mapping failed, fallback to heuristic: "
-               "(Error: %s)\n",
-               toString(std::move(Err)).data());
-      }
-    }
-
-    uint64_t DevMemSize;
-    if (Device->getDeviceMemorySize(DevMemSize))
-      return Plugin::error("Cannot determine Device Memory Size");
-
-    return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr);
-  }
-
-  void dumpDeviceMemory(StringRef Filename) {
-    ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
-        WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
-    if (!DeviceMemoryMB)
-      report_fatal_error("Error creating MemoryBuffer for device memory");
-
-    auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
-                                    MemoryStart, MemorySize, nullptr);
-    if (Err)
-      report_fatal_error("Error retrieving data for target pointer");
-
-    StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
-    std::error_code EC;
-    raw_fd_ostream OS(Filename, EC);
-    if (EC)
-      report_fatal_error("Error dumping memory to file " + Filename + " :" +
-                         EC.message());
-    OS << DeviceMemory;
-    OS.close();
-  }
-
-public:
-  bool isRecording() const { return Status == RRStatusTy::RRRecording; }
-  bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
-  bool isRecordingOrReplaying() const {
-    return (Status != RRStatusTy::RRDeactivated);
-  }
-  void setStatus(RRStatusTy Status) { this->Status = Status; }
-  bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
-  void addEntry(const char *Name, uint64_t Size, void *Addr) {
-    GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr});
-  }
-
-  void saveImage(const char *Name, const DeviceImageTy &Image) {
-    SmallString<128> ImageName = {Name, ".image"};
-    std::error_code EC;
-    raw_fd_ostream OS(ImageName, EC);
-    if (EC)
-      report_fatal_error("Error saving image : " + StringRef(EC.message()));
-    if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
-      size_t Size =
-          getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart);
-      MemoryBufferRef MBR = MemoryBufferRef(
-          StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
-      OS << MBR.getBuffer();
-    } else {
-      OS << Image.getMemoryBuffer().getBuffer();
-    }
-    OS.close();
-  }
-
-  void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
-    int32_t Size = 0;
-
-    for (auto &OffloadEntry : GlobalEntries) {
-      if (!OffloadEntry.Size)
-        continue;
-      // Get the total size of the string and entry including the null byte.
-      Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) +
-              OffloadEntry.Size;
-    }
-
-    ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
-        WritableMemoryBuffer::getNewUninitMemBuffer(Size);
-    if (!GlobalsMB)
-      report_fatal_error("Error creating MemoryBuffer for globals memory");
-
-    void *BufferPtr = GlobalsMB.get()->getBufferStart();
-    for (auto &OffloadEntry : GlobalEntries) {
-      if (!OffloadEntry.Size)
-        continue;
-
-      int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
-      memcpy(BufferPtr, OffloadEntry.Name, NameLength);
-      BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
-
-      *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
-      BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
-
-      auto Err = Plugin::success();
-      {
-        if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr,
-                                            OffloadEntry.Size, nullptr))
-          report_fatal_error("Error retrieving data for global");
-      }
-      if (Err)
-        report_fatal_error("Error retrieving data for global");
-      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size);
-    }
-    assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
-           "Buffer over/under-filled.");
-    assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
-           "Buffer size mismatch");
-
-    StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
-    std::error_code EC;
-    raw_fd_ostream OS(Filename, EC);
-    OS << GlobalsMemory;
-    OS.close();
-  }
-
-  void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
-                       uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
-                       uint64_t LoopTripCount) {
-    json::Object JsonKernelInfo;
-    JsonKernelInfo["Name"] = Name;
-    JsonKernelInfo["NumArgs"] = NumArgs;
-    JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
-    JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
-    JsonKernelInfo["LoopTripCount"] = LoopTripCount;
-    JsonKernelInfo["DeviceMemorySize"] = MemorySize;
-    JsonKernelInfo["DeviceId"] = Device->getDeviceId();
-    JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
-
-    json::Array JsonArgPtrs;
-    for (int I = 0; I < NumArgs; ++I)
-      JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
-    JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
-
-    json::Array JsonArgOffsets;
-    for (int I = 0; I < NumArgs; ++I)
-      JsonArgOffsets.push_back(0);
-    JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
-
-    SmallString<128> JsonFilename = {Name, ".json"};
-    std::error_code EC;
-    raw_fd_ostream JsonOS(JsonFilename.str(), EC);
-    if (EC)
-      report_fatal_error("Error saving kernel json file : " +
-                         StringRef(EC.message()));
-    JsonOS << json::Value(std::move(JsonKernelInfo));
-    JsonOS.close();
-  }
-
-  void saveKernelInput(const char *Name, DeviceImageTy &Image) {
-    SmallString<128> GlobalsFilename = {Name, ".globals"};
-    dumpGlobals(GlobalsFilename, Image);
-
-    SmallString<128> MemoryFilename = {Name, ".memory"};
-    dumpDeviceMemory(MemoryFilename);
-  }
-
-  void saveKernelOutputInfo(const char *Name) {
-    SmallString<128> OutputFilename = {
-        Name, (isRecording() ? ".original.output" : ".replay.output")};
-    dumpDeviceMemory(OutputFilename);
-  }
-
-  void *alloc(uint64_t Size) {
-    assert(MemoryStart && "Expected memory has been pre-allocated");
-    void *Alloc = nullptr;
-    constexpr int Alignment = 16;
-    // Assumes alignment is a power of 2.
-    int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1));
-    std::lock_guard<std::mutex> LG(AllocationLock);
-    Alloc = MemoryPtr;
-    MemoryPtr = (char *)MemoryPtr + AlignedSize;
-    MemorySize += AlignedSize;
-    DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
-    return Alloc;
-  }
-
-  Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
-             RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
-    this->Device = Device;
-    this->Status = Status;
-    this->ReplaySaveOutput = SaveOutput;
-
-    if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
-      return Err;
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
-         "Record Replay Initialized (%p)"
-         " as starting address, %lu Memory Size"
-         " and set on status %s\n",
-         MemoryStart, TotalSize,
-         Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
-
-    // Tell the user to offset pointer arguments as the memory allocation does
-    // not match.
-    ReqPtrArgOffset = MemoryOffset;
-    return Plugin::success();
-  }
-
-  void deinit() {
-    if (UsedVAMap) {
-      if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
-        report_fatal_error("Error on releasing virtual memory space");
-    } else {
-      Device->free(MemoryStart);
-    }
-  }
-};
-
-static RecordReplayTy RecordReplay;
-
-// Extract the mapping of host function pointers to device function pointers
-// from the entry table. Functions marked as 'indirect' in OpenMP will have
-// offloading entries generated for them which map the host's function pointer
-// to a global containing the corresponding function pointer on the device.
-static Expected<std::pair<void *, uint64_t>>
-setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
-                       DeviceImageTy &Image) {
-  GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-
-  llvm::ArrayRef<__tgt_offload_entry> Entries(Image.getTgtImage()->EntriesBegin,
-                                              Image.getTgtImage()->EntriesEnd);
-  llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
-  for (const auto &Entry : Entries) {
-    if (Entry.size == 0 || !(Entry.flags & OMP_DECLARE_TARGET_INDIRECT))
-      continue;
-
-    assert(Entry.size == sizeof(void *) && "Global not a function pointer?");
-    auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
-
-    GlobalTy DeviceGlobal(Entry.name, Entry.size);
-    if (auto Err =
-            Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
-      return std::move(Err);
-
-    HstPtr = Entry.addr;
-    if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
-                                       Entry.size, nullptr))
-      return std::move(Err);
-  }
-
-  // If we do not have any indirect globals we exit early.
-  if (IndirectCallTable.empty())
-    return std::pair{nullptr, 0};
-
-  // Sort the array to allow for more efficient lookup of device pointers.
-  llvm::sort(IndirectCallTable,
-             [](const auto &x, const auto &y) { return x.first < y.first; });
-
-  uint64_t TableSize =
-      IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
-  void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
-  if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
-                                   TableSize, nullptr))
-    return std::move(Err);
-  return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
-}
-
-AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
-                                       __tgt_async_info *AsyncInfoPtr)
-    : Device(Device),
-      AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
-
-void AsyncInfoWrapperTy::finalize(Error &Err) {
-  assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized");
-
-  // If we used a local async info object we want synchronous behavior. In that
-  // case, and assuming the current status code is correct, we will synchronize
-  // explicitly when the object is deleted. Update the error with the result of
-  // the synchronize operation.
-  if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
-    Err = Device.synchronize(&LocalAsyncInfo);
-
-  // Invalidate the wrapper object.
-  AsyncInfoPtr = nullptr;
-}
-
-Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
-                            DeviceImageTy &Image) {
-
-  ImagePtr = &Image;
-
-  // Retrieve kernel environment object for the kernel.
-  GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
-                     sizeof(KernelEnvironment), &KernelEnvironment);
-  GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler();
-  if (auto Err =
-          GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
-    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
-    DP("Failed to read kernel environment for '%s': %s\n"
-       "Using default SPMD (2) execution mode\n",
-       Name, ErrStr.data());
-    assert(KernelEnvironment.Configuration.ReductionDataSize == 0 &&
-           "Default initialization failed.");
-    IsBareKernel = true;
-  }
-
-  // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
-  MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
-                      ? std::min(KernelEnvironment.Configuration.MaxThreads,
-                                 int32_t(GenericDevice.getThreadLimit()))
-                      : GenericDevice.getThreadLimit();
-
-  // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
-  PreferredNumThreads =
-      KernelEnvironment.Configuration.MinThreads > 0
-          ? std::max(KernelEnvironment.Configuration.MinThreads,
-                     int32_t(GenericDevice.getDefaultNumThreads()))
-          : GenericDevice.getDefaultNumThreads();
-
-  return initImpl(GenericDevice, Image);
-}
-
-Expected<KernelLaunchEnvironmentTy *>
-GenericKernelTy::getKernelLaunchEnvironment(
-    GenericDeviceTy &GenericDevice, uint32_t Version,
-    AsyncInfoWrapperTy &AsyncInfoWrapper) const {
-  // Ctor/Dtor have no arguments, replaying uses the original kernel launch
-  // environment. Older versions of the compiler do not generate a kernel
-  // launch environment.
-  if (isCtorOrDtor() || RecordReplay.isReplaying() ||
-      Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
-    return nullptr;
-
-  if (!KernelEnvironment.Configuration.ReductionDataSize ||
-      !KernelEnvironment.Configuration.ReductionBufferLength)
-    return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
-
-  // TODO: Check if the kernel needs a launch environment.
-  auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
-                                            /*HostPtr=*/nullptr,
-                                            TargetAllocTy::TARGET_ALLOC_DEVICE);
-  if (!AllocOrErr)
-    return AllocOrErr.takeError();
-
-  // Remember to free the memory later.
-  AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
-
-  /// Use the KLE in the __tgt_async_info to ensure a stable address for the
-  /// async data transfer.
-  auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
-  LocalKLE = KernelLaunchEnvironment;
-  {
-    auto AllocOrErr = GenericDevice.dataAlloc(
-        KernelEnvironment.Configuration.ReductionDataSize *
-            KernelEnvironment.Configuration.ReductionBufferLength,
-        /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
-    if (!AllocOrErr)
-      return AllocOrErr.takeError();
-    LocalKLE.ReductionBuffer = *AllocOrErr;
-    // Remember to free the memory later.
-    AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
-  }
-
-  INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
-       "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
-       ", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
-       DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr),
-       sizeof(KernelLaunchEnvironmentTy));
-
-  auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
-                                      sizeof(KernelLaunchEnvironmentTy),
-                                      AsyncInfoWrapper);
-  if (Err)
-    return Err;
-  return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
-}
-
-Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
-                                       KernelArgsTy &KernelArgs,
-                                       uint32_t NumThreads,
-                                       uint64_t NumBlocks) const {
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
-       "Launching kernel %s with %" PRIu64
-       " blocks and %d threads in %s mode\n",
-       getName(), NumBlocks, NumThreads, getExecutionModeName());
-  return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
-                                NumBlocks);
-}
-
-Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
-                                              KernelArgsTy &KernelArgs,
-                                              uint32_t NumThreads,
-                                              uint64_t NumBlocks) const {
-  return Plugin::success();
-}
-
-Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
-                              ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
-                              AsyncInfoWrapperTy &AsyncInfoWrapper) const {
-  llvm::SmallVector<void *, 16> Args;
-  llvm::SmallVector<void *, 16> Ptrs;
-
-  auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
-      GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
-  if (!KernelLaunchEnvOrErr)
-    return KernelLaunchEnvOrErr.takeError();
-
-  void *KernelArgsPtr =
-      prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
-                  Ptrs, *KernelLaunchEnvOrErr);
-
-  uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
-  uint64_t NumBlocks =
-      getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount,
-                   NumThreads, KernelArgs.ThreadLimit[0] > 0);
-
-  // Record the kernel description after we modified the argument count and num
-  // blocks/threads.
-  if (RecordReplay.isRecording()) {
-    RecordReplay.saveImage(getName(), getImage());
-    RecordReplay.saveKernelInput(getName(), getImage());
-    RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
-                                 NumBlocks, NumThreads, KernelArgs.Tripcount);
-  }
-
-  if (auto Err =
-          printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
-    return Err;
-
-  return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
-                    KernelArgsPtr, AsyncInfoWrapper);
-}
-
-void *GenericKernelTy::prepareArgs(
-    GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
-    uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
-    llvm::SmallVectorImpl<void *> &Ptrs,
-    KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
-  if (isCtorOrDtor())
-    return nullptr;
-
-  uint32_t KLEOffset = !!KernelLaunchEnvironment;
-  NumArgs += KLEOffset;
-
-  if (NumArgs == 0)
-    return nullptr;
-
-  Args.resize(NumArgs);
-  Ptrs.resize(NumArgs);
-
-  if (KernelLaunchEnvironment) {
-    Ptrs[0] = KernelLaunchEnvironment;
-    Args[0] = &Ptrs[0];
-  }
-
-  for (int I = KLEOffset; I < NumArgs; ++I) {
-    Ptrs[I] =
-        (void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
-    Args[I] = &Ptrs[I];
-  }
-  return &Args[0];
-}
-
-uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
-                                        uint32_t ThreadLimitClause[3]) const {
-  assert(ThreadLimitClause[1] == 0 && ThreadLimitClause[2] == 0 &&
-         "Multi dimensional launch not supported yet.");
-
-  if (IsBareKernel && ThreadLimitClause[0] > 0)
-    return ThreadLimitClause[0];
-
-  if (ThreadLimitClause[0] > 0 && isGenericMode())
-    ThreadLimitClause[0] += GenericDevice.getWarpSize();
-
-  return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
-                                     ? ThreadLimitClause[0]
-                                     : PreferredNumThreads);
-}
-
-uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
-                                       uint32_t NumTeamsClause[3],
-                                       uint64_t LoopTripCount,
-                                       uint32_t &NumThreads,
-                                       bool IsNumThreadsFromUser) const {
-  assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
-         "Multi dimensional launch not supported yet.");
-
-  if (IsBareKernel && NumTeamsClause[0] > 0)
-    return NumTeamsClause[0];
-
-  if (NumTeamsClause[0] > 0) {
-    // TODO: We need to honor any value and consequently allow more than the
-    // block limit. For this we might need to start multiple kernels or let the
-    // blocks start again until the requested number has been started.
-    return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
-  }
-
-  uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
-  uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
-  if (LoopTripCount > 0) {
-    if (isSPMDMode()) {
-      // We have a combined construct, i.e. `target teams distribute
-      // parallel for [simd]`. We launch so many teams so that each thread
-      // will execute one iteration of the loop; rounded up to the nearest
-      // integer. However, if that results in too few teams, we artificially
-      // reduce the thread count per team to increase the outer parallelism.
-      auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
-      MinThreads = std::min(MinThreads, NumThreads);
-
-      // Honor the thread_limit clause; only lower the number of threads.
-      [[maybe_unused]] auto OldNumThreads = NumThreads;
-      if (LoopTripCount >= DefaultNumBlocks * NumThreads ||
-          IsNumThreadsFromUser) {
-        // Enough parallelism for teams and threads.
-        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
-        assert(IsNumThreadsFromUser ||
-               TripCountNumBlocks >= DefaultNumBlocks &&
-                   "Expected sufficient outer parallelism.");
-      } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
-        // Enough parallelism for teams, limit threads.
-
-        // This case is hard; for now, we force "full warps":
-        // First, compute a thread count assuming DefaultNumBlocks.
-        auto NumThreadsDefaultBlocks =
-            (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
-        // Now get a power of two that is larger or equal.
-        auto NumThreadsDefaultBlocksP2 =
-            llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
-        // Do not increase a thread limit given be the user.
-        NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
-        assert(NumThreads >= MinThreads &&
-               "Expected sufficient inner parallelism.");
-        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
-      } else {
-        // Not enough parallelism for teams and threads, limit both.
-        NumThreads = std::min(NumThreads, MinThreads);
-        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
-      }
-
-      assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
-             "Expected sufficient parallelism");
-      assert(OldNumThreads >= NumThreads &&
-             "Number of threads cannot be increased!");
-    } else {
-      assert((isGenericMode() || isGenericSPMDMode()) &&
-             "Unexpected execution mode!");
-      // If we reach this point, then we have a non-combined construct, i.e.
-      // `teams distribute` with a nested `parallel for` and each team is
-      // assigned one iteration of the `distribute` loop. E.g.:
-      //
-      // #pragma omp target teams distribute
-      // for(...loop_tripcount...) {
-      //   #pragma omp parallel for
-      //   for(...) {}
-      // }
-      //
-      // Threads within a team will execute the iterations of the `parallel`
-      // loop.
-      TripCountNumBlocks = LoopTripCount;
-    }
-  }
-  // If the loops are long running we rather reuse blocks than spawn too many.
-  uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
-  return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
-}
-
-GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
-                                 int32_t NumDevices,
-                                 const llvm::omp::GV &OMPGridValues)
-    : Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
-      OMP_NumTeams("OMP_NUM_TEAMS"),
-      OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
-      OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
-      OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
-      // Do not initialize the following two envars since they depend on the
-      // device initialization. These cannot be consulted until the device is
-      // initialized correctly. We intialize them in GenericDeviceTy::init().
-      OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
-      // By default, the initial number of streams and events is 1.
-      OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
-      OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
-      DeviceId(DeviceId), GridValues(OMPGridValues),
-      PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
-      PinnedAllocs(*this), RPCServer(nullptr) {
-#ifdef OMPT_SUPPORT
-  OmptInitialized.store(false);
-  // Bind the callbacks to this device's member functions
-#define bindOmptCallback(Name, Type, Code)                                     \
-  if (ompt::Initialized && ompt::lookupCallbackByCode) {                       \
-    ompt::lookupCallbackByCode((ompt_callbacks_t)(Code),                       \
-                               ((ompt_callback_t *)&(Name##_fn)));             \
-    DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn));     \
-  }
-
-  FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
-#undef bindOmptCallback
-
-#endif
-}
-
-Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
-  if (auto Err = initImpl(Plugin))
-    return Err;
-
-#ifdef OMPT_SUPPORT
-  if (ompt::Initialized) {
-    bool ExpectedStatus = false;
-    if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
-      performOmptCallback(device_initialize, /*device_num=*/DeviceId +
-                                                 Plugin.getDeviceIdStartIndex(),
-                          /*type=*/getComputeUnitKind().c_str(),
-                          /*device=*/reinterpret_cast<ompt_device_t *>(this),
-                          /*lookup=*/ompt::lookupCallbackByName,
-                          /*documentation=*/nullptr);
-  }
-#endif
-
-  // Read and reinitialize the envars that depend on the device initialization.
-  // Notice these two envars may change the stack size and heap size of the
-  // device, so they need the device properly initialized.
-  auto StackSizeEnvarOrErr = UInt64Envar::create(
-      "LIBOMPTARGET_STACK_SIZE",
-      [this](uint64_t &V) -> Error { return getDeviceStackSize(V); },
-      [this](uint64_t V) -> Error { return setDeviceStackSize(V); });
-  if (!StackSizeEnvarOrErr)
-    return StackSizeEnvarOrErr.takeError();
-  OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
-
-  auto HeapSizeEnvarOrErr = UInt64Envar::create(
-      "LIBOMPTARGET_HEAP_SIZE",
-      [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
-      [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
-  if (!HeapSizeEnvarOrErr)
-    return HeapSizeEnvarOrErr.takeError();
-  OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
-
-  // Update the maximum number of teams and threads after the device
-  // initialization sets the corresponding hardware limit.
-  if (OMP_NumTeams > 0)
-    GridValues.GV_Max_Teams =
-        std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
-
-  if (OMP_TeamsThreadLimit > 0)
-    GridValues.GV_Max_WG_Size =
-        std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit));
-
-  // Enable the memory manager if required.
-  auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
-  if (EnableMM)
-    MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
-
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
-  for (DeviceImageTy *Image : LoadedImages)
-    if (auto Err = callGlobalDestructors(Plugin, *Image))
-      return Err;
-
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-    for (auto *Image : LoadedImages) {
-      DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-      GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                             sizeof(DeviceMemoryPoolTrackingTy),
-                             &ImageDeviceMemoryPoolTracking);
-      if (auto Err =
-              GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
-        consumeError(std::move(Err));
-        continue;
-      }
-      DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
-    }
-
-    // TODO: Write this by default into a file.
-    printf("\n\n|-----------------------\n"
-           "| Device memory tracker:\n"
-           "|-----------------------\n"
-           "| #Allocations: %lu\n"
-           "| Byes allocated: %lu\n"
-           "| Minimal allocation: %lu\n"
-           "| Maximal allocation: %lu\n"
-           "|-----------------------\n\n\n",
-           DeviceMemoryPoolTracking.NumAllocations,
-           DeviceMemoryPoolTracking.AllocationTotal,
-           DeviceMemoryPoolTracking.AllocationMin,
-           DeviceMemoryPoolTracking.AllocationMax);
-  }
-
-  // Delete the memory manager before deinitializing the device. Otherwise,
-  // we may delete device allocations after the device is deinitialized.
-  if (MemoryManager)
-    delete MemoryManager;
-  MemoryManager = nullptr;
-
-  if (RecordReplay.isRecordingOrReplaying())
-    RecordReplay.deinit();
-
-  if (RPCServer)
-    if (auto Err = RPCServer->deinitDevice(*this))
-      return Err;
-
-#ifdef OMPT_SUPPORT
-  if (ompt::Initialized) {
-    bool ExpectedStatus = true;
-    if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
-      performOmptCallback(device_finalize,
-                          /*device_num=*/DeviceId +
-                              Plugin.getDeviceIdStartIndex());
-  }
-#endif
-
-  return deinitImpl();
-}
-Expected<DeviceImageTy *>
-GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
-                            const __tgt_device_image *InputTgtImage) {
-  assert(InputTgtImage && "Expected non-null target image");
-  DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
-
-  auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
-  if (!PostJITImageOrErr) {
-    auto Err = PostJITImageOrErr.takeError();
-    REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
-           DeviceId, toString(std::move(Err)).data());
-    return nullptr;
-  }
-
-  // Load the binary and allocate the image object. Use the next available id
-  // for the image id, which is the number of previously loaded images.
-  auto ImageOrErr =
-      loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
-  if (!ImageOrErr)
-    return ImageOrErr.takeError();
-
-  DeviceImageTy *Image = *ImageOrErr;
-  assert(Image != nullptr && "Invalid image");
-  if (InputTgtImage != PostJITImageOrErr.get())
-    Image->setTgtImageBitcode(InputTgtImage);
-
-  // Add the image to list.
-  LoadedImages.push_back(Image);
-
-  // Setup the device environment if needed.
-  if (auto Err = setupDeviceEnvironment(Plugin, *Image))
-    return std::move(Err);
-
-  // Setup the global device memory pool if needed.
-  if (!RecordReplay.isReplaying() && shouldSetupDeviceMemoryPool()) {
-    uint64_t HeapSize;
-    auto SizeOrErr = getDeviceHeapSize(HeapSize);
-    if (SizeOrErr) {
-      REPORT("No global device memory pool due to error: %s\n",
-             toString(std::move(SizeOrErr)).data());
-    } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
-      return std::move(Err);
-  }
-
-  if (auto Err = setupRPCServer(Plugin, *Image))
-    return std::move(Err);
-
-#ifdef OMPT_SUPPORT
-  if (ompt::Initialized) {
-    size_t Bytes =
-        getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
-    performOmptCallback(
-        device_load, /*device_num=*/DeviceId + Plugin.getDeviceIdStartIndex(),
-        /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
-        /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart,
-        /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
-  }
-#endif
-
-  // Call any global constructors present on the device.
-  if (auto Err = callGlobalConstructors(Plugin, *Image))
-    return std::move(Err);
-
-  // Return the pointer to the table of entries.
-  return Image;
-}
-
-Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
-                                              DeviceImageTy &Image) {
-  // There are some plugins that do not need this step.
-  if (!shouldSetupDeviceEnvironment())
-    return Plugin::success();
-
-  // Obtain a table mapping host function pointers to device function pointers.
-  auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
-  if (!CallTablePairOrErr)
-    return CallTablePairOrErr.takeError();
-
-  DeviceEnvironmentTy DeviceEnvironment;
-  DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
-  DeviceEnvironment.NumDevices = Plugin.getNumDevices();
-  // TODO: The device ID used here is not the real device ID used by OpenMP.
-  DeviceEnvironment.DeviceNum = DeviceId;
-  DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
-  DeviceEnvironment.ClockFrequency = getClockFrequency();
-  DeviceEnvironment.IndirectCallTable =
-      reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
-  DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
-  DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
-                        sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
-
-  // Write device environment values to the device.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
-    DP("Missing symbol %s, continue execution anyway.\n",
-       DevEnvGlobal.getName().data());
-    consumeError(std::move(Err));
-  }
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
-                                             DeviceImageTy &Image,
-                                             uint64_t PoolSize) {
-  // Free the old pool, if any.
-  if (DeviceMemoryPool.Ptr) {
-    if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE))
-      return Err;
-  }
-
-  DeviceMemoryPool.Size = PoolSize;
-  auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE);
-  if (AllocOrErr) {
-    DeviceMemoryPool.Ptr = *AllocOrErr;
-  } else {
-    auto Err = AllocOrErr.takeError();
-    REPORT("Failure to allocate device memory for global memory pool: %s\n",
-           toString(std::move(Err)).data());
-    DeviceMemoryPool.Ptr = nullptr;
-    DeviceMemoryPool.Size = 0;
-  }
-
-  // Create the metainfo of the device environment global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (!GHandler.isSymbolInImage(*this, Image,
-                                "__omp_rtl_device_memory_pool_tracker")) {
-    DP("Skip the memory pool as there is no tracker symbol in the image.");
-    return Error::success();
-  }
-
-  GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                         sizeof(DeviceMemoryPoolTrackingTy),
-                         &DeviceMemoryPoolTracking);
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
-    return Err;
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
-                        sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
-
-  // Write device environment values to the device.
-  return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
-}
-
-Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) {
-  // The plugin either does not need an RPC server or it is unavailible.
-  if (!shouldSetupRPCServer())
-    return Plugin::success();
-
-  // Check if this device needs to run an RPC server.
-  RPCServerTy &Server = Plugin.getRPCServer();
-  auto UsingOrErr =
-      Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
-  if (!UsingOrErr)
-    return UsingOrErr.takeError();
-
-  if (!UsingOrErr.get())
-    return Plugin::success();
-
-  if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
-    return Err;
-
-  RPCServer = &Server;
-  DP("Running an RPC server on device %d\n", getDeviceId());
-  return Plugin::success();
-}
-
-Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
-                                         size_t Size, bool ExternallyLocked) {
-  // Insert the new entry into the map.
-  auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked});
-  if (!Res.second)
-    return Plugin::error("Cannot insert locked buffer entry");
-
-  // Check whether the next entry overlaps with the inserted entry.
-  auto It = std::next(Res.first);
-  if (It == Allocs.end())
-    return Plugin::success();
-
-  const EntryTy *NextEntry = &(*It);
-  if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size))
-    return Plugin::error("Partial overlapping not allowed in locked buffers");
-
-  return Plugin::success();
-}
-
-Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) {
-  // Erase the existing entry. Notice this requires an additional map lookup,
-  // but this should not be a performance issue. Using iterators would make
-  // the code more difficult to read.
-  size_t Erased = Allocs.erase({Entry.HstPtr});
-  if (!Erased)
-    return Plugin::error("Cannot erase locked buffer entry");
-  return Plugin::success();
-}
-
-Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry,
-                                              void *HstPtr, size_t Size) {
-  if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size))
-    return Plugin::error("Partial overlapping not allowed in locked buffers");
-
-  ++Entry.References;
-  return Plugin::success();
-}
-
-Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) {
-  if (Entry.References == 0)
-    return Plugin::error("Invalid number of references");
-
-  // Return whether this was the last user.
-  return (--Entry.References == 0);
-}
-
-Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
-                                                void *DevAccessiblePtr,
-                                                size_t Size) {
-  assert(HstPtr && "Invalid pointer");
-  assert(DevAccessiblePtr && "Invalid pointer");
-  assert(Size && "Invalid size");
-
-  std::lock_guard<std::shared_mutex> Lock(Mutex);
-
-  // No pinned allocation should intersect.
-  const EntryTy *Entry = findIntersecting(HstPtr);
-  if (Entry)
-    return Plugin::error("Cannot insert entry due to an existing one");
-
-  // Now insert the new entry.
-  return insertEntry(HstPtr, DevAccessiblePtr, Size);
-}
-
-Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) {
-  assert(HstPtr && "Invalid pointer");
-
-  std::lock_guard<std::shared_mutex> Lock(Mutex);
-
-  const EntryTy *Entry = findIntersecting(HstPtr);
-  if (!Entry)
-    return Plugin::error("Cannot find locked buffer");
-
-  // The address in the entry should be the same we are unregistering.
-  if (Entry->HstPtr != HstPtr)
-    return Plugin::error("Unexpected host pointer in locked buffer entry");
-
-  // Unregister from the entry.
-  auto LastUseOrErr = unregisterEntryUse(*Entry);
-  if (!LastUseOrErr)
-    return LastUseOrErr.takeError();
-
-  // There should be no other references to the pinned allocation.
-  if (!(*LastUseOrErr))
-    return Plugin::error("The locked buffer is still being used");
-
-  // Erase the entry from the map.
-  return eraseEntry(*Entry);
-}
-
-Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
-                                                       size_t Size) {
-  assert(HstPtr && "Invalid pointer");
-  assert(Size && "Invalid size");
-
-  std::lock_guard<std::shared_mutex> Lock(Mutex);
-
-  const EntryTy *Entry = findIntersecting(HstPtr);
-
-  if (Entry) {
-    // An already registered intersecting buffer was found. Register a new use.
-    if (auto Err = registerEntryUse(*Entry, HstPtr, Size))
-      return std::move(Err);
-
-    // Return the device accessible pointer with the correct offset.
-    return advanceVoidPtr(Entry->DevAccessiblePtr,
-                          getPtrDiff(HstPtr, Entry->HstPtr));
-  }
-
-  // No intersecting registered allocation found in the map. First, lock the
-  // host buffer and retrieve the device accessible pointer.
-  auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
-  if (!DevAccessiblePtrOrErr)
-    return DevAccessiblePtrOrErr.takeError();
-
-  // Now insert the new entry into the map.
-  if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size))
-    return std::move(Err);
-
-  // Return the device accessible pointer.
-  return *DevAccessiblePtrOrErr;
-}
-
-Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) {
-  assert(HstPtr && "Invalid pointer");
-
-  std::lock_guard<std::shared_mutex> Lock(Mutex);
-
-  const EntryTy *Entry = findIntersecting(HstPtr);
-  if (!Entry)
-    return Plugin::error("Cannot find locked buffer");
-
-  // Unregister from the locked buffer. No need to do anything if there are
-  // others using the allocation.
-  auto LastUseOrErr = unregisterEntryUse(*Entry);
-  if (!LastUseOrErr)
-    return LastUseOrErr.takeError();
-
-  // No need to do anything if there are others using the allocation.
-  if (!(*LastUseOrErr))
-    return Plugin::success();
-
-  // This was the last user of the allocation. Unlock the original locked buffer
-  // if it was locked by the plugin. Do not unlock it if it was locked by an
-  // external entity. Unlock the buffer using the host pointer of the entry.
-  if (!Entry->ExternallyLocked)
-    if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
-      return Err;
-
-  // Erase the entry from the map.
-  return eraseEntry(*Entry);
-}
-
-Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) {
-  assert(HstPtr && "Invalid pointer");
-  assert(Size && "Invalid size");
-
-  std::lock_guard<std::shared_mutex> Lock(Mutex);
-
-  // If previously registered, just register a new user on the entry.
-  const EntryTy *Entry = findIntersecting(HstPtr);
-  if (Entry)
-    return registerEntryUse(*Entry, HstPtr, Size);
-
-  size_t BaseSize;
-  void *BaseHstPtr, *BaseDevAccessiblePtr;
-
-  // Check if it was externally pinned by a vendor-specific API.
-  auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr,
-                                              BaseDevAccessiblePtr, BaseSize);
-  if (!IsPinnedOrErr)
-    return IsPinnedOrErr.takeError();
-
-  // If pinned, just insert the entry representing the whole pinned buffer.
-  if (*IsPinnedOrErr)
-    return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize,
-                       /*Externallylocked=*/true);
-
-  // Not externally pinned. Do nothing if locking of mapped buffers is disabled.
-  if (!LockMappedBuffers)
-    return Plugin::success();
-
-  // Otherwise, lock the buffer and insert the new entry.
-  auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
-  if (!DevAccessiblePtrOrErr) {
-    // Errors may be tolerated.
-    if (!IgnoreLockMappedFailures)
-      return DevAccessiblePtrOrErr.takeError();
-
-    consumeError(DevAccessiblePtrOrErr.takeError());
-    return Plugin::success();
-  }
-
-  return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size);
-}
-
-Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
-  assert(HstPtr && "Invalid pointer");
-
-  std::lock_guard<std::shared_mutex> Lock(Mutex);
-
-  // Check whether there is any intersecting entry.
-  const EntryTy *Entry = findIntersecting(HstPtr);
-
-  // No entry but automatic locking of mapped buffers is disabled, so
-  // nothing to do.
-  if (!Entry && !LockMappedBuffers)
-    return Plugin::success();
-
-  // No entry, automatic locking is enabled, but the locking may have failed, so
-  // do nothing.
-  if (!Entry && IgnoreLockMappedFailures)
-    return Plugin::success();
-
-  // No entry, but the automatic locking is enabled, so this is an error.
-  if (!Entry)
-    return Plugin::error("Locked buffer not found");
-
-  // There is entry, so unregister a user and check whether it was the last one.
-  auto LastUseOrErr = unregisterEntryUse(*Entry);
-  if (!LastUseOrErr)
-    return LastUseOrErr.takeError();
-
-  // If it is not the last one, there is nothing to do.
-  if (!(*LastUseOrErr))
-    return Plugin::success();
-
-  // Otherwise, if it was the last and the buffer was locked by the plugin,
-  // unlock it.
-  if (!Entry->ExternallyLocked)
-    if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
-      return Err;
-
-  // Finally erase the entry from the map.
-  return eraseEntry(*Entry);
-}
-
-Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
-  if (!AsyncInfo || !AsyncInfo->Queue)
-    return Plugin::error("Invalid async info queue");
-
-  if (auto Err = synchronizeImpl(*AsyncInfo))
-    return Err;
-
-  for (auto *Ptr : AsyncInfo->AssociatedAllocations)
-    if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
-      return Err;
-  AsyncInfo->AssociatedAllocations.clear();
-
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
-  if (!AsyncInfo || !AsyncInfo->Queue)
-    return Plugin::error("Invalid async info queue");
-
-  return queryAsyncImpl(*AsyncInfo);
-}
-
-Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
-  return Plugin::error("Device does not suppport VA Management");
-}
-
-Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
-  return Plugin::error("Device does not suppport VA Management");
-}
-
-Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
-  return Plugin::error(
-      "Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
-}
-
-Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
-                                            TargetAllocTy Kind) {
-  void *Alloc = nullptr;
-
-  if (RecordReplay.isRecordingOrReplaying())
-    return RecordReplay.alloc(Size);
-
-  switch (Kind) {
-  case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
-  case TARGET_ALLOC_DEVICE:
-    if (MemoryManager) {
-      Alloc = MemoryManager->allocate(Size, HostPtr);
-      if (!Alloc)
-        return Plugin::error("Failed to allocate from memory manager");
-      break;
-    }
-    [[fallthrough]];
-  case TARGET_ALLOC_HOST:
-  case TARGET_ALLOC_SHARED:
-    Alloc = allocate(Size, HostPtr, Kind);
-    if (!Alloc)
-      return Plugin::error("Failed to allocate from device allocator");
-  }
-
-  // Report error if the memory manager or the device allocator did not return
-  // any memory buffer.
-  if (!Alloc)
-    return Plugin::error("Invalid target data allocation kind or requested "
-                         "allocator not implemented yet");
-
-  // Register allocated buffer as pinned memory if the type is host memory.
-  if (Kind == TARGET_ALLOC_HOST)
-    if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
-      return std::move(Err);
-
-  return Alloc;
-}
-
-Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
-  // Free is a noop when recording or replaying.
-  if (RecordReplay.isRecordingOrReplaying())
-    return Plugin::success();
-
-  int Res;
-  if (MemoryManager)
-    Res = MemoryManager->free(TgtPtr);
-  else
-    Res = free(TgtPtr, Kind);
-
-  if (Res)
-    return Plugin::error("Failure to deallocate device pointer %p", TgtPtr);
-
-  // Unregister deallocated pinned memory buffer if the type is host memory.
-  if (Kind == TARGET_ALLOC_HOST)
-    if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr))
-      return Err;
-
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
-                                  int64_t Size, __tgt_async_info *AsyncInfo) {
-  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
-
-  auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
-  AsyncInfoWrapper.finalize(Err);
-  return Err;
-}
-
-Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
-                                    int64_t Size, __tgt_async_info *AsyncInfo) {
-  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
-
-  auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
-  AsyncInfoWrapper.finalize(Err);
-  return Err;
-}
-
-Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
-                                    void *DstPtr, int64_t Size,
-                                    __tgt_async_info *AsyncInfo) {
-  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
-
-  auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
-  AsyncInfoWrapper.finalize(Err);
-  return Err;
-}
-
-Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
-                                    ptrdiff_t *ArgOffsets,
-                                    KernelArgsTy &KernelArgs,
-                                    __tgt_async_info *AsyncInfo) {
-  AsyncInfoWrapperTy AsyncInfoWrapper(
-      *this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo);
-
-  GenericKernelTy &GenericKernel =
-      *reinterpret_cast<GenericKernelTy *>(EntryPtr);
-
-  auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
-                                  AsyncInfoWrapper);
-
-  // 'finalize' here to guarantee next record-replay actions are in-sync
-  AsyncInfoWrapper.finalize(Err);
-
-  if (RecordReplay.isRecordingOrReplaying() &&
-      RecordReplay.isSaveOutputEnabled())
-    RecordReplay.saveKernelOutputInfo(GenericKernel.getName());
-
-  return Err;
-}
-
-Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
-  assert(AsyncInfoPtr && "Invalid async info");
-
-  *AsyncInfoPtr = new __tgt_async_info();
-
-  AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
-
-  auto Err = initAsyncInfoImpl(AsyncInfoWrapper);
-  AsyncInfoWrapper.finalize(Err);
-  return Err;
-}
-
-Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
-  assert(DeviceInfo && "Invalid device info");
-
-  return initDeviceInfoImpl(DeviceInfo);
-}
-
-Error GenericDeviceTy::printInfo() {
-  InfoQueueTy InfoQueue;
-
-  // Get the vendor-specific info entries describing the device properties.
-  if (auto Err = obtainInfoImpl(InfoQueue))
-    return Err;
-
-  // Print all info entries.
-  InfoQueue.print();
-
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
-  return createEventImpl(EventPtrStorage);
-}
-
-Error GenericDeviceTy::destroyEvent(void *EventPtr) {
-  return destroyEventImpl(EventPtr);
-}
-
-Error GenericDeviceTy::recordEvent(void *EventPtr,
-                                   __tgt_async_info *AsyncInfo) {
-  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
-
-  auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper);
-  AsyncInfoWrapper.finalize(Err);
-  return Err;
-}
-
-Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
-  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
-
-  auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper);
-  AsyncInfoWrapper.finalize(Err);
-  return Err;
-}
-
-Error GenericDeviceTy::syncEvent(void *EventPtr) {
-  return syncEventImpl(EventPtr);
-}
-
-bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
-
-Error GenericPluginTy::init() {
-  auto NumDevicesOrErr = initImpl();
-  if (!NumDevicesOrErr)
-    return NumDevicesOrErr.takeError();
-
-  NumDevices = *NumDevicesOrErr;
-  if (NumDevices == 0)
-    return Plugin::success();
-
-  assert(Devices.size() == 0 && "Plugin already initialized");
-  Devices.resize(NumDevices, nullptr);
-
-  GlobalHandler = createGlobalHandler();
-  assert(GlobalHandler && "Invalid global handler");
-
-  RPCServer = new RPCServerTy(*this);
-  assert(RPCServer && "Invalid RPC server");
-
-  return Plugin::success();
-}
-
-Error GenericPluginTy::deinit() {
-  // Deinitialize all active devices.
-  for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
-    if (Devices[DeviceId]) {
-      if (auto Err = deinitDevice(DeviceId))
-        return Err;
-    }
-    assert(!Devices[DeviceId] && "Device was not deinitialized");
-  }
-
-  // There is no global handler if no device is available.
-  if (GlobalHandler)
-    delete GlobalHandler;
-
-  if (RPCServer)
-    delete RPCServer;
-
-  // Perform last deinitializations on the plugin.
-  return deinitImpl();
-}
-
-Error GenericPluginTy::initDevice(int32_t DeviceId) {
-  assert(!Devices[DeviceId] && "Device already initialized");
-
-  // Create the device and save the reference.
-  GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices);
-  assert(Device && "Invalid device");
-
-  // Save the device reference into the list.
-  Devices[DeviceId] = Device;
-
-  // Initialize the device and its resources.
-  return Device->init(*this);
-}
-
-Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
-  // The device may be already deinitialized.
-  if (Devices[DeviceId] == nullptr)
-    return Plugin::success();
-
-  // Deinitialize the device and release its resources.
-  if (auto Err = Devices[DeviceId]->deinit(*this))
-    return Err;
-
-  // Delete the device and invalidate its reference.
-  delete Devices[DeviceId];
-  Devices[DeviceId] = nullptr;
-
-  return Plugin::success();
-}
-
-Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
-  // First check if this image is a regular ELF file.
-  if (!utils::elf::isELF(Image))
-    return false;
-
-  // Check if this image is an ELF with a matching machine value.
-  auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits());
-  if (!MachineOrErr)
-    return MachineOrErr.takeError();
-
-  if (!*MachineOrErr)
-    return false;
-
-  // Perform plugin-dependent checks for the specific architecture if needed.
-  return isELFCompatible(Image);
-}
-
-int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image) {
-  StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
-                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
-
-  auto HandleError = [&](Error Err) -> bool {
-    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
-    DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
-    return false;
-  };
-  switch (identify_magic(Buffer)) {
-  case file_magic::elf:
-  case file_magic::elf_relocatable:
-  case file_magic::elf_executable:
-  case file_magic::elf_shared_object:
-  case file_magic::elf_core: {
-    auto MatchOrErr = checkELFImage(Buffer);
-    if (Error Err = MatchOrErr.takeError())
-      return HandleError(std::move(Err));
-    return *MatchOrErr;
-  }
-  case file_magic::bitcode: {
-    auto MatchOrErr = getJIT().checkBitcodeImage(Buffer);
-    if (Error Err = MatchOrErr.takeError())
-      return HandleError(std::move(Err));
-    return *MatchOrErr;
-  }
-  default:
-    return false;
-  }
-}
-
-int32_t GenericPluginTy::init_device(int32_t DeviceId) {
-  auto Err = initDevice(DeviceId);
-  if (Err) {
-    REPORT("Failure to initialize device %d: %s\n", DeviceId,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::number_of_devices() { return getNumDevices(); }
-
-int64_t GenericPluginTy::init_requires(int64_t RequiresFlags) {
-  setRequiresFlag(RequiresFlags);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::is_data_exchangable(int32_t SrcDeviceId,
-                                             int32_t DstDeviceId) {
-  return isDataExchangable(SrcDeviceId, DstDeviceId);
-}
-
-int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId,
-                                                  int64_t MemorySize,
-                                                  void *VAddr, bool isRecord,
-                                                  bool SaveOutput,
-                                                  uint64_t &ReqPtrArgOffset) {
-  GenericDeviceTy &Device = getDevice(DeviceId);
-  RecordReplayTy::RRStatusTy Status =
-      isRecord ? RecordReplayTy::RRStatusTy::RRRecording
-               : RecordReplayTy::RRStatusTy::RRReplaying;
-
-  if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status,
-                                   SaveOutput, ReqPtrArgOffset)) {
-    REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
-           "(Error: %s)\n",
-           MemorySize, toString(std::move(Err)).data());
-    RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
-
-    if (!isRecord) {
-      return OFFLOAD_FAIL;
-    }
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::load_binary(int32_t DeviceId,
-                                     __tgt_device_image *TgtImage,
-                                     __tgt_device_binary *Binary) {
-  GenericDeviceTy &Device = getDevice(DeviceId);
-
-  auto ImageOrErr = Device.loadBinary(*this, TgtImage);
-  if (!ImageOrErr) {
-    auto Err = ImageOrErr.takeError();
-    REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
-           DeviceId, toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  DeviceImageTy *Image = *ImageOrErr;
-  assert(Image != nullptr && "Invalid Image");
-
-  *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)};
-
-  return OFFLOAD_SUCCESS;
-}
-
-void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
-                                  int32_t Kind) {
-  auto AllocOrErr =
-      getDevice(DeviceId).dataAlloc(Size, HostPtr, (TargetAllocTy)Kind);
-  if (!AllocOrErr) {
-    auto Err = AllocOrErr.takeError();
-    REPORT("Failure to allocate device memory: %s\n",
-           toString(std::move(Err)).data());
-    return nullptr;
-  }
-  assert(*AllocOrErr && "Null pointer upon successful allocation");
-
-  return *AllocOrErr;
-}
-
-int32_t GenericPluginTy::data_delete(int32_t DeviceId, void *TgtPtr,
-                                     int32_t Kind) {
-  auto Err =
-      getDevice(DeviceId).dataDelete(TgtPtr, static_cast<TargetAllocTy>(Kind));
-  if (Err) {
-    REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
-                                   void **LockedPtr) {
-  auto LockedPtrOrErr = getDevice(DeviceId).dataLock(Ptr, Size);
-  if (!LockedPtrOrErr) {
-    auto Err = LockedPtrOrErr.takeError();
-    REPORT("Failure to lock memory %p: %s\n", Ptr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  if (!(*LockedPtrOrErr)) {
-    REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr);
-    return OFFLOAD_FAIL;
-  }
-  *LockedPtr = *LockedPtrOrErr;
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::data_unlock(int32_t DeviceId, void *Ptr) {
-  auto Err = getDevice(DeviceId).dataUnlock(Ptr);
-  if (Err) {
-    REPORT("Failure to unlock memory %p: %s\n", Ptr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::data_notify_mapped(int32_t DeviceId, void *HstPtr,
-                                            int64_t Size) {
-  auto Err = getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
-  if (Err) {
-    REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
-  auto Err = getDevice(DeviceId).notifyDataUnmapped(HstPtr);
-  if (Err) {
-    REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr,
-                                     void *HstPtr, int64_t Size) {
-  return data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
-                           /*AsyncInfoPtr=*/nullptr);
-}
-
-int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr,
-                                           void *HstPtr, int64_t Size,
-                                           __tgt_async_info *AsyncInfoPtr) {
-  auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr);
-  if (Err) {
-    REPORT("Failure to copy data from host to device. Pointers: host "
-           "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
-           DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr,
-                                       void *TgtPtr, int64_t Size) {
-  return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
-                             /*AsyncInfoPtr=*/nullptr);
-}
-
-int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr,
-                                             void *TgtPtr, int64_t Size,
-                                             __tgt_async_info *AsyncInfoPtr) {
-  auto Err =
-      getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr);
-  if (Err) {
-    REPORT("Faliure to copy data from device to host. Pointers: host "
-           "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
-           DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr,
-                                       int32_t DstDeviceId, void *DstPtr,
-                                       int64_t Size) {
-  return data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, Size,
-                             /*AsyncInfoPtr=*/nullptr);
-}
-
-int32_t GenericPluginTy::data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
-                                             int DstDeviceId, void *DstPtr,
-                                             int64_t Size,
-                                             __tgt_async_info *AsyncInfo) {
-  GenericDeviceTy &SrcDevice = getDevice(SrcDeviceId);
-  GenericDeviceTy &DstDevice = getDevice(DstDeviceId);
-  auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
-  if (Err) {
-    REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
-           "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
-           SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                       void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                       KernelArgsTy *KernelArgs,
-                                       __tgt_async_info *AsyncInfoPtr) {
-  auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets,
-                                              *KernelArgs, AsyncInfoPtr);
-  if (Err) {
-    REPORT("Failure to run target region " DPxMOD " in device %d: %s\n",
-           DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::synchronize(int32_t DeviceId,
-                                     __tgt_async_info *AsyncInfoPtr) {
-  auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr);
-  if (Err) {
-    REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::query_async(int32_t DeviceId,
-                                     __tgt_async_info *AsyncInfoPtr) {
-  auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr);
-  if (Err) {
-    REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr->Queue,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-void GenericPluginTy::print_device_info(int32_t DeviceId) {
-  if (auto Err = getDevice(DeviceId).printInfo())
-    REPORT("Failure to print device %d info: %s\n", DeviceId,
-           toString(std::move(Err)).data());
-}
-
-int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
-  auto Err = getDevice(DeviceId).createEvent(EventPtr);
-  if (Err) {
-    REPORT("Failure to create event: %s\n", toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::record_event(int32_t DeviceId, void *EventPtr,
-                                      __tgt_async_info *AsyncInfoPtr) {
-  auto Err = getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
-  if (Err) {
-    REPORT("Failure to record event %p: %s\n", EventPtr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::wait_event(int32_t DeviceId, void *EventPtr,
-                                    __tgt_async_info *AsyncInfoPtr) {
-  auto Err = getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
-  if (Err) {
-    REPORT("Failure to wait event %p: %s\n", EventPtr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
-  auto Err = getDevice(DeviceId).syncEvent(EventPtr);
-  if (Err) {
-    REPORT("Failure to synchronize event %p: %s\n", EventPtr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
-  auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
-  if (Err) {
-    REPORT("Failure to destroy event %p: %s\n", EventPtr,
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-void GenericPluginTy::set_info_flag(uint32_t NewInfoLevel) {
-  std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
-  InfoLevel.store(NewInfoLevel);
-}
-
-int32_t GenericPluginTy::init_async_info(int32_t DeviceId,
-                                         __tgt_async_info **AsyncInfoPtr) {
-  assert(AsyncInfoPtr && "Invalid async info");
-
-  auto Err = getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
-  if (Err) {
-    REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n",
-           DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::init_device_info(int32_t DeviceId,
-                                          __tgt_device_info *DeviceInfo,
-                                          const char **ErrStr) {
-  *ErrStr = "";
-
-  auto Err = getDevice(DeviceId).initDeviceInfo(DeviceInfo);
-  if (Err) {
-    REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
-           DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::set_device_offset(int32_t DeviceIdOffset) {
-  setDeviceIdStartIndex(DeviceIdOffset);
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) {
-  // Automatic zero-copy only applies to programs that did
-  // not request unified_shared_memory and are deployed on an
-  // APU with XNACK enabled.
-  if (getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
-    return false;
-  return getDevice(DeviceId).useAutoZeroCopy();
-}
-
-int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
-                                    const char *Name, void **DevicePtr) {
-  assert(Binary.handle && "Invalid device binary handle");
-  DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
-
-  GenericDeviceTy &Device = Image.getDevice();
-
-  GlobalTy DeviceGlobal(Name, Size);
-  GenericGlobalHandlerTy &GHandler = getGlobalHandler();
-  if (auto Err =
-          GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
-    REPORT("Failure to look up global address: %s\n",
-           toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  *DevicePtr = DeviceGlobal.getPtr();
-  assert(DevicePtr && "Invalid device global's address");
-
-  // Save the loaded globals if we are recording.
-  if (RecordReplay.isRecording())
-    RecordReplay.addEntry(Name, Size, *DevicePtr);
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t GenericPluginTy::get_function(__tgt_device_binary Binary,
-                                      const char *Name, void **KernelPtr) {
-  assert(Binary.handle && "Invalid device binary handle");
-  DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
-
-  GenericDeviceTy &Device = Image.getDevice();
-
-  auto KernelOrErr = Device.constructKernel(Name);
-  if (Error Err = KernelOrErr.takeError()) {
-    REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  GenericKernelTy &Kernel = *KernelOrErr;
-  if (auto Err = Kernel.init(Device, Image)) {
-    REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
-    return OFFLOAD_FAIL;
-  }
-
-  // Note that this is not the kernel's device address.
-  *KernelPtr = &Kernel;
-  return OFFLOAD_SUCCESS;
-}
-
-bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  return true;
-#else
-  return false;
-#endif
-}
-
-/// Exposed library API function, basically wrappers around the GenericDeviceTy
-/// functionality with the same name. All non-async functions are redirected
-/// to the async versions right away with a NULL AsyncInfoPtr.
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_init_plugin() {
-  auto Err = PluginTy::initIfNeeded();
-  if (Err) {
-    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
-    DP("Failed to init plugin: %s", ErrStr.c_str());
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-  if (!PluginTy::isActive())
-    return false;
-
-  return PluginTy::get().is_valid_binary(Image);
-}
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) {
-  return PluginTy::get().init_device(DeviceId);
-}
-
-int32_t __tgt_rtl_number_of_devices() {
-  return PluginTy::get().number_of_devices();
-}
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  return PluginTy::get().init_requires(RequiresFlags);
-}
-
-int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
-                                      int32_t DstDeviceId) {
-  return PluginTy::get().is_data_exchangable(SrcDeviceId, DstDeviceId);
-}
-
-int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
-                                           void *VAddr, bool isRecord,
-                                           bool SaveOutput,
-                                           uint64_t &ReqPtrArgOffset) {
-  return PluginTy::get().initialize_record_replay(
-      DeviceId, MemorySize, VAddr, isRecord, SaveOutput, ReqPtrArgOffset);
-}
-
-int32_t __tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
-                              __tgt_device_binary *Binary) {
-  return PluginTy::get().load_binary(DeviceId, TgtImage, Binary);
-}
-
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
-                           int32_t Kind) {
-  return PluginTy::get().data_alloc(DeviceId, Size, HostPtr, Kind);
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
-  return PluginTy::get().data_delete(DeviceId, TgtPtr, Kind);
-}
-
-int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
-                            void **LockedPtr) {
-  return PluginTy::get().data_lock(DeviceId, Ptr, Size, LockedPtr);
-}
-
-int32_t __tgt_rtl_data_unlock(int32_t DeviceId, void *Ptr) {
-  return PluginTy::get().data_unlock(DeviceId, Ptr);
-}
-
-int32_t __tgt_rtl_data_notify_mapped(int32_t DeviceId, void *HstPtr,
-                                     int64_t Size) {
-  return PluginTy::get().data_notify_mapped(DeviceId, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
-  return PluginTy::get().data_notify_unmapped(DeviceId, HstPtr);
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  return PluginTy::get().data_submit(DeviceId, TgtPtr, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
-                                    void *HstPtr, int64_t Size,
-                                    __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
-                                           AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  return PluginTy::get().data_retrieve(DeviceId, HstPtr, TgtPtr, Size);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
-                                      void *TgtPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
-                                             AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr,
-                                int32_t DstDeviceId, void *DstPtr,
-                                int64_t Size) {
-  return PluginTy::get().data_exchange(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr,
-                                       Size);
-}
-
-int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
-                                      int DstDeviceId, void *DstPtr,
-                                      int64_t Size,
-                                      __tgt_async_info *AsyncInfo) {
-  return PluginTy::get().data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId,
-                                             DstPtr, Size, AsyncInfo);
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().launch_kernel(DeviceId, TgtEntryPtr, TgtArgs,
-                                       TgtOffsets, KernelArgs, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().synchronize(DeviceId, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_query_async(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().query_async(DeviceId, AsyncInfoPtr);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  PluginTy::get().print_device_info(DeviceId);
-}
-
-int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) {
-  return PluginTy::get().create_event(DeviceId, EventPtr);
-}
-
-int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
-                               __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().record_event(DeviceId, EventPtr, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
-                             __tgt_async_info *AsyncInfoPtr) {
-  return PluginTy::get().wait_event(DeviceId, EventPtr, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
-  return PluginTy::get().sync_event(DeviceId, EventPtr);
-}
-
-int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
-  return PluginTy::get().destroy_event(DeviceId, EventPtr);
-}
-
-void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
-  return PluginTy::get().set_info_flag(NewInfoLevel);
-}
-
-int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
-                                  __tgt_async_info **AsyncInfoPtr) {
-  return PluginTy::get().init_async_info(DeviceId, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
-                                   __tgt_device_info *DeviceInfo,
-                                   const char **ErrStr) {
-  return PluginTy::get().init_device_info(DeviceId, DeviceInfo, ErrStr);
-}
-
-int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
-  return PluginTy::get().set_device_offset(DeviceIdOffset);
-}
-
-int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
-  return PluginTy::get().use_auto_zero_copy(DeviceId);
-}
-
-int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
-                             const char *Name, void **DevicePtr) {
-  return PluginTy::get().get_global(Binary, Size, Name, DevicePtr);
-}
-
-int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
-                               void **KernelPtr) {
-  return PluginTy::get().get_function(Binary, Name, KernelPtr);
-}
-
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/plugins-nextgen/common/src/RPC.cpp b/libomptarget/plugins-nextgen/common/src/RPC.cpp
deleted file mode 100644
index faa2cbd..0000000
--- a/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ /dev/null

@@ -1,137 +0,0 @@
-//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "RPC.h"
-
-#include "Shared/Debug.h"
-
-#include "PluginInterface.h"
-
-#if defined(LIBOMPTARGET_RPC_SUPPORT)
-#include "llvm-libc-types/rpc_opcodes_t.h"
-#include "llvmlibc_rpc_server.h"
-#endif
-
-using namespace llvm;
-using namespace omp;
-using namespace target;
-
-RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
-    : Handles(Plugin.getNumDevices()) {}
-
-llvm::Expected<bool>
-RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
-                              plugin::GenericGlobalHandlerTy &Handler,
-                              plugin::DeviceImageTy &Image) {
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  return Handler.isSymbolInImage(Device, Image, rpc_client_symbol_name);
-#else
-  return false;
-#endif
-}
-
-Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
-                              plugin::GenericGlobalHandlerTy &Handler,
-                              plugin::DeviceImageTy &Image) {
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  auto Alloc = [](uint64_t Size, void *Data) {
-    plugin::GenericDeviceTy &Device =
-        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-    return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST);
-  };
-  uint64_t NumPorts =
-      std::min(Device.requestedRPCPortCount(), RPC_MAXIMUM_PORT_COUNT);
-  rpc_device_t RPCDevice;
-  if (rpc_status_t Err = rpc_server_init(&RPCDevice, NumPorts,
-                                         Device.getWarpSize(), Alloc, &Device))
-    return plugin::Plugin::error(
-        "Failed to initialize RPC server for device %d: %d",
-        Device.getDeviceId(), Err);
-
-  // Register a custom opcode handler to perform plugin specific allocation.
-  auto MallocHandler = [](rpc_port_t Port, void *Data) {
-    rpc_recv_and_send(
-        Port,
-        [](rpc_buffer_t *Buffer, void *Data) {
-          plugin::GenericDeviceTy &Device =
-              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-          Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
-              Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
-        },
-        Data);
-  };
-  if (rpc_status_t Err =
-          rpc_register_callback(RPCDevice, RPC_MALLOC, MallocHandler, &Device))
-    return plugin::Plugin::error(
-        "Failed to register RPC malloc handler for device %d: %d\n",
-        Device.getDeviceId(), Err);
-
-  // Register a custom opcode handler to perform plugin specific deallocation.
-  auto FreeHandler = [](rpc_port_t Port, void *Data) {
-    rpc_recv(
-        Port,
-        [](rpc_buffer_t *Buffer, void *Data) {
-          plugin::GenericDeviceTy &Device =
-              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-          Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                      TARGET_ALLOC_DEVICE_NON_BLOCKING);
-        },
-        Data);
-  };
-  if (rpc_status_t Err =
-          rpc_register_callback(RPCDevice, RPC_FREE, FreeHandler, &Device))
-    return plugin::Plugin::error(
-        "Failed to register RPC free handler for device %d: %d\n",
-        Device.getDeviceId(), Err);
-
-  // Get the address of the RPC client from the device.
-  void *ClientPtr;
-  plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
-  if (auto Err =
-          Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal))
-    return Err;
-
-  if (auto Err = Device.dataRetrieve(&ClientPtr, ClientGlobal.getPtr(),
-                                     sizeof(void *), nullptr))
-    return Err;
-
-  const void *ClientBuffer = rpc_get_client_buffer(RPCDevice);
-  if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer,
-                                   rpc_get_client_size(), nullptr))
-    return Err;
-  Handles[Device.getDeviceId()] = RPCDevice.handle;
-#endif
-  return Error::success();
-}
-
-Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
-  if (rpc_status_t Err = rpc_handle_server(RPCDevice))
-    return plugin::Plugin::error(
-        "Error while running RPC server on device %d: %d", Device.getDeviceId(),
-        Err);
-#endif
-  return Error::success();
-}
-
-Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
-  auto Dealloc = [](void *Ptr, void *Data) {
-    plugin::GenericDeviceTy &Device =
-        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-    Device.free(Ptr, TARGET_ALLOC_HOST);
-  };
-  if (rpc_status_t Err = rpc_server_shutdown(RPCDevice, Dealloc, &Device))
-    return plugin::Plugin::error(
-        "Failed to shut down RPC server for device %d: %d",
-        Device.getDeviceId(), Err);
-#endif
-  return Error::success();
-}

diff --git a/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp b/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
deleted file mode 100644
index 2ae97f0..0000000
--- a/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
+++ /dev/null

@@ -1,333 +0,0 @@
-//===-- Utils/ELF.cpp - Common ELF functionality --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common ELF functionality for target plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Utils/ELF.h"
-
-#include "llvm/BinaryFormat/Magic.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/ELFTypes.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-using namespace llvm;
-using namespace llvm::ELF;
-using namespace llvm::object;
-
-bool utils::elf::isELF(StringRef Buffer) {
-  switch (identify_magic(Buffer)) {
-  case file_magic::elf:
-  case file_magic::elf_relocatable:
-  case file_magic::elf_executable:
-  case file_magic::elf_shared_object:
-  case file_magic::elf_core:
-    return true;
-  default:
-    return false;
-  }
-}
-
-template <class ELFT>
-static Expected<bool>
-checkMachineImpl(const object::ELFObjectFile<ELFT> &ELFObj, uint16_t EMachine) {
-  const auto Header = ELFObj.getELFFile().getHeader();
-  if (Header.e_type != ET_EXEC && Header.e_type != ET_DYN)
-    return createError("Only executable ELF files are supported");
-
-  if (Header.e_machine == EM_AMDGPU) {
-    if (Header.e_ident[EI_OSABI] != ELFOSABI_AMDGPU_HSA)
-      return createError("Invalid AMD OS/ABI, must be AMDGPU_HSA");
-    if (Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V4 &&
-        Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V5)
-      return createError("Invalid AMD ABI version, must be version 4 or 5");
-    if ((Header.e_flags & EF_AMDGPU_MACH) < EF_AMDGPU_MACH_AMDGCN_GFX700 ||
-        (Header.e_flags & EF_AMDGPU_MACH) > EF_AMDGPU_MACH_AMDGCN_GFX1201)
-      return createError("Unsupported AMDGPU architecture");
-  } else if (Header.e_machine == EM_CUDA) {
-    if (~Header.e_flags & EF_CUDA_64BIT_ADDRESS)
-      return createError("Invalid CUDA addressing mode");
-    if ((Header.e_flags & EF_CUDA_SM) < EF_CUDA_SM35 ||
-        (Header.e_flags & EF_CUDA_SM) > EF_CUDA_SM90)
-      return createError("Unsupported NVPTX architecture");
-  }
-
-  return Header.e_machine == EMachine;
-}
-
-Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
-  assert(isELF(Object) && "Input is not an ELF!");
-
-  Expected<std::unique_ptr<ObjectFile>> ElfOrErr =
-      ObjectFile::createELFObjectFile(
-          MemoryBufferRef(Object, /*Identifier=*/""),
-          /*InitContent=*/false);
-  if (!ElfOrErr)
-    return ElfOrErr.takeError();
-
-  if (const ELF64LEObjectFile *ELFObj =
-          dyn_cast<ELF64LEObjectFile>(&**ElfOrErr))
-    return checkMachineImpl(*ELFObj, EMachine);
-  if (const ELF64BEObjectFile *ELFObj =
-          dyn_cast<ELF64BEObjectFile>(&**ElfOrErr))
-    return checkMachineImpl(*ELFObj, EMachine);
-  return createError("Only 64-bit ELF files are supported");
-}
-
-template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getSymbolFromGnuHashTable(StringRef Name, const typename ELFT::GnuHash &HashTab,
-                          ArrayRef<typename ELFT::Sym> SymTab,
-                          StringRef StrTab) {
-  const uint32_t NameHash = hashGnu(Name);
-  const typename ELFT::Word NBucket = HashTab.nbuckets;
-  const typename ELFT::Word SymOffset = HashTab.symndx;
-  ArrayRef<typename ELFT::Off> Filter = HashTab.filter();
-  ArrayRef<typename ELFT::Word> Bucket = HashTab.buckets();
-  ArrayRef<typename ELFT::Word> Chain = HashTab.values(SymTab.size());
-
-  // Check the bloom filter and exit early if the symbol is not present.
-  uint64_t ElfClassBits = ELFT::Is64Bits ? 64 : 32;
-  typename ELFT::Off Word =
-      Filter[(NameHash / ElfClassBits) % HashTab.maskwords];
-  uint64_t Mask = (0x1ull << (NameHash % ElfClassBits)) |
-                  (0x1ull << ((NameHash >> HashTab.shift2) % ElfClassBits));
-  if ((Word & Mask) != Mask)
-    return nullptr;
-
-  // The symbol may or may not be present, check the hash values.
-  for (typename ELFT::Word I = Bucket[NameHash % NBucket];
-       I >= SymOffset && I < SymTab.size(); I = I + 1) {
-    const uint32_t ChainHash = Chain[I - SymOffset];
-
-    if ((NameHash | 0x1) != (ChainHash | 0x1))
-      continue;
-
-    if (SymTab[I].st_name >= StrTab.size())
-      return createError("symbol [index " + Twine(I) +
-                         "] has invalid st_name: " + Twine(SymTab[I].st_name));
-    if (StrTab.drop_front(SymTab[I].st_name).data() == Name)
-      return &SymTab[I];
-
-    if (ChainHash & 0x1)
-      return nullptr;
-  }
-  return nullptr;
-}
-
-template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getSymbolFromSysVHashTable(StringRef Name, const typename ELFT::Hash &HashTab,
-                           ArrayRef<typename ELFT::Sym> SymTab,
-                           StringRef StrTab) {
-  const uint32_t Hash = hashSysV(Name);
-  const typename ELFT::Word NBucket = HashTab.nbucket;
-  ArrayRef<typename ELFT::Word> Bucket = HashTab.buckets();
-  ArrayRef<typename ELFT::Word> Chain = HashTab.chains();
-  for (typename ELFT::Word I = Bucket[Hash % NBucket]; I != ELF::STN_UNDEF;
-       I = Chain[I]) {
-    if (I >= SymTab.size())
-      return createError(
-          "symbol [index " + Twine(I) +
-          "] is greater than the number of symbols: " + Twine(SymTab.size()));
-    if (SymTab[I].st_name >= StrTab.size())
-      return createError("symbol [index " + Twine(I) +
-                         "] has invalid st_name: " + Twine(SymTab[I].st_name));
-
-    if (StrTab.drop_front(SymTab[I].st_name).data() == Name)
-      return &SymTab[I];
-  }
-  return nullptr;
-}
-
-template <class ELFT>
-static Expected<std::optional<ELFSymbolRef>>
-getHashTableSymbol(const ELFObjectFile<ELFT> &ELFObj,
-                   const typename ELFT::Shdr &Sec, StringRef Name) {
-  const ELFFile<ELFT> &Elf = ELFObj.getELFFile();
-  if (Sec.sh_type != ELF::SHT_HASH && Sec.sh_type != ELF::SHT_GNU_HASH)
-    return createError(
-        "invalid sh_type for hash table, expected SHT_HASH or SHT_GNU_HASH");
-  Expected<typename ELFT::ShdrRange> SectionsOrError = Elf.sections();
-  if (!SectionsOrError)
-    return SectionsOrError.takeError();
-
-  auto SymTabOrErr = getSection<ELFT>(*SectionsOrError, Sec.sh_link);
-  if (!SymTabOrErr)
-    return SymTabOrErr.takeError();
-
-  auto StrTabOrErr =
-      Elf.getStringTableForSymtab(**SymTabOrErr, *SectionsOrError);
-  if (!StrTabOrErr)
-    return StrTabOrErr.takeError();
-  StringRef StrTab = *StrTabOrErr;
-
-  auto SymsOrErr = Elf.symbols(*SymTabOrErr);
-  if (!SymsOrErr)
-    return SymsOrErr.takeError();
-  ArrayRef<typename ELFT::Sym> SymTab = *SymsOrErr;
-
-  // If this is a GNU hash table we verify its size and search the symbol
-  // table using the GNU hash table format.
-  if (Sec.sh_type == ELF::SHT_GNU_HASH) {
-    const typename ELFT::GnuHash *HashTab =
-        reinterpret_cast<const typename ELFT::GnuHash *>(Elf.base() +
-                                                         Sec.sh_offset);
-    if (Sec.sh_offset + Sec.sh_size >= Elf.getBufSize())
-      return createError("section has invalid sh_offset: " +
-                         Twine(Sec.sh_offset));
-    if (Sec.sh_size < sizeof(typename ELFT::GnuHash) ||
-        Sec.sh_size <
-            sizeof(typename ELFT::GnuHash) +
-                sizeof(typename ELFT::Word) * HashTab->maskwords +
-                sizeof(typename ELFT::Word) * HashTab->nbuckets +
-                sizeof(typename ELFT::Word) * (SymTab.size() - HashTab->symndx))
-      return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
-    auto Sym = getSymbolFromGnuHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
-    if (!Sym)
-      return Sym.takeError();
-    if (!*Sym)
-      return std::nullopt;
-    return ELFObj.toSymbolRef(*SymTabOrErr, *Sym - &SymTab[0]);
-  }
-
-  // If this is a Sys-V hash table we verify its size and search the symbol
-  // table using the Sys-V hash table format.
-  if (Sec.sh_type == ELF::SHT_HASH) {
-    const typename ELFT::Hash *HashTab =
-        reinterpret_cast<const typename ELFT::Hash *>(Elf.base() +
-                                                      Sec.sh_offset);
-    if (Sec.sh_offset + Sec.sh_size >= Elf.getBufSize())
-      return createError("section has invalid sh_offset: " +
-                         Twine(Sec.sh_offset));
-    if (Sec.sh_size < sizeof(typename ELFT::Hash) ||
-        Sec.sh_size < sizeof(typename ELFT::Hash) +
-                          sizeof(typename ELFT::Word) * HashTab->nbucket +
-                          sizeof(typename ELFT::Word) * HashTab->nchain)
-      return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
-
-    auto Sym = getSymbolFromSysVHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
-    if (!Sym)
-      return Sym.takeError();
-    if (!*Sym)
-      return std::nullopt;
-    return ELFObj.toSymbolRef(*SymTabOrErr, *Sym - &SymTab[0]);
-  }
-
-  return std::nullopt;
-}
-
-template <class ELFT>
-static Expected<std::optional<ELFSymbolRef>>
-getSymTableSymbol(const ELFObjectFile<ELFT> &ELFObj,
-                  const typename ELFT::Shdr &Sec, StringRef Name) {
-  const ELFFile<ELFT> &Elf = ELFObj.getELFFile();
-  if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
-    return createError(
-        "invalid sh_type for hash table, expected SHT_SYMTAB or SHT_DYNSYM");
-  Expected<typename ELFT::ShdrRange> SectionsOrError = Elf.sections();
-  if (!SectionsOrError)
-    return SectionsOrError.takeError();
-
-  auto StrTabOrErr = Elf.getStringTableForSymtab(Sec, *SectionsOrError);
-  if (!StrTabOrErr)
-    return StrTabOrErr.takeError();
-  StringRef StrTab = *StrTabOrErr;
-
-  auto SymsOrErr = Elf.symbols(&Sec);
-  if (!SymsOrErr)
-    return SymsOrErr.takeError();
-  ArrayRef<typename ELFT::Sym> SymTab = *SymsOrErr;
-
-  for (const typename ELFT::Sym &Sym : SymTab)
-    if (StrTab.drop_front(Sym.st_name).data() == Name)
-      return ELFObj.toSymbolRef(&Sec, &Sym - &SymTab[0]);
-
-  return std::nullopt;
-}
-
-template <class ELFT>
-static Expected<std::optional<ELFSymbolRef>>
-getSymbolImpl(const ELFObjectFile<ELFT> &ELFObj, StringRef Name) {
-  // First try to look up the symbol via the hash table.
-  for (ELFSectionRef Sec : ELFObj.sections()) {
-    if (Sec.getType() != SHT_HASH && Sec.getType() != SHT_GNU_HASH)
-      continue;
-
-    auto HashTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
-    if (!HashTabOrErr)
-      return HashTabOrErr.takeError();
-    return getHashTableSymbol<ELFT>(ELFObj, **HashTabOrErr, Name);
-  }
-
-  // If this is an executable file check the entire standard symbol table.
-  for (ELFSectionRef Sec : ELFObj.sections()) {
-    if (Sec.getType() != SHT_SYMTAB)
-      continue;
-
-    auto SymTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
-    if (!SymTabOrErr)
-      return SymTabOrErr.takeError();
-    return getSymTableSymbol<ELFT>(ELFObj, **SymTabOrErr, Name);
-  }
-
-  return std::nullopt;
-}
-
-Expected<std::optional<ELFSymbolRef>>
-utils::elf::getSymbol(const ObjectFile &Obj, StringRef Name) {
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(&Obj))
-    return getSymbolImpl(*ELFObj, Name);
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(&Obj))
-    return getSymbolImpl(*ELFObj, Name);
-  return createError("Only 64-bit ELF files are supported");
-}
-
-template <class ELFT>
-static Expected<const void *>
-getSymbolAddressImpl(const ELFObjectFile<ELFT> &ELFObj,
-                     const ELFSymbolRef &SymRef) {
-  const ELFFile<ELFT> &ELFFile = ELFObj.getELFFile();
-
-  auto SymOrErr = ELFObj.getSymbol(SymRef.getRawDataRefImpl());
-  if (!SymOrErr)
-    return SymOrErr.takeError();
-  const auto &Symbol = **SymOrErr;
-
-  auto SecOrErr = ELFFile.getSection(Symbol.st_shndx);
-  if (!SecOrErr)
-    return SecOrErr.takeError();
-  const auto &Section = *SecOrErr;
-
-  // A section with SHT_NOBITS occupies no space in the file and has no
-  // offset.
-  if (Section->sh_type == ELF::SHT_NOBITS)
-    return createError(
-        "invalid sh_type for symbol lookup, cannot be SHT_NOBITS");
-
-  uint64_t Offset = Section->sh_offset - Section->sh_addr + Symbol.st_value;
-  if (Offset > ELFFile.getBufSize())
-    return createError("invalid offset [" + Twine(Offset) +
-                       "] into ELF file of size [" +
-                       Twine(ELFFile.getBufSize()) + "]");
-
-  return ELFFile.base() + Offset;
-}
-
-Expected<const void *>
-utils::elf::getSymbolAddress(const ELFSymbolRef &SymRef) {
-  const ObjectFile *Obj = SymRef.getObject();
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return getSymbolAddressImpl(*ELFObj, SymRef);
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return getSymbolAddressImpl(*ELFObj, SymRef);
-  return createError("Only 64-bit ELF files are supported");
-}

diff --git a/libomptarget/plugins-nextgen/cuda/CMakeLists.txt b/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
deleted file mode 100644
index b353046..0000000
--- a/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
+++ /dev/null

@@ -1,58 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a CUDA machine if available.
-#
-##===----------------------------------------------------------------------===##
-set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL
-  "Whether to build CUDA plugin")
-if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN)
-  libomptarget_say("Not building CUDA NextGen offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false")
-  return()
-endif()
-
-if (NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building CUDA NextGen offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
-  return()
-endif()
-
-libomptarget_say("Building CUDA NextGen offloading plugin.")
-
-# Create the library and add the default arguments.
-add_target_library(omptarget.rtl.cuda CUDA)
-
-target_sources(omptarget.rtl.cuda PRIVATE src/rtl.cpp)
-
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF)
-if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
-  libomptarget_say("Building CUDA plugin linked against libcuda")
-  target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
-else()
-  libomptarget_say("Building CUDA plugin for dlopened libcuda")
-  target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda)
-  target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp)
-endif()
-
-# Configure testing for the CUDA plugin. We will build tests if we could a
-# functional NVIDIA GPU on the system, or if manually specifies by the user.
-option(LIBOMPTARGET_FORCE_NVIDIA_TESTS "Build NVIDIA libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_NVIDIA_GPU OR LIBOMPTARGET_FORCE_NVIDIA_TESTS)
-  libomptarget_say("Enable tests using CUDA plugin")
-  set(LIBOMPTARGET_SYSTEM_TARGETS 
-      "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-LTO" PARENT_SCOPE)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda")
-  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
-  libomptarget_say("Not generating NVIDIA tests, no supported devices detected."
-                   " Use 'LIBOMPTARGET_FORCE_NVIDIA_TESTS' to override.")
-endif()
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda PROPERTIES
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")

diff --git a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
deleted file mode 100644
index 5ec3adb..0000000
--- a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ /dev/null

@@ -1,170 +0,0 @@
-//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implement subset of cuda api by calling into cuda library via dlopen
-// Does the dlopen/dlsym calls as part of the call to cuInit
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#include "Shared/Debug.h"
-
-#include "DLWrap.h"
-#include "cuda.h"
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-DLWRAP_INITIALIZE()
-
-DLWRAP_INTERNAL(cuInit, 1)
-
-DLWRAP(cuCtxGetDevice, 1)
-DLWRAP(cuDeviceGet, 2)
-DLWRAP(cuDeviceGetAttribute, 3)
-DLWRAP(cuDeviceGetCount, 1)
-DLWRAP(cuFuncGetAttribute, 3)
-
-// Device info
-DLWRAP(cuDeviceGetName, 3)
-DLWRAP(cuDeviceTotalMem, 2)
-DLWRAP(cuDriverGetVersion, 1)
-
-DLWRAP(cuGetErrorString, 2)
-DLWRAP(cuLaunchKernel, 11)
-
-DLWRAP(cuMemAlloc, 2)
-DLWRAP(cuMemAllocHost, 2)
-DLWRAP(cuMemAllocManaged, 3)
-DLWRAP(cuMemAllocAsync, 3)
-
-DLWRAP(cuMemcpyDtoDAsync, 4)
-DLWRAP(cuMemcpyDtoH, 3)
-DLWRAP(cuMemcpyDtoHAsync, 4)
-DLWRAP(cuMemcpyHtoD, 3)
-DLWRAP(cuMemcpyHtoDAsync, 4)
-
-DLWRAP(cuMemFree, 1)
-DLWRAP(cuMemFreeHost, 1)
-DLWRAP(cuMemFreeAsync, 2)
-
-DLWRAP(cuModuleGetFunction, 3)
-DLWRAP(cuModuleGetGlobal, 4)
-
-DLWRAP(cuModuleUnload, 1)
-DLWRAP(cuStreamCreate, 2)
-DLWRAP(cuStreamDestroy, 1)
-DLWRAP(cuStreamSynchronize, 1)
-DLWRAP(cuStreamQuery, 1)
-DLWRAP(cuCtxSetCurrent, 1)
-DLWRAP(cuDevicePrimaryCtxRelease, 1)
-DLWRAP(cuDevicePrimaryCtxGetState, 3)
-DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
-DLWRAP(cuDevicePrimaryCtxRetain, 2)
-DLWRAP(cuModuleLoadDataEx, 5)
-
-DLWRAP(cuDeviceCanAccessPeer, 3)
-DLWRAP(cuCtxEnablePeerAccess, 2)
-DLWRAP(cuMemcpyPeerAsync, 6)
-
-DLWRAP(cuCtxGetLimit, 2)
-DLWRAP(cuCtxSetLimit, 2)
-
-DLWRAP(cuEventCreate, 2)
-DLWRAP(cuEventRecord, 2)
-DLWRAP(cuStreamWaitEvent, 3)
-DLWRAP(cuEventSynchronize, 1)
-DLWRAP(cuEventDestroy, 1)
-
-DLWRAP_FINALIZE()
-
-DLWRAP(cuMemUnmap, 2)
-DLWRAP(cuMemRelease, 1)
-DLWRAP(cuMemAddressFree, 2)
-DLWRAP(cuMemGetInfo, 2)
-DLWRAP(cuMemAddressReserve, 5)
-DLWRAP(cuMemMap, 5)
-DLWRAP(cuMemCreate, 4)
-DLWRAP(cuMemSetAccess, 4)
-DLWRAP(cuMemGetAllocationGranularity, 3)
-
-#ifndef DYNAMIC_CUDA_PATH
-#define DYNAMIC_CUDA_PATH "libcuda.so"
-#endif
-
-#ifndef TARGET_NAME
-#define TARGET_NAME CUDA
-#endif
-#ifndef DEBUG_PREFIX
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#endif
-
-static bool checkForCUDA() {
-  // return true if dlopen succeeded and all functions found
-
-  // Prefer _v2 versions of functions if found in the library
-  std::unordered_map<std::string, const char *> TryFirst = {
-      {"cuMemAlloc", "cuMemAlloc_v2"},
-      {"cuMemFree", "cuMemFree_v2"},
-      {"cuMemcpyDtoH", "cuMemcpyDtoH_v2"},
-      {"cuMemcpyHtoD", "cuMemcpyHtoD_v2"},
-      {"cuStreamDestroy", "cuStreamDestroy_v2"},
-      {"cuModuleGetGlobal", "cuModuleGetGlobal_v2"},
-      {"cuMemcpyDtoHAsync", "cuMemcpyDtoHAsync_v2"},
-      {"cuMemcpyDtoDAsync", "cuMemcpyDtoDAsync_v2"},
-      {"cuMemcpyHtoDAsync", "cuMemcpyHtoDAsync_v2"},
-      {"cuDevicePrimaryCtxRelease", "cuDevicePrimaryCtxRelease_v2"},
-      {"cuDevicePrimaryCtxSetFlags", "cuDevicePrimaryCtxSetFlags_v2"},
-  };
-
-  const char *CudaLib = DYNAMIC_CUDA_PATH;
-  std::string ErrMsg;
-  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-      llvm::sys::DynamicLibrary::getPermanentLibrary(CudaLib, &ErrMsg));
-  if (!DynlibHandle->isValid()) {
-    DP("Unable to load library '%s': %s!\n", CudaLib, ErrMsg.c_str());
-    return false;
-  }
-
-  for (size_t I = 0; I < dlwrap::size(); I++) {
-    const char *Sym = dlwrap::symbol(I);
-
-    auto It = TryFirst.find(Sym);
-    if (It != TryFirst.end()) {
-      const char *First = It->second;
-      void *P = DynlibHandle->getAddressOfSymbol(First);
-      if (P) {
-        DP("Implementing %s with dlsym(%s) -> %p\n", Sym, First, P);
-        *dlwrap::pointer(I) = P;
-        continue;
-      }
-    }
-
-    void *P = DynlibHandle->getAddressOfSymbol(Sym);
-    if (P == nullptr) {
-      DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
-      return false;
-    }
-    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
-
-    *dlwrap::pointer(I) = P;
-  }
-
-  return true;
-}
-
-CUresult cuInit(unsigned X) {
-  // Note: Called exactly once from cuda rtl.cpp in a global constructor so
-  // does not need to handle being called repeatedly or concurrently
-  if (!checkForCUDA()) {
-    return CUDA_ERROR_INVALID_HANDLE;
-  }
-  return dlwrap_cuInit(X);
-}

diff --git a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
deleted file mode 100644
index 32031c2..0000000
--- a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ /dev/null

@@ -1,355 +0,0 @@
-//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The parts of the cuda api that are presently in use by the openmp cuda plugin
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED
-#define DYNAMIC_CUDA_CUDA_H_INCLUDED
-
-#include <cstddef>
-#include <cstdint>
-
-typedef int CUdevice;
-typedef uintptr_t CUdeviceptr;
-typedef struct CUmod_st *CUmodule;
-typedef struct CUctx_st *CUcontext;
-typedef struct CUfunc_st *CUfunction;
-typedef struct CUstream_st *CUstream;
-typedef struct CUevent_st *CUevent;
-
-#define CU_DEVICE_INVALID ((CUdevice)-2)
-
-typedef unsigned long long CUmemGenericAllocationHandle_v1;
-typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
-
-#define CU_DEVICE_INVALID ((CUdevice)-2)
-
-typedef enum CUmemAllocationGranularity_flags_enum {
-  CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
-  CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
-} CUmemAllocationGranularity_flags;
-
-typedef enum CUmemAccess_flags_enum {
-  CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
-  CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
-  CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
-  CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF
-} CUmemAccess_flags;
-
-typedef enum CUmemLocationType_enum {
-  CU_MEM_LOCATION_TYPE_INVALID = 0x0,
-  CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
-  CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF
-} CUmemLocationType;
-
-typedef struct CUmemLocation_st {
-  CUmemLocationType type;
-  int id;
-} CUmemLocation_v1;
-typedef CUmemLocation_v1 CUmemLocation;
-
-typedef struct CUmemAccessDesc_st {
-  CUmemLocation location;
-  CUmemAccess_flags flags;
-} CUmemAccessDesc_v1;
-
-typedef CUmemAccessDesc_v1 CUmemAccessDesc;
-
-typedef enum CUmemAllocationType_enum {
-  CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
-  CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
-  CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF
-} CUmemAllocationType;
-
-typedef enum CUmemAllocationHandleType_enum {
-  CU_MEM_HANDLE_TYPE_NONE = 0x0,
-  CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
-  CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
-  CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
-  CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF
-} CUmemAllocationHandleType;
-
-typedef struct CUmemAllocationProp_st {
-  CUmemAllocationType type;
-  CUmemAllocationHandleType requestedHandleTypes;
-  CUmemLocation location;
-
-  void *win32HandleMetaData;
-  struct {
-    unsigned char compressionType;
-    unsigned char gpuDirectRDMACapable;
-    unsigned short usage;
-    unsigned char reserved[4];
-  } allocFlags;
-} CUmemAllocationProp_v1;
-typedef CUmemAllocationProp_v1 CUmemAllocationProp;
-
-typedef enum cudaError_enum {
-  CUDA_SUCCESS = 0,
-  CUDA_ERROR_INVALID_VALUE = 1,
-  CUDA_ERROR_NO_DEVICE = 100,
-  CUDA_ERROR_INVALID_HANDLE = 400,
-  CUDA_ERROR_NOT_READY = 600,
-  CUDA_ERROR_TOO_MANY_PEERS = 711,
-} CUresult;
-
-typedef enum CUstream_flags_enum {
-  CU_STREAM_DEFAULT = 0x0,
-  CU_STREAM_NON_BLOCKING = 0x1,
-} CUstream_flags;
-
-typedef enum CUlimit_enum {
-  CU_LIMIT_STACK_SIZE = 0x0,
-  CU_LIMIT_PRINTF_FIFO_SIZE = 0x1,
-  CU_LIMIT_MALLOC_HEAP_SIZE = 0x2,
-  CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x3,
-  CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x4,
-  CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x5,
-  CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x6,
-  CU_LIMIT_MAX
-} CUlimit;
-
-typedef enum CUdevice_attribute_enum {
-  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
-  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
-  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
-  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
-  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
-  CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
-  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
-  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
-  CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
-  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
-  CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
-  CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
-  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
-  CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
-  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
-  CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
-  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
-  CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
-  CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
-  CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
-  CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
-  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
-  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
-  CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,
-  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,
-  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,
-  CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,
-  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
-  CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
-  CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,
-  CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,
-  CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,
-  CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,
-  CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,
-  CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,
-  CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,
-  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,
-  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
-  CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
-  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,
-  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
-  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
-  CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
-  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
-  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,
-  CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,
-  CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,
-  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,
-  CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,
-  CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,
-  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100,
-  CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,
-  CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,
-  CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,
-  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,
-  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,
-  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,
-  CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,
-  CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,
-  CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,
-  CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,
-  CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,
-  CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,
-  CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,
-  CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,
-  CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,
-  CU_DEVICE_ATTRIBUTE_MAX,
-} CUdevice_attribute;
-
-typedef enum CUfunction_attribute_enum {
-  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
-} CUfunction_attribute;
-
-typedef enum CUctx_flags_enum {
-  CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
-  CU_CTX_SCHED_MASK = 0x07,
-} CUctx_flags;
-
-typedef enum CUmemAttach_flags_enum {
-  CU_MEM_ATTACH_GLOBAL = 0x1,
-  CU_MEM_ATTACH_HOST = 0x2,
-  CU_MEM_ATTACH_SINGLE = 0x4,
-} CUmemAttach_flags;
-
-typedef enum CUcomputeMode_enum {
-  CU_COMPUTEMODE_DEFAULT = 0,
-  CU_COMPUTEMODE_PROHIBITED = 2,
-  CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,
-} CUcompute_mode;
-
-typedef enum CUevent_flags_enum {
-  CU_EVENT_DEFAULT = 0x0,
-  CU_EVENT_BLOCKING_SYNC = 0x1,
-  CU_EVENT_DISABLE_TIMING = 0x2,
-  CU_EVENT_INTERPROCESS = 0x4
-} CUevent_flags;
-
-CUresult cuCtxGetDevice(CUdevice *);
-CUresult cuDeviceGet(CUdevice *, int);
-CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
-CUresult cuDeviceGetCount(int *);
-CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);
-
-// Device info
-CUresult cuDeviceGetName(char *, int, CUdevice);
-CUresult cuDeviceTotalMem(size_t *, CUdevice);
-CUresult cuDriverGetVersion(int *);
-
-CUresult cuGetErrorString(CUresult, const char **);
-CUresult cuInit(unsigned);
-CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
-                        unsigned, unsigned, unsigned, CUstream, void **,
-                        void **);
-
-CUresult cuMemAlloc(CUdeviceptr *, size_t);
-CUresult cuMemAllocHost(void **, size_t);
-CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
-CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
-
-CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
-CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
-CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
-CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
-
-CUresult cuMemFree(CUdeviceptr);
-CUresult cuMemFreeHost(void *);
-CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
-
-CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
-CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
-
-CUresult cuModuleUnload(CUmodule);
-CUresult cuStreamCreate(CUstream *, unsigned);
-CUresult cuStreamDestroy(CUstream);
-CUresult cuStreamSynchronize(CUstream);
-CUresult cuStreamQuery(CUstream);
-CUresult cuCtxSetCurrent(CUcontext);
-CUresult cuDevicePrimaryCtxRelease(CUdevice);
-CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
-CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
-CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
-CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *,
-                            void **);
-
-CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice);
-CUresult cuCtxEnablePeerAccess(CUcontext, unsigned);
-CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext,
-                           size_t, CUstream);
-
-CUresult cuCtxGetLimit(size_t *, CUlimit);
-CUresult cuCtxSetLimit(CUlimit, size_t);
-
-CUresult cuEventCreate(CUevent *, unsigned int);
-CUresult cuEventRecord(CUevent, CUstream);
-CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
-CUresult cuEventSynchronize(CUevent);
-CUresult cuEventDestroy(CUevent);
-
-CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
-CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
-CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
-CUresult cuMemGetInfo(size_t *free, size_t *total);
-CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
-                             CUdeviceptr addr, unsigned long long flags);
-CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
-                  CUmemGenericAllocationHandle handle,
-                  unsigned long long flags);
-CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                     const CUmemAllocationProp *prop, unsigned long long flags);
-CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
-                        const CUmemAccessDesc *desc, size_t count);
-CUresult cuMemGetAllocationGranularity(size_t *granularity,
-                                       const CUmemAllocationProp *prop,
-                                       CUmemAllocationGranularity_flags option);
-
-#endif

diff --git a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
deleted file mode 100644
index fc74c6a..0000000
--- a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ /dev/null

@@ -1,1518 +0,0 @@
-//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL NextGen for CUDA machine
-//
-//===----------------------------------------------------------------------===//
-
-#include <cassert>
-#include <cstddef>
-#include <cuda.h>
-#include <string>
-#include <unordered_map>
-
-#include "Shared/Debug.h"
-#include "Shared/Environment.h"
-
-#include "GlobalHandler.h"
-#include "OpenMP/OMPT/Callback.h"
-#include "PluginInterface.h"
-#include "Utils/ELF.h"
-
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Program.h"
-
-namespace llvm {
-namespace omp {
-namespace target {
-namespace plugin {
-
-/// Forward declarations for all specialized data structures.
-struct CUDAKernelTy;
-struct CUDADeviceTy;
-struct CUDAPluginTy;
-
-#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000))
-/// Forward declarations for all Virtual Memory Management
-/// related data structures and functions. This is necessary
-/// for older cuda versions.
-typedef void *CUmemGenericAllocationHandle;
-typedef void *CUmemAllocationProp;
-typedef void *CUmemAccessDesc;
-typedef void *CUmemAllocationGranularity_flags;
-CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
-                             CUdeviceptr addr, unsigned long long flags) {}
-CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
-                  CUmemGenericAllocationHandle handle,
-                  unsigned long long flags) {}
-CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                     const CUmemAllocationProp *prop,
-                     unsigned long long flags) {}
-CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
-                        const CUmemAccessDesc *desc, size_t count) {}
-CUresult
-cuMemGetAllocationGranularity(size_t *granularity,
-                              const CUmemAllocationProp *prop,
-                              CUmemAllocationGranularity_flags option) {}
-#endif
-
-#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
-// Forward declarations of asynchronous memory management functions. This is
-// necessary for older versions of CUDA.
-CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = 0; }
-
-CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
-#endif
-
-/// Class implementing the CUDA device images properties.
-struct CUDADeviceImageTy : public DeviceImageTy {
-  /// Create the CUDA image with the id and the target image pointer.
-  CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                    const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
-
-  /// Load the image as a CUDA module.
-  Error loadModule() {
-    assert(!Module && "Module already loaded");
-
-    CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Unload the CUDA module corresponding to the image.
-  Error unloadModule() {
-    assert(Module && "Module not loaded");
-
-    CUresult Res = cuModuleUnload(Module);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
-      return Err;
-
-    Module = nullptr;
-
-    return Plugin::success();
-  }
-
-  /// Getter of the CUDA module.
-  CUmodule getModule() const { return Module; }
-
-private:
-  /// The CUDA module that loaded the image.
-  CUmodule Module;
-};
-
-/// Class implementing the CUDA kernel functionalities which derives from the
-/// generic kernel class.
-struct CUDAKernelTy : public GenericKernelTy {
-  /// Create a CUDA kernel with a name and an execution mode.
-  CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
-
-  /// Initialize the CUDA kernel.
-  Error initImpl(GenericDeviceTy &GenericDevice,
-                 DeviceImageTy &Image) override {
-    CUresult Res;
-    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
-
-    // Retrieve the function pointer of the kernel.
-    Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
-    if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
-                                 getName()))
-      return Err;
-
-    // Check that the function pointer is valid.
-    if (!Func)
-      return Plugin::error("Invalid function for kernel %s", getName());
-
-    int MaxThreads;
-    Res = cuFuncGetAttribute(&MaxThreads,
-                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
-    if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
-      return Err;
-
-    // The maximum number of threads cannot exceed the maximum of the kernel.
-    MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
-
-    return Plugin::success();
-  }
-
-  /// Launch the CUDA kernel function.
-  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
-                   uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
-                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
-
-private:
-  /// The CUDA kernel function to execute.
-  CUfunction Func;
-};
-
-/// Class wrapping a CUDA stream reference. These are the objects handled by the
-/// Stream Manager for the CUDA plugin.
-struct CUDAStreamRef final : public GenericDeviceResourceRef {
-  /// The underlying handle type for streams.
-  using HandleTy = CUstream;
-
-  /// Create an empty reference to an invalid stream.
-  CUDAStreamRef() : Stream(nullptr) {}
-
-  /// Create a reference to an existing stream.
-  CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
-
-  /// Create a new stream and save the reference. The reference must be empty
-  /// before calling to this function.
-  Error create(GenericDeviceTy &Device) override {
-    if (Stream)
-      return Plugin::error("Creating an existing stream");
-
-    CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
-    if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s"))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Destroy the referenced stream and invalidate the reference. The reference
-  /// must be to a valid stream before calling to this function.
-  Error destroy(GenericDeviceTy &Device) override {
-    if (!Stream)
-      return Plugin::error("Destroying an invalid stream");
-
-    CUresult Res = cuStreamDestroy(Stream);
-    if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s"))
-      return Err;
-
-    Stream = nullptr;
-    return Plugin::success();
-  }
-
-  /// Get the underlying CUDA stream.
-  operator HandleTy() const { return Stream; }
-
-private:
-  /// The reference to the CUDA stream.
-  HandleTy Stream;
-};
-
-/// Class wrapping a CUDA event reference. These are the objects handled by the
-/// Event Manager for the CUDA plugin.
-struct CUDAEventRef final : public GenericDeviceResourceRef {
-  /// The underlying handle type for events.
-  using HandleTy = CUevent;
-
-  /// Create an empty reference to an invalid event.
-  CUDAEventRef() : Event(nullptr) {}
-
-  /// Create a reference to an existing event.
-  CUDAEventRef(HandleTy Event) : Event(Event) {}
-
-  /// Create a new event and save the reference. The reference must be empty
-  /// before calling to this function.
-  Error create(GenericDeviceTy &Device) override {
-    if (Event)
-      return Plugin::error("Creating an existing event");
-
-    CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
-    if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s"))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Destroy the referenced event and invalidate the reference. The reference
-  /// must be to a valid event before calling to this function.
-  Error destroy(GenericDeviceTy &Device) override {
-    if (!Event)
-      return Plugin::error("Destroying an invalid event");
-
-    CUresult Res = cuEventDestroy(Event);
-    if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s"))
-      return Err;
-
-    Event = nullptr;
-    return Plugin::success();
-  }
-
-  /// Get the underlying CUevent.
-  operator HandleTy() const { return Event; }
-
-private:
-  /// The reference to the CUDA event.
-  HandleTy Event;
-};
-
-/// Class implementing the CUDA device functionalities which derives from the
-/// generic device class.
-struct CUDADeviceTy : public GenericDeviceTy {
-  // Create a CUDA device with a device id and the default CUDA grid values.
-  CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices)
-      : GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues),
-        CUDAStreamManager(*this), CUDAEventManager(*this) {}
-
-  ~CUDADeviceTy() {}
-
-  /// Initialize the device, its resources and get its properties.
-  Error initImpl(GenericPluginTy &Plugin) override {
-    CUresult Res = cuDeviceGet(&Device, DeviceId);
-    if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
-      return Err;
-
-    // Query the current flags of the primary context and set its flags if
-    // it is inactive.
-    unsigned int FormerPrimaryCtxFlags = 0;
-    int FormerPrimaryCtxIsActive = 0;
-    Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
-                                     &FormerPrimaryCtxIsActive);
-    if (auto Err =
-            Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s"))
-      return Err;
-
-    if (FormerPrimaryCtxIsActive) {
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-           "The primary context is active, no change to its flags\n");
-      if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
-          CU_CTX_SCHED_BLOCKING_SYNC)
-        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-             "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
-    } else {
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-           "The primary context is inactive, set its flags to "
-           "CU_CTX_SCHED_BLOCKING_SYNC\n");
-      Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
-      if (auto Err =
-              Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s"))
-        return Err;
-    }
-
-    // Retain the per device primary context and save it to use whenever this
-    // device is selected.
-    Res = cuDevicePrimaryCtxRetain(&Context, Device);
-    if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s"))
-      return Err;
-
-    if (auto Err = setContext())
-      return Err;
-
-    // Initialize stream pool.
-    if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
-      return Err;
-
-    // Initialize event pool.
-    if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
-      return Err;
-
-    // Query attributes to determine number of threads/block and blocks/grid.
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                                 GridValues.GV_Max_Teams))
-      return Err;
-
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                                 GridValues.GV_Max_WG_Size))
-      return Err;
-
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                                 GridValues.GV_Warp_Size))
-      return Err;
-
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                                 ComputeCapability.Major))
-      return Err;
-
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                                 ComputeCapability.Minor))
-      return Err;
-
-    uint32_t NumMuliprocessors = 0;
-    uint32_t MaxThreadsPerSM = 0;
-    uint32_t WarpSize = 0;
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                                 NumMuliprocessors))
-      return Err;
-    if (auto Err =
-            getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
-                          MaxThreadsPerSM))
-      return Err;
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
-      return Err;
-    HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
-
-    return Plugin::success();
-  }
-
-  /// Deinitialize the device and release its resources.
-  Error deinitImpl() override {
-    if (Context) {
-      if (auto Err = setContext())
-        return Err;
-    }
-
-    // Deinitialize the stream manager.
-    if (auto Err = CUDAStreamManager.deinit())
-      return Err;
-
-    if (auto Err = CUDAEventManager.deinit())
-      return Err;
-
-    // Close modules if necessary.
-    if (!LoadedImages.empty()) {
-      assert(Context && "Invalid CUDA context");
-
-      // Each image has its own module.
-      for (DeviceImageTy *Image : LoadedImages) {
-        CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
-
-        // Unload the module of the image.
-        if (auto Err = CUDAImage.unloadModule())
-          return Err;
-      }
-    }
-
-    if (Context) {
-      CUresult Res = cuDevicePrimaryCtxRelease(Device);
-      if (auto Err =
-              Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s"))
-        return Err;
-    }
-
-    // Invalidate context and device references.
-    Context = nullptr;
-    Device = CU_DEVICE_INVALID;
-
-    return Plugin::success();
-  }
-
-  virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
-                                       DeviceImageTy &Image) override {
-    // Check for the presense of global destructors at initialization time. This
-    // is required when the image may be deallocated before destructors are run.
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
-      Image.setPendingGlobalDtors();
-
-    return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
-  }
-
-  virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
-                                      DeviceImageTy &Image) override {
-    if (Image.hasPendingGlobalDtors())
-      return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
-    return Plugin::success();
-  }
-
-  Expected<std::unique_ptr<MemoryBuffer>>
-  doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
-    // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
-    //       avoid the call to 'ptxas'.
-    SmallString<128> PTXInputFilePath;
-    std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s",
-                                                      PTXInputFilePath);
-    if (EC)
-      return Plugin::error("Failed to create temporary file for ptxas");
-
-    // Write the file's contents to the output file.
-    Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
-        FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size());
-    if (!OutputOrErr)
-      return OutputOrErr.takeError();
-    std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
-    llvm::copy(MB->getBuffer(), Output->getBufferStart());
-    if (Error E = Output->commit())
-      return std::move(E);
-
-    SmallString<128> PTXOutputFilePath;
-    EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin",
-                                      PTXOutputFilePath);
-    if (EC)
-      return Plugin::error("Failed to create temporary file for ptxas");
-
-    // Try to find `ptxas` in the path to compile the PTX to a binary.
-    const auto ErrorOrPath = sys::findProgramByName("ptxas");
-    if (!ErrorOrPath)
-      return Plugin::error("Failed to find 'ptxas' on the PATH.");
-
-    std::string Arch = getComputeUnitKind();
-    StringRef Args[] = {*ErrorOrPath,
-                        "-m64",
-                        "-O2",
-                        "--gpu-name",
-                        Arch,
-                        "--output-file",
-                        PTXOutputFilePath,
-                        PTXInputFilePath};
-
-    std::string ErrMsg;
-    if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0,
-                            &ErrMsg))
-      return Plugin::error("Running 'ptxas' failed: %s\n", ErrMsg.c_str());
-
-    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data());
-    if (!BufferOrErr)
-      return Plugin::error("Failed to open temporary file for ptxas");
-
-    // Clean up the temporary files afterwards.
-    if (sys::fs::remove(PTXOutputFilePath))
-      return Plugin::error("Failed to remove temporary file for ptxas");
-    if (sys::fs::remove(PTXInputFilePath))
-      return Plugin::error("Failed to remove temporary file for ptxas");
-
-    return std::move(*BufferOrErr);
-  }
-
-  /// Allocate and construct a CUDA kernel.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
-    // Allocate and construct the CUDA kernel.
-    CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
-    if (!CUDAKernel)
-      return Plugin::error("Failed to allocate memory for CUDA kernel");
-
-    new (CUDAKernel) CUDAKernelTy(Name);
-
-    return *CUDAKernel;
-  }
-
-  /// Set the current context to this device's context.
-  Error setContext() override {
-    CUresult Res = cuCtxSetCurrent(Context);
-    return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
-  }
-
-  /// NVIDIA returns the product of the SM count and the number of warps that
-  /// fit if the maximum number of threads were scheduled on each SM.
-  uint64_t getHardwareParallelism() const override {
-    return HardwareParallelism;
-  }
-
-  /// We want to set up the RPC server for host services to the GPU if it is
-  /// availible.
-  bool shouldSetupRPCServer() const override {
-    return libomptargetSupportsRPC();
-  }
-
-  /// The RPC interface should have enough space for all availible parallelism.
-  uint64_t requestedRPCPortCount() const override {
-    return getHardwareParallelism();
-  }
-
-  /// Get the stream of the asynchronous info sructure or get a new one.
-  Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
-    // Get the stream (if any) from the async info.
-    Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
-    if (!Stream) {
-      // There was no stream; get an idle one.
-      if (auto Err = CUDAStreamManager.getResource(Stream))
-        return Err;
-
-      // Modify the async info's stream.
-      AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
-    }
-    return Plugin::success();
-  }
-
-  /// Getters of CUDA references.
-  CUcontext getCUDAContext() const { return Context; }
-  CUdevice getCUDADevice() const { return Device; }
-
-  /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
-    if (auto Err = setContext())
-      return std::move(Err);
-
-    // Allocate and initialize the image object.
-    CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
-    new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
-
-    // Load the CUDA module.
-    if (auto Err = CUDAImage->loadModule())
-      return std::move(Err);
-
-    return CUDAImage;
-  }
-
-  /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
-    if (Size == 0)
-      return nullptr;
-
-    if (auto Err = setContext()) {
-      REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
-      return nullptr;
-    }
-
-    void *MemAlloc = nullptr;
-    CUdeviceptr DevicePtr;
-    CUresult Res;
-
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-      Res = cuMemAlloc(&DevicePtr, Size);
-      MemAlloc = (void *)DevicePtr;
-      break;
-    case TARGET_ALLOC_HOST:
-      Res = cuMemAllocHost(&MemAlloc, Size);
-      break;
-    case TARGET_ALLOC_SHARED:
-      Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
-      MemAlloc = (void *)DevicePtr;
-      break;
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
-      CUstream Stream;
-      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
-        break;
-      if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
-        break;
-      cuStreamSynchronize(Stream);
-      Res = cuStreamDestroy(Stream);
-      MemAlloc = (void *)DevicePtr;
-    }
-    }
-
-    if (auto Err =
-            Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) {
-      REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
-      return nullptr;
-    }
-    return MemAlloc;
-  }
-
-  /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
-    if (TgtPtr == nullptr)
-      return OFFLOAD_SUCCESS;
-
-    if (auto Err = setContext()) {
-      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
-      return OFFLOAD_FAIL;
-    }
-
-    CUresult Res;
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-    case TARGET_ALLOC_SHARED:
-      Res = cuMemFree((CUdeviceptr)TgtPtr);
-      break;
-    case TARGET_ALLOC_HOST:
-      Res = cuMemFreeHost(TgtPtr);
-      break;
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
-      CUstream Stream;
-      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
-        break;
-      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
-      cuStreamSynchronize(Stream);
-      if ((Res = cuStreamDestroy(Stream)))
-        break;
-    }
-    }
-
-    if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
-      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
-      return OFFLOAD_FAIL;
-    }
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Synchronize current thread with the pending operations on the async info.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
-    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
-    CUresult Res;
-    // If we have an RPC server running on this device we will continuously
-    // query it for work rather than blocking.
-    if (!getRPCServer()) {
-      Res = cuStreamSynchronize(Stream);
-    } else {
-      do {
-        Res = cuStreamQuery(Stream);
-        if (auto Err = getRPCServer()->runServer(*this))
-          return Err;
-      } while (Res == CUDA_ERROR_NOT_READY);
-    }
-
-    // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
-    AsyncInfo.Queue = nullptr;
-    if (auto Err = CUDAStreamManager.returnResource(Stream))
-      return Err;
-
-    return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
-  }
-
-  /// CUDA support VA management
-  bool supportVAManagement() const override {
-#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000))
-    return true;
-#else
-    return false;
-#endif
-  }
-
-  /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
-  /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
-  /// At return \p RSize contains the actual size
-  Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
-    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
-    auto IHandle = DeviceMMaps.find(DVAddr);
-    size_t Size = *RSize;
-
-    if (Size == 0)
-      return Plugin::error("Memory Map Size must be larger than 0");
-
-    // Check if we have already mapped this address
-    if (IHandle != DeviceMMaps.end())
-      return Plugin::error("Address already memory mapped");
-
-    CUmemAllocationProp Prop = {};
-    size_t Granularity = 0;
-
-    size_t Free, Total;
-    CUresult Res = cuMemGetInfo(&Free, &Total);
-    if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
-      return Err;
-
-    if (Size >= Free) {
-      *Addr = nullptr;
-      return Plugin::error(
-          "Canot map memory size larger than the available device memory");
-    }
-
-    // currently NVidia only supports pinned device types
-    Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-
-    Prop.location.id = DeviceId;
-    cuMemGetAllocationGranularity(&Granularity, &Prop,
-                                  CU_MEM_ALLOC_GRANULARITY_MINIMUM);
-    if (auto Err =
-            Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
-      return Err;
-
-    if (Granularity == 0)
-      return Plugin::error("Wrong device Page size");
-
-    // Ceil to page size.
-    Size = roundUp(Size, Granularity);
-
-    // Create a handler of our allocation
-    CUmemGenericAllocationHandle AHandle;
-    Res = cuMemCreate(&AHandle, Size, &Prop, 0);
-    if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
-      return Err;
-
-    CUdeviceptr DevPtr = 0;
-    Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
-    if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
-      return Err;
-
-    Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
-    if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
-      return Err;
-
-    CUmemAccessDesc ADesc = {};
-    ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    ADesc.location.id = DeviceId;
-    ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-
-    // Sets address
-    Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
-    if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
-      return Err;
-
-    *Addr = reinterpret_cast<void *>(DevPtr);
-    *RSize = Size;
-    DeviceMMaps.insert({DevPtr, AHandle});
-    return Plugin::success();
-  }
-
-  /// De-allocates device memory and Unmaps the Virtual Addr
-  Error memoryVAUnMap(void *VAddr, size_t Size) override {
-    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
-    auto IHandle = DeviceMMaps.find(DVAddr);
-    // Mapping does not exist
-    if (IHandle == DeviceMMaps.end()) {
-      return Plugin::error("Addr is not MemoryMapped");
-    }
-
-    if (IHandle == DeviceMMaps.end())
-      return Plugin::error("Addr is not MemoryMapped");
-
-    CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
-
-    CUresult Res = cuMemUnmap(DVAddr, Size);
-    if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
-      return Err;
-
-    Res = cuMemRelease(AllocHandle);
-    if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
-      return Err;
-
-    Res = cuMemAddressFree(DVAddr, Size);
-    if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
-      return Err;
-
-    DeviceMMaps.erase(IHandle);
-    return Plugin::success();
-  }
-
-  /// Query for the completion of the pending operations on the async info.
-  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
-    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
-    CUresult Res = cuStreamQuery(Stream);
-
-    // Not ready streams must be considered as successful operations.
-    if (Res == CUDA_ERROR_NOT_READY)
-      return Plugin::success();
-
-    // Once the stream is synchronized and the operations completed (or an error
-    // occurs), return it to stream pool and reset AsyncInfo. This is to make
-    // sure the synchronization only works for its own tasks.
-    AsyncInfo.Queue = nullptr;
-    if (auto Err = CUDAStreamManager.returnResource(Stream))
-      return Err;
-
-    return Plugin::check(Res, "Error in cuStreamQuery: %s");
-  }
-
-  Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
-    // TODO: Register the buffer as CUDA host memory.
-    return HstPtr;
-  }
-
-  Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
-
-  Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
-                                 void *&BaseDevAccessiblePtr,
-                                 size_t &BaseSize) const override {
-    // TODO: Implement pinning feature for CUDA.
-    return false;
-  }
-
-  /// Submit data to the device (host to device transfer).
-  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
-                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    if (auto Err = setContext())
-      return Err;
-
-    CUstream Stream;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
-    return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s");
-  }
-
-  /// Retrieve data from the device (device to host transfer).
-  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
-                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    if (auto Err = setContext())
-      return Err;
-
-    CUstream Stream;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    // If there is already pending work on the stream it could be waiting for
-    // someone to check the RPC server.
-    if (auto *RPCServer = getRPCServer()) {
-      CUresult Res = cuStreamQuery(Stream);
-      while (Res == CUDA_ERROR_NOT_READY) {
-        if (auto Err = RPCServer->runServer(*this))
-          return Err;
-        Res = cuStreamQuery(Stream);
-      }
-    }
-
-    CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
-    return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
-  }
-
-  /// Exchange data between two devices directly. We may use peer access if
-  /// the CUDA devices and driver allow them.
-  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
-                         void *DstPtr, int64_t Size,
-                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
-
-  /// Initialize the async info for interoperability purposes.
-  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    if (auto Err = setContext())
-      return Err;
-
-    CUstream Stream;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Initialize the device info for interoperability purposes.
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    assert(Context && "Context is null");
-    assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
-
-    if (auto Err = setContext())
-      return Err;
-
-    if (!DeviceInfo->Context)
-      DeviceInfo->Context = Context;
-
-    if (!DeviceInfo->Device)
-      DeviceInfo->Device = reinterpret_cast<void *>(Device);
-
-    return Plugin::success();
-  }
-
-  /// Create an event.
-  Error createEventImpl(void **EventPtrStorage) override {
-    CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
-    return CUDAEventManager.getResource(*Event);
-  }
-
-  /// Destroy a previously created event.
-  Error destroyEventImpl(void *EventPtr) override {
-    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-    return CUDAEventManager.returnResource(Event);
-  }
-
-  /// Record the event.
-  Error recordEventImpl(void *EventPtr,
-                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-    CUstream Stream;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    CUresult Res = cuEventRecord(Event, Stream);
-    return Plugin::check(Res, "Error in cuEventRecord: %s");
-  }
-
-  /// Make the stream wait on the event.
-  Error waitEventImpl(void *EventPtr,
-                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-    CUstream Stream;
-    if (auto Err = getStream(AsyncInfoWrapper, Stream))
-      return Err;
-
-    // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
-    // specific CUDA version, and defined as 0x0. In previous version, per CUDA
-    // API document, that argument has to be 0x0.
-    CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
-    return Plugin::check(Res, "Error in cuStreamWaitEvent: %s");
-  }
-
-  /// Synchronize the current thread with the event.
-  Error syncEventImpl(void *EventPtr) override {
-    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-    CUresult Res = cuEventSynchronize(Event);
-    return Plugin::check(Res, "Error in cuEventSynchronize: %s");
-  }
-
-  /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
-    char TmpChar[1000];
-    const char *TmpCharPtr;
-    size_t TmpSt;
-    int TmpInt;
-
-    CUresult Res = cuDriverGetVersion(&TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("CUDA Driver Version", TmpInt);
-
-    Info.add("CUDA OpenMP Device Number", DeviceId);
-
-    Res = cuDeviceGetName(TmpChar, 1000, Device);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Device Name", TmpChar);
-
-    Res = cuDeviceTotalMem(&TmpSt, Device);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Global Memory Size", TmpSt, "bytes");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Number of Multiprocessors", TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Concurrent Copy and Execution", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Total Constant Memory", TmpInt, "bytes");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                           TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Max Shared Memory per Block", TmpInt, "bytes");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Registers per Block", TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Warp Size", TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Maximum Threads per Block", TmpInt);
-
-    Info.add("Maximum Block Dimensions", "");
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("x", TmpInt);
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("y", TmpInt);
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("z", TmpInt);
-
-    Info.add("Maximum Grid Dimensions", "");
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("x", TmpInt);
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("y", TmpInt);
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("z", TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Maximum Memory Pitch", TmpInt, "bytes");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Texture Alignment", TmpInt, "bytes");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Clock Rate", TmpInt, "kHz");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Execution Timeout", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Integrated Device", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Can Map Host Memory", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
-    if (Res == CUDA_SUCCESS) {
-      if (TmpInt == CU_COMPUTEMODE_DEFAULT)
-        TmpCharPtr = "Default";
-      else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
-        TmpCharPtr = "Prohibited";
-      else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
-        TmpCharPtr = "Exclusive process";
-      else
-        TmpCharPtr = "Unknown";
-      Info.add("Compute Mode", TmpCharPtr);
-    }
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Concurrent Kernels", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("ECC Enabled", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Memory Clock Rate", TmpInt, "kHz");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Memory Bus Width", TmpInt, "bits");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("L2 Cache Size", TmpInt, "bytes");
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
-                           TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Max Threads Per SMP", TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Async Engines", TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Unified Addressing", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Managed Memory", (bool)TmpInt);
-
-    Res =
-        getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Concurrent Managed Memory", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
-                           TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Preemption Supported", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Cooperative Launch", (bool)TmpInt);
-
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Multi-Device Boars", (bool)TmpInt);
-
-    Info.add("Compute Capabilities", ComputeCapability.str());
-
-    return Plugin::success();
-  }
-
-  virtual bool shouldSetupDeviceMemoryPool() const override {
-    /// We use the CUDA malloc for now.
-    return false;
-  }
-
-  /// Getters and setters for stack and heap sizes.
-  Error getDeviceStackSize(uint64_t &Value) override {
-    return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
-  }
-  Error setDeviceStackSize(uint64_t Value) override {
-    return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
-  }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
-  }
-  Error setDeviceHeapSize(uint64_t Value) override {
-    return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
-  }
-  Error getDeviceMemorySize(uint64_t &Value) override {
-    CUresult Res = cuDeviceTotalMem(&Value, Device);
-    return Plugin::check(Res, "Error in getDeviceMemorySize %s");
-  }
-
-  /// CUDA-specific functions for getting and setting context limits.
-  Error setCtxLimit(CUlimit Kind, uint64_t Value) {
-    CUresult Res = cuCtxSetLimit(Kind, Value);
-    return Plugin::check(Res, "Error in cuCtxSetLimit: %s");
-  }
-  Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
-    CUresult Res = cuCtxGetLimit(&Value, Kind);
-    return Plugin::check(Res, "Error in cuCtxGetLimit: %s");
-  }
-
-  /// CUDA-specific function to get device attributes.
-  Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
-    // TODO: Warn if the new value is larger than the old.
-    CUresult Res =
-        cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
-    return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
-  }
-
-  CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
-    return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
-  }
-
-  /// See GenericDeviceTy::getComputeUnitKind().
-  std::string getComputeUnitKind() const override {
-    return ComputeCapability.str();
-  }
-
-  /// Returns the clock frequency for the given NVPTX device.
-  uint64_t getClockFrequency() const override { return 1000000000; }
-
-private:
-  using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
-  using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
-
-  Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                                 bool IsCtor) {
-    const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
-    // Perform a quick check for the named kernel in the image. The kernel
-    // should be created by the 'nvptx-lower-ctor-dtor' pass.
-    GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
-      return Plugin::success();
-
-    // The Nvidia backend cannot handle creating the ctor / dtor array
-    // automatically so we must create it ourselves. The backend will emit
-    // several globals that contain function pointers we can call. These are
-    // prefixed with a known name due to Nvidia's lack of section support.
-    auto ELFObjOrErr = Handler.getELFObjectFile(Image);
-    if (!ELFObjOrErr)
-      return ELFObjOrErr.takeError();
-
-    // Search for all symbols that contain a constructor or destructor.
-    SmallVector<std::pair<StringRef, uint16_t>> Funcs;
-    for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
-      auto NameOrErr = Sym.getName();
-      if (!NameOrErr)
-        return NameOrErr.takeError();
-
-      if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
-                                         : "__fini_array_object_"))
-        continue;
-
-      uint16_t Priority;
-      if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority))
-        return Plugin::error("Invalid priority for constructor or destructor");
-
-      Funcs.emplace_back(*NameOrErr, Priority);
-    }
-
-    // Sort the created array to be in priority order.
-    llvm::sort(Funcs, [=](auto X, auto Y) { return X.second < Y.second; });
-
-    // Allocate a buffer to store all of the known constructor / destructor
-    // functions in so we can iterate them on the device.
-    void *Buffer =
-        allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE);
-    if (!Buffer)
-      return Plugin::error("Failed to allocate memory for global buffer");
-
-    auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
-    auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
-
-    SmallVector<void *> FunctionPtrs(Funcs.size());
-    std::size_t Idx = 0;
-    for (auto [Name, Priority] : Funcs) {
-      GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
-      if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
-        return Err;
-    }
-
-    // Copy the local buffer to the device.
-    if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
-                              FunctionPtrs.size() * sizeof(void *), nullptr))
-      return Err;
-
-    // Copy the created buffer to the appropriate symbols so the kernel can
-    // iterate through them.
-    GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
-                         sizeof(void *), &GlobalPtrStart);
-    if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
-      return Err;
-
-    GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
-                        sizeof(void *), &GlobalPtrStop);
-    if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
-      return Err;
-
-    CUDAKernelTy CUDAKernel(KernelName);
-
-    if (auto Err = CUDAKernel.init(*this, Image))
-      return Err;
-
-    AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
-
-    KernelArgsTy KernelArgs = {};
-    if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
-                                         /*NumBlocks=*/1ul, KernelArgs, nullptr,
-                                         AsyncInfoWrapper))
-      return Err;
-
-    Error Err = Plugin::success();
-    AsyncInfoWrapper.finalize(Err);
-
-    if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
-      return Plugin::error("Failed to free memory for global buffer");
-
-    return Err;
-  }
-
-  /// Stream manager for CUDA streams.
-  CUDAStreamManagerTy CUDAStreamManager;
-
-  /// Event manager for CUDA events.
-  CUDAEventManagerTy CUDAEventManager;
-
-  /// The device's context. This context should be set before performing
-  /// operations on the device.
-  CUcontext Context = nullptr;
-
-  /// The CUDA device handler.
-  CUdevice Device = CU_DEVICE_INVALID;
-
-  /// The memory mapped addresses and their handles
-  std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
-
-  /// The compute capability of the corresponding CUDA device.
-  struct ComputeCapabilityTy {
-    uint32_t Major;
-    uint32_t Minor;
-    std::string str() const {
-      return "sm_" + std::to_string(Major * 10 + Minor);
-    }
-  } ComputeCapability;
-
-  /// The maximum number of warps that can be resident on all the SMs
-  /// simultaneously.
-  uint32_t HardwareParallelism = 0;
-};
-
-Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
-                               uint32_t NumThreads, uint64_t NumBlocks,
-                               KernelArgsTy &KernelArgs, void *Args,
-                               AsyncInfoWrapperTy &AsyncInfoWrapper) const {
-  CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
-
-  CUstream Stream;
-  if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
-    return Err;
-
-  uint32_t MaxDynCGroupMem =
-      std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
-
-  CUresult Res =
-      cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
-                     /*gridDimZ=*/1, NumThreads,
-                     /*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
-                     (void **)Args, nullptr);
-  return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
-}
-
-/// Class implementing the CUDA-specific functionalities of the global handler.
-class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
-public:
-  /// Get the metadata of a global from the device. The name and size of the
-  /// global is read from DeviceGlobal and the address of the global is written
-  /// to DeviceGlobal.
-  Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
-                                    DeviceImageTy &Image,
-                                    GlobalTy &DeviceGlobal) override {
-    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
-
-    const char *GlobalName = DeviceGlobal.getName().data();
-
-    size_t CUSize;
-    CUdeviceptr CUPtr;
-    CUresult Res =
-        cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
-                                 GlobalName))
-      return Err;
-
-    if (CUSize != DeviceGlobal.getSize())
-      return Plugin::error(
-          "Failed to load global '%s' due to size mismatch (%zu != %zu)",
-          GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
-
-    DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
-    return Plugin::success();
-  }
-};
-
-/// Class implementing the CUDA-specific functionalities of the plugin.
-struct CUDAPluginTy final : public GenericPluginTy {
-  /// Create a CUDA plugin.
-  CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
-
-  /// This class should not be copied.
-  CUDAPluginTy(const CUDAPluginTy &) = delete;
-  CUDAPluginTy(CUDAPluginTy &&) = delete;
-
-  /// Initialize the plugin and return the number of devices.
-  Expected<int32_t> initImpl() override {
-    CUresult Res = cuInit(0);
-    if (Res == CUDA_ERROR_INVALID_HANDLE) {
-      // Cannot call cuGetErrorString if dlsym failed.
-      DP("Failed to load CUDA shared library\n");
-      return 0;
-    }
-
-#ifdef OMPT_SUPPORT
-    ompt::connectLibrary();
-#endif
-
-    if (Res == CUDA_ERROR_NO_DEVICE) {
-      // Do not initialize if there are no devices.
-      DP("There are no devices supporting CUDA.\n");
-      return 0;
-    }
-
-    if (auto Err = Plugin::check(Res, "Error in cuInit: %s"))
-      return std::move(Err);
-
-    // Get the number of devices.
-    int NumDevices;
-    Res = cuDeviceGetCount(&NumDevices);
-    if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s"))
-      return std::move(Err);
-
-    // Do not initialize if there are no devices.
-    if (NumDevices == 0)
-      DP("There are no devices supporting CUDA.\n");
-
-    return NumDevices;
-  }
-
-  /// Deinitialize the plugin.
-  Error deinitImpl() override { return Plugin::success(); }
-
-  /// Creates a CUDA device to use for offloading.
-  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
-                                int32_t NumDevices) override {
-    return new CUDADeviceTy(Plugin, DeviceId, NumDevices);
-  }
-
-  /// Creates a CUDA global handler.
-  GenericGlobalHandlerTy *createGlobalHandler() override {
-    return new CUDAGlobalHandlerTy();
-  }
-
-  /// Get the ELF code for recognizing the compatible image binary.
-  uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
-
-  Triple::ArchType getTripleArch() const override {
-    // TODO: I think we can drop the support for 32-bit NVPTX devices.
-    return Triple::nvptx64;
-  }
-
-  /// Check whether the image is compatible with the available CUDA devices.
-  Expected<bool> isELFCompatible(StringRef Image) const override {
-    auto ElfOrErr =
-        ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""),
-                                  /*InitContent=*/false);
-    if (!ElfOrErr)
-      return ElfOrErr.takeError();
-
-    // Get the numeric value for the image's `sm_` value.
-    auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM;
-
-    for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) {
-      CUdevice Device;
-      CUresult Res = cuDeviceGet(&Device, DevId);
-      if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
-        return std::move(Err);
-
-      int32_t Major, Minor;
-      Res = cuDeviceGetAttribute(
-          &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
-      if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
-        return std::move(Err);
-
-      Res = cuDeviceGetAttribute(
-          &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
-      if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
-        return std::move(Err);
-
-      int32_t ImageMajor = SM / 10;
-      int32_t ImageMinor = SM % 10;
-
-      // A cubin generated for a certain compute capability is supported to
-      // run on any GPU with the same major revision and same or higher minor
-      // revision.
-      if (Major != ImageMajor || Minor < ImageMinor)
-        return false;
-    }
-    return true;
-  }
-};
-
-Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
-                                     GenericDeviceTy &DstGenericDevice,
-                                     void *DstPtr, int64_t Size,
-                                     AsyncInfoWrapperTy &AsyncInfoWrapper) {
-  if (auto Err = setContext())
-    return Err;
-
-  CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
-
-  CUresult Res;
-  int32_t DstDeviceId = DstDevice.DeviceId;
-  CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
-  CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
-
-  int CanAccessPeer = 0;
-  if (DeviceId != DstDeviceId) {
-    // Make sure the lock is released before performing the copies.
-    std::lock_guard<std::mutex> Lock(PeerAccessesLock);
-
-    switch (PeerAccesses[DstDeviceId]) {
-    case PeerAccessState::AVAILABLE:
-      CanAccessPeer = 1;
-      break;
-    case PeerAccessState::UNAVAILABLE:
-      CanAccessPeer = 0;
-      break;
-    case PeerAccessState::PENDING:
-      // Check whether the source device can access the destination device.
-      Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
-      if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
-        return Err;
-
-      if (CanAccessPeer) {
-        Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
-        if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
-          // Resources may be exhausted due to many P2P links.
-          CanAccessPeer = 0;
-          DP("Too many P2P so fall back to D2D memcpy");
-        } else if (auto Err =
-                       Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s"))
-          return Err;
-      }
-      PeerAccesses[DstDeviceId] = (CanAccessPeer)
-                                      ? PeerAccessState::AVAILABLE
-                                      : PeerAccessState::UNAVAILABLE;
-    }
-  }
-
-  CUstream Stream;
-  if (auto Err = getStream(AsyncInfoWrapper, Stream))
-    return Err;
-
-  if (CanAccessPeer) {
-    // TODO: Should we fallback to D2D if peer access fails?
-    Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
-                            Size, Stream);
-    return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s");
-  }
-
-  // Fallback to D2D copy.
-  Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
-  return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
-}
-
-GenericPluginTy *PluginTy::createPlugin() { return new CUDAPluginTy(); }
-
-template <typename... ArgsTy>
-static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
-  CUresult ResultCode = static_cast<CUresult>(Code);
-  if (ResultCode == CUDA_SUCCESS)
-    return Error::success();
-
-  const char *Desc = "Unknown error";
-  CUresult Ret = cuGetErrorString(ResultCode, &Desc);
-  if (Ret != CUDA_SUCCESS)
-    REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
-
-  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
-                                                    ErrFmt, Args..., Desc);
-}
-
-} // namespace plugin
-} // namespace target
-} // namespace omp
-} // namespace llvm

diff --git a/libomptarget/plugins-nextgen/exports b/libomptarget/plugins-nextgen/exports
deleted file mode 100644
index cc7beda..0000000
--- a/libomptarget/plugins-nextgen/exports
+++ /dev/null

@@ -1,6 +0,0 @@
-VERS1.0 {
-  global:
-    __tgt_rtl*;
-  local:
-    *;
-};

diff --git a/libomptarget/plugins-nextgen/host/CMakeLists.txt b/libomptarget/plugins-nextgen/host/CMakeLists.txt
deleted file mode 100644
index c1493d2..0000000
--- a/libomptarget/plugins-nextgen/host/CMakeLists.txt
+++ /dev/null

@@ -1,89 +0,0 @@
-if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
-  return()
-endif()
-
-set(supported_targets x86_64 aarch64 ppc64 ppc64le s390x)
-if(NOT ${CMAKE_SYSTEM_PROCESSOR} IN_LIST supported_targets)
-  libomptarget_say("Not building ${machine} NextGen offloading plugin")
-  return()
-endif()
-
-set(machine ${CMAKE_SYSTEM_PROCESSOR})
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
-  set(machine ppc64)
-endif()
-
-# Create the library and add the default arguments.
-add_target_library(omptarget.rtl.${machine} ${machine})
-
-target_sources(omptarget.rtl.${machine} PRIVATE src/rtl.cpp)
-
-if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-  libomptarget_say("Building ${machine} plugin linked with libffi")
-  if(FFI_STATIC_LIBRARIES)
-    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi_static)
-  else()
-    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi)
-  endif()
-else()
-  libomptarget_say("Building ${machine} plugin for dlopened libffi")
-  target_sources(omptarget.rtl.${machine} PRIVATE dynamic_ffi/ffi.cpp)
-  target_include_directories(omptarget.rtl.${machine} PRIVATE dynamic_ffi)
-endif()
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.${machine}
-        LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.${machine} PROPERTIES
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-  POSITION_INDEPENDENT_CODE ON
-  CXX_VISIBILITY_PRESET protected)
-
-target_include_directories(omptarget.rtl.${machine} PRIVATE
-                           ${LIBOMPTARGET_INCLUDE_DIR})
-
-if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS omptarget.rtl.${machine})
-  set(LIBOMPTARGET_TESTED_PLUGINS
-      "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
-  libomptarget_say("Not generating ${tmachine_name} tests. LibFFI not found.")
-endif()
-
-# Define the target specific triples and ELF machine values.
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64le-ibm-linux-gnu")
-  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
-       "powerpc64le-ibm-linux-gnu" "powerpc64le-ibm-linux-gnu-LTO")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64-ibm-linux-gnu")
-  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
-       "powerpc64-ibm-linux-gnu" "powerpc64-ibm-linux-gnu-LTO")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_X86_64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="x86_64-pc-linux-gnu")
-  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
-       "x86_64-pc-linux-gnu" "x86_64-pc-linux-gnu-LTO")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_AARCH64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="aarch64-unknown-linux-gnu")
-  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
-       "aarch64-unknown-linux-gnu" "aarch64-unknown-linux-gnu-LTO")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_S390)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
-      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="s390x-ibm-linux-gnu")
-  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
-       "s390x-ibm-linux-gnu" "s390x-ibm-linux-gnu-LTO")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-endif()

diff --git a/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp b/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp
deleted file mode 100644
index c586ad1..0000000
--- a/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-//===--- generic-elf-64bit/dynamic_ffi/ffi.cpp -------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implement subset of the FFI api by calling into the FFI library via dlopen
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#include "Shared/Debug.h"
-#include <memory>
-
-#include "DLWrap.h"
-#include "ffi.h"
-
-DLWRAP_INITIALIZE()
-
-DLWRAP(ffi_call, 4);
-DLWRAP(ffi_prep_cif, 5);
-
-DLWRAP_FINALIZE()
-
-ffi_type ffi_type_void;
-ffi_type ffi_type_pointer;
-
-// Name of the FFI shared library.
-constexpr const char *FFI_PATH = "libffi.so";
-
-#define DYNAMIC_FFI_SUCCESS 0
-#define DYNAMIC_FFI_FAIL 1
-
-// Initializes the dynamic FFI wrapper.
-uint32_t ffi_init() {
-  std::string ErrMsg;
-  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-      llvm::sys::DynamicLibrary::getPermanentLibrary(FFI_PATH, &ErrMsg));
-
-  if (!DynlibHandle->isValid()) {
-    DP("Unable to load library '%s': %s!\n", FFI_PATH, ErrMsg.c_str());
-    return DYNAMIC_FFI_FAIL;
-  }
-
-  for (size_t I = 0; I < dlwrap::size(); I++) {
-    const char *Sym = dlwrap::symbol(I);
-
-    void *P = DynlibHandle->getAddressOfSymbol(Sym);
-    if (P == nullptr) {
-      DP("Unable to find '%s' in '%s'!\n", Sym, FFI_PATH);
-      return DYNAMIC_FFI_FAIL;
-    }
-    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
-
-    *dlwrap::pointer(I) = P;
-  }
-
-#define DYNAMIC_INIT(SYMBOL)                                                   \
-  {                                                                            \
-    void *SymbolPtr = DynlibHandle->getAddressOfSymbol(#SYMBOL);               \
-    if (!SymbolPtr) {                                                          \
-      DP("Unable to find '%s' in '%s'!\n", #SYMBOL, FFI_PATH);                 \
-      return DYNAMIC_FFI_FAIL;                                                 \
-    }                                                                          \
-    SYMBOL = *reinterpret_cast<decltype(SYMBOL) *>(SymbolPtr);                 \
-  }
-  DYNAMIC_INIT(ffi_type_void);
-  DYNAMIC_INIT(ffi_type_pointer);
-#undef DYNAMIC_INIT
-
-  return DYNAMIC_FFI_SUCCESS;
-}

diff --git a/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.h b/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.h
deleted file mode 100644
index 0ae0258..0000000
--- a/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.h
+++ /dev/null

@@ -1,78 +0,0 @@
-//===--- generic-elf-64bit/dynamic_ffi/ffi.cpp -------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Provides a mirror to the parts of the FFI interface that the plugins require.
-//
-// libffi
-//   - Copyright (c) 2011, 2014, 2019, 2021, 2022 Anthony Green
-//   - Copyright (c) 1996-2003, 2007, 2008 Red Hat, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef DYNAMIC_FFI_FFI_H
-#define DYNAMIC_FFI_FFI_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#define USES_DYNAMIC_FFI
-
-uint32_t ffi_init();
-
-typedef struct _ffi_type {
-  size_t size;
-  unsigned short alignment;
-  unsigned short type;
-  struct _ffi_type **elements;
-} ffi_type;
-
-typedef enum {
-  FFI_OK = 0,
-  FFI_BAD_TYPEDEF,
-  FFI_BAD_ABI,
-  FFI_BAD_ARGTYPE
-} ffi_status;
-
-// These are target depenent so we set them manually for each ABI by referencing
-// the FFI source.
-typedef enum ffi_abi {
-#if (defined(_M_X64) || defined(__x86_64__))
-  FFI_DEFAULT_ABI = 2, // FFI_UNIX64.
-#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
-  FFI_DEFAULT_ABI = 1, // FFI_SYSV.
-#elif defined(__powerpc64__)
-  FFI_DEFAULT_ABI = 8, // FFI_LINUX.
-#elif defined(__s390x__)
-  FFI_DEFAULT_ABI = 1, // FFI_SYSV.
-#else
-#error "Unknown ABI"
-#endif
-} ffi_cif;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FFI_EXTERN extern
-#define FFI_API
-
-FFI_EXTERN ffi_type ffi_type_void;
-FFI_EXTERN ffi_type ffi_type_pointer;
-
-FFI_API
-void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue);
-
-FFI_API
-ffi_status ffi_prep_cif(ffi_cif *cif, ffi_abi abi, unsigned int nargs,
-                        ffi_type *rtype, ffi_type **atypes);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // DYNAMIC_FFI_FFI_H

diff --git a/libomptarget/plugins-nextgen/host/src/rtl.cpp b/libomptarget/plugins-nextgen/host/src/rtl.cpp
deleted file mode 100644
index f0ce242..0000000
--- a/libomptarget/plugins-nextgen/host/src/rtl.cpp
+++ /dev/null

@@ -1,442 +0,0 @@
-//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL NextGen for generic 64-bit machine
-//
-//===----------------------------------------------------------------------===//
-
-#include <cassert>
-#include <cstddef>
-#include <ffi.h>
-#include <string>
-#include <unordered_map>
-
-#include "Shared/Debug.h"
-#include "Shared/Environment.h"
-
-#include "GlobalHandler.h"
-#include "OpenMP/OMPT/Callback.h"
-#include "PluginInterface.h"
-#include "omptarget.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-// The number of devices in this plugin.
-#define NUM_DEVICES 4
-
-// The ELF ID should be defined at compile-time by the build system.
-#ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID EM_NONE
-#endif
-
-// The target triple should be defined at compile-time by the build system.
-#ifndef LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE
-#define LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE ""
-#endif
-
-namespace llvm {
-namespace omp {
-namespace target {
-namespace plugin {
-
-/// Forward declarations for all specialized data structures.
-struct GenELF64KernelTy;
-struct GenELF64DeviceTy;
-struct GenELF64PluginTy;
-
-using llvm::sys::DynamicLibrary;
-
-/// Class implementing kernel functionalities for GenELF64.
-struct GenELF64KernelTy : public GenericKernelTy {
-  /// Construct the kernel with a name and an execution mode.
-  GenELF64KernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
-
-  /// Initialize the kernel.
-  Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
-    // Functions have zero size.
-    GlobalTy Global(getName(), 0);
-
-    // Get the metadata (address) of the kernel function.
-    GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler();
-    if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, Global))
-      return Err;
-
-    // Check that the function pointer is valid.
-    if (!Global.getPtr())
-      return Plugin::error("Invalid function for kernel %s", getName());
-
-    // Save the function pointer.
-    Func = (void (*)())Global.getPtr();
-
-    KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_GENERIC;
-    KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
-    KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
-
-    // Set the maximum number of threads to a single.
-    MaxNumThreads = 1;
-    return Plugin::success();
-  }
-
-  /// Launch the kernel using the libffi.
-  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
-                   uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
-                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
-    // Create a vector of ffi_types, one per argument.
-    SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
-    ffi_type **ArgTypesPtr = (ArgTypes.size()) ? &ArgTypes[0] : nullptr;
-
-    // Prepare the cif structure before running the kernel function.
-    ffi_cif Cif;
-    ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, KernelArgs.NumArgs,
-                                     &ffi_type_void, ArgTypesPtr);
-    if (Status != FFI_OK)
-      return Plugin::error("Error in ffi_prep_cif: %d", Status);
-
-    // Call the kernel function through libffi.
-    long Return;
-    ffi_call(&Cif, Func, &Return, (void **)Args);
-
-    return Plugin::success();
-  }
-
-private:
-  /// The kernel function to execute.
-  void (*Func)(void);
-};
-
-/// Class implementing the GenELF64 device images properties.
-struct GenELF64DeviceImageTy : public DeviceImageTy {
-  /// Create the GenELF64 image with the id and the target image pointer.
-  GenELF64DeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
-                        const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, Device, TgtImage), DynLib() {}
-
-  /// Getter and setter for the dynamic library.
-  DynamicLibrary &getDynamicLibrary() { return DynLib; }
-  void setDynamicLibrary(const DynamicLibrary &Lib) { DynLib = Lib; }
-
-private:
-  /// The dynamic library that loaded the image.
-  DynamicLibrary DynLib;
-};
-
-/// Class implementing the device functionalities for GenELF64.
-struct GenELF64DeviceTy : public GenericDeviceTy {
-  /// Create the device with a specific id.
-  GenELF64DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
-                   int32_t NumDevices)
-      : GenericDeviceTy(Plugin, DeviceId, NumDevices, GenELF64GridValues) {}
-
-  ~GenELF64DeviceTy() {}
-
-  /// Initialize the device, which is a no-op
-  Error initImpl(GenericPluginTy &Plugin) override { return Plugin::success(); }
-
-  /// Deinitialize the device, which is a no-op
-  Error deinitImpl() override { return Plugin::success(); }
-
-  /// See GenericDeviceTy::getComputeUnitKind().
-  std::string getComputeUnitKind() const override { return "generic-64bit"; }
-
-  /// Construct the kernel for a specific image on the device.
-  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
-    // Allocate and construct the kernel.
-    GenELF64KernelTy *GenELF64Kernel = Plugin.allocate<GenELF64KernelTy>();
-    if (!GenELF64Kernel)
-      return Plugin::error("Failed to allocate memory for GenELF64 kernel");
-
-    new (GenELF64Kernel) GenELF64KernelTy(Name);
-
-    return *GenELF64Kernel;
-  }
-
-  /// Set the current context to this device, which is a no-op.
-  Error setContext() override { return Plugin::success(); }
-
-  /// Load the binary image into the device and allocate an image object.
-  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
-                                           int32_t ImageId) override {
-    // Allocate and initialize the image object.
-    GenELF64DeviceImageTy *Image = Plugin.allocate<GenELF64DeviceImageTy>();
-    new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage);
-
-    // Create a temporary file.
-    char TmpFileName[] = "/tmp/tmpfile_XXXXXX";
-    int TmpFileFd = mkstemp(TmpFileName);
-    if (TmpFileFd == -1)
-      return Plugin::error("Failed to create tmpfile for loading target image");
-
-    // Open the temporary file.
-    FILE *TmpFile = fdopen(TmpFileFd, "wb");
-    if (!TmpFile)
-      return Plugin::error("Failed to open tmpfile %s for loading target image",
-                           TmpFileName);
-
-    // Write the image into the temporary file.
-    size_t Written = fwrite(Image->getStart(), Image->getSize(), 1, TmpFile);
-    if (Written != 1)
-      return Plugin::error("Failed to write target image to tmpfile %s",
-                           TmpFileName);
-
-    // Close the temporary file.
-    int Ret = fclose(TmpFile);
-    if (Ret)
-      return Plugin::error("Failed to close tmpfile %s with the target image",
-                           TmpFileName);
-
-    // Load the temporary file as a dynamic library.
-    std::string ErrMsg;
-    DynamicLibrary DynLib =
-        DynamicLibrary::getPermanentLibrary(TmpFileName, &ErrMsg);
-
-    // Check if the loaded library is valid.
-    if (!DynLib.isValid())
-      return Plugin::error("Failed to load target image: %s", ErrMsg.c_str());
-
-    // Save a reference of the image's dynamic library.
-    Image->setDynamicLibrary(DynLib);
-
-    return Image;
-  }
-
-  /// Allocate memory. Use std::malloc in all cases.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
-    if (Size == 0)
-      return nullptr;
-
-    void *MemAlloc = nullptr;
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-    case TARGET_ALLOC_HOST:
-    case TARGET_ALLOC_SHARED:
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
-      MemAlloc = std::malloc(Size);
-      break;
-    }
-    return MemAlloc;
-  }
-
-  /// Free the memory. Use std::free in all cases.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
-    std::free(TgtPtr);
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// This plugin does nothing to lock buffers. Do not return an error, just
-  /// return the same pointer as the device pointer.
-  Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
-    return HstPtr;
-  }
-
-  /// Nothing to do when unlocking the buffer.
-  Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
-
-  /// Indicate that the buffer is not pinned.
-  Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
-                                 void *&BaseDevAccessiblePtr,
-                                 size_t &BaseSize) const override {
-    return false;
-  }
-
-  /// Submit data to the device (host to device transfer).
-  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
-                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    std::memcpy(TgtPtr, HstPtr, Size);
-    return Plugin::success();
-  }
-
-  /// Retrieve data from the device (device to host transfer).
-  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
-                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    std::memcpy(HstPtr, TgtPtr, Size);
-    return Plugin::success();
-  }
-
-  /// Exchange data between two devices within the plugin. This function is not
-  /// supported in this plugin.
-  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
-                         void *DstPtr, int64_t Size,
-                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    // This function should never be called because the function
-    // GenELF64PluginTy::isDataExchangable() returns false.
-    return Plugin::error("dataExchangeImpl not supported");
-  }
-
-  /// All functions are already synchronous. No need to do anything on this
-  /// synchronization function.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
-    return Plugin::success();
-  }
-
-  /// All functions are already synchronous. No need to do anything on this
-  /// query function.
-  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
-    return Plugin::success();
-  }
-
-  /// This plugin does not support interoperability
-  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    return Plugin::error("initAsyncInfoImpl not supported");
-  }
-
-  /// This plugin does not support interoperability
-  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
-    return Plugin::error("initDeviceInfoImpl not supported");
-  }
-
-  /// This plugin does not support the event API. Do nothing without failing.
-  Error createEventImpl(void **EventPtrStorage) override {
-    *EventPtrStorage = nullptr;
-    return Plugin::success();
-  }
-  Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); }
-  Error recordEventImpl(void *EventPtr,
-                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    return Plugin::success();
-  }
-  Error waitEventImpl(void *EventPtr,
-                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    return Plugin::success();
-  }
-  Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
-
-  /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
-    Info.add("Device Type", "Generic-elf-64bit");
-    return Plugin::success();
-  }
-
-  /// This plugin should not setup the device environment or memory pool.
-  virtual bool shouldSetupDeviceEnvironment() const override { return false; };
-  virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
-
-  /// Getters and setters for stack size and heap size not relevant.
-  Error getDeviceStackSize(uint64_t &Value) override {
-    Value = 0;
-    return Plugin::success();
-  }
-  Error setDeviceStackSize(uint64_t Value) override {
-    return Plugin::success();
-  }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = 0;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
-
-private:
-  /// Grid values for Generic ELF64 plugins.
-  static constexpr GV GenELF64GridValues = {
-      1, // GV_Slot_Size
-      1, // GV_Warp_Size
-      1, // GV_Max_Teams
-      1, // GV_Default_Num_Teams
-      1, // GV_SimpleBufferSize
-      1, // GV_Max_WG_Size
-      1, // GV_Default_WG_Size
-  };
-};
-
-class GenELF64GlobalHandlerTy final : public GenericGlobalHandlerTy {
-public:
-  Error getGlobalMetadataFromDevice(GenericDeviceTy &GenericDevice,
-                                    DeviceImageTy &Image,
-                                    GlobalTy &DeviceGlobal) override {
-    const char *GlobalName = DeviceGlobal.getName().data();
-    GenELF64DeviceImageTy &GenELF64Image =
-        static_cast<GenELF64DeviceImageTy &>(Image);
-
-    // Get dynamic library that has loaded the device image.
-    DynamicLibrary &DynLib = GenELF64Image.getDynamicLibrary();
-
-    // Get the address of the symbol.
-    void *Addr = DynLib.getAddressOfSymbol(GlobalName);
-    if (Addr == nullptr) {
-      return Plugin::error("Failed to load global '%s'", GlobalName);
-    }
-
-    // Save the pointer to the symbol.
-    DeviceGlobal.setPtr(Addr);
-
-    return Plugin::success();
-  }
-};
-
-/// Class implementing the plugin functionalities for GenELF64.
-struct GenELF64PluginTy final : public GenericPluginTy {
-  /// Create the GenELF64 plugin.
-  GenELF64PluginTy() : GenericPluginTy(getTripleArch()) {}
-
-  /// This class should not be copied.
-  GenELF64PluginTy(const GenELF64PluginTy &) = delete;
-  GenELF64PluginTy(GenELF64PluginTy &&) = delete;
-
-  /// Initialize the plugin and return the number of devices.
-  Expected<int32_t> initImpl() override {
-#ifdef OMPT_SUPPORT
-    ompt::connectLibrary();
-#endif
-
-#ifdef USES_DYNAMIC_FFI
-    if (auto Err = Plugin::check(ffi_init(), "Failed to initialize libffi"))
-      return std::move(Err);
-#endif
-
-    return NUM_DEVICES;
-  }
-
-  /// Deinitialize the plugin.
-  Error deinitImpl() override { return Plugin::success(); }
-
-  /// Creates a generic ELF device.
-  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
-                                int32_t NumDevices) override {
-    return new GenELF64DeviceTy(Plugin, DeviceId, NumDevices);
-  }
-
-  /// Creates a generic global handler.
-  GenericGlobalHandlerTy *createGlobalHandler() override {
-    return new GenELF64GlobalHandlerTy();
-  }
-
-  /// Get the ELF code to recognize the compatible binary images.
-  uint16_t getMagicElfBits() const override { return ELF::TARGET_ELF_ID; }
-
-  /// This plugin does not support exchanging data between two devices.
-  bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
-    return false;
-  }
-
-  /// All images (ELF-compatible) should be compatible with this plugin.
-  Expected<bool> isELFCompatible(StringRef) const override { return true; }
-
-  Triple::ArchType getTripleArch() const override {
-    return llvm::Triple(LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE).getArch();
-  }
-};
-
-GenericPluginTy *PluginTy::createPlugin() { return new GenELF64PluginTy(); }
-
-template <typename... ArgsTy>
-static Error Plugin::check(int32_t Code, const char *ErrMsg, ArgsTy... Args) {
-  if (Code == 0)
-    return Error::success();
-
-  return createStringError<ArgsTy..., const char *>(
-      inconvertibleErrorCode(), ErrMsg, Args..., std::to_string(Code).data());
-}
-
-} // namespace plugin
-} // namespace target
-} // namespace omp
-} // namespace llvm

diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt
deleted file mode 100644
index d0971bd..0000000
--- a/libomptarget/src/CMakeLists.txt
+++ /dev/null

@@ -1,90 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build offloading library libomptarget.so.
-#
-##===----------------------------------------------------------------------===##
-
-libomptarget_say("Building offloading runtime library libomptarget.")
-
-add_llvm_library(omptarget
-  SHARED
-
-  device.cpp
-  interface.cpp
-  omptarget.cpp
-  OffloadRTL.cpp
-  LegacyAPI.cpp
-  PluginManager.cpp
-  DeviceImage.cpp
-
-  OpenMP/API.cpp
-  OpenMP/Mapping.cpp
-  OpenMP/InteropAPI.cpp
-  OpenMP/OMPT/Callback.cpp
-
-
-  ADDITIONAL_HEADER_DIRS
-  ${LIBOMPTARGET_INCLUDE_DIR}
-
-  LINK_COMPONENTS
-  Support
-  Object
-
-  LINK_LIBS
-  PUBLIC
-  omp
-
-  NO_INSTALL_RPATH
-  BUILDTREE_ONLY
-)
-target_include_directories(omptarget PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
-
-if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-  target_link_libraries(omptarget PRIVATE
-    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
-endif()
-
-# Define the TARGET_NAME and DEBUG_PREFIX.
-target_compile_definitions(omptarget PRIVATE
-  TARGET_NAME=omptarget
-  DEBUG_PREFIX="omptarget"
-)
-
-target_compile_options(omptarget PUBLIC ${offload_compile_flags})
-target_link_options(omptarget PUBLIC ${offload_link_flags})
-
-macro(check_plugin_target target)
-if (TARGET omptarget.rtl.${target})
-	list(APPEND LIBOMPTARGET_PLUGINS_TO_LOAD ${target})
-endif()
-endmacro()
-
-set(LIBOMPTARGET_PLUGINS_TO_LOAD "" CACHE STRING
-  "Comma separated list of plugin names to look for at runtime")
-if (NOT LIBOMPTARGET_PLUGINS_TO_LOAD)
-	check_plugin_target(ppc64)
-	check_plugin_target(x86_64)
-	check_plugin_target(cuda)
-	check_plugin_target(aarch64)
-	check_plugin_target(amdgpu)
-	check_plugin_target(s390x)
-endif()
-
-list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
-list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD APPEND "\"")
-list(JOIN LIBOMPTARGET_PLUGINS_TO_LOAD "," ENABLED_OFFLOAD_PLUGINS)
-target_compile_definitions(omptarget PRIVATE ENABLED_OFFLOAD_PLUGINS=${ENABLED_OFFLOAD_PLUGINS})
-
-# libomptarget.so needs to be aware of where the plugins live as they
-# are now separated in the build directory.
-set_target_properties(omptarget PROPERTIES
-                      POSITION_INDEPENDENT_CODE ON
-                      INSTALL_RPATH "$ORIGIN"
-                      BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
-install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OPENMP_INSTALL_LIBDIR}")

diff --git a/libomptarget/src/DeviceImage.cpp b/libomptarget/src/DeviceImage.cpp
deleted file mode 100644
index e42460b..0000000
--- a/libomptarget/src/DeviceImage.cpp
+++ /dev/null

@@ -1,49 +0,0 @@
-//===-- DeviceImage.cpp - Representation of the device code/image ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "DeviceImage.h"
-
-#include "OffloadEntry.h"
-#include "Shared/APITypes.h"
-#include "Shared/Debug.h"
-#include "Shared/Utils.h"
-
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/Error.h"
-#include <memory>
-
-__tgt_bin_desc *OffloadEntryTy::getBinaryDescription() const {
-  return &DeviceImage.getBinaryDesc();
-}
-
-DeviceImageTy::DeviceImageTy(__tgt_bin_desc &BinaryDesc,
-                             __tgt_device_image &TgtDeviceImage)
-    : BinaryDesc(&BinaryDesc), Image(TgtDeviceImage) {
-
-  llvm::StringRef ImageStr(
-      static_cast<char *>(Image.ImageStart),
-      llvm::omp::target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
-
-  auto BinaryOrErr =
-      llvm::object::OffloadBinary::create(llvm::MemoryBufferRef(ImageStr, ""));
-
-  if (!BinaryOrErr) {
-    consumeError(BinaryOrErr.takeError());
-    return;
-  }
-
-  Binary = std::move(*BinaryOrErr);
-  void *Begin = const_cast<void *>(
-      static_cast<const void *>(Binary->getImage().bytes_begin()));
-  void *End = const_cast<void *>(
-      static_cast<const void *>(Binary->getImage().bytes_end()));
-
-  Image = __tgt_device_image{Begin, End, Image.EntriesBegin, Image.EntriesEnd};
-}

diff --git a/libomptarget/src/LegacyAPI.cpp b/libomptarget/src/LegacyAPI.cpp
deleted file mode 100644
index 91d5642..0000000
--- a/libomptarget/src/LegacyAPI.cpp
+++ /dev/null

@@ -1,199 +0,0 @@
-//===-------- LegacyAPI.cpp - Target independent OpenMP target RTL --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Legacy interfaces for libomptarget used to maintain backwards-compatibility.
-//
-//===----------------------------------------------------------------------===//
-
-#include "OpenMP/OMPT/Interface.h"
-#include "omptarget.h"
-#include "private.h"
-
-#include "Shared/Profile.h"
-
-#ifdef OMPT_SUPPORT
-using namespace llvm::omp::target::ompt;
-#endif
-
-EXTERN void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum,
-                                    void **ArgsBase, void **Args,
-                                    int64_t *ArgSizes, int64_t *ArgTypes) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  __tgt_target_data_begin_mapper(nullptr, DeviceId, ArgNum, ArgsBase, Args,
-                                 ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum,
-                                           void **ArgsBase, void **Args,
-                                           int64_t *ArgSizes, int64_t *ArgTypes,
-                                           int32_t DepNum, void *DepList,
-                                           int32_t NoAliasDepNum,
-                                           void *NoAliasDepList) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  __tgt_target_data_begin_mapper(nullptr, DeviceId, ArgNum, ArgsBase, Args,
-                                 ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum,
-                                  void **ArgsBase, void **Args,
-                                  int64_t *ArgSizes, int64_t *ArgTypes) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  __tgt_target_data_end_mapper(nullptr, DeviceId, ArgNum, ArgsBase, Args,
-                               ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum,
-                                     void **ArgsBase, void **Args,
-                                     int64_t *ArgSizes, int64_t *ArgTypes) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  __tgt_target_data_update_mapper(nullptr, DeviceId, ArgNum, ArgsBase, Args,
-                                  ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN void __tgt_target_data_update_nowait(
-    int64_t DeviceId, int32_t ArgNum, void **ArgsBase, void **Args,
-    int64_t *ArgSizes, int64_t *ArgTypes, int32_t DepNum, void *DepList,
-    int32_t NoAliasDepNum, void *NoAliasDepList) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  __tgt_target_data_update_mapper(nullptr, DeviceId, ArgNum, ArgsBase, Args,
-                                  ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum,
-                                         void **ArgsBase, void **Args,
-                                         int64_t *ArgSizes, int64_t *ArgTypes,
-                                         int32_t DepNum, void *DepList,
-                                         int32_t NoAliasDepNum,
-                                         void *NoAliasDepList) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  __tgt_target_data_end_mapper(nullptr, DeviceId, ArgNum, ArgsBase, Args,
-                               ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN int __tgt_target_mapper(ident_t *Loc, int64_t DeviceId, void *HostPtr,
-                               uint32_t ArgNum, void **ArgsBase, void **Args,
-                               int64_t *ArgSizes, int64_t *ArgTypes,
-                               map_var_info_t *ArgNames, void **ArgMappers) {
-  TIMESCOPE_WITH_IDENT(Loc);
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  KernelArgsTy KernelArgs{1,        ArgNum,   ArgsBase,   Args, ArgSizes,
-                          ArgTypes, ArgNames, ArgMappers, 0};
-  return __tgt_target_kernel(Loc, DeviceId, -1, -1, HostPtr, &KernelArgs);
-}
-
-EXTERN int __tgt_target(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
-                        void **ArgsBase, void **Args, int64_t *ArgSizes,
-                        int64_t *ArgTypes) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return __tgt_target_mapper(nullptr, DeviceId, HostPtr, ArgNum, ArgsBase, Args,
-                             ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN int __tgt_target_nowait(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
-                               void **ArgsBase, void **Args, int64_t *ArgSizes,
-                               int64_t *ArgTypes, int32_t DepNum, void *DepList,
-                               int32_t NoAliasDepNum, void *NoAliasDepList) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return __tgt_target_mapper(nullptr, DeviceId, HostPtr, ArgNum, ArgsBase, Args,
-                             ArgSizes, ArgTypes, nullptr, nullptr);
-}
-
-EXTERN int __tgt_target_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum,
-    void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
-    map_var_info_t *ArgNames, void **ArgMappers, int32_t DepNum, void *DepList,
-    int32_t NoAliasDepNum, void *NoAliasDepList) {
-  TIMESCOPE_WITH_IDENT(Loc);
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return __tgt_target_mapper(Loc, DeviceId, HostPtr, ArgNum, ArgsBase, Args,
-                             ArgSizes, ArgTypes, ArgNames, ArgMappers);
-}
-
-EXTERN int __tgt_target_teams_mapper(ident_t *Loc, int64_t DeviceId,
-                                     void *HostPtr, uint32_t ArgNum,
-                                     void **ArgsBase, void **Args,
-                                     int64_t *ArgSizes, int64_t *ArgTypes,
-                                     map_var_info_t *ArgNames,
-                                     void **ArgMappers, int32_t NumTeams,
-                                     int32_t ThreadLimit) {
-  TIMESCOPE_WITH_IDENT(Loc);
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  KernelArgsTy KernelArgs{1,        ArgNum,   ArgsBase,   Args, ArgSizes,
-                          ArgTypes, ArgNames, ArgMappers, 0};
-  return __tgt_target_kernel(Loc, DeviceId, NumTeams, ThreadLimit, HostPtr,
-                             &KernelArgs);
-}
-
-EXTERN int __tgt_target_teams(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
-                              void **ArgsBase, void **Args, int64_t *ArgSizes,
-                              int64_t *ArgTypes, int32_t NumTeams,
-                              int32_t ThreadLimit) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return __tgt_target_teams_mapper(nullptr, DeviceId, HostPtr, ArgNum, ArgsBase,
-                                   Args, ArgSizes, ArgTypes, nullptr, nullptr,
-                                   NumTeams, ThreadLimit);
-}
-
-EXTERN int __tgt_target_teams_nowait(int64_t DeviceId, void *HostPtr,
-                                     int32_t ArgNum, void **ArgsBase,
-                                     void **Args, int64_t *ArgSizes,
-                                     int64_t *ArgTypes, int32_t NumTeams,
-                                     int32_t ThreadLimit, int32_t DepNum,
-                                     void *DepList, int32_t NoAliasDepNum,
-                                     void *NoAliasDepList) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return __tgt_target_teams_mapper(nullptr, DeviceId, HostPtr, ArgNum, ArgsBase,
-                                   Args, ArgSizes, ArgTypes, nullptr, nullptr,
-                                   NumTeams, ThreadLimit);
-}
-
-EXTERN int __tgt_target_teams_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum,
-    void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
-    map_var_info_t *ArgNames, void **ArgMappers, int32_t NumTeams,
-    int32_t ThreadLimit, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
-    void *NoAliasDepList) {
-  TIMESCOPE_WITH_IDENT(Loc);
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return __tgt_target_teams_mapper(Loc, DeviceId, HostPtr, ArgNum, ArgsBase,
-                                   Args, ArgSizes, ArgTypes, ArgNames,
-                                   ArgMappers, NumTeams, ThreadLimit);
-}
-
-EXTERN void __kmpc_push_target_tripcount_mapper(ident_t *Loc, int64_t DeviceId,
-                                                uint64_t LoopTripcount) {
-  TIMESCOPE_WITH_IDENT(Loc);
-  DP("WARNING: __kmpc_push_target_tripcount has been deprecated and is a noop");
-}
-
-EXTERN void __kmpc_push_target_tripcount(int64_t DeviceId,
-                                         uint64_t LoopTripcount) {
-  __kmpc_push_target_tripcount_mapper(nullptr, DeviceId, LoopTripcount);
-}
-
-EXTERN int __tgt_target_kernel_nowait(ident_t *Loc, int64_t DeviceId,
-                                      int32_t NumTeams, int32_t ThreadLimit,
-                                      void *HostPtr, KernelArgsTy *KernelArgs,
-                                      int32_t DepNum, void *DepList,
-                                      int32_t NoAliasDepNum,
-                                      void *NoAliasDepList) {
-  TIMESCOPE_WITH_IDENT(Loc);
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return __tgt_target_kernel(Loc, DeviceId, NumTeams, ThreadLimit, HostPtr,
-                             KernelArgs);
-}

diff --git a/libomptarget/src/OffloadRTL.cpp b/libomptarget/src/OffloadRTL.cpp
deleted file mode 100644
index dd75b1b..0000000
--- a/libomptarget/src/OffloadRTL.cpp
+++ /dev/null

@@ -1,58 +0,0 @@
-//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Initialization and tear down of the offload runtime.
-//
-//===----------------------------------------------------------------------===//
-
-#include "OpenMP/OMPT/Callback.h"
-#include "PluginManager.h"
-
-#include "Shared/Debug.h"
-#include "Shared/Profile.h"
-
-#ifdef OMPT_SUPPORT
-extern void llvm::omp::target::ompt::connectLibrary();
-#endif
-
-static std::mutex PluginMtx;
-static uint32_t RefCount = 0;
-
-void initRuntime() {
-  std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
-  Profiler::get();
-  TIMESCOPE();
-
-  if (PM == nullptr)
-    PM = new PluginManager();
-
-  RefCount++;
-  if (RefCount == 1) {
-    DP("Init offload library!\n");
-#ifdef OMPT_SUPPORT
-    // Initialize OMPT first
-    llvm::omp::target::ompt::connectLibrary();
-#endif
-
-    PM->init();
-    PM->registerDelayedLibraries();
-  }
-}
-
-void deinitRuntime() {
-  std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
-  assert(PM && "Runtime not initialized");
-
-  if (RefCount == 1) {
-    DP("Deinit offload library!\n");
-    delete PM;
-    PM = nullptr;
-  }
-
-  RefCount--;
-}

diff --git a/libomptarget/src/OpenMP/API.cpp b/libomptarget/src/OpenMP/API.cpp
deleted file mode 100644
index c85f986..0000000
--- a/libomptarget/src/OpenMP/API.cpp
+++ /dev/null

@@ -1,674 +0,0 @@
-//===----------- api.cpp - Target independent OpenMP target RTL -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of OpenMP API interface functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PluginManager.h"
-#include "device.h"
-#include "omptarget.h"
-#include "rtl.h"
-
-#include "OpenMP/InternalTypes.h"
-#include "OpenMP/Mapping.h"
-#include "OpenMP/OMPT/Interface.h"
-#include "OpenMP/omp.h"
-#include "Shared/Profile.h"
-
-#include "llvm/ADT/SmallVector.h"
-
-#include <climits>
-#include <cstdlib>
-#include <cstring>
-#include <mutex>
-
-EXTERN void ompx_dump_mapping_tables() {
-  ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
-  auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
-  for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
-    dumpTargetPointerMappings(&Loc, Device, true);
-}
-
-#ifdef OMPT_SUPPORT
-using namespace llvm::omp::target::ompt;
-#endif
-
-void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
-                          const char *Name);
-void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
-                        const char *Name);
-void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
-                         const char *Name);
-void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name);
-
-// Implemented in libomp, they are called from within __tgt_* functions.
-extern "C" {
-int __kmpc_get_target_offload(void) __attribute__((weak));
-kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
-                                  size_t sizeof_kmp_task_t,
-                                  size_t sizeof_shareds,
-                                  kmp_routine_entry_t task_entry)
-    __attribute__((weak));
-
-kmp_task_t *
-__kmpc_omp_target_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
-                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
-                             kmp_routine_entry_t task_entry, int64_t device_id)
-    __attribute__((weak));
-
-int32_t __kmpc_omp_task_with_deps(ident_t *loc_ref, int32_t gtid,
-                                  kmp_task_t *new_task, int32_t ndeps,
-                                  kmp_depend_info_t *dep_list,
-                                  int32_t ndeps_noalias,
-                                  kmp_depend_info_t *noalias_dep_list)
-    __attribute__((weak));
-}
-
-EXTERN int omp_get_num_devices(void) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  size_t NumDevices = PM->getNumDevices();
-
-  DP("Call to omp_get_num_devices returning %zd\n", NumDevices);
-
-  return NumDevices;
-}
-
-EXTERN int omp_get_device_num(void) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  int HostDevice = omp_get_initial_device();
-
-  DP("Call to omp_get_device_num returning %d\n", HostDevice);
-
-  return HostDevice;
-}
-
-EXTERN int omp_get_initial_device(void) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  int HostDevice = omp_get_num_devices();
-  DP("Call to omp_get_initial_device returning %d\n", HostDevice);
-  return HostDevice;
-}
-
-EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
-  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
-                         ";size=" + std::to_string(Size));
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
-}
-
-EXTERN void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEVICE, __func__);
-}
-
-EXTERN void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_HOST, __func__);
-}
-
-EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_SHARED, __func__);
-}
-
-EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
-}
-
-EXTERN void llvm_omp_target_free_device(void *Ptr, int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEVICE, __func__);
-}
-
-EXTERN void llvm_omp_target_free_host(void *Ptr, int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_HOST, __func__);
-}
-
-EXTERN void llvm_omp_target_free_shared(void *Ptre, int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetFreeExplicit(Ptre, DeviceNum, TARGET_ALLOC_SHARED, __func__);
-}
-
-EXTERN void *llvm_omp_target_dynamic_shared_alloc() {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return nullptr;
-}
-
-EXTERN void *llvm_omp_get_dynamic_shared() {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return nullptr;
-}
-
-EXTERN [[nodiscard]] void *llvm_omp_target_lock_mem(void *Ptr, size_t Size,
-                                                    int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  return targetLockExplicit(Ptr, Size, DeviceNum, __func__);
-}
-
-EXTERN void llvm_omp_target_unlock_mem(void *Ptr, int DeviceNum) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  targetUnlockExplicit(Ptr, DeviceNum, __func__);
-}
-
-EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
-     DeviceNum, DPxPTR(Ptr));
-
-  if (!Ptr) {
-    DP("Call to omp_target_is_present with NULL ptr, returning false\n");
-    return false;
-  }
-
-  if (DeviceNum == omp_get_initial_device()) {
-    DP("Call to omp_target_is_present on host, returning true\n");
-    return true;
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  // omp_target_is_present tests whether a host pointer refers to storage that
-  // is mapped to a given device. However, due to the lack of the storage size,
-  // only check 1 byte. Cannot set size 0 which checks whether the pointer (zero
-  // lengh array) is mapped instead of the referred storage.
-  TargetPointerResultTy TPR =
-      DeviceOrErr->getMappingInfo().getTgtPtrBegin(const_cast<void *>(Ptr), 1,
-                                                   /*UpdateRefCount=*/false,
-                                                   /*UseHoldRefCount=*/false);
-  int Rc = TPR.isPresent();
-  DP("Call to omp_target_is_present returns %d\n", Rc);
-  return Rc;
-}
-
-EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
-                             size_t DstOffset, size_t SrcOffset, int DstDevice,
-                             int SrcDevice) {
-  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
-                         ";src_dev=" + std::to_string(SrcDevice) +
-                         ";size=" + std::to_string(Length));
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_memcpy, dst device %d, src device %d, "
-     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
-     "src offset %zu, length %zu\n",
-     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
-     Length);
-
-  if (!Dst || !Src || Length <= 0) {
-    if (Length == 0) {
-      DP("Call to omp_target_memcpy with zero length, nothing to do\n");
-      return OFFLOAD_SUCCESS;
-    }
-
-    REPORT("Call to omp_target_memcpy with invalid arguments\n");
-    return OFFLOAD_FAIL;
-  }
-
-  int Rc = OFFLOAD_SUCCESS;
-  void *SrcAddr = (char *)const_cast<void *>(Src) + SrcOffset;
-  void *DstAddr = (char *)Dst + DstOffset;
-
-  if (SrcDevice == omp_get_initial_device() &&
-      DstDevice == omp_get_initial_device()) {
-    DP("copy from host to host\n");
-    const void *P = memcpy(DstAddr, SrcAddr, Length);
-    if (P == NULL)
-      Rc = OFFLOAD_FAIL;
-  } else if (SrcDevice == omp_get_initial_device()) {
-    DP("copy from host to device\n");
-    auto DstDeviceOrErr = PM->getDevice(DstDevice);
-    if (!DstDeviceOrErr)
-      FATAL_MESSAGE(DstDevice, "%s",
-                    toString(DstDeviceOrErr.takeError()).c_str());
-    AsyncInfoTy AsyncInfo(*DstDeviceOrErr);
-    Rc = DstDeviceOrErr->submitData(DstAddr, SrcAddr, Length, AsyncInfo);
-  } else if (DstDevice == omp_get_initial_device()) {
-    DP("copy from device to host\n");
-    auto SrcDeviceOrErr = PM->getDevice(SrcDevice);
-    if (!SrcDeviceOrErr)
-      FATAL_MESSAGE(SrcDevice, "%s",
-                    toString(SrcDeviceOrErr.takeError()).c_str());
-    AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
-    Rc = SrcDeviceOrErr->retrieveData(DstAddr, SrcAddr, Length, AsyncInfo);
-  } else {
-    DP("copy from device to device\n");
-    auto SrcDeviceOrErr = PM->getDevice(SrcDevice);
-    if (!SrcDeviceOrErr)
-      FATAL_MESSAGE(SrcDevice, "%s",
-                    toString(SrcDeviceOrErr.takeError()).c_str());
-    AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
-    auto DstDeviceOrErr = PM->getDevice(DstDevice);
-    if (!DstDeviceOrErr)
-      FATAL_MESSAGE(DstDevice, "%s",
-                    toString(DstDeviceOrErr.takeError()).c_str());
-    // First try to use D2D memcpy which is more efficient. If fails, fall back
-    // to unefficient way.
-    if (SrcDeviceOrErr->isDataExchangable(*DstDeviceOrErr)) {
-      AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
-      Rc = SrcDeviceOrErr->dataExchange(SrcAddr, *DstDeviceOrErr, DstAddr,
-                                        Length, AsyncInfo);
-      if (Rc == OFFLOAD_SUCCESS)
-        return OFFLOAD_SUCCESS;
-    }
-
-    void *Buffer = malloc(Length);
-    {
-      AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
-      Rc = SrcDeviceOrErr->retrieveData(Buffer, SrcAddr, Length, AsyncInfo);
-    }
-    if (Rc == OFFLOAD_SUCCESS) {
-      AsyncInfoTy AsyncInfo(*DstDeviceOrErr);
-      Rc = DstDeviceOrErr->submitData(DstAddr, Buffer, Length, AsyncInfo);
-    }
-    free(Buffer);
-  }
-
-  DP("omp_target_memcpy returns %d\n", Rc);
-  return Rc;
-}
-
-// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
-static int libomp_target_memcpy_async_task(int32_t Gtid, kmp_task_t *Task) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  if (Task == nullptr)
-    return OFFLOAD_FAIL;
-
-  TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds;
-
-  if (Args == nullptr)
-    return OFFLOAD_FAIL;
-
-  // Call blocked version
-  int Rc = OFFLOAD_SUCCESS;
-  if (Args->IsRectMemcpy) {
-    Rc = omp_target_memcpy_rect(
-        Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume,
-        Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions,
-        Args->SrcDimensions, Args->DstDevice, Args->SrcDevice);
-
-    DP("omp_target_memcpy_rect returns %d\n", Rc);
-  } else {
-    Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset,
-                           Args->SrcOffset, Args->DstDevice, Args->SrcDevice);
-
-    DP("omp_target_memcpy returns %d\n", Rc);
-  }
-
-  // Release the arguments object
-  delete Args;
-
-  return Rc;
-}
-
-static int libomp_target_memset_async_task(int32_t Gtid, kmp_task_t *Task) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  if (!Task)
-    return OFFLOAD_FAIL;
-
-  auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
-  if (!Args)
-    return OFFLOAD_FAIL;
-
-  // call omp_target_memset()
-  omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);
-
-  delete Args;
-
-  return OFFLOAD_SUCCESS;
-}
-
-static inline void
-convertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
-                    omp_depend_t *DepObjList) {
-  for (int i = 0; i < DepObjCount; ++i) {
-    omp_depend_t DepObj = DepObjList[i];
-    Vec.push_back(*((kmp_depend_info_t *)DepObj));
-  }
-}
-
-template <class T>
-static inline int
-libomp_helper_task_creation(T *Args, int (*Fn)(int32_t, kmp_task_t *),
-                            int DepObjCount, omp_depend_t *DepObjList) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  // Create global thread ID
-  int Gtid = __kmpc_global_thread_num(nullptr);
-
-  // Setup the hidden helper flags
-  int32_t Flags = 0;
-  kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
-  InputFlags->hidden_helper = 1;
-
-  // Alloc the helper task
-  kmp_task_t *Task = __kmpc_omp_target_task_alloc(
-      nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
-  if (!Task) {
-    delete Args;
-    return OFFLOAD_FAIL;
-  }
-
-  // Setup the arguments for the helper task
-  Task->shareds = Args;
-
-  // Convert types of depend objects
-  llvm::SmallVector<kmp_depend_info_t> DepObjs;
-  convertDepObjVector(DepObjs, DepObjCount, DepObjList);
-
-  // Launch the helper task
-  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
-                                     DepObjs.data(), 0, nullptr);
-
-  return Rc;
-}
-
-EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
-                               int DeviceNum) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
-     DeviceNum, Ptr, NumBytes);
-
-  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
-  // of unspecified behavior, see OpenMP spec).
-  if (!Ptr || NumBytes == 0) {
-    return Ptr;
-  }
-
-  if (DeviceNum == omp_get_initial_device()) {
-    DP("filling memory on host via memset");
-    memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
-  } else {
-    // TODO: replace the omp_target_memset() slow path with the fast path.
-    // That will require the ability to execute a kernel from within
-    // libomptarget.so (which we do not have at the moment).
-
-    // This is a very slow path: create a filled array on the host and upload
-    // it to the GPU device.
-    int InitialDevice = omp_get_initial_device();
-    void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
-    if (Shadow) {
-      (void)memset(Shadow, ByteVal, NumBytes);
-      (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
-                              InitialDevice);
-      (void)omp_target_free(Shadow, InitialDevice);
-    } else {
-      // If the omp_target_alloc has failed, let's just not do anything.
-      // omp_target_memset does not have any good way to fail, so we
-      // simply avoid a catastrophic failure of the process for now.
-      DP("omp_target_memset failed to fill memory due to error with "
-         "omp_target_alloc");
-    }
-  }
-
-  DP("omp_target_memset returns %p\n", Ptr);
-  return Ptr;
-}
-
-EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes,
-                                     int DeviceNum, int DepObjCount,
-                                     omp_depend_t *DepObjList) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
-     DeviceNum, Ptr, NumBytes);
-
-  // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
-  // of unspecified behavior, see OpenMP spec).
-  if (!Ptr || NumBytes == 0)
-    return Ptr;
-
-  // Create the task object to deal with the async invocation
-  auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};
-
-  // omp_target_memset_async() cannot fail via a return code, so ignore the
-  // return code of the helper function
-  (void)libomp_helper_task_creation(Args, &libomp_target_memset_async_task,
-                                    DepObjCount, DepObjList);
-
-  return Ptr;
-}
-
-EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
-                                   size_t DstOffset, size_t SrcOffset,
-                                   int DstDevice, int SrcDevice,
-                                   int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
-                         ";src_dev=" + std::to_string(SrcDevice) +
-                         ";size=" + std::to_string(Length));
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
-     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
-     "src offset %zu, length %zu\n",
-     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
-     Length);
-
-  // Check the source and dest address
-  if (Dst == nullptr || Src == nullptr)
-    return OFFLOAD_FAIL;
-
-  // Create task object
-  TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
-      Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
-
-  // Create and launch helper task
-  int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
-                                       DepObjCount, DepObjList);
-
-  DP("omp_target_memcpy_async returns %d\n", Rc);
-  return Rc;
-}
-
-EXTERN int
-omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
-                       int NumDims, const size_t *Volume,
-                       const size_t *DstOffsets, const size_t *SrcOffsets,
-                       const size_t *DstDimensions, const size_t *SrcDimensions,
-                       int DstDevice, int SrcDevice) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
-     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
-     "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
-     "volume " DPxMOD ", element size %zu, num_dims %d\n",
-     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
-     DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
-     DPxPTR(Volume), ElementSize, NumDims);
-
-  if (!(Dst || Src)) {
-    DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
-       INT_MAX);
-    return INT_MAX;
-  }
-
-  if (!Dst || !Src || ElementSize < 1 || NumDims < 1 || !Volume ||
-      !DstOffsets || !SrcOffsets || !DstDimensions || !SrcDimensions) {
-    REPORT("Call to omp_target_memcpy_rect with invalid arguments\n");
-    return OFFLOAD_FAIL;
-  }
-
-  int Rc;
-  if (NumDims == 1) {
-    Rc = omp_target_memcpy(Dst, Src, ElementSize * Volume[0],
-                           ElementSize * DstOffsets[0],
-                           ElementSize * SrcOffsets[0], DstDevice, SrcDevice);
-  } else {
-    size_t DstSliceSize = ElementSize;
-    size_t SrcSliceSize = ElementSize;
-    for (int I = 1; I < NumDims; ++I) {
-      DstSliceSize *= DstDimensions[I];
-      SrcSliceSize *= SrcDimensions[I];
-    }
-
-    size_t DstOff = DstOffsets[0] * DstSliceSize;
-    size_t SrcOff = SrcOffsets[0] * SrcSliceSize;
-    for (size_t I = 0; I < Volume[0]; ++I) {
-      Rc = omp_target_memcpy_rect(
-          (char *)Dst + DstOff + DstSliceSize * I,
-          (char *)const_cast<void *>(Src) + SrcOff + SrcSliceSize * I,
-          ElementSize, NumDims - 1, Volume + 1, DstOffsets + 1, SrcOffsets + 1,
-          DstDimensions + 1, SrcDimensions + 1, DstDevice, SrcDevice);
-
-      if (Rc) {
-        DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
-        return Rc;
-      }
-    }
-  }
-
-  DP("omp_target_memcpy_rect returns %d\n", Rc);
-  return Rc;
-}
-
-EXTERN int omp_target_memcpy_rect_async(
-    void *Dst, const void *Src, size_t ElementSize, int NumDims,
-    const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
-    const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
-    int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
-                         ";src_dev=" + std::to_string(SrcDevice) +
-                         ";size=" + std::to_string(ElementSize) +
-                         ";num_dims=" + std::to_string(NumDims));
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
-     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
-     "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
-     "volume " DPxMOD ", element size %zu, num_dims %d\n",
-     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
-     DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
-     DPxPTR(Volume), ElementSize, NumDims);
-
-  // Need to check this first to not return OFFLOAD_FAIL instead
-  if (!Dst && !Src) {
-    DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
-       INT_MAX);
-    return INT_MAX;
-  }
-
-  // Check the source and dest address
-  if (Dst == nullptr || Src == nullptr)
-    return OFFLOAD_FAIL;
-
-  // Create task object
-  TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
-      Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets,
-      DstDimensions, SrcDimensions, DstDevice, SrcDevice);
-
-  // Create and launch helper task
-  int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
-                                       DepObjCount, DepObjList);
-
-  DP("omp_target_memcpy_rect_async returns %d\n", Rc);
-  return Rc;
-}
-
-EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
-                                    size_t Size, size_t DeviceOffset,
-                                    int DeviceNum) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
-     "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
-     DPxPTR(HostPtr), DPxPTR(DevicePtr), Size, DeviceOffset, DeviceNum);
-
-  if (!HostPtr || !DevicePtr || Size <= 0) {
-    REPORT("Call to omp_target_associate_ptr with invalid arguments\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (DeviceNum == omp_get_initial_device()) {
-    REPORT("omp_target_associate_ptr: no association possible on the host\n");
-    return OFFLOAD_FAIL;
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  void *DeviceAddr = (void *)((uint64_t)DevicePtr + (uint64_t)DeviceOffset);
-  int Rc = DeviceOrErr->getMappingInfo().associatePtr(
-      const_cast<void *>(HostPtr), const_cast<void *>(DeviceAddr), Size);
-  DP("omp_target_associate_ptr returns %d\n", Rc);
-  return Rc;
-}
-
-EXTERN int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
-     "device_num %d\n",
-     DPxPTR(HostPtr), DeviceNum);
-
-  if (!HostPtr) {
-    REPORT("Call to omp_target_associate_ptr with invalid host_ptr\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (DeviceNum == omp_get_initial_device()) {
-    REPORT(
-        "omp_target_disassociate_ptr: no association possible on the host\n");
-    return OFFLOAD_FAIL;
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  int Rc = DeviceOrErr->getMappingInfo().disassociatePtr(
-      const_cast<void *>(HostPtr));
-  DP("omp_target_disassociate_ptr returns %d\n", Rc);
-  return Rc;
-}
-
-EXTERN void *omp_get_mapped_ptr(const void *Ptr, int DeviceNum) {
-  TIMESCOPE();
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  DP("Call to omp_get_mapped_ptr with ptr " DPxMOD ", device_num %d.\n",
-     DPxPTR(Ptr), DeviceNum);
-
-  if (!Ptr) {
-    REPORT("Call to omp_get_mapped_ptr with nullptr.\n");
-    return nullptr;
-  }
-
-  size_t NumDevices = omp_get_initial_device();
-  if (DeviceNum == NumDevices) {
-    DP("Device %d is initial device, returning Ptr " DPxMOD ".\n",
-           DeviceNum, DPxPTR(Ptr));
-    return const_cast<void *>(Ptr);
-  }
-
-  if (NumDevices <= DeviceNum) {
-    DP("DeviceNum %d is invalid, returning nullptr.\n", DeviceNum);
-    return nullptr;
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  TargetPointerResultTy TPR =
-      DeviceOrErr->getMappingInfo().getTgtPtrBegin(const_cast<void *>(Ptr), 1,
-                                                   /*UpdateRefCount=*/false,
-                                                   /*UseHoldRefCount=*/false);
-  if (!TPR.isPresent()) {
-    DP("Ptr " DPxMOD "is not present on device %d, returning nullptr.\n",
-       DPxPTR(Ptr), DeviceNum);
-    return nullptr;
-  }
-
-  DP("omp_get_mapped_ptr returns " DPxMOD ".\n", DPxPTR(TPR.TargetPointer));
-
-  return TPR.TargetPointer;
-}

diff --git a/libomptarget/src/OpenMP/InteropAPI.cpp b/libomptarget/src/OpenMP/InteropAPI.cpp
deleted file mode 100644
index 1a995cd..0000000
--- a/libomptarget/src/OpenMP/InteropAPI.cpp
+++ /dev/null

@@ -1,311 +0,0 @@
-//===-- InteropAPI.cpp - Implementation of OpenMP interoperability API ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "OpenMP/InteropAPI.h"
-#include "OpenMP/InternalTypes.h"
-#include "OpenMP/omp.h"
-
-#include "PluginManager.h"
-#include "device.h"
-#include "omptarget.h"
-#include "llvm/Support/Error.h"
-#include <cstdlib>
-#include <cstring>
-
-extern "C" {
-
-void __kmpc_omp_wait_deps(ident_t *loc_ref, int32_t gtid, int32_t ndeps,
-                          kmp_depend_info_t *dep_list, int32_t ndeps_noalias,
-                          kmp_depend_info_t *noalias_dep_list)
-    __attribute__((weak));
-
-} // extern "C"
-
-namespace {
-omp_interop_rc_t getPropertyErrorType(omp_interop_property_t Property) {
-  switch (Property) {
-  case omp_ipr_fr_id:
-    return omp_irc_type_int;
-  case omp_ipr_fr_name:
-    return omp_irc_type_str;
-  case omp_ipr_vendor:
-    return omp_irc_type_int;
-  case omp_ipr_vendor_name:
-    return omp_irc_type_str;
-  case omp_ipr_device_num:
-    return omp_irc_type_int;
-  case omp_ipr_platform:
-    return omp_irc_type_int;
-  case omp_ipr_device:
-    return omp_irc_type_ptr;
-  case omp_ipr_device_context:
-    return omp_irc_type_ptr;
-  case omp_ipr_targetsync:
-    return omp_irc_type_ptr;
-  };
-  return omp_irc_no_value;
-}
-
-void getTypeMismatch(omp_interop_property_t Property, int *Err) {
-  if (Err)
-    *Err = getPropertyErrorType(Property);
-}
-
-const char *getVendorIdToStr(const omp_foreign_runtime_ids_t VendorId) {
-  switch (VendorId) {
-  case cuda:
-    return ("cuda");
-  case cuda_driver:
-    return ("cuda_driver");
-  case opencl:
-    return ("opencl");
-  case sycl:
-    return ("sycl");
-  case hip:
-    return ("hip");
-  case level_zero:
-    return ("level_zero");
-  }
-  return ("unknown");
-}
-
-template <typename PropertyTy>
-PropertyTy getProperty(omp_interop_val_t &InteropVal,
-                       omp_interop_property_t Property, int *Err);
-
-template <>
-intptr_t getProperty<intptr_t>(omp_interop_val_t &InteropVal,
-                               omp_interop_property_t Property, int *Err) {
-  switch (Property) {
-  case omp_ipr_fr_id:
-    return InteropVal.backend_type_id;
-  case omp_ipr_vendor:
-    return InteropVal.vendor_id;
-  case omp_ipr_device_num:
-    return InteropVal.device_id;
-  default:;
-  }
-  getTypeMismatch(Property, Err);
-  return 0;
-}
-
-template <>
-const char *getProperty<const char *>(omp_interop_val_t &InteropVal,
-                                      omp_interop_property_t Property,
-                                      int *Err) {
-  switch (Property) {
-  case omp_ipr_fr_id:
-    return InteropVal.interop_type == kmp_interop_type_tasksync
-               ? "tasksync"
-               : "device+context";
-  case omp_ipr_vendor_name:
-    return getVendorIdToStr(InteropVal.vendor_id);
-  default:
-    getTypeMismatch(Property, Err);
-    return nullptr;
-  }
-}
-
-template <>
-void *getProperty<void *>(omp_interop_val_t &InteropVal,
-                          omp_interop_property_t Property, int *Err) {
-  switch (Property) {
-  case omp_ipr_device:
-    if (InteropVal.device_info.Device)
-      return InteropVal.device_info.Device;
-    *Err = omp_irc_no_value;
-    return const_cast<char *>(InteropVal.err_str);
-  case omp_ipr_device_context:
-    return InteropVal.device_info.Context;
-  case omp_ipr_targetsync:
-    return InteropVal.async_info->Queue;
-  default:;
-  }
-  getTypeMismatch(Property, Err);
-  return nullptr;
-}
-
-bool getPropertyCheck(omp_interop_val_t **InteropPtr,
-                      omp_interop_property_t Property, int *Err) {
-  if (Err)
-    *Err = omp_irc_success;
-  if (!InteropPtr) {
-    if (Err)
-      *Err = omp_irc_empty;
-    return false;
-  }
-  if (Property >= 0 || Property < omp_ipr_first) {
-    if (Err)
-      *Err = omp_irc_out_of_range;
-    return false;
-  }
-  if (Property == omp_ipr_targetsync &&
-      (*InteropPtr)->interop_type != kmp_interop_type_tasksync) {
-    if (Err)
-      *Err = omp_irc_other;
-    return false;
-  }
-  if ((Property == omp_ipr_device || Property == omp_ipr_device_context) &&
-      (*InteropPtr)->interop_type == kmp_interop_type_tasksync) {
-    if (Err)
-      *Err = omp_irc_other;
-    return false;
-  }
-  return true;
-}
-
-} // namespace
-
-#define __OMP_GET_INTEROP_TY(RETURN_TYPE, SUFFIX)                              \
-  RETURN_TYPE omp_get_interop_##SUFFIX(const omp_interop_t interop,            \
-                                       omp_interop_property_t property_id,     \
-                                       int *err) {                             \
-    omp_interop_val_t *interop_val = (omp_interop_val_t *)interop;             \
-    assert((interop_val)->interop_type == kmp_interop_type_tasksync);          \
-    if (!getPropertyCheck(&interop_val, property_id, err)) {                   \
-      return (RETURN_TYPE)(0);                                                 \
-    }                                                                          \
-    return getProperty<RETURN_TYPE>(*interop_val, property_id, err);           \
-  }
-__OMP_GET_INTEROP_TY(intptr_t, int)
-__OMP_GET_INTEROP_TY(void *, ptr)
-__OMP_GET_INTEROP_TY(const char *, str)
-#undef __OMP_GET_INTEROP_TY
-
-#define __OMP_GET_INTEROP_TY3(RETURN_TYPE, SUFFIX)                             \
-  RETURN_TYPE omp_get_interop_##SUFFIX(const omp_interop_t interop,            \
-                                       omp_interop_property_t property_id) {   \
-    int err;                                                                   \
-    omp_interop_val_t *interop_val = (omp_interop_val_t *)interop;             \
-    if (!getPropertyCheck(&interop_val, property_id, &err)) {                  \
-      return (RETURN_TYPE)(0);                                                 \
-    }                                                                          \
-    return nullptr;                                                            \
-    return getProperty<RETURN_TYPE>(*interop_val, property_id, &err);          \
-  }
-__OMP_GET_INTEROP_TY3(const char *, name)
-__OMP_GET_INTEROP_TY3(const char *, type_desc)
-__OMP_GET_INTEROP_TY3(const char *, rc_desc)
-#undef __OMP_GET_INTEROP_TY3
-
-static const char *copyErrorString(llvm::Error &&Err) {
-  // TODO: Use the error string while avoiding leaks.
-  std::string ErrMsg = llvm::toString(std::move(Err));
-  char *UsrMsg = reinterpret_cast<char *>(malloc(ErrMsg.size() + 1));
-  strcpy(UsrMsg, ErrMsg.c_str());
-  return UsrMsg;
-}
-
-extern "C" {
-
-void __tgt_interop_init(ident_t *LocRef, int32_t Gtid,
-                        omp_interop_val_t *&InteropPtr,
-                        kmp_interop_type_t InteropType, int32_t DeviceId,
-                        int32_t Ndeps, kmp_depend_info_t *DepList,
-                        int32_t HaveNowait) {
-  int32_t NdepsNoalias = 0;
-  kmp_depend_info_t *NoaliasDepList = NULL;
-  assert(InteropType != kmp_interop_type_unknown &&
-         "Cannot initialize with unknown interop_type!");
-  if (DeviceId == -1) {
-    DeviceId = omp_get_default_device();
-  }
-
-  if (InteropType == kmp_interop_type_tasksync) {
-    __kmpc_omp_wait_deps(LocRef, Gtid, Ndeps, DepList, NdepsNoalias,
-                         NoaliasDepList);
-  }
-
-  InteropPtr = new omp_interop_val_t(DeviceId, InteropType);
-
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr) {
-    InteropPtr->err_str = copyErrorString(DeviceOrErr.takeError());
-    return;
-  }
-
-  DeviceTy &Device = *DeviceOrErr;
-  if (!Device.RTL || !Device.RTL->init_device_info ||
-      Device.RTL->init_device_info(DeviceId, &(InteropPtr)->device_info,
-                                   &(InteropPtr)->err_str)) {
-    delete InteropPtr;
-    InteropPtr = omp_interop_none;
-  }
-  if (InteropType == kmp_interop_type_tasksync) {
-    if (!Device.RTL || !Device.RTL->init_async_info ||
-        Device.RTL->init_async_info(DeviceId, &(InteropPtr)->async_info)) {
-      delete InteropPtr;
-      InteropPtr = omp_interop_none;
-    }
-  }
-}
-
-void __tgt_interop_use(ident_t *LocRef, int32_t Gtid,
-                       omp_interop_val_t *&InteropPtr, int32_t DeviceId,
-                       int32_t Ndeps, kmp_depend_info_t *DepList,
-                       int32_t HaveNowait) {
-  int32_t NdepsNoalias = 0;
-  kmp_depend_info_t *NoaliasDepList = NULL;
-  assert(InteropPtr && "Cannot use nullptr!");
-  omp_interop_val_t *InteropVal = InteropPtr;
-  if (DeviceId == -1) {
-    DeviceId = omp_get_default_device();
-  }
-  assert(InteropVal != omp_interop_none &&
-         "Cannot use uninitialized interop_ptr!");
-  assert((DeviceId == -1 || InteropVal->device_id == DeviceId) &&
-         "Inconsistent device-id usage!");
-
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr) {
-    InteropPtr->err_str = copyErrorString(DeviceOrErr.takeError());
-    return;
-  }
-
-  if (InteropVal->interop_type == kmp_interop_type_tasksync) {
-    __kmpc_omp_wait_deps(LocRef, Gtid, Ndeps, DepList, NdepsNoalias,
-                         NoaliasDepList);
-  }
-  // TODO Flush the queue associated with the interop through the plugin
-}
-
-void __tgt_interop_destroy(ident_t *LocRef, int32_t Gtid,
-                           omp_interop_val_t *&InteropPtr, int32_t DeviceId,
-                           int32_t Ndeps, kmp_depend_info_t *DepList,
-                           int32_t HaveNowait) {
-  int32_t NdepsNoalias = 0;
-  kmp_depend_info_t *NoaliasDepList = NULL;
-  assert(InteropPtr && "Cannot use nullptr!");
-  omp_interop_val_t *InteropVal = InteropPtr;
-  if (DeviceId == -1) {
-    DeviceId = omp_get_default_device();
-  }
-
-  if (InteropVal == omp_interop_none)
-    return;
-
-  assert((DeviceId == -1 || InteropVal->device_id == DeviceId) &&
-         "Inconsistent device-id usage!");
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr) {
-    InteropPtr->err_str = copyErrorString(DeviceOrErr.takeError());
-    return;
-  }
-
-  if (InteropVal->interop_type == kmp_interop_type_tasksync) {
-    __kmpc_omp_wait_deps(LocRef, Gtid, Ndeps, DepList, NdepsNoalias,
-                         NoaliasDepList);
-  }
-  // TODO Flush the queue associated with the interop through the plugin
-  // TODO Signal out dependences
-
-  delete InteropPtr;
-  InteropPtr = omp_interop_none;
-}
-
-} // extern "C"

diff --git a/libomptarget/src/OpenMP/Mapping.cpp b/libomptarget/src/OpenMP/Mapping.cpp
deleted file mode 100644
index c6ff3aa..0000000
--- a/libomptarget/src/OpenMP/Mapping.cpp
+++ /dev/null

@@ -1,537 +0,0 @@
-//===-- OpenMP/Mapping.cpp - OpenMP/OpenACC pointer mapping impl. ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "OpenMP/Mapping.h"
-
-#include "PluginManager.h"
-#include "Shared/Debug.h"
-#include "Shared/Requirements.h"
-#include "device.h"
-
-/// Dump a table of all the host-target pointer pairs on failure
-void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
-                               bool toStdOut) {
-  MappingInfoTy::HDTTMapAccessorTy HDTTMap =
-      Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
-  if (HDTTMap->empty()) {
-    DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
-              "OpenMP Host-Device pointer mappings table empty\n");
-    return;
-  }
-
-  SourceInfo Kernel(Loc);
-  DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
-            "OpenMP Host-Device pointer mappings after block at %s:%d:%d:\n",
-            Kernel.getFilename(), Kernel.getLine(), Kernel.getColumn());
-  DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
-            "%-18s %-18s %s %s %s %s\n", "Host Ptr", "Target Ptr", "Size (B)",
-            "DynRefCount", "HoldRefCount", "Declaration");
-  for (const auto &It : *HDTTMap) {
-    HostDataToTargetTy &HDTT = *It.HDTT;
-    SourceInfo Info(HDTT.HstPtrName);
-    DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
-              DPxMOD " " DPxMOD " %-8" PRIuPTR " %-11s %-12s %s at %s:%d:%d\n",
-              DPxPTR(HDTT.HstPtrBegin), DPxPTR(HDTT.TgtPtrBegin),
-              HDTT.HstPtrEnd - HDTT.HstPtrBegin,
-              HDTT.dynRefCountToStr().c_str(), HDTT.holdRefCountToStr().c_str(),
-              Info.getName(), Info.getFilename(), Info.getLine(),
-              Info.getColumn());
-  }
-}
-
-int MappingInfoTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin,
-                                int64_t Size) {
-  HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
-
-  // Check if entry exists
-  auto It = HDTTMap->find(HstPtrBegin);
-  if (It != HDTTMap->end()) {
-    HostDataToTargetTy &HDTT = *It->HDTT;
-    std::lock_guard<HostDataToTargetTy> LG(HDTT);
-    // Mapping already exists
-    bool IsValid = HDTT.HstPtrEnd == (uintptr_t)HstPtrBegin + Size &&
-                   HDTT.TgtPtrBegin == (uintptr_t)TgtPtrBegin;
-    if (IsValid) {
-      DP("Attempt to re-associate the same device ptr+offset with the same "
-         "host ptr, nothing to do\n");
-      return OFFLOAD_SUCCESS;
-    }
-    REPORT("Not allowed to re-associate a different device ptr+offset with "
-           "the same host ptr\n");
-    return OFFLOAD_FAIL;
-  }
-
-  // Mapping does not exist, allocate it with refCount=INF
-  const HostDataToTargetTy &NewEntry =
-      *HDTTMap
-           ->emplace(new HostDataToTargetTy(
-               /*HstPtrBase=*/(uintptr_t)HstPtrBegin,
-               /*HstPtrBegin=*/(uintptr_t)HstPtrBegin,
-               /*HstPtrEnd=*/(uintptr_t)HstPtrBegin + Size,
-               /*TgtAllocBegin=*/(uintptr_t)TgtPtrBegin,
-               /*TgtPtrBegin=*/(uintptr_t)TgtPtrBegin,
-               /*UseHoldRefCount=*/false, /*Name=*/nullptr,
-               /*IsRefCountINF=*/true))
-           .first->HDTT;
-  DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD
-     ", HstEnd=" DPxMOD ", TgtBegin=" DPxMOD ", DynRefCount=%s, "
-     "HoldRefCount=%s\n",
-     DPxPTR(NewEntry.HstPtrBase), DPxPTR(NewEntry.HstPtrBegin),
-     DPxPTR(NewEntry.HstPtrEnd), DPxPTR(NewEntry.TgtPtrBegin),
-     NewEntry.dynRefCountToStr().c_str(), NewEntry.holdRefCountToStr().c_str());
-  (void)NewEntry;
-
-  // Notify the plugin about the new mapping.
-  return Device.notifyDataMapped(HstPtrBegin, Size);
-}
-
-int MappingInfoTy::disassociatePtr(void *HstPtrBegin) {
-  HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
-
-  auto It = HDTTMap->find(HstPtrBegin);
-  if (It == HDTTMap->end()) {
-    REPORT("Association not found\n");
-    return OFFLOAD_FAIL;
-  }
-  // Mapping exists
-  HostDataToTargetTy &HDTT = *It->HDTT;
-  std::lock_guard<HostDataToTargetTy> LG(HDTT);
-
-  if (HDTT.getHoldRefCount()) {
-    // This is based on OpenACC 3.1, sec 3.2.33 "acc_unmap_data", L3656-3657:
-    // "It is an error to call acc_unmap_data if the structured reference
-    // count for the pointer is not zero."
-    REPORT("Trying to disassociate a pointer with a non-zero hold reference "
-           "count\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (HDTT.isDynRefCountInf()) {
-    DP("Association found, removing it\n");
-    void *Event = HDTT.getEvent();
-    delete &HDTT;
-    if (Event)
-      Device.destroyEvent(Event);
-    HDTTMap->erase(It);
-    return Device.notifyDataUnmapped(HstPtrBegin);
-  }
-
-  REPORT("Trying to disassociate a pointer which was not mapped via "
-         "omp_target_associate_ptr\n");
-  return OFFLOAD_FAIL;
-}
-
-LookupResult MappingInfoTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
-                                          void *HstPtrBegin, int64_t Size,
-                                          HostDataToTargetTy *OwnedTPR) {
-
-  uintptr_t HP = (uintptr_t)HstPtrBegin;
-  LookupResult LR;
-
-  DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%" PRId64 ")...\n",
-     DPxPTR(HP), Size);
-
-  if (HDTTMap->empty())
-    return LR;
-
-  auto Upper = HDTTMap->upper_bound(HP);
-
-  if (Size == 0) {
-    // specification v5.1 Pointer Initialization for Device Data Environments
-    // upper_bound satisfies
-    //   std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin
-    if (Upper != HDTTMap->begin()) {
-      LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR);
-      // the left side of extended address range is satisified.
-      // hp >= LR.TPR.getEntry()->HstPtrBegin || hp >=
-      // LR.TPR.getEntry()->HstPtrBase
-      LR.Flags.IsContained = HP < LR.TPR.getEntry()->HstPtrEnd ||
-                             HP < LR.TPR.getEntry()->HstPtrBase;
-    }
-
-    if (!LR.Flags.IsContained && Upper != HDTTMap->end()) {
-      LR.TPR.setEntry(Upper->HDTT, OwnedTPR);
-      // the right side of extended address range is satisified.
-      // hp < LR.TPR.getEntry()->HstPtrEnd || hp < LR.TPR.getEntry()->HstPtrBase
-      LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBase;
-    }
-  } else {
-    // check the left bin
-    if (Upper != HDTTMap->begin()) {
-      LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR);
-      // Is it contained?
-      LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBegin &&
-                             HP < LR.TPR.getEntry()->HstPtrEnd &&
-                             (HP + Size) <= LR.TPR.getEntry()->HstPtrEnd;
-      // Does it extend beyond the mapped region?
-      LR.Flags.ExtendsAfter = HP < LR.TPR.getEntry()->HstPtrEnd &&
-                              (HP + Size) > LR.TPR.getEntry()->HstPtrEnd;
-    }
-
-    // check the right bin
-    if (!(LR.Flags.IsContained || LR.Flags.ExtendsAfter) &&
-        Upper != HDTTMap->end()) {
-      LR.TPR.setEntry(Upper->HDTT, OwnedTPR);
-      // Does it extend into an already mapped region?
-      LR.Flags.ExtendsBefore = HP < LR.TPR.getEntry()->HstPtrBegin &&
-                               (HP + Size) > LR.TPR.getEntry()->HstPtrBegin;
-      // Does it extend beyond the mapped region?
-      LR.Flags.ExtendsAfter = HP < LR.TPR.getEntry()->HstPtrEnd &&
-                              (HP + Size) > LR.TPR.getEntry()->HstPtrEnd;
-    }
-
-    if (LR.Flags.ExtendsBefore) {
-      DP("WARNING: Pointer is not mapped but section extends into already "
-         "mapped data\n");
-    }
-    if (LR.Flags.ExtendsAfter) {
-      DP("WARNING: Pointer is already mapped but section extends beyond mapped "
-         "region\n");
-    }
-  }
-
-  return LR;
-}
-
-TargetPointerResultTy MappingInfoTy::getTargetPointer(
-    HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase,
-    int64_t TgtPadding, int64_t Size, map_var_info_t HstPtrName, bool HasFlagTo,
-    bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount,
-    bool HasCloseModifier, bool HasPresentModifier, bool HasHoldModifier,
-    AsyncInfoTy &AsyncInfo, HostDataToTargetTy *OwnedTPR, bool ReleaseHDTTMap) {
-
-  LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size, OwnedTPR);
-  LR.TPR.Flags.IsPresent = true;
-
-  // Release the mapping table lock only after the entry is locked by
-  // attaching it to TPR. Once TPR is destroyed it will release the lock
-  // on entry. If it is returned the lock will move to the returned object.
-  // If LR.Entry is already owned/locked we avoid trying to lock it again.
-
-  // Check if the pointer is contained.
-  // If a variable is mapped to the device manually by the user - which would
-  // lead to the IsContained flag to be true - then we must ensure that the
-  // device address is returned even under unified memory conditions.
-  if (LR.Flags.IsContained ||
-      ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && IsImplicit)) {
-    const char *RefCountAction;
-    if (UpdateRefCount) {
-      // After this, reference count >= 1. If the reference count was 0 but the
-      // entry was still there we can reuse the data on the device and avoid a
-      // new submission.
-      LR.TPR.getEntry()->incRefCount(HasHoldModifier);
-      RefCountAction = " (incremented)";
-    } else {
-      // It might have been allocated with the parent, but it's still new.
-      LR.TPR.Flags.IsNewEntry = LR.TPR.getEntry()->getTotalRefCount() == 1;
-      RefCountAction = " (update suppressed)";
-    }
-    const char *DynRefCountAction = HasHoldModifier ? "" : RefCountAction;
-    const char *HoldRefCountAction = HasHoldModifier ? RefCountAction : "";
-    uintptr_t Ptr = LR.TPR.getEntry()->TgtPtrBegin +
-                    ((uintptr_t)HstPtrBegin - LR.TPR.getEntry()->HstPtrBegin);
-    INFO(OMP_INFOTYPE_MAPPING_EXISTS, Device.DeviceID,
-         "Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
-         ", Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s, Name=%s\n",
-         (IsImplicit ? " (implicit)" : ""), DPxPTR(HstPtrBegin), DPxPTR(Ptr),
-         Size, LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
-         LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction,
-         (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
-    LR.TPR.TargetPointer = (void *)Ptr;
-  } else if ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && !IsImplicit) {
-    // Explicit extension of mapped data - not allowed.
-    MESSAGE("explicit extension not allowed: host address specified is " DPxMOD
-            " (%" PRId64
-            " bytes), but device allocation maps to host at " DPxMOD
-            " (%" PRId64 " bytes)",
-            DPxPTR(HstPtrBegin), Size, DPxPTR(LR.TPR.getEntry()->HstPtrBegin),
-            LR.TPR.getEntry()->HstPtrEnd - LR.TPR.getEntry()->HstPtrBegin);
-    if (HasPresentModifier)
-      MESSAGE("device mapping required by 'present' map type modifier does not "
-              "exist for host address " DPxMOD " (%" PRId64 " bytes)",
-              DPxPTR(HstPtrBegin), Size);
-  } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-              !HasCloseModifier) ||
-             (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
-
-    // If unified shared memory is active, implicitly mapped variables that are
-    // not privatized use host address. Any explicitly mapped variables also use
-    // host address where correctness is not impeded. In all other cases maps
-    // are respected.
-    // In addition to the mapping rules above, the close map modifier forces the
-    // mapping of the variable to the device.
-    if (Size) {
-      INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
-           "Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
-           "memory\n",
-           DPxPTR((uintptr_t)HstPtrBegin), Size);
-      DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
-         "memory\n",
-         DPxPTR((uintptr_t)HstPtrBegin), Size);
-      LR.TPR.Flags.IsPresent = false;
-      LR.TPR.Flags.IsHostPointer = true;
-      LR.TPR.TargetPointer = HstPtrBegin;
-    }
-  } else if (HasPresentModifier) {
-    DP("Mapping required by 'present' map type modifier does not exist for "
-       "HstPtrBegin=" DPxMOD ", Size=%" PRId64 "\n",
-       DPxPTR(HstPtrBegin), Size);
-    MESSAGE("device mapping required by 'present' map type modifier does not "
-            "exist for host address " DPxMOD " (%" PRId64 " bytes)",
-            DPxPTR(HstPtrBegin), Size);
-  } else if (Size) {
-    // If it is not contained and Size > 0, we should create a new entry for it.
-    LR.TPR.Flags.IsNewEntry = true;
-    uintptr_t TgtAllocBegin =
-        (uintptr_t)Device.allocData(TgtPadding + Size, HstPtrBegin);
-    uintptr_t TgtPtrBegin = TgtAllocBegin + TgtPadding;
-    // Release the mapping table lock only after the entry is locked by
-    // attaching it to TPR.
-    LR.TPR.setEntry(HDTTMap
-                        ->emplace(new HostDataToTargetTy(
-                            (uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
-                            (uintptr_t)HstPtrBegin + Size, TgtAllocBegin,
-                            TgtPtrBegin, HasHoldModifier, HstPtrName))
-                        .first->HDTT);
-    INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
-         "Creating new map entry with HstPtrBase=" DPxMOD
-         ", HstPtrBegin=" DPxMOD ", TgtAllocBegin=" DPxMOD
-         ", TgtPtrBegin=" DPxMOD
-         ", Size=%ld, DynRefCount=%s, HoldRefCount=%s, Name=%s\n",
-         DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin), DPxPTR(TgtAllocBegin),
-         DPxPTR(TgtPtrBegin), Size,
-         LR.TPR.getEntry()->dynRefCountToStr().c_str(),
-         LR.TPR.getEntry()->holdRefCountToStr().c_str(),
-         (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
-    LR.TPR.TargetPointer = (void *)TgtPtrBegin;
-
-    // Notify the plugin about the new mapping.
-    if (Device.notifyDataMapped(HstPtrBegin, Size))
-      return {{false /*IsNewEntry=*/, false /*IsHostPointer=*/},
-              nullptr /*Entry=*/,
-              nullptr /*TargetPointer=*/};
-  } else {
-    // This entry is not present and we did not create a new entry for it.
-    LR.TPR.Flags.IsPresent = false;
-  }
-
-  // All mapping table modifications have been made. If the user requested it we
-  // give up the lock.
-  if (ReleaseHDTTMap)
-    HDTTMap.destroy();
-
-  // If the target pointer is valid, and we need to transfer data, issue the
-  // data transfer.
-  if (LR.TPR.TargetPointer && !LR.TPR.Flags.IsHostPointer && HasFlagTo &&
-      (LR.TPR.Flags.IsNewEntry || HasFlagAlways) && Size != 0) {
-    DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", Size,
-       DPxPTR(HstPtrBegin), DPxPTR(LR.TPR.TargetPointer));
-
-    int Ret = Device.submitData(LR.TPR.TargetPointer, HstPtrBegin, Size,
-                                AsyncInfo, LR.TPR.getEntry());
-    if (Ret != OFFLOAD_SUCCESS) {
-      REPORT("Copying data to device failed.\n");
-      // We will also return nullptr if the data movement fails because that
-      // pointer points to a corrupted memory region so it doesn't make any
-      // sense to continue to use it.
-      LR.TPR.TargetPointer = nullptr;
-    } else if (LR.TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
-               OFFLOAD_SUCCESS)
-      return {{false /*IsNewEntry=*/, false /*IsHostPointer=*/},
-              nullptr /*Entry=*/,
-              nullptr /*TargetPointer=*/};
-  } else {
-    // If not a host pointer and no present modifier, we need to wait for the
-    // event if it exists.
-    // Note: Entry might be nullptr because of zero length array section.
-    if (LR.TPR.getEntry() && !LR.TPR.Flags.IsHostPointer &&
-        !HasPresentModifier) {
-      void *Event = LR.TPR.getEntry()->getEvent();
-      if (Event) {
-        int Ret = Device.waitEvent(Event, AsyncInfo);
-        if (Ret != OFFLOAD_SUCCESS) {
-          // If it fails to wait for the event, we need to return nullptr in
-          // case of any data race.
-          REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
-          return {{false /*IsNewEntry=*/, false /*IsHostPointer=*/},
-                  nullptr /*Entry=*/,
-                  nullptr /*TargetPointer=*/};
-        }
-      }
-    }
-  }
-
-  return std::move(LR.TPR);
-}
-
-TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
-    void *HstPtrBegin, int64_t Size, bool UpdateRefCount, bool UseHoldRefCount,
-    bool MustContain, bool ForceDelete, bool FromDataEnd) {
-  HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
-
-  LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
-
-  LR.TPR.Flags.IsPresent = true;
-
-  if (LR.Flags.IsContained ||
-      (!MustContain && (LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter))) {
-    LR.TPR.Flags.IsLast =
-        LR.TPR.getEntry()->decShouldRemove(UseHoldRefCount, ForceDelete);
-
-    if (ForceDelete) {
-      LR.TPR.getEntry()->resetRefCount(UseHoldRefCount);
-      assert(LR.TPR.Flags.IsLast ==
-                 LR.TPR.getEntry()->decShouldRemove(UseHoldRefCount) &&
-             "expected correct IsLast prediction for reset");
-    }
-
-    // Increment the number of threads that is using the entry on a
-    // targetDataEnd, tracking the number of possible "deleters". A thread may
-    // come to own the entry deletion even if it was not the last one querying
-    // for it. Thus, we must track every query on targetDataEnds to ensure only
-    // the last thread that holds a reference to an entry actually deletes it.
-    if (FromDataEnd)
-      LR.TPR.getEntry()->incDataEndThreadCount();
-
-    const char *RefCountAction;
-    if (!UpdateRefCount) {
-      RefCountAction = " (update suppressed)";
-    } else if (LR.TPR.Flags.IsLast) {
-      LR.TPR.getEntry()->decRefCount(UseHoldRefCount);
-      assert(LR.TPR.getEntry()->getTotalRefCount() == 0 &&
-             "Expected zero reference count when deletion is scheduled");
-      if (ForceDelete)
-        RefCountAction = " (reset, delayed deletion)";
-      else
-        RefCountAction = " (decremented, delayed deletion)";
-    } else {
-      LR.TPR.getEntry()->decRefCount(UseHoldRefCount);
-      RefCountAction = " (decremented)";
-    }
-    const char *DynRefCountAction = UseHoldRefCount ? "" : RefCountAction;
-    const char *HoldRefCountAction = UseHoldRefCount ? RefCountAction : "";
-    uintptr_t TP = LR.TPR.getEntry()->TgtPtrBegin +
-                   ((uintptr_t)HstPtrBegin - LR.TPR.getEntry()->HstPtrBegin);
-    INFO(OMP_INFOTYPE_MAPPING_EXISTS, Device.DeviceID,
-         "Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
-         "Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s\n",
-         DPxPTR(HstPtrBegin), DPxPTR(TP), Size,
-         LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
-         LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
-    LR.TPR.TargetPointer = (void *)TP;
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
-             PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) {
-    // If the value isn't found in the mapping and unified shared memory
-    // is on then it means we have stumbled upon a value which we need to
-    // use directly from the host.
-    DP("Get HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
-       "memory\n",
-       DPxPTR((uintptr_t)HstPtrBegin), Size);
-    LR.TPR.Flags.IsPresent = false;
-    LR.TPR.Flags.IsHostPointer = true;
-    LR.TPR.TargetPointer = HstPtrBegin;
-  } else {
-    // OpenMP Specification v5.2: if a matching list item is not found, the
-    // pointer retains its original value as per firstprivate semantics.
-    LR.TPR.Flags.IsPresent = false;
-    LR.TPR.Flags.IsHostPointer = false;
-    LR.TPR.TargetPointer = HstPtrBegin;
-  }
-
-  return std::move(LR.TPR);
-}
-
-// Return the target pointer begin (where the data will be moved).
-void *MappingInfoTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap,
-                                    void *HstPtrBegin, int64_t Size) {
-  uintptr_t HP = (uintptr_t)HstPtrBegin;
-  LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
-  if (LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) {
-    uintptr_t TP =
-        LR.TPR.getEntry()->TgtPtrBegin + (HP - LR.TPR.getEntry()->HstPtrBegin);
-    return (void *)TP;
-  }
-
-  return NULL;
-}
-
-int MappingInfoTy::eraseMapEntry(HDTTMapAccessorTy &HDTTMap,
-                                 HostDataToTargetTy *Entry, int64_t Size) {
-  assert(Entry && "Trying to delete a null entry from the HDTT map.");
-  assert(Entry->getTotalRefCount() == 0 &&
-         Entry->getDataEndThreadCount() == 0 &&
-         "Trying to delete entry that is in use or owned by another thread.");
-
-  INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
-       "Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
-       ", Size=%" PRId64 ", Name=%s\n",
-       DPxPTR(Entry->HstPtrBegin), DPxPTR(Entry->TgtPtrBegin), Size,
-       (Entry->HstPtrName) ? getNameFromMapping(Entry->HstPtrName).c_str()
-                           : "unknown");
-
-  if (HDTTMap->erase(Entry) == 0) {
-    REPORT("Trying to remove a non-existent map entry\n");
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry,
-                                         int64_t Size) {
-  assert(Entry && "Trying to deallocate a null entry.");
-
-  DP("Deleting tgt data " DPxMOD " of size %" PRId64 " by freeing allocation "
-     "starting at " DPxMOD "\n",
-     DPxPTR(Entry->TgtPtrBegin), Size, DPxPTR(Entry->TgtAllocBegin));
-
-  void *Event = Entry->getEvent();
-  if (Event && Device.destroyEvent(Event) != OFFLOAD_SUCCESS) {
-    REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event));
-    return OFFLOAD_FAIL;
-  }
-
-  int Ret = Device.deleteData((void *)Entry->TgtAllocBegin);
-
-  // Notify the plugin about the unmapped memory.
-  Ret |= Device.notifyDataUnmapped((void *)Entry->HstPtrBegin);
-
-  delete Entry;
-
-  return Ret;
-}
-
-static void printCopyInfoImpl(int DeviceId, bool H2D, void *SrcPtrBegin,
-                              void *DstPtrBegin, int64_t Size,
-                              HostDataToTargetTy *HT) {
-
-  INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId,
-       "Copying data from %s to %s, %sPtr=" DPxMOD ", %sPtr=" DPxMOD
-       ", Size=%" PRId64 ", Name=%s\n",
-       H2D ? "host" : "device", H2D ? "device" : "host", H2D ? "Hst" : "Tgt",
-       DPxPTR(H2D ? SrcPtrBegin : DstPtrBegin), H2D ? "Tgt" : "Hst",
-       DPxPTR(H2D ? DstPtrBegin : SrcPtrBegin), Size,
-       (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
-                              : "unknown");
-}
-
-void MappingInfoTy::printCopyInfo(
-    void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, bool H2D,
-    HostDataToTargetTy *Entry, MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
-  auto HDTTMap =
-      HostDataToTargetMap.getExclusiveAccessor(!!Entry || !!HDTTMapPtr);
-  LookupResult LR;
-  if (!Entry) {
-    LR = lookupMapping(HDTTMapPtr ? *HDTTMapPtr : HDTTMap, HstPtrBegin, Size);
-    Entry = LR.TPR.getEntry();
-  }
-  printCopyInfoImpl(Device.DeviceID, H2D, HstPtrBegin, TgtPtrBegin, Size,
-                    Entry);
-}

diff --git a/libomptarget/src/OpenMP/OMPT/Callback.cpp b/libomptarget/src/OpenMP/OMPT/Callback.cpp
deleted file mode 100644
index f285843..0000000
--- a/libomptarget/src/OpenMP/OMPT/Callback.cpp
+++ /dev/null

@@ -1,496 +0,0 @@
-//===-- OpenMP/OMPT/Callback.cpp - OpenMP Tooling Callback implementation -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of OMPT callback interfaces for target independent layer
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPT_SUPPORT
-
-extern "C" {
-/// Dummy definition when OMPT is disabled
-void ompt_libomptarget_connect() {}
-}
-
-#else // OMPT_SUPPORT is set
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-
-#include "Shared/Debug.h"
-
-#include "OpenMP/OMPT/Callback.h"
-#include "OpenMP/OMPT/Connector.h"
-#include "OpenMP/OMPT/Interface.h"
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#undef DEBUG_PREFIX
-#define DEBUG_PREFIX "OMPT"
-
-using namespace llvm::omp::target::ompt;
-
-// Define OMPT callback functions (bound to actual callbacks later on)
-#define defineOmptCallback(Name, Type, Code)                                   \
-  Name##_t llvm::omp::target::ompt::Name##_fn = nullptr;
-FOREACH_OMPT_NOEMI_EVENT(defineOmptCallback)
-FOREACH_OMPT_EMI_EVENT(defineOmptCallback)
-#undef defineOmptCallback
-
-/// Forward declaration
-class LibomptargetRtlFinalizer;
-
-/// Object that will maintain the RTL finalizer from the plugin
-LibomptargetRtlFinalizer *LibraryFinalizer = nullptr;
-
-thread_local Interface llvm::omp::target::ompt::RegionInterface;
-
-thread_local void *llvm::omp::target::ompt::ReturnAddress = nullptr;
-
-bool llvm::omp::target::ompt::Initialized = false;
-
-ompt_get_callback_t llvm::omp::target::ompt::lookupCallbackByCode = nullptr;
-ompt_function_lookup_t llvm::omp::target::ompt::lookupCallbackByName = nullptr;
-ompt_get_target_task_data_t ompt_get_target_task_data_fn = nullptr;
-ompt_get_task_data_t ompt_get_task_data_fn = nullptr;
-
-/// Unique correlation id
-static std::atomic<uint64_t> IdCounter(1);
-
-/// Used to create a new correlation id
-static uint64_t createId() { return IdCounter.fetch_add(1); }
-
-/// Create a new correlation id and update the operations id
-static uint64_t createOpId() {
-  uint64_t NewId = createId();
-  RegionInterface.setHostOpId(NewId);
-  return NewId;
-}
-
-/// Create a new correlation id and update the target region id
-static uint64_t createRegionId() {
-  uint64_t NewId = createId();
-  RegionInterface.setTargetDataValue(NewId);
-  return NewId;
-}
-
-void Interface::beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                                     void **TgtPtrBegin, size_t Size,
-                                     void *Code) {
-  beginTargetDataOperation();
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_alloc, HstPtrBegin,
-        /*SrcDeviceNum=*/omp_get_initial_device(), *TgtPtrBegin,
-        /*TgtDeviceNum=*/DeviceId, Size, Code);
-  } else if (ompt_callback_target_data_op_fn) {
-    // HostOpId is set by the runtime
-    HostOpId = createOpId();
-    // Invoke the tool supplied data op callback
-    ompt_callback_target_data_op_fn(
-        TargetData.value, HostOpId, ompt_target_data_alloc, HstPtrBegin,
-        /*SrcDeviceNum=*/omp_get_initial_device(), *TgtPtrBegin,
-        /*TgtDeviceNum=*/DeviceId, Size, Code);
-  }
-}
-
-void Interface::endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
-                                   void **TgtPtrBegin, size_t Size,
-                                   void *Code) {
-  // Only EMI callback handles end scope
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_alloc, HstPtrBegin,
-        /*SrcDeviceNum=*/omp_get_initial_device(), *TgtPtrBegin,
-        /*TgtDeviceNum=*/DeviceId, Size, Code);
-  }
-  endTargetDataOperation();
-}
-
-void Interface::beginTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
-                                      int64_t DstDeviceId, void *DstPtrBegin,
-                                      size_t Size, void *Code) {
-  beginTargetDataOperation();
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_to_device, SrcPtrBegin, SrcDeviceId,
-        DstPtrBegin, DstDeviceId, Size, Code);
-  } else if (ompt_callback_target_data_op_fn) {
-    // HostOpId is set by the runtime
-    HostOpId = createOpId();
-    // Invoke the tool supplied data op callback
-    ompt_callback_target_data_op_fn(
-        TargetData.value, HostOpId, ompt_target_data_transfer_to_device,
-        SrcPtrBegin, SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
-  }
-}
-
-void Interface::endTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
-                                    int64_t DstDeviceId, void *DstPtrBegin,
-                                    size_t Size, void *Code) {
-  // Only EMI callback handles end scope
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_to_device, SrcPtrBegin, SrcDeviceId,
-        DstPtrBegin, DstDeviceId, Size, Code);
-  }
-  endTargetDataOperation();
-}
-
-void Interface::beginTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin,
-                                      void *Code) {
-  beginTargetDataOperation();
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_delete, TgtPtrBegin, DeviceId,
-        /*TgtPtrBegin=*/nullptr, /*TgtDeviceNum=*/-1, /*Bytes=*/0, Code);
-  } else if (ompt_callback_target_data_op_fn) {
-    // HostOpId is set by the runtime
-    HostOpId = createOpId();
-    // Invoke the tool supplied data op callback
-    ompt_callback_target_data_op_fn(TargetData.value, HostOpId,
-                                    ompt_target_data_delete, TgtPtrBegin,
-                                    DeviceId, /*TgtPtrBegin=*/nullptr,
-                                    /*TgtDeviceNum=*/-1, /*Bytes=*/0, Code);
-  }
-}
-
-void Interface::endTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin,
-                                    void *Code) {
-  // Only EMI callback handles end scope
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_delete, TgtPtrBegin, DeviceId,
-        /*TgtPtrBegin=*/nullptr, /*TgtDeviceNum=*/-1, /*Bytes=*/0, Code);
-  }
-  endTargetDataOperation();
-}
-
-void Interface::beginTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
-                                        int64_t DstDeviceId, void *DstPtrBegin,
-                                        size_t Size, void *Code) {
-  beginTargetDataOperation();
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_from_device, SrcPtrBegin, SrcDeviceId,
-        DstPtrBegin, DstDeviceId, Size, Code);
-  } else if (ompt_callback_target_data_op_fn) {
-    // HostOpId is set by the runtime
-    HostOpId = createOpId();
-    // Invoke the tool supplied data op callback
-    ompt_callback_target_data_op_fn(
-        TargetData.value, HostOpId, ompt_target_data_transfer_from_device,
-        SrcPtrBegin, SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
-  }
-}
-
-void Interface::endTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
-                                      int64_t DstDeviceId, void *DstPtrBegin,
-                                      size_t Size, void *Code) {
-  // Only EMI callback handles end scope
-  if (ompt_callback_target_data_op_emi_fn) {
-    // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
-    // callback
-    ompt_callback_target_data_op_emi_fn(
-        ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_from_device, SrcPtrBegin, SrcDeviceId,
-        DstPtrBegin, DstDeviceId, Size, Code);
-  }
-  endTargetDataOperation();
-}
-
-void Interface::beginTargetSubmit(unsigned int numTeams) {
-  if (ompt_callback_target_submit_emi_fn) {
-    // HostOpId is set by the tool. Invoke the tool supplied target submit EMI
-    // callback
-    ompt_callback_target_submit_emi_fn(ompt_scope_begin, &TargetData, &HostOpId,
-                                       numTeams);
-  } else if (ompt_callback_target_submit_fn) {
-    // HostOpId is set by the runtime
-    HostOpId = createOpId();
-    ompt_callback_target_submit_fn(TargetData.value, HostOpId, numTeams);
-  }
-}
-
-void Interface::endTargetSubmit(unsigned int numTeams) {
-  // Only EMI callback handles end scope
-  if (ompt_callback_target_submit_emi_fn) {
-    // HostOpId is set by the tool. Invoke the tool supplied target submit EMI
-    // callback
-    ompt_callback_target_submit_emi_fn(ompt_scope_end, &TargetData, &HostOpId,
-                                       numTeams);
-  }
-}
-
-void Interface::beginTargetDataEnter(int64_t DeviceId, void *Code) {
-  beginTargetRegion();
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target_enter_data, ompt_scope_begin,
-                                DeviceId, TaskData, TargetTaskData, &TargetData,
-                                Code);
-  } else if (ompt_callback_target_fn) {
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target_enter_data, ompt_scope_begin, DeviceId,
-                            TaskData, TargetData.value, Code);
-  }
-}
-
-void Interface::endTargetDataEnter(int64_t DeviceId, void *Code) {
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target_enter_data, ompt_scope_end,
-                                DeviceId, TaskData, TargetTaskData, &TargetData,
-                                Code);
-  } else if (ompt_callback_target_fn) {
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target_enter_data, ompt_scope_end, DeviceId,
-                            TaskData, TargetData.value, Code);
-  }
-  endTargetRegion();
-}
-
-void Interface::beginTargetDataExit(int64_t DeviceId, void *Code) {
-  beginTargetRegion();
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target_exit_data, ompt_scope_begin,
-                                DeviceId, TaskData, TargetTaskData, &TargetData,
-                                Code);
-  } else if (ompt_callback_target_fn) {
-    TargetData.value = createRegionId();
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target_exit_data, ompt_scope_begin, DeviceId,
-                            TaskData, TargetData.value, Code);
-  }
-}
-
-void Interface::endTargetDataExit(int64_t DeviceId, void *Code) {
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target_exit_data, ompt_scope_end, DeviceId,
-                                TaskData, TargetTaskData, &TargetData, Code);
-  } else if (ompt_callback_target_fn) {
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target_exit_data, ompt_scope_end, DeviceId,
-                            TaskData, TargetData.value, Code);
-  }
-  endTargetRegion();
-}
-
-void Interface::beginTargetUpdate(int64_t DeviceId, void *Code) {
-  beginTargetRegion();
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target_update, ompt_scope_begin, DeviceId,
-                                TaskData, TargetTaskData, &TargetData, Code);
-  } else if (ompt_callback_target_fn) {
-    TargetData.value = createRegionId();
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target_update, ompt_scope_begin, DeviceId,
-                            TaskData, TargetData.value, Code);
-  }
-}
-
-void Interface::endTargetUpdate(int64_t DeviceId, void *Code) {
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target_update, ompt_scope_end, DeviceId,
-                                TaskData, TargetTaskData, &TargetData, Code);
-  } else if (ompt_callback_target_fn) {
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target_update, ompt_scope_end, DeviceId,
-                            TaskData, TargetData.value, Code);
-  }
-  endTargetRegion();
-}
-
-void Interface::beginTarget(int64_t DeviceId, void *Code) {
-  beginTargetRegion();
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target, ompt_scope_begin, DeviceId,
-                                TaskData, TargetTaskData, &TargetData, Code);
-  } else if (ompt_callback_target_fn) {
-    TargetData.value = createRegionId();
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target, ompt_scope_begin, DeviceId, TaskData,
-                            TargetData.value, Code);
-  }
-}
-
-void Interface::endTarget(int64_t DeviceId, void *Code) {
-  if (ompt_callback_target_emi_fn) {
-    // Invoke the tool supplied target EMI callback
-    ompt_callback_target_emi_fn(ompt_target, ompt_scope_end, DeviceId, TaskData,
-                                TargetTaskData, &TargetData, Code);
-  } else if (ompt_callback_target_fn) {
-    // Invoke the tool supplied target callback
-    ompt_callback_target_fn(ompt_target, ompt_scope_end, DeviceId, TaskData,
-                            TargetData.value, Code);
-  }
-  endTargetRegion();
-}
-
-void Interface::beginTargetDataOperation() {
-  DP("in ompt_target_region_begin (TargetRegionId = %lu)\n", TargetData.value);
-}
-
-void Interface::endTargetDataOperation() {
-  DP("in ompt_target_region_end (TargetRegionId = %lu)\n", TargetData.value);
-}
-
-void Interface::beginTargetRegion() {
-  // Set up task state
-  assert(ompt_get_task_data_fn && "Calling a null task data function");
-  TaskData = ompt_get_task_data_fn();
-  // Set up target task state
-  assert(ompt_get_target_task_data_fn &&
-         "Calling a null target task data function");
-  TargetTaskData = ompt_get_target_task_data_fn();
-  // Target state will be set later
-  TargetData = ompt_data_none;
-}
-
-void Interface::endTargetRegion() {
-  TaskData = 0;
-  TargetTaskData = 0;
-  TargetData = ompt_data_none;
-}
-
-/// Used to maintain the finalization functions that are received
-/// from the plugins during connect.
-/// Note: Currently, there are no plugin-specific finalizations, so each plugin
-/// will call the same (empty) function.
-class LibomptargetRtlFinalizer {
-public:
-  LibomptargetRtlFinalizer() {}
-
-  void registerRtl(ompt_finalize_t FinalizationFunction) {
-    if (FinalizationFunction) {
-      RtlFinalizationFunctions.emplace_back(FinalizationFunction);
-    }
-  }
-
-  void finalize() {
-    for (auto FinalizationFunction : RtlFinalizationFunctions)
-      FinalizationFunction(/* tool_data */ nullptr);
-    RtlFinalizationFunctions.clear();
-  }
-
-private:
-  llvm::SmallVector<ompt_finalize_t> RtlFinalizationFunctions;
-};
-
-int llvm::omp::target::ompt::initializeLibrary(ompt_function_lookup_t lookup,
-                                               int initial_device_num,
-                                               ompt_data_t *tool_data) {
-  DP("Executing initializeLibrary (libomp)\n");
-#define bindOmptFunctionName(OmptFunction, DestinationFunction)                \
-  DestinationFunction = (OmptFunction##_t)lookup(#OmptFunction);               \
-  DP("initializeLibrary (libomp) bound %s=%p\n", #DestinationFunction,         \
-     ((void *)(uint64_t)DestinationFunction));
-
-  bindOmptFunctionName(ompt_get_callback, lookupCallbackByCode);
-  bindOmptFunctionName(ompt_get_task_data, ompt_get_task_data_fn);
-  bindOmptFunctionName(ompt_get_target_task_data, ompt_get_target_task_data_fn);
-#undef bindOmptFunctionName
-
-  // Store pointer of 'ompt_libomp_target_fn_lookup' for use by libomptarget
-  lookupCallbackByName = lookup;
-
-  assert(lookupCallbackByCode && "lookupCallbackByCode should be non-null");
-  assert(lookupCallbackByName && "lookupCallbackByName should be non-null");
-  assert(ompt_get_task_data_fn && "ompt_get_task_data_fn should be non-null");
-  assert(ompt_get_target_task_data_fn &&
-         "ompt_get_target_task_data_fn should be non-null");
-  assert(LibraryFinalizer == nullptr &&
-         "LibraryFinalizer should not be initialized yet");
-
-  LibraryFinalizer = new LibomptargetRtlFinalizer();
-
-  Initialized = true;
-
-  return 0;
-}
-
-void llvm::omp::target::ompt::finalizeLibrary(ompt_data_t *data) {
-  DP("Executing finalizeLibrary (libomp)\n");
-  // Before disabling OMPT, call the (plugin) finalizations that were registered
-  // with this library
-  LibraryFinalizer->finalize();
-  delete LibraryFinalizer;
-  Initialized = false;
-}
-
-void llvm::omp::target::ompt::connectLibrary() {
-  DP("Entering connectLibrary (libomp)\n");
-  // Connect with libomp
-  static OmptLibraryConnectorTy LibompConnector("libomp");
-  static ompt_start_tool_result_t OmptResult;
-
-  // Initialize OmptResult with the init and fini functions that will be
-  // called by the connector
-  OmptResult.initialize = ompt::initializeLibrary;
-  OmptResult.finalize = ompt::finalizeLibrary;
-  OmptResult.tool_data.value = 0;
-
-  // Now call connect that causes the above init/fini functions to be called
-  LibompConnector.connect(&OmptResult);
-
-#define bindOmptCallback(Name, Type, Code)                                     \
-  if (lookupCallbackByCode)                                                    \
-    lookupCallbackByCode(                                                      \
-        (ompt_callbacks_t)(Code),                                              \
-        (ompt_callback_t *)&(llvm::omp::target::ompt::Name##_fn));
-  FOREACH_OMPT_NOEMI_EVENT(bindOmptCallback)
-  FOREACH_OMPT_EMI_EVENT(bindOmptCallback)
-#undef bindOmptCallback
-
-  DP("Exiting connectLibrary (libomp)\n");
-}
-
-extern "C" {
-/// Used for connecting libomptarget with a plugin
-void ompt_libomptarget_connect(ompt_start_tool_result_t *result) {
-  DP("Enter ompt_libomptarget_connect\n");
-  if (Initialized && result && LibraryFinalizer) {
-    // Cache each fini function, so that they can be invoked on exit
-    LibraryFinalizer->registerRtl(result->finalize);
-    // Invoke the provided init function with the lookup function maintained
-    // in this library so that callbacks maintained by this library are
-    // retrieved.
-    result->initialize(lookupCallbackByName,
-                       /* initial_device_num */ 0, /* tool_data */ nullptr);
-  }
-  DP("Leave ompt_libomptarget_connect\n");
-}
-}
-#endif // OMPT_SUPPORT

diff --git a/libomptarget/src/PluginManager.cpp b/libomptarget/src/PluginManager.cpp
deleted file mode 100644
index 792cae3..0000000
--- a/libomptarget/src/PluginManager.cpp
+++ /dev/null

@@ -1,349 +0,0 @@
-//===-- PluginManager.cpp - Plugin loading and communication API ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Functionality for handling plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PluginManager.h"
-#include "Shared/Debug.h"
-#include "Shared/Profile.h"
-
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <memory>
-
-using namespace llvm;
-using namespace llvm::sys;
-
-PluginManager *PM = nullptr;
-
-// List of all plugins that can support offloading.
-static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS};
-
-Expected<std::unique_ptr<PluginAdaptorTy>>
-PluginAdaptorTy::create(const std::string &Name) {
-  DP("Attempting to load library '%s'...\n", Name.c_str());
-  TIMESCOPE_WITH_NAME_AND_IDENT(Name, (const ident_t *)nullptr);
-
-  std::string ErrMsg;
-  auto LibraryHandler = std::make_unique<DynamicLibrary>(
-      DynamicLibrary::getPermanentLibrary(Name.c_str(), &ErrMsg));
-
-  if (!LibraryHandler->isValid()) {
-    // Library does not exist or cannot be found.
-    return createStringError(inconvertibleErrorCode(),
-                             "Unable to load library '%s': %s!\n", Name.c_str(),
-                             ErrMsg.c_str());
-  }
-
-  DP("Successfully loaded library '%s'!\n", Name.c_str());
-  auto PluginAdaptor = std::unique_ptr<PluginAdaptorTy>(
-      new PluginAdaptorTy(Name, std::move(LibraryHandler)));
-  if (auto Err = PluginAdaptor->init())
-    return Err;
-  return std::move(PluginAdaptor);
-}
-
-PluginAdaptorTy::PluginAdaptorTy(const std::string &Name,
-                                 std::unique_ptr<llvm::sys::DynamicLibrary> DL)
-    : Name(Name), LibraryHandler(std::move(DL)) {}
-
-Error PluginAdaptorTy::init() {
-
-#define PLUGIN_API_HANDLE(NAME)                                                \
-  NAME = reinterpret_cast<decltype(NAME)>(                                     \
-      LibraryHandler->getAddressOfSymbol(GETNAME(__tgt_rtl_##NAME)));          \
-  if (!NAME) {                                                                 \
-    return createStringError(inconvertibleErrorCode(),                         \
-                             "Invalid plugin as necessary interface function " \
-                             "(%s) was not found.\n",                          \
-                             std::string(#NAME).c_str());                      \
-  }
-
-#include "Shared/PluginAPI.inc"
-#undef PLUGIN_API_HANDLE
-
-  // Remove plugin on failure to call optional init_plugin
-  int32_t Rc = init_plugin();
-  if (Rc != OFFLOAD_SUCCESS) {
-    return createStringError(inconvertibleErrorCode(),
-                             "Unable to initialize library '%s': %u!\n",
-                             Name.c_str(), Rc);
-  }
-
-  // No devices are supported by this RTL?
-  int32_t NumberOfPluginDevices = number_of_devices();
-  if (!NumberOfPluginDevices) {
-    return createStringError(inconvertibleErrorCode(),
-                             "No devices supported in this RTL\n");
-  }
-
-  DP("Registered '%s' with %d plugin visible devices!\n", Name.c_str(),
-     NumberOfPluginDevices);
-  return Error::success();
-}
-
-void PluginManager::init() {
-  TIMESCOPE();
-  DP("Loading RTLs...\n");
-
-  // Attempt to open all the plugins and, if they exist, check if the interface
-  // is correct and if they are supporting any devices.
-  for (const char *Name : RTLNames) {
-    auto PluginAdaptorOrErr =
-        PluginAdaptorTy::create(std::string(Name) + ".so");
-    if (!PluginAdaptorOrErr) {
-      [[maybe_unused]] std::string InfoMsg =
-          toString(PluginAdaptorOrErr.takeError());
-      DP("%s", InfoMsg.c_str());
-    } else {
-      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));
-    }
-  }
-
-  DP("RTLs loaded!\n");
-}
-
-void PluginManager::initDevices(PluginAdaptorTy &RTL) {
-  // If this RTL has already been initialized.
-  if (PM->DeviceOffsets.contains(&RTL))
-    return;
-  TIMESCOPE();
-
-  // If this RTL is not already in use, initialize it.
-  assert(RTL.number_of_devices() > 0 &&
-         "Tried to initialize useless plugin adaptor");
-
-  // Initialize the device information for the RTL we are about to use.
-  auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
-
-  // Initialize the index of this RTL and save it in the used RTLs.
-  int32_t DeviceOffset = ExclusiveDevicesAccessor->size();
-
-  // Set the device identifier offset in the plugin.
-  RTL.set_device_offset(DeviceOffset);
-
-  int32_t NumberOfUserDevices = 0;
-  int32_t NumPD = RTL.number_of_devices();
-  ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
-  // Auto zero-copy is a per-device property. We need to ensure
-  // that all devices are suggesting to use it.
-  bool UseAutoZeroCopy = !(NumPD == 0);
-  for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
-    auto Device = std::make_unique<DeviceTy>(&RTL, UserDevId, PDevI);
-    if (auto Err = Device->init()) {
-      DP("Skip plugin known device %d: %s\n", PDevI,
-         toString(std::move(Err)).c_str());
-      continue;
-    }
-    UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy();
-
-    ExclusiveDevicesAccessor->push_back(std::move(Device));
-    ++NumberOfUserDevices;
-    ++UserDevId;
-  }
-
-  // Auto Zero-Copy can only be currently triggered when the system is an
-  // homogeneous APU architecture without attached discrete GPUs.
-  // If all devices suggest to use it, change requirment flags to trigger
-  // zero-copy behavior when mapping memory.
-  if (UseAutoZeroCopy)
-    addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
-
-  DeviceOffsets[&RTL] = DeviceOffset;
-  DeviceUsed[&RTL] = NumberOfUserDevices;
-  DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
-     DPxPTR(RTL.LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
-     RTL.number_of_devices());
-}
-
-void PluginManager::initAllPlugins() {
-  for (auto &R : PluginAdaptors)
-    initDevices(*R);
-}
-
-static void registerImageIntoTranslationTable(TranslationTable &TT,
-                                              int32_t DeviceOffset,
-                                              int32_t NumberOfUserDevices,
-                                              __tgt_device_image *Image) {
-
-  // same size, as when we increase one, we also increase the other.
-  assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
-         "We should have as many images as we have tables!");
-
-  // Resize the Targets Table and Images to accommodate the new targets if
-  // required
-  unsigned TargetsTableMinimumSize = DeviceOffset + NumberOfUserDevices;
-
-  if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
-    TT.DeviceTables.resize(TargetsTableMinimumSize, {});
-    TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
-    TT.TargetsEntries.resize(TargetsTableMinimumSize, {});
-    TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
-  }
-
-  // Register the image in all devices for this target type.
-  for (int32_t I = 0; I < NumberOfUserDevices; ++I) {
-    // If we are changing the image we are also invalidating the target table.
-    if (TT.TargetsImages[DeviceOffset + I] != Image) {
-      TT.TargetsImages[DeviceOffset + I] = Image;
-      TT.TargetsTable[DeviceOffset + I] =
-          0; // lazy initialization of target table.
-    }
-  }
-}
-
-void PluginManager::registerLib(__tgt_bin_desc *Desc) {
-  PM->RTLsMtx.lock();
-
-  // Add in all the OpenMP requirements associated with this binary.
-  for (__tgt_offload_entry &Entry :
-       llvm::make_range(Desc->HostEntriesBegin, Desc->HostEntriesEnd))
-    if (Entry.flags == OMP_REGISTER_REQUIRES)
-      PM->addRequirements(Entry.data);
-
-  // Extract the exectuable image and extra information if availible.
-  for (int32_t i = 0; i < Desc->NumDeviceImages; ++i)
-    PM->addDeviceImage(*Desc, Desc->DeviceImages[i]);
-
-  // Register the images with the RTLs that understand them, if any.
-  for (DeviceImageTy &DI : PM->deviceImages()) {
-    // Obtain the image and information that was previously extracted.
-    __tgt_device_image *Img = &DI.getExecutableImage();
-
-    PluginAdaptorTy *FoundRTL = nullptr;
-
-    // Scan the RTLs that have associated images until we find one that supports
-    // the current image.
-    for (auto &R : PM->pluginAdaptors()) {
-      if (!R.is_valid_binary(Img)) {
-        DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
-           DPxPTR(Img->ImageStart), R.Name.c_str());
-        continue;
-      }
-
-      DP("Image " DPxMOD " is compatible with RTL %s!\n",
-         DPxPTR(Img->ImageStart), R.Name.c_str());
-
-      PM->initDevices(R);
-
-      // Initialize (if necessary) translation table for this library.
-      PM->TrlTblMtx.lock();
-      if (!PM->HostEntriesBeginToTransTable.count(Desc->HostEntriesBegin)) {
-        PM->HostEntriesBeginRegistrationOrder.push_back(Desc->HostEntriesBegin);
-        TranslationTable &TransTable =
-            (PM->HostEntriesBeginToTransTable)[Desc->HostEntriesBegin];
-        TransTable.HostTable.EntriesBegin = Desc->HostEntriesBegin;
-        TransTable.HostTable.EntriesEnd = Desc->HostEntriesEnd;
-      }
-
-      // Retrieve translation table for this library.
-      TranslationTable &TransTable =
-          (PM->HostEntriesBeginToTransTable)[Desc->HostEntriesBegin];
-
-      DP("Registering image " DPxMOD " with RTL %s!\n", DPxPTR(Img->ImageStart),
-         R.Name.c_str());
-
-      registerImageIntoTranslationTable(TransTable, PM->DeviceOffsets[&R],
-                                        PM->DeviceUsed[&R], Img);
-      PM->UsedImages.insert(Img);
-
-      PM->TrlTblMtx.unlock();
-      FoundRTL = &R;
-
-      // if an RTL was found we are done - proceed to register the next image
-      break;
-    }
-
-    if (!FoundRTL) {
-      DP("No RTL found for image " DPxMOD "!\n", DPxPTR(Img->ImageStart));
-    }
-  }
-  PM->RTLsMtx.unlock();
-
-  DP("Done registering entries!\n");
-}
-
-// Temporary forward declaration, old style CTor/DTor handling is going away.
-int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
-           KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
-
-void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
-  DP("Unloading target library!\n");
-
-  PM->RTLsMtx.lock();
-  // Find which RTL understands each image, if any.
-  for (DeviceImageTy &DI : PM->deviceImages()) {
-    // Obtain the image and information that was previously extracted.
-    __tgt_device_image *Img = &DI.getExecutableImage();
-
-    PluginAdaptorTy *FoundRTL = NULL;
-
-    // Scan the RTLs that have associated images until we find one that supports
-    // the current image. We only need to scan RTLs that are already being used.
-    for (auto &R : PM->pluginAdaptors()) {
-      if (!DeviceOffsets.contains(&R))
-        continue;
-
-      // Ensure that we do not use any unused images associated with this RTL.
-      if (!UsedImages.contains(Img))
-        continue;
-
-      FoundRTL = &R;
-
-      DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
-         DPxPTR(Img->ImageStart), DPxPTR(R.LibraryHandler.get()));
-
-      break;
-    }
-
-    // if no RTL was found proceed to unregister the next image
-    if (!FoundRTL) {
-      DP("No RTLs in use support the image " DPxMOD "!\n",
-         DPxPTR(Img->ImageStart));
-    }
-  }
-  PM->RTLsMtx.unlock();
-  DP("Done unregistering images!\n");
-
-  // Remove entries from PM->HostPtrToTableMap
-  PM->TblMapMtx.lock();
-  for (__tgt_offload_entry *Cur = Desc->HostEntriesBegin;
-       Cur < Desc->HostEntriesEnd; ++Cur) {
-    PM->HostPtrToTableMap.erase(Cur->addr);
-  }
-
-  // Remove translation table for this descriptor.
-  auto TransTable =
-      PM->HostEntriesBeginToTransTable.find(Desc->HostEntriesBegin);
-  if (TransTable != PM->HostEntriesBeginToTransTable.end()) {
-    DP("Removing translation table for descriptor " DPxMOD "\n",
-       DPxPTR(Desc->HostEntriesBegin));
-    PM->HostEntriesBeginToTransTable.erase(TransTable);
-  } else {
-    DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
-       "it has been already removed.\n",
-       DPxPTR(Desc->HostEntriesBegin));
-  }
-
-  PM->TblMapMtx.unlock();
-
-  DP("Done unregistering library!\n");
-}
-
-Expected<DeviceTy &> PluginManager::getDevice(uint32_t DeviceNo) {
-  auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
-  if (DeviceNo >= ExclusiveDevicesAccessor->size())
-    return createStringError(
-        inconvertibleErrorCode(),
-        "Device number '%i' out of range, only %i devices available", DeviceNo,
-        ExclusiveDevicesAccessor->size());
-
-  return *(*ExclusiveDevicesAccessor)[DeviceNo];
-}

diff --git a/libomptarget/src/device.cpp b/libomptarget/src/device.cpp
deleted file mode 100644
index 44a2fac..0000000
--- a/libomptarget/src/device.cpp
+++ /dev/null

@@ -1,290 +0,0 @@
-//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Functionality for managing devices that are handled by RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#include "device.h"
-#include "OffloadEntry.h"
-#include "OpenMP/Mapping.h"
-#include "OpenMP/OMPT/Callback.h"
-#include "OpenMP/OMPT/Interface.h"
-#include "PluginManager.h"
-#include "Shared/APITypes.h"
-#include "Shared/Debug.h"
-#include "omptarget.h"
-#include "private.h"
-#include "rtl.h"
-
-#include "Shared/EnvironmentVar.h"
-#include "llvm/Support/Error.h"
-
-#include <cassert>
-#include <climits>
-#include <cstdint>
-#include <cstdio>
-#include <mutex>
-#include <string>
-#include <thread>
-
-#ifdef OMPT_SUPPORT
-using namespace llvm::omp::target::ompt;
-#endif
-
-int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
-                                            AsyncInfoTy &AsyncInfo) const {
-  // First, check if the user disabled atomic map transfer/malloc/dealloc.
-  if (!MappingConfig::get().UseEventsForAtomicTransfers)
-    return OFFLOAD_SUCCESS;
-
-  void *Event = getEvent();
-  bool NeedNewEvent = Event == nullptr;
-  if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
-    REPORT("Failed to create event\n");
-    return OFFLOAD_FAIL;
-  }
-
-  // We cannot assume the event should not be nullptr because we don't
-  // know if the target support event. But if a target doesn't,
-  // recordEvent should always return success.
-  if (Device.recordEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
-    REPORT("Failed to set dependence on event " DPxMOD "\n", DPxPTR(Event));
-    return OFFLOAD_FAIL;
-  }
-
-  if (NeedNewEvent)
-    setEvent(Event);
-
-  return OFFLOAD_SUCCESS;
-}
-
-DeviceTy::DeviceTy(PluginAdaptorTy *RTL, int32_t DeviceID, int32_t RTLDeviceID)
-    : DeviceID(DeviceID), RTL(RTL), RTLDeviceID(RTLDeviceID),
-      MappingInfo(*this) {}
-
-DeviceTy::~DeviceTy() {
-  if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
-    return;
-
-  ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
-  dumpTargetPointerMappings(&Loc, *this);
-}
-
-llvm::Error DeviceTy::init() {
-  // Make call to init_requires if it exists for this plugin.
-  int32_t Ret = 0;
-  Ret = RTL->init_requires(PM->getRequirements());
-  if (Ret != OFFLOAD_SUCCESS)
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "Failed to initialize requirements for device %d\n", DeviceID);
-
-  Ret = RTL->init_device(RTLDeviceID);
-  if (Ret != OFFLOAD_SUCCESS)
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Failed to initialize device %d\n",
-                                   DeviceID);
-
-  // Enables recording kernels if set.
-  BoolEnvar OMPX_RecordKernel("LIBOMPTARGET_RECORD", false);
-  if (OMPX_RecordKernel) {
-    // Enables saving the device memory kernel output post execution if set.
-    BoolEnvar OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT", false);
-
-    uint64_t ReqPtrArgOffset;
-    RTL->initialize_record_replay(RTLDeviceID, 0, nullptr, true,
-                                  OMPX_ReplaySaveOutput, ReqPtrArgOffset);
-  }
-
-  return llvm::Error::success();
-}
-
-// Load binary to device.
-llvm::Expected<__tgt_device_binary>
-DeviceTy::loadBinary(__tgt_device_image *Img) {
-  __tgt_device_binary Binary;
-
-  if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS)
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Failed to load binary %p", Img);
-  return Binary;
-}
-
-void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
-  /// RAII to establish tool anchors before and after data allocation
-  void *TargetPtr = nullptr;
-  OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
-                    RegionInterface.getCallbacks<ompt_target_data_alloc>(),
-                    DeviceID, HstPtr, &TargetPtr, Size,
-                    /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-
-  TargetPtr = RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
-  return TargetPtr;
-}
-
-int32_t DeviceTy::deleteData(void *TgtAllocBegin, int32_t Kind) {
-  /// RAII to establish tool anchors before and after data deletion
-  OMPT_IF_BUILT(InterfaceRAII TargetDataDeleteRAII(
-                    RegionInterface.getCallbacks<ompt_target_data_delete>(),
-                    DeviceID, TgtAllocBegin,
-                    /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-
-  return RTL->data_delete(RTLDeviceID, TgtAllocBegin, Kind);
-}
-
-// Submit data to device
-int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
-                             AsyncInfoTy &AsyncInfo, HostDataToTargetTy *Entry,
-                             MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
-  if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
-    MappingInfo.printCopyInfo(TgtPtrBegin, HstPtrBegin, Size, /*H2D=*/true,
-                              Entry, HDTTMapPtr);
-
-  /// RAII to establish tool anchors before and after data submit
-  OMPT_IF_BUILT(
-      InterfaceRAII TargetDataSubmitRAII(
-          RegionInterface.getCallbacks<ompt_target_data_transfer_to_device>(),
-          omp_get_initial_device(), HstPtrBegin, DeviceID, TgtPtrBegin, Size,
-          /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-
-  return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
-                                AsyncInfo);
-}
-
-// Retrieve data from device
-int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
-                               int64_t Size, AsyncInfoTy &AsyncInfo,
-                               HostDataToTargetTy *Entry,
-                               MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
-  if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
-    MappingInfo.printCopyInfo(TgtPtrBegin, HstPtrBegin, Size, /*H2D=*/false,
-                              Entry, HDTTMapPtr);
-
-  /// RAII to establish tool anchors before and after data retrieval
-  OMPT_IF_BUILT(
-      InterfaceRAII TargetDataRetrieveRAII(
-          RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
-          DeviceID, TgtPtrBegin, omp_get_initial_device(), HstPtrBegin, Size,
-          /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-
-  return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
-                                  AsyncInfo);
-}
-
-// Copy data from current device to destination device directly
-int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
-                               int64_t Size, AsyncInfoTy &AsyncInfo) {
-  /// RAII to establish tool anchors before and after data exchange
-  /// Note: Despite the fact that this is a data exchange, we use 'from_device'
-  ///       operation enum (w.r.t. ompt_target_data_op_t) as there is currently
-  ///       no better alternative. It is still possible to distinguish this
-  ///       scenario from a real data retrieve by checking if both involved
-  ///       device numbers are less than omp_get_num_devices().
-  OMPT_IF_BUILT(
-      InterfaceRAII TargetDataExchangeRAII(
-          RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
-          RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
-          /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-  if (!AsyncInfo) {
-    assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
-    return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
-                              Size);
-  }
-  return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
-                                  DstPtr, Size, AsyncInfo);
-}
-
-int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
-  DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
-     DPxPTR(HstPtr), Size);
-
-  if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
-    REPORT("Notifiying about data mapping failed.\n");
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
-  DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
-
-  if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
-    REPORT("Notifiying about data unmapping failed.\n");
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-// Run region on device
-int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
-                               ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
-                               AsyncInfoTy &AsyncInfo) {
-  return RTL->launch_kernel(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
-                            &KernelArgs, AsyncInfo);
-}
-
-// Run region on device
-bool DeviceTy::printDeviceInfo() {
-  RTL->print_device_info(RTLDeviceID);
-  return true;
-}
-
-// Whether data can be copied to DstDevice directly
-bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
-  if (RTL != DstDevice.RTL)
-    return false;
-
-  if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
-    return true;
-  return false;
-}
-
-int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
-  return RTL->synchronize(RTLDeviceID, AsyncInfo);
-}
-
-int32_t DeviceTy::queryAsync(AsyncInfoTy &AsyncInfo) {
-  return RTL->query_async(RTLDeviceID, AsyncInfo);
-}
-
-int32_t DeviceTy::createEvent(void **Event) {
-  return RTL->create_event(RTLDeviceID, Event);
-}
-
-int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) {
-  return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
-}
-
-int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) {
-  return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
-}
-
-int32_t DeviceTy::syncEvent(void *Event) {
-  return RTL->sync_event(RTLDeviceID, Event);
-}
-
-int32_t DeviceTy::destroyEvent(void *Event) {
-  return RTL->destroy_event(RTLDeviceID, Event);
-}
-
-void DeviceTy::dumpOffloadEntries() {
-  fprintf(stderr, "Device %i offload entries:\n", DeviceID);
-  for (auto &It : *DeviceOffloadEntries.getExclusiveAccessor()) {
-    const char *Kind = "kernel";
-    if (It.second.isLink())
-      Kind = "link";
-    else if (It.second.isGlobal())
-      Kind = "global var.";
-    fprintf(stderr, "  %11s: %s\n", Kind, It.second.getNameAsCStr());
-  }
-}
-
-bool DeviceTy::useAutoZeroCopy() {
-  return RTL->use_auto_zero_copy(RTLDeviceID);
-}

diff --git a/libomptarget/src/exports b/libomptarget/src/exports
deleted file mode 100644
index f95544e..0000000
--- a/libomptarget/src/exports
+++ /dev/null

@@ -1,76 +0,0 @@
-VERS1.0 {
-  global:
-    __tgt_rtl_init;
-    __tgt_rtl_deinit;
-    __tgt_register_requires;
-    __tgt_register_lib;
-    __tgt_unregister_lib;
-    __tgt_init_all_rtls;
-    __tgt_target_data_begin;
-    __tgt_target_data_end;
-    __tgt_target_data_update;
-    __tgt_target;
-    __tgt_target_teams;
-    __tgt_target_data_begin_nowait;
-    __tgt_target_data_end_nowait;
-    __tgt_target_data_update_nowait;
-    __tgt_target_nowait;
-    __tgt_target_teams_nowait;
-    __tgt_target_data_begin_mapper;
-    __tgt_target_data_end_mapper;
-    __tgt_target_data_update_mapper;
-    __tgt_target_mapper;
-    __tgt_target_teams_mapper;
-    __tgt_target_data_begin_nowait_mapper;
-    __tgt_target_data_end_nowait_mapper;
-    __tgt_target_data_update_nowait_mapper;
-    __tgt_target_nowait_mapper;
-    __tgt_target_teams_nowait_mapper;
-    __tgt_target_kernel;
-    __tgt_target_kernel_nowait;
-    __tgt_target_nowait_query;
-    __tgt_target_kernel_replay;
-    __tgt_activate_record_replay;
-    __tgt_mapper_num_components;
-    __tgt_push_mapper_component;
-    __kmpc_push_target_tripcount;
-    __kmpc_push_target_tripcount_mapper;
-    ompx_dump_mapping_tables;
-    omp_get_mapped_ptr;
-    omp_get_num_devices;
-    omp_get_device_num;
-    omp_get_initial_device;
-    omp_target_alloc;
-    omp_target_free;
-    omp_target_is_present;
-    omp_target_memcpy;
-    omp_target_memcpy_rect;
-    omp_target_memcpy_async;
-    omp_target_memcpy_rect_async;
-    omp_target_memset;
-    omp_target_memset_async;
-    omp_target_associate_ptr;
-    omp_target_disassociate_ptr;
-    llvm_omp_target_alloc_host;
-    llvm_omp_target_alloc_shared;
-    llvm_omp_target_alloc_device;
-    llvm_omp_target_free_host;
-    llvm_omp_target_free_shared;
-    llvm_omp_target_free_device;
-    llvm_omp_target_dynamic_shared_alloc;
-    llvm_omp_target_lock_mem;
-    llvm_omp_target_unlock_mem;
-    __tgt_set_info_flag;
-    __tgt_print_device_info;
-    omp_get_interop_ptr;
-    omp_get_interop_str;
-    omp_get_interop_int;
-    omp_get_interop_name;
-    omp_get_interop_type_desc;
-    __tgt_interop_init;
-    __tgt_interop_use;
-    __tgt_interop_destroy;
-    ompt_libomptarget_connect;
-  local:
-    *;
-};

diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp
deleted file mode 100644
index 5577036..0000000
--- a/libomptarget/src/interface.cpp
+++ /dev/null

@@ -1,514 +0,0 @@
-//===-------- interface.cpp - Target independent OpenMP target RTL --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of the interface to be used by Clang during the codegen of a
-// target region.
-//
-//===----------------------------------------------------------------------===//
-
-#include "OpenMP/OMPT/Interface.h"
-#include "OpenMP/OMPT/Callback.h"
-#include "PluginManager.h"
-#include "private.h"
-
-#include "Shared/EnvironmentVar.h"
-#include "Shared/Profile.h"
-
-#include "Utils/ExponentialBackoff.h"
-
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-
-#include <cassert>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-
-#ifdef OMPT_SUPPORT
-using namespace llvm::omp::target::ompt;
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// adds requires flags
-EXTERN void __tgt_register_requires(int64_t Flags) {
-  MESSAGE("The %s function has been removed. Old OpenMP requirements will not "
-          "be handled",
-          __PRETTY_FUNCTION__);
-}
-
-EXTERN void __tgt_rtl_init() { initRuntime(); }
-EXTERN void __tgt_rtl_deinit() { deinitRuntime(); }
-
-////////////////////////////////////////////////////////////////////////////////
-/// adds a target shared library to the target execution image
-EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
-  initRuntime();
-  if (PM->delayRegisterLib(Desc))
-    return;
-
-  PM->registerLib(Desc);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Initialize all available devices without registering any image
-EXTERN void __tgt_init_all_rtls() {
-  assert(PM && "Runtime not initialized");
-  PM->initAllPlugins();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// unloads a target shared library
-EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
-  PM->unregisterLib(Desc);
-
-  deinitRuntime();
-}
-
-template <typename TargetAsyncInfoTy>
-static inline void
-targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-           void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
-           map_var_info_t *ArgNames, void **ArgMappers,
-           TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
-           const char *RegionName) {
-  assert(PM && "Runtime not initialized");
-  static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
-                "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
-
-  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
-                                   "NumArgs=" + std::to_string(ArgNum), Loc);
-
-  DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
-     RegionName, DeviceId, ArgNum);
-
-  if (checkDeviceAndCtors(DeviceId, Loc)) {
-    DP("Not offloading to device %" PRId64 "\n", DeviceId);
-    return;
-  }
-
-  if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
-    printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames,
-                         RegionTypeMsg);
-#ifdef OMPTARGET_DEBUG
-  for (int I = 0; I < ArgNum; ++I) {
-    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-       ", Type=0x%" PRIx64 ", Name=%s\n",
-       I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I],
-       (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown");
-  }
-#endif
-
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
-  AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
-
-  /// RAII to establish tool anchors before and after data begin / end / update
-  OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin ||
-                        TargetDataFunction == targetDataEnd ||
-                        TargetDataFunction == targetDataUpdate) &&
-                       "Encountered unexpected TargetDataFunction during "
-                       "execution of targetData");
-                auto CallbackFunctions =
-                    (TargetDataFunction == targetDataBegin)
-                        ? RegionInterface.getCallbacks<ompt_target_enter_data>()
-                    : (TargetDataFunction == targetDataEnd)
-                        ? RegionInterface.getCallbacks<ompt_target_exit_data>()
-                        : RegionInterface.getCallbacks<ompt_target_update>();
-                InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
-                                             OMPT_GET_RETURN_ADDRESS);)
-
-  int Rc = OFFLOAD_SUCCESS;
-  Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
-                          ArgTypes, ArgNames, ArgMappers, AsyncInfo,
-                          false /*FromMapper=*/);
-
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
-
-  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
-}
-
-/// creates host-to-target data mapping, stores it in the
-/// libomptarget.so internal structure (an entry in a stack of data maps)
-/// and passes the data to the device.
-EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
-                                           int32_t ArgNum, void **ArgsBase,
-                                           void **Args, int64_t *ArgSizes,
-                                           int64_t *ArgTypes,
-                                           map_var_info_t *ArgNames,
-                                           void **ArgMappers) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
-                          ArgTypes, ArgNames, ArgMappers, targetDataBegin,
-                          "Entering OpenMP data region with being_mapper",
-                          "begin");
-}
-
-EXTERN void __tgt_target_data_begin_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
-    void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
-    void *NoAliasDepList) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  targetData<TaskAsyncInfoWrapperTy>(
-      Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
-      ArgMappers, targetDataBegin,
-      "Entering OpenMP data region with being_nowait_mapper", "begin");
-}
-
-/// passes data from the target, releases target memory and destroys
-/// the host-target mapping (top entry from the stack of data maps)
-/// created by the last __tgt_target_data_begin.
-EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
-                                         int32_t ArgNum, void **ArgsBase,
-                                         void **Args, int64_t *ArgSizes,
-                                         int64_t *ArgTypes,
-                                         map_var_info_t *ArgNames,
-                                         void **ArgMappers) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
-                          ArgTypes, ArgNames, ArgMappers, targetDataEnd,
-                          "Exiting OpenMP data region with end_mapper", "end");
-}
-
-EXTERN void __tgt_target_data_end_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
-    void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
-    void *NoAliasDepList) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  targetData<TaskAsyncInfoWrapperTy>(
-      Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
-      ArgMappers, targetDataEnd,
-      "Exiting OpenMP data region with end_nowait_mapper", "end");
-}
-
-EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
-                                            int32_t ArgNum, void **ArgsBase,
-                                            void **Args, int64_t *ArgSizes,
-                                            int64_t *ArgTypes,
-                                            map_var_info_t *ArgNames,
-                                            void **ArgMappers) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  targetData<AsyncInfoTy>(
-      Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
-      ArgMappers, targetDataUpdate,
-      "Updating data within the OpenMP data region with update_mapper",
-      "update");
-}
-
-EXTERN void __tgt_target_data_update_nowait_mapper(
-    ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
-    void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
-    void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
-    void *NoAliasDepList) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  targetData<TaskAsyncInfoWrapperTy>(
-      Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
-      ArgMappers, targetDataUpdate,
-      "Updating data within the OpenMP data region with update_nowait_mapper",
-      "update");
-}
-
-static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs,
-                                       KernelArgsTy &LocalKernelArgs,
-                                       int32_t NumTeams, int32_t ThreadLimit) {
-  if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION)
-    DP("Unexpected ABI version: %u\n", KernelArgs->Version);
-
-  uint32_t UpgradedVersion = KernelArgs->Version;
-  if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) {
-    // The upgraded version will be based on the kernel launch environment.
-    if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
-      UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1;
-    else
-      UpgradedVersion = OMP_KERNEL_ARG_VERSION;
-  }
-  if (UpgradedVersion != KernelArgs->Version) {
-    LocalKernelArgs.Version = UpgradedVersion;
-    LocalKernelArgs.NumArgs = KernelArgs->NumArgs;
-    LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs;
-    LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs;
-    LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes;
-    LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes;
-    LocalKernelArgs.ArgNames = KernelArgs->ArgNames;
-    LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers;
-    LocalKernelArgs.Tripcount = KernelArgs->Tripcount;
-    LocalKernelArgs.Flags = KernelArgs->Flags;
-    LocalKernelArgs.DynCGroupMem = 0;
-    LocalKernelArgs.NumTeams[0] = NumTeams;
-    LocalKernelArgs.NumTeams[1] = 0;
-    LocalKernelArgs.NumTeams[2] = 0;
-    LocalKernelArgs.ThreadLimit[0] = ThreadLimit;
-    LocalKernelArgs.ThreadLimit[1] = 0;
-    LocalKernelArgs.ThreadLimit[2] = 0;
-    return &LocalKernelArgs;
-  }
-
-  return KernelArgs;
-}
-
-template <typename TargetAsyncInfoTy>
-static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
-                               int32_t ThreadLimit, void *HostPtr,
-                               KernelArgsTy *KernelArgs) {
-  assert(PM && "Runtime not initialized");
-  static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
-                "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-  DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
-     "\n",
-     DeviceId, DPxPTR(HostPtr));
-
-  if (checkDeviceAndCtors(DeviceId, Loc)) {
-    DP("Not offloading to device %" PRId64 "\n", DeviceId);
-    return OMP_TGT_FAIL;
-  }
-
-  bool IsTeams = NumTeams != -1;
-  if (!IsTeams)
-    KernelArgs->NumTeams[0] = NumTeams = 1;
-
-  // Auto-upgrade kernel args version 1 to 2.
-  KernelArgsTy LocalKernelArgs;
-  KernelArgs =
-      upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit);
-
-  assert(KernelArgs->NumTeams[0] == static_cast<uint32_t>(NumTeams) &&
-         !KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
-         "OpenMP interface should not use multiple dimensions");
-  assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
-         !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
-         "OpenMP interface should not use multiple dimensions");
-  TIMESCOPE_WITH_DETAILS_AND_IDENT(
-      "Runtime: target exe",
-      "NumTeams=" + std::to_string(NumTeams) +
-          ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
-      Loc);
-
-  if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
-    printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
-                         KernelArgs->ArgSizes, KernelArgs->ArgTypes,
-                         KernelArgs->ArgNames, "Entering OpenMP kernel");
-#ifdef OMPTARGET_DEBUG
-  for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) {
-    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-       ", Type=0x%" PRIx64 ", Name=%s\n",
-       I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]),
-       KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I],
-       (KernelArgs->ArgNames)
-           ? getNameFromMapping(KernelArgs->ArgNames[I]).c_str()
-           : "unknown");
-  }
-#endif
-
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
-  AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
-  /// RAII to establish tool anchors before and after target region
-  OMPT_IF_BUILT(InterfaceRAII TargetRAII(
-                    RegionInterface.getCallbacks<ompt_target>(), DeviceId,
-                    /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-
-  int Rc = OFFLOAD_SUCCESS;
-  Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
-  { // required to show syncronization
-    TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
-    if (Rc == OFFLOAD_SUCCESS)
-      Rc = AsyncInfo.synchronize();
-
-    handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
-    assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-  }
-  return OMP_TGT_SUCCESS;
-}
-
-/// Implements a kernel entry that executes the target region on the specified
-/// device.
-///
-/// \param Loc Source location associated with this target region.
-/// \param DeviceId The device to execute this region, -1 indicated the default.
-/// \param NumTeams Number of teams to launch the region with, -1 indicates a
-///                 non-teams region and 0 indicates it was unspecified.
-/// \param ThreadLimit Limit to the number of threads to use in the kernel
-///                    launch, 0 indicates it was unspecified.
-/// \param HostPtr  The pointer to the host function registered with the kernel.
-/// \param Args     All arguments to this kernel launch (see struct definition).
-EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
-                               int32_t ThreadLimit, void *HostPtr,
-                               KernelArgsTy *KernelArgs) {
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  if (KernelArgs->Flags.NoWait)
-    return targetKernel<TaskAsyncInfoWrapperTy>(
-        Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs);
-  return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit,
-                                   HostPtr, KernelArgs);
-}
-
-/// Activates the record replay mechanism.
-/// \param DeviceId The device identifier to execute the target region.
-/// \param MemorySize The number of bytes to be (pre-)allocated
-///                   by the bump allocator
-/// /param IsRecord Activates the record replay mechanism in
-///                 'record' mode or 'replay' mode.
-/// /param SaveOutput Store the device memory after kernel
-///                   execution on persistent storage
-EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
-                                        void *VAddr, bool IsRecord,
-                                        bool SaveOutput,
-                                        uint64_t &ReqPtrArgOffset) {
-  assert(PM && "Runtime not initialized");
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  [[maybe_unused]] int Rc = target_activate_rr(
-      *DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset);
-  assert(Rc == OFFLOAD_SUCCESS &&
-         "__tgt_activate_record_replay unexpected failure!");
-  return OMP_TGT_SUCCESS;
-}
-
-/// Implements a target kernel entry that replays a pre-recorded kernel.
-/// \param Loc Source location associated with this target region (unused).
-/// \param DeviceId The device identifier to execute the target region.
-/// \param HostPtr A pointer to an address that uniquely identifies the kernel.
-/// \param DeviceMemory A pointer to an array storing device memory data to move
-///                     prior to kernel execution.
-/// \param DeviceMemorySize The size of the above device memory data in bytes.
-/// \param TgtArgs An array of pointers of the pre-recorded target kernel
-///                arguments.
-/// \param TgtOffsets An array of pointers of the pre-recorded target kernel
-///                   argument offsets.
-/// \param NumArgs The number of kernel arguments.
-/// \param NumTeams Number of teams to launch the target region with.
-/// \param ThreadLimit Limit to the number of threads to use in kernel
-///                    execution.
-/// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
-/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
-EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
-                                      void *HostPtr, void *DeviceMemory,
-                                      int64_t DeviceMemorySize, void **TgtArgs,
-                                      ptrdiff_t *TgtOffsets, int32_t NumArgs,
-                                      int32_t NumTeams, int32_t ThreadLimit,
-                                      uint64_t LoopTripCount) {
-  assert(PM && "Runtime not initialized");
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-  if (checkDeviceAndCtors(DeviceId, Loc)) {
-    DP("Not offloading to device %" PRId64 "\n", DeviceId);
-    return OMP_TGT_FAIL;
-  }
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  /// RAII to establish tool anchors before and after target region
-  OMPT_IF_BUILT(InterfaceRAII TargetRAII(
-                    RegionInterface.getCallbacks<ompt_target>(), DeviceId,
-                    /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-
-  AsyncInfoTy AsyncInfo(*DeviceOrErr);
-  int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory,
-                         DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs,
-                         NumTeams, ThreadLimit, LoopTripCount, AsyncInfo);
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
-  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
-  assert(Rc == OFFLOAD_SUCCESS &&
-         "__tgt_target_kernel_replay unexpected failure!");
-  return OMP_TGT_SUCCESS;
-}
-
-// Get the current number of components for a user-defined mapper.
-EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
-  auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
-  int64_t Size = MapperComponentsPtr->Components.size();
-  DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
-     DPxPTR(RtMapperHandle), Size);
-  return Size;
-}
-
-// Push back one component for a user-defined mapper.
-EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
-                                        void *Begin, int64_t Size, int64_t Type,
-                                        void *Name) {
-  DP("__tgt_push_mapper_component(Handle=" DPxMOD
-     ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-     ", Type=0x%" PRIx64 ", Name=%s).\n",
-     DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type,
-     (Name) ? getNameFromMapping(Name).c_str() : "unknown");
-  auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
-  MapperComponentsPtr->Components.push_back(
-      MapComponentInfoTy(Base, Begin, Size, Type, Name));
-}
-
-EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
-  assert(PM && "Runtime not initialized");
-  std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
-  InfoLevel.store(NewInfoLevel);
-  for (auto &R : PM->pluginAdaptors())
-    R.set_info_flag(NewInfoLevel);
-}
-
-EXTERN int __tgt_print_device_info(int64_t DeviceId) {
-  assert(PM && "Runtime not initialized");
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  return DeviceOrErr->printDeviceInfo();
-}
-
-EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
-  assert(PM && "Runtime not initialized");
-  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
-
-  if (!AsyncHandle || !*AsyncHandle) {
-    FATAL_MESSAGE0(
-        1, "Receive an invalid async handle from the current OpenMP task. Is "
-           "this a target nowait region?\n");
-  }
-
-  // Exponential backoff tries to optimally decide if a thread should just query
-  // for the device operations (work/spin wait on them) or block until they are
-  // completed (use device side blocking mechanism). This allows the runtime to
-  // adapt itself when there are a lot of long-running target regions in-flight.
-  static thread_local utils::ExponentialBackoff QueryCounter(
-      Int64Envar("OMPTARGET_QUERY_COUNT_MAX", 10),
-      Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD", 5),
-      Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR", 0.5f));
-
-  auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle;
-
-  // If the thread is actively waiting on too many target nowait regions, we
-  // should use the blocking sync type.
-  if (QueryCounter.isAboveThreshold())
-    AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING;
-
-  if (AsyncInfo->synchronize())
-    FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
-  // If there are device operations still pending, return immediately without
-  // deallocating the handle and increase the current thread query count.
-  if (!AsyncInfo->isDone()) {
-    QueryCounter.increment();
-    return;
-  }
-
-  // When a thread successfully completes a target nowait region, we
-  // exponentially backoff its query counter by the query factor.
-  QueryCounter.decrement();
-
-  // Delete the handle and unset it from the OpenMP task data.
-  delete AsyncInfo;
-  *AsyncHandle = nullptr;
-}

diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
deleted file mode 100644
index 803e941..0000000
--- a/libomptarget/src/omptarget.cpp
+++ /dev/null

@@ -1,1767 +0,0 @@
-//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of the interface to be used by Clang during the codegen of a
-// target region.
-//
-//===----------------------------------------------------------------------===//
-
-#include "omptarget.h"
-#include "OffloadPolicy.h"
-#include "OpenMP/OMPT/Callback.h"
-#include "OpenMP/OMPT/Interface.h"
-#include "PluginManager.h"
-#include "Shared/Debug.h"
-#include "Shared/EnvironmentVar.h"
-#include "Shared/Utils.h"
-#include "device.h"
-#include "private.h"
-#include "rtl.h"
-
-#include "Shared/Profile.h"
-
-#include "OpenMP/Mapping.h"
-#include "OpenMP/omp.h"
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/bit.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Object/ObjectFile.h"
-
-#include <cassert>
-#include <cstdint>
-#include <vector>
-
-using llvm::SmallVector;
-#ifdef OMPT_SUPPORT
-using namespace llvm::omp::target::ompt;
-#endif
-
-int AsyncInfoTy::synchronize() {
-  int Result = OFFLOAD_SUCCESS;
-  if (!isQueueEmpty()) {
-    switch (SyncType) {
-    case SyncTy::BLOCKING:
-      // If we have a queue we need to synchronize it now.
-      Result = Device.synchronize(*this);
-      assert(AsyncInfo.Queue == nullptr &&
-             "The device plugin should have nulled the queue to indicate there "
-             "are no outstanding actions!");
-      break;
-    case SyncTy::NON_BLOCKING:
-      Result = Device.queryAsync(*this);
-      break;
-    }
-  }
-
-  // Run any pending post-processing function registered on this async object.
-  if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
-    Result = runPostProcessing();
-
-  return Result;
-}
-
-void *&AsyncInfoTy::getVoidPtrLocation() {
-  BufferLocations.push_back(nullptr);
-  return BufferLocations.back();
-}
-
-bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
-
-int32_t AsyncInfoTy::runPostProcessing() {
-  size_t Size = PostProcessingFunctions.size();
-  for (size_t I = 0; I < Size; ++I) {
-    const int Result = PostProcessingFunctions[I]();
-    if (Result != OFFLOAD_SUCCESS)
-      return Result;
-  }
-
-  // Clear the vector up until the last known function, since post-processing
-  // procedures might add new procedures themselves.
-  const auto *PrevBegin = PostProcessingFunctions.begin();
-  PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size);
-
-  return OFFLOAD_SUCCESS;
-}
-
-bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
-
-/* All begin addresses for partially mapped structs must be aligned, up to 16,
- * in order to ensure proper alignment of members. E.g.
- *
- * struct S {
- *   int a;   // 4-aligned
- *   int b;   // 4-aligned
- *   int *p;  // 8-aligned
- * } s1;
- * ...
- * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
- * {
- *   s1.b = 5;
- *   for (int i...) s1.p[i] = ...;
- * }
- *
- * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
- * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
- * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
- * requirements for its type. Now, when we allocate memory on the device, in
- * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
- * This means that the chunk of the struct on the device will start at a
- * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
- * address of p will be a misaligned 0x204 (on the host there was no need to add
- * padding between b and p, so p comes exactly 4 bytes after b). If the device
- * kernel tries to access s1.p, a misaligned address error occurs (as reported
- * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
- * extending the size of the allocated chuck accordingly, the chuck on the
- * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
- * &s1.p=0x208, as they should be to satisfy the alignment requirements.
- */
-static const int64_t MaxAlignment = 16;
-
-/// Return the alignment requirement of partially mapped structs, see
-/// MaxAlignment above.
-static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) {
-  int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase));
-  uint64_t BaseAlignment = 1 << (LowestOneBit - 1);
-  return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment;
-}
-
-/// Map global data and execute pending ctors
-static int initLibrary(DeviceTy &Device) {
-  /*
-   * Map global data
-   */
-  int32_t DeviceId = Device.DeviceID;
-  int Rc = OFFLOAD_SUCCESS;
-  {
-    std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
-    for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
-      TranslationTable *TransTable =
-          &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
-      if (TransTable->HostTable.EntriesBegin ==
-          TransTable->HostTable.EntriesEnd) {
-        // No host entry so no need to proceed
-        continue;
-      }
-
-      if (TransTable->TargetsTable[DeviceId] != 0) {
-        // Library entries have already been processed
-        continue;
-      }
-
-      // 1) get image.
-      assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&
-             "Not expecting a device ID outside the table's bounds!");
-      __tgt_device_image *Img = TransTable->TargetsImages[DeviceId];
-      if (!Img) {
-        REPORT("No image loaded for device id %d.\n", DeviceId);
-        Rc = OFFLOAD_FAIL;
-        break;
-      }
-
-      // 2) Load the image onto the given device.
-      auto BinaryOrErr = Device.loadBinary(Img);
-      if (llvm::Error Err = BinaryOrErr.takeError()) {
-        REPORT("Failed to load image %s\n",
-               llvm::toString(std::move(Err)).c_str());
-        Rc = OFFLOAD_FAIL;
-        break;
-      }
-
-      // 3) Create the translation table.
-      llvm::SmallVector<__tgt_offload_entry> &DeviceEntries =
-          TransTable->TargetsEntries[DeviceId];
-      for (__tgt_offload_entry &Entry :
-           llvm::make_range(Img->EntriesBegin, Img->EntriesEnd)) {
-        __tgt_device_binary &Binary = *BinaryOrErr;
-
-        __tgt_offload_entry DeviceEntry = Entry;
-        if (Entry.size) {
-          if (Device.RTL->get_global(Binary, Entry.size, Entry.name,
-                                     &DeviceEntry.addr) != OFFLOAD_SUCCESS)
-            REPORT("Failed to load symbol %s\n", Entry.name);
-
-          // If unified memory is active, the corresponding global is a device
-          // reference to the host global. We need to initialize the pointer on
-          // the deive to point to the memory on the host.
-          if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
-              (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
-            if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
-                                        Entry.size) != OFFLOAD_SUCCESS)
-              REPORT("Failed to write symbol for USM %s\n", Entry.name);
-          }
-        } else if (Entry.addr) {
-          if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) !=
-              OFFLOAD_SUCCESS)
-            REPORT("Failed to load kernel %s\n", Entry.name);
-        }
-        DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
-           DPxPTR(Entry.addr), (Entry.size) ? " global" : "", Entry.name,
-           DPxPTR(DeviceEntry.addr));
-
-        DeviceEntries.emplace_back(DeviceEntry);
-      }
-
-      // Set the storage for the table and get a pointer to it.
-      __tgt_target_table DeviceTable{&DeviceEntries[0],
-                                     &DeviceEntries[0] + DeviceEntries.size()};
-      TransTable->DeviceTables[DeviceId] = DeviceTable;
-      __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
-          &TransTable->DeviceTables[DeviceId];
-
-      // 4) Verify whether the two table sizes match.
-      size_t Hsize =
-          TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
-      size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
-
-      // Invalid image for these host entries!
-      if (Hsize != Tsize) {
-        REPORT(
-            "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
-            DeviceId, Hsize, Tsize);
-        TransTable->TargetsImages[DeviceId] = 0;
-        TransTable->TargetsTable[DeviceId] = 0;
-        Rc = OFFLOAD_FAIL;
-        break;
-      }
-
-      MappingInfoTy::HDTTMapAccessorTy HDTTMap =
-          Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
-
-      __tgt_target_table *HostTable = &TransTable->HostTable;
-      for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
-                               *CurrHostEntry = HostTable->EntriesBegin,
-                               *EntryDeviceEnd = TargetTable->EntriesEnd;
-           CurrDeviceEntry != EntryDeviceEnd;
-           CurrDeviceEntry++, CurrHostEntry++) {
-        if (CurrDeviceEntry->size == 0)
-          continue;
-
-        assert(CurrDeviceEntry->size == CurrHostEntry->size &&
-               "data size mismatch");
-
-        // Fortran may use multiple weak declarations for the same symbol,
-        // therefore we must allow for multiple weak symbols to be loaded from
-        // the fat binary. Treat these mappings as any other "regular"
-        // mapping. Add entry to map.
-        if (Device.getMappingInfo().getTgtPtrBegin(HDTTMap, CurrHostEntry->addr,
-                                                   CurrHostEntry->size))
-          continue;
-
-        void *CurrDeviceEntryAddr = CurrDeviceEntry->addr;
-
-        // For indirect mapping, follow the indirection and map the actual
-        // target.
-        if (CurrDeviceEntry->flags & OMP_DECLARE_TARGET_INDIRECT) {
-          AsyncInfoTy AsyncInfo(Device);
-          void *DevPtr;
-          Device.retrieveData(&DevPtr, CurrDeviceEntryAddr, sizeof(void *),
-                              AsyncInfo, /*Entry=*/nullptr, &HDTTMap);
-          if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
-            return OFFLOAD_FAIL;
-          CurrDeviceEntryAddr = DevPtr;
-        }
-
-        DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
-           ", name \"%s\"\n",
-           DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
-           CurrDeviceEntry->size, CurrDeviceEntry->name);
-        HDTTMap->emplace(new HostDataToTargetTy(
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
-            (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
-            (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
-            (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
-            false /*UseHoldRefCount*/, CurrHostEntry->name,
-            true /*IsRefCountINF*/));
-
-        // Notify about the new mapping.
-        if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size))
-          return OFFLOAD_FAIL;
-      }
-    }
-  }
-
-  if (Rc != OFFLOAD_SUCCESS)
-    return Rc;
-
-  static Int32Envar DumpOffloadEntries =
-      Int32Envar("OMPTARGET_DUMP_OFFLOAD_ENTRIES", -1);
-  if (DumpOffloadEntries.get() == DeviceId)
-    Device.dumpOffloadEntries();
-
-  return OFFLOAD_SUCCESS;
-}
-
-void handleTargetOutcome(bool Success, ident_t *Loc) {
-  switch (OffloadPolicy::get(*PM).Kind) {
-  case OffloadPolicy::DISABLED:
-    if (Success) {
-      FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
-    }
-    break;
-  case OffloadPolicy::MANDATORY:
-    if (!Success) {
-      if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
-        auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
-        for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
-          dumpTargetPointerMappings(Loc, Device);
-      } else
-        FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
-                        "for debugging options.\n");
-
-      if (!PM->getNumUsedPlugins()) {
-        FAILURE_MESSAGE(
-            "No images found compatible with the installed hardware. ");
-
-        llvm::SmallVector<llvm::StringRef> Archs;
-        for (auto &Image : PM->deviceImages()) {
-          const char *Start = reinterpret_cast<const char *>(
-              Image.getExecutableImage().ImageStart);
-          uint64_t Length = llvm::omp::target::getPtrDiff(
-              Start, Image.getExecutableImage().ImageEnd);
-          llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
-                                       /*Identifier=*/"");
-
-          auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
-          if (auto Err = ObjectOrErr.takeError()) {
-            llvm::consumeError(std::move(Err));
-            continue;
-          }
-
-          if (auto CPU = (*ObjectOrErr)->tryGetCPUName())
-            Archs.push_back(*CPU);
-        }
-        fprintf(stderr, "Found %zu image(s): (%s)\n", Archs.size(),
-                llvm::join(Archs, ",").c_str());
-      }
-
-      SourceInfo Info(Loc);
-      if (Info.isAvailible())
-        fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
-                Info.getColumn());
-      else
-        FAILURE_MESSAGE("Source location information not present. Compile with "
-                        "-g or -gline-tables-only.\n");
-      FATAL_MESSAGE0(
-          1, "failure of target construct while offloading is mandatory");
-    } else {
-      if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
-        auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
-        for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
-          dumpTargetPointerMappings(Loc, Device);
-      }
-    }
-    break;
-  }
-}
-
-// If offload is enabled, ensure that device DeviceID has been initialized,
-// global ctors have been executed, and global data has been mapped.
-//
-// The return bool indicates if the offload is to the host device
-// There are three possible results:
-// - Return false if the taregt device is ready for offload
-// - Return true without reporting a runtime error if offload is
-//   disabled, perhaps because the initial device was specified.
-// - Report a runtime error and return true.
-//
-// If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
-// This step might be skipped if offload is disabled.
-bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
-  if (OffloadPolicy::get(*PM).Kind == OffloadPolicy::DISABLED) {
-    DP("Offload is disabled\n");
-    return true;
-  }
-
-  if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
-    DeviceID = omp_get_default_device();
-    DP("Use default device id %" PRId64 "\n", DeviceID);
-  }
-
-  // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
-  if (omp_get_num_devices() == 0) {
-    DP("omp_get_num_devices() == 0 but offload is manadatory\n");
-    handleTargetOutcome(false, Loc);
-    return true;
-  }
-
-  if (DeviceID == omp_get_initial_device()) {
-    DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
-       DeviceID);
-    return true;
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceID);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceID, "%s", toString(DeviceOrErr.takeError()).data());
-
-  // Check whether global data has been mapped for this device
-  if (initLibrary(*DeviceOrErr) != OFFLOAD_SUCCESS) {
-    REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
-    handleTargetOutcome(false, Loc);
-    return true;
-  }
-
-  return false;
-}
-
-static int32_t getParentIndex(int64_t Type) {
-  return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
-}
-
-void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
-                          const char *Name) {
-  DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
-
-  if (Size <= 0) {
-    DP("Call to %s with non-positive length\n", Name);
-    return NULL;
-  }
-
-  void *Rc = NULL;
-
-  if (DeviceNum == omp_get_initial_device()) {
-    Rc = malloc(Size);
-    DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
-    return Rc;
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  Rc = DeviceOrErr->allocData(Size, nullptr, Kind);
-  DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
-  return Rc;
-}
-
-void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
-                        const char *Name) {
-  DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
-     DPxPTR(DevicePtr));
-
-  if (!DevicePtr) {
-    DP("Call to %s with NULL ptr\n", Name);
-    return;
-  }
-
-  if (DeviceNum == omp_get_initial_device()) {
-    free(DevicePtr);
-    DP("%s deallocated host ptr\n", Name);
-    return;
-  }
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  DeviceOrErr->deleteData(DevicePtr, Kind);
-  DP("omp_target_free deallocated device ptr\n");
-}
-
-void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
-                         const char *Name) {
-  DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
-
-  if (Size <= 0) {
-    DP("Call to %s with non-positive length\n", Name);
-    return NULL;
-  }
-
-  void *RC = NULL;
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  int32_t Err = 0;
-  Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
-  if (Err) {
-    DP("Could not lock ptr %p\n", HostPtr);
-    return nullptr;
-  }
-  DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC));
-  return RC;
-}
-
-void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
-  DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
-
-  auto DeviceOrErr = PM->getDevice(DeviceNum);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
-  DP("%s returns\n", Name);
-}
-
-/// Call the user-defined mapper function followed by the appropriate
-// targetData* function (targetData{Begin,End,Update}).
-int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
-                     int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
-                     void *ArgMapper, AsyncInfoTy &AsyncInfo,
-                     TargetDataFuncPtrTy TargetDataFunction) {
-  DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
-
-  // The mapper function fills up Components.
-  MapperComponentsTy MapperComponents;
-  MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
-  (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
-                   ArgNames);
-
-  // Construct new arrays for args_base, args, arg_sizes and arg_types
-  // using the information in MapperComponents and call the corresponding
-  // targetData* function using these new arrays.
-  SmallVector<void *> MapperArgsBase(MapperComponents.Components.size());
-  SmallVector<void *> MapperArgs(MapperComponents.Components.size());
-  SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size());
-  SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size());
-  SmallVector<void *> MapperArgNames(MapperComponents.Components.size());
-
-  for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
-    auto &C = MapperComponents.Components[I];
-    MapperArgsBase[I] = C.Base;
-    MapperArgs[I] = C.Begin;
-    MapperArgSizes[I] = C.Size;
-    MapperArgTypes[I] = C.Type;
-    MapperArgNames[I] = C.Name;
-  }
-
-  int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
-                              MapperArgsBase.data(), MapperArgs.data(),
-                              MapperArgSizes.data(), MapperArgTypes.data(),
-                              MapperArgNames.data(), /*arg_mappers*/ nullptr,
-                              AsyncInfo, /*FromMapper=*/true);
-
-  return Rc;
-}
-
-/// Internal function to do the mapping and transfer the data to the device
-int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
-                    void **ArgsBase, void **Args, int64_t *ArgSizes,
-                    int64_t *ArgTypes, map_var_info_t *ArgNames,
-                    void **ArgMappers, AsyncInfoTy &AsyncInfo,
-                    bool FromMapper) {
-  // process each input.
-  for (int32_t I = 0; I < ArgNum; ++I) {
-    // Ignore private variables and arrays - there is no mapping for them.
-    if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
-      continue;
-    TIMESCOPE_WITH_DETAILS_AND_IDENT(
-        "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
-    if (ArgMappers && ArgMappers[I]) {
-      // Instead of executing the regular path of targetDataBegin, call the
-      // targetDataMapper variant which will call targetDataBegin again
-      // with new arguments.
-      DP("Calling targetDataMapper for the %dth argument\n", I);
-
-      map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
-      int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
-                                ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
-                                targetDataBegin);
-
-      if (Rc != OFFLOAD_SUCCESS) {
-        REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
-               " failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      // Skip the rest of this function, continue to the next argument.
-      continue;
-    }
-
-    void *HstPtrBegin = Args[I];
-    void *HstPtrBase = ArgsBase[I];
-    int64_t DataSize = ArgSizes[I];
-    map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
-
-    // Adjust for proper alignment if this is a combined entry (for structs).
-    // Look at the next argument - if that is MEMBER_OF this one, then this one
-    // is a combined entry.
-    int64_t TgtPadding = 0;
-    const int NextI = I + 1;
-    if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
-        getParentIndex(ArgTypes[NextI]) == I) {
-      int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase);
-      TgtPadding = (int64_t)HstPtrBegin % Alignment;
-      if (TgtPadding) {
-        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
-           "\n",
-           TgtPadding, DPxPTR(HstPtrBegin));
-      }
-    }
-
-    // Address of pointer on the host and device, respectively.
-    void *PointerHstPtrBegin, *PointerTgtPtrBegin;
-    TargetPointerResultTy PointerTpr;
-    bool IsHostPtr = false;
-    bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
-    // Force the creation of a device side copy of the data when:
-    // a close map modifier was associated with a map that contained a to.
-    bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
-    bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
-    bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
-    // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
-    // have reached this point via __tgt_target_data_begin and not __tgt_target
-    // then no argument is marked as TARGET_PARAM ("omp target data map" is not
-    // associated with a target region, so there are no target parameters). This
-    // may be considered a hack, we could revise the scheme in the future.
-    bool UpdateRef =
-        !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
-
-    MappingInfoTy::HDTTMapAccessorTy HDTTMap =
-        Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
-      DP("Has a pointer entry: \n");
-      // Base is address of pointer.
-      //
-      // Usually, the pointer is already allocated by this time.  For example:
-      //
-      //   #pragma omp target map(s.p[0:N])
-      //
-      // The map entry for s comes first, and the PTR_AND_OBJ entry comes
-      // afterward, so the pointer is already allocated by the time the
-      // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
-      // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
-      // entry for a global that might not already be allocated by the time the
-      // PTR_AND_OBJ entry is handled below, and so the allocation might fail
-      // when HasPresentModifier.
-      PointerTpr = Device.getMappingInfo().getTargetPointer(
-          HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *),
-          /*HstPtrName=*/nullptr,
-          /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
-          HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
-          /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false);
-      PointerTgtPtrBegin = PointerTpr.TargetPointer;
-      IsHostPtr = PointerTpr.Flags.IsHostPointer;
-      if (!PointerTgtPtrBegin) {
-        REPORT("Call to getTargetPointer returned null pointer (%s).\n",
-               HasPresentModifier ? "'present' map type modifier"
-                                  : "device failure or illegal mapping");
-        return OFFLOAD_FAIL;
-      }
-      DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
-         "\n",
-         sizeof(void *), DPxPTR(PointerTgtPtrBegin),
-         (PointerTpr.Flags.IsNewEntry ? "" : " not"));
-      PointerHstPtrBegin = HstPtrBase;
-      // modify current entry.
-      HstPtrBase = *(void **)HstPtrBase;
-      // No need to update pointee ref count for the first element of the
-      // subelement that comes from mapper.
-      UpdateRef =
-          (!FromMapper || I != 0); // subsequently update ref count of pointee
-    }
-
-    const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
-    const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
-    // Note that HDTTMap will be released in getTargetPointer.
-    auto TPR = Device.getMappingInfo().getTargetPointer(
-        HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
-        HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
-        HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
-    void *TgtPtrBegin = TPR.TargetPointer;
-    IsHostPtr = TPR.Flags.IsHostPointer;
-    // If data_size==0, then the argument could be a zero-length pointer to
-    // NULL, so getOrAlloc() returning NULL is not an error.
-    if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
-      REPORT("Call to getTargetPointer returned null pointer (%s).\n",
-             HasPresentModifier ? "'present' map type modifier"
-                                : "device failure or illegal mapping");
-      return OFFLOAD_FAIL;
-    }
-    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-       " - is%s new\n",
-       DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
-
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
-      uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
-      void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
-      DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
-      ArgsBase[I] = TgtPtrBase;
-    }
-
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
-
-      uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
-      void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
-
-      if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
-              (void **)PointerHstPtrBegin, HstPtrBase,
-              (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
-        DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
-           DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
-
-        void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
-        TgtPtrBase = ExpectedTgtPtrBase;
-
-        int Ret =
-            Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
-                              AsyncInfo, PointerTpr.getEntry());
-        if (Ret != OFFLOAD_SUCCESS) {
-          REPORT("Copying data to device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-        if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
-            OFFLOAD_SUCCESS)
-          return OFFLOAD_FAIL;
-      }
-    }
-
-    // Check if variable can be used on the device:
-    bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF;
-    if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 &&
-        !IsStructMember && !IsImplicit && !TPR.isPresent() &&
-        !TPR.isContained() && !TPR.isHostPointer())
-      INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,
-           "variable %s does not have a valid device counterpart\n",
-           (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-namespace {
-/// This structure contains information to deallocate a target pointer, aka.
-/// used to fix up the shadow map and potentially delete the entry from the
-/// mapping table via \p DeviceTy::deallocTgtPtr.
-struct PostProcessingInfo {
-  /// Host pointer used to look up into the map table
-  void *HstPtrBegin;
-
-  /// Size of the data
-  int64_t DataSize;
-
-  /// The mapping type (bitfield).
-  int64_t ArgType;
-
-  /// The target pointer information.
-  TargetPointerResultTy TPR;
-
-  PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType,
-                     TargetPointerResultTy &&TPR)
-      : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType),
-        TPR(std::move(TPR)) {}
-};
-
-} // namespace
-
-/// Applies the necessary post-processing procedures to entries listed in \p
-/// EntriesInfo after the execution of all device side operations from a target
-/// data end. This includes the update of pointers at the host and removal of
-/// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS
-/// according to the successfulness of the operations.
-[[nodiscard]] static int
-postProcessingTargetDataEnd(DeviceTy *Device,
-                            SmallVector<PostProcessingInfo> &EntriesInfo) {
-  int Ret = OFFLOAD_SUCCESS;
-
-  for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) {
-    bool DelEntry = !TPR.isHostPointer();
-
-    // If the last element from the mapper (for end transfer args comes in
-    // reverse order), do not remove the partial entry, the parent struct still
-    // exists.
-    if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-        !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
-      DelEntry = false; // protect parent struct from being deallocated
-    }
-
-    // If we marked the entry to be deleted we need to verify no other
-    // thread reused it by now. If deletion is still supposed to happen by
-    // this thread LR will be set and exclusive access to the HDTT map
-    // will avoid another thread reusing the entry now. Note that we do
-    // not request (exclusive) access to the HDTT map if DelEntry is
-    // not set.
-    MappingInfoTy::HDTTMapAccessorTy HDTTMap =
-        Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
-
-    // We cannot use a lock guard because we may end up delete the mutex.
-    // We also explicitly unlocked the entry after it was put in the EntriesInfo
-    // so it can be reused.
-    TPR.getEntry()->lock();
-    auto *Entry = TPR.getEntry();
-
-    const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0;
-    if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) {
-      // The thread is not in charge of deletion anymore. Give up access
-      // to the HDTT map and unset the deletion flag.
-      HDTTMap.destroy();
-      DelEntry = false;
-    }
-
-    // If we copied back to the host a struct/array containing pointers,
-    // we need to restore the original host pointer values from their
-    // shadow copies. If the struct is going to be deallocated, remove any
-    // remaining shadow pointer entries for this struct.
-    const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
-    if (HasFrom) {
-      Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
-        *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-        DP("Restoring original host pointer value " DPxMOD " for host "
-           "pointer " DPxMOD "\n",
-           DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
-        return OFFLOAD_SUCCESS;
-      });
-    }
-
-    // Give up the lock as we either don't need it anymore (e.g., done with
-    // TPR), or erase TPR.
-    TPR.setEntry(nullptr);
-
-    if (!DelEntry)
-      continue;
-
-    Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, DataSize);
-    // Entry is already remove from the map, we can unlock it now.
-    HDTTMap.destroy();
-    Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, DataSize);
-    if (Ret != OFFLOAD_SUCCESS) {
-      REPORT("Deallocating data from device failed.\n");
-      break;
-    }
-  }
-
-  delete &EntriesInfo;
-  return Ret;
-}
-
-/// Internal function to undo the mapping and retrieve the data from the device.
-int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
-                  void **ArgBases, void **Args, int64_t *ArgSizes,
-                  int64_t *ArgTypes, map_var_info_t *ArgNames,
-                  void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
-  int Ret = OFFLOAD_SUCCESS;
-  auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
-  // process each input.
-  for (int32_t I = ArgNum - 1; I >= 0; --I) {
-    // Ignore private variables and arrays - there is no mapping for them.
-    // Also, ignore the use_device_ptr directive, it has no effect here.
-    if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
-      continue;
-
-    if (ArgMappers && ArgMappers[I]) {
-      // Instead of executing the regular path of targetDataEnd, call the
-      // targetDataMapper variant which will call targetDataEnd again
-      // with new arguments.
-      DP("Calling targetDataMapper for the %dth argument\n", I);
-
-      map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
-      Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I],
-                             ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
-                             targetDataEnd);
-
-      if (Ret != OFFLOAD_SUCCESS) {
-        REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
-               " failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      // Skip the rest of this function, continue to the next argument.
-      continue;
-    }
-
-    void *HstPtrBegin = Args[I];
-    int64_t DataSize = ArgSizes[I];
-    bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
-    bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
-                      (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
-                     !(FromMapper && I == 0);
-    bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
-    bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
-    bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
-
-    // If PTR_AND_OBJ, HstPtrBegin is address of pointee
-    TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
-        HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, !IsImplicit,
-        ForceDelete, /*FromDataEnd=*/true);
-    void *TgtPtrBegin = TPR.TargetPointer;
-    if (!TPR.isPresent() && !TPR.isHostPointer() &&
-        (DataSize || HasPresentModifier)) {
-      DP("Mapping does not exist (%s)\n",
-         (HasPresentModifier ? "'present' map type modifier" : "ignored"));
-      if (HasPresentModifier) {
-        // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
-        // "If a map clause appears on a target, target data, target enter data
-        // or target exit data construct with a present map-type-modifier then
-        // on entry to the region if the corresponding list item does not appear
-        // in the device data environment then an error occurs and the program
-        // terminates."
-        //
-        // This should be an error upon entering an "omp target exit data".  It
-        // should not be an error upon exiting an "omp target data" or "omp
-        // target".  For "omp target data", Clang thus doesn't include present
-        // modifiers for end calls.  For "omp target", we have not found a valid
-        // OpenMP program for which the error matters: it appears that, if a
-        // program can guarantee that data is present at the beginning of an
-        // "omp target" region so that there's no error there, that data is also
-        // guaranteed to be present at the end.
-        MESSAGE("device mapping required by 'present' map type modifier does "
-                "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
-                DPxPTR(HstPtrBegin), DataSize);
-        return OFFLOAD_FAIL;
-      }
-    } else {
-      DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-         " - is%s last\n",
-         DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not"));
-    }
-
-    // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
-    // "If the map clause appears on a target, target data, or target exit data
-    // construct and a corresponding list item of the original list item is not
-    // present in the device data environment on exit from the region then the
-    // list item is ignored."
-    if (!TPR.isPresent())
-      continue;
-
-    // Move data back to the host
-    const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
-    const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM;
-    if (HasFrom && (HasAlways || TPR.Flags.IsLast) &&
-        !TPR.Flags.IsHostPointer && DataSize != 0) {
-      DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-         DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-      TIMESCOPE_WITH_DETAILS_AND_IDENT(
-          "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
-      // Wait for any previous transfer if an event is present.
-      if (void *Event = TPR.getEntry()->getEvent()) {
-        if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
-          REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
-          return OFFLOAD_FAIL;
-        }
-      }
-
-      Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo,
-                                TPR.getEntry());
-      if (Ret != OFFLOAD_SUCCESS) {
-        REPORT("Copying data from device failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      // As we are expecting to delete the entry the d2h copy might race
-      // with another one that also tries to delete the entry. This happens
-      // as the entry can be reused and the reuse might happen after the
-      // copy-back was issued but before it completed. Since the reuse might
-      // also copy-back a value we would race.
-      if (TPR.Flags.IsLast) {
-        if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
-            OFFLOAD_SUCCESS)
-          return OFFLOAD_FAIL;
-      }
-    }
-
-    // Add pointer to the buffer for post-synchronize processing.
-    PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
-                                     std::move(TPR));
-    PostProcessingPtrs->back().TPR.getEntry()->unlock();
-  }
-
-  // Add post-processing functions
-  // TODO: We might want to remove `mutable` in the future by not changing the
-  // captured variables somehow.
-  AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int {
-    return postProcessingTargetDataEnd(Device, *PostProcessingPtrs);
-  });
-
-  return Ret;
-}
-
-static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
-                                void *HstPtrBegin, int64_t ArgSize,
-                                int64_t ArgType, AsyncInfoTy &AsyncInfo) {
-  TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
-      HstPtrBegin, ArgSize, /*UpdateRefCount=*/false,
-      /*UseHoldRefCount=*/false, /*MustContain=*/true);
-  void *TgtPtrBegin = TPR.TargetPointer;
-  if (!TPR.isPresent()) {
-    DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
-    if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
-      MESSAGE("device mapping required by 'present' motion modifier does not "
-              "exist for host address " DPxMOD " (%" PRId64 " bytes)",
-              DPxPTR(HstPtrBegin), ArgSize);
-      return OFFLOAD_FAIL;
-    }
-    return OFFLOAD_SUCCESS;
-  }
-
-  if (TPR.Flags.IsHostPointer) {
-    DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
-       DPxPTR(HstPtrBegin));
-    return OFFLOAD_SUCCESS;
-  }
-
-  if (ArgType & OMP_TGT_MAPTYPE_TO) {
-    DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-       ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-    int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo,
-                                TPR.getEntry());
-    if (Ret != OFFLOAD_SUCCESS) {
-      REPORT("Copying data to device failed.\n");
-      return OFFLOAD_FAIL;
-    }
-    if (TPR.getEntry()) {
-      int Ret = TPR.getEntry()->foreachShadowPointerInfo(
-          [&](ShadowPtrInfoTy &ShadowPtr) {
-            DP("Restoring original target pointer value " DPxMOD " for target "
-               "pointer " DPxMOD "\n",
-               DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
-            Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
-                                    (void *)&ShadowPtr.TgtPtrVal,
-                                    sizeof(void *), AsyncInfo);
-            if (Ret != OFFLOAD_SUCCESS) {
-              REPORT("Copying data to device failed.\n");
-              return OFFLOAD_FAIL;
-            }
-            return OFFLOAD_SUCCESS;
-          });
-      if (Ret != OFFLOAD_SUCCESS) {
-        DP("Updating shadow map failed\n");
-        return Ret;
-      }
-    }
-  }
-
-  if (ArgType & OMP_TGT_MAPTYPE_FROM) {
-    DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-       ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-    int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo,
-                                  TPR.getEntry());
-    if (Ret != OFFLOAD_SUCCESS) {
-      REPORT("Copying data from device failed.\n");
-      return OFFLOAD_FAIL;
-    }
-
-    // Wait for device-to-host memcopies for whole struct to complete,
-    // before restoring the correct host pointer.
-    if (auto *Entry = TPR.getEntry()) {
-      AsyncInfo.addPostProcessingFunction([=]() -> int {
-        int Ret = Entry->foreachShadowPointerInfo(
-            [&](const ShadowPtrInfoTy &ShadowPtr) {
-              *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-              DP("Restoring original host pointer value " DPxMOD
-                 " for host pointer " DPxMOD "\n",
-                 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
-              return OFFLOAD_SUCCESS;
-            });
-        Entry->unlock();
-        if (Ret != OFFLOAD_SUCCESS) {
-          DP("Updating shadow map failed\n");
-          return Ret;
-        }
-        return OFFLOAD_SUCCESS;
-      });
-    }
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
-                                   void *ArgsBase,
-                                   __tgt_target_non_contig *NonContig,
-                                   uint64_t Size, int64_t ArgType,
-                                   int CurrentDim, int DimSize, uint64_t Offset,
-                                   AsyncInfoTy &AsyncInfo) {
-  int Ret = OFFLOAD_SUCCESS;
-  if (CurrentDim < DimSize) {
-    for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
-      uint64_t CurOffset =
-          (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
-      // we only need to transfer the first element for the last dimension
-      // since we've already got a contiguous piece.
-      if (CurrentDim != DimSize - 1 || I == 0) {
-        Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
-                                      ArgType, CurrentDim + 1, DimSize,
-                                      Offset + CurOffset, AsyncInfo);
-        // Stop the whole process if any contiguous piece returns anything
-        // other than OFFLOAD_SUCCESS.
-        if (Ret != OFFLOAD_SUCCESS)
-          return Ret;
-      }
-    }
-  } else {
-    char *Ptr = (char *)ArgsBase + Offset;
-    DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
-       " len %" PRIu64 "\n",
-       DPxPTR(Ptr), Offset, Size);
-    Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType,
-                               AsyncInfo);
-  }
-  return Ret;
-}
-
-static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
-                                       int32_t DimSize) {
-  int RemovedDim = 0;
-  for (int I = DimSize - 1; I > 0; --I) {
-    if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
-      RemovedDim++;
-  }
-  return RemovedDim;
-}
-
-/// Internal function to pass data to/from the target.
-int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
-                     void **ArgsBase, void **Args, int64_t *ArgSizes,
-                     int64_t *ArgTypes, map_var_info_t *ArgNames,
-                     void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
-  // process each input.
-  for (int32_t I = 0; I < ArgNum; ++I) {
-    if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
-      continue;
-
-    if (ArgMappers && ArgMappers[I]) {
-      // Instead of executing the regular path of targetDataUpdate, call the
-      // targetDataMapper variant which will call targetDataUpdate again
-      // with new arguments.
-      DP("Calling targetDataMapper for the %dth argument\n", I);
-
-      map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
-      int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
-                                 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
-                                 targetDataUpdate);
-
-      if (Ret != OFFLOAD_SUCCESS) {
-        REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
-               " failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      // Skip the rest of this function, continue to the next argument.
-      continue;
-    }
-
-    int Ret = OFFLOAD_SUCCESS;
-
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
-      __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
-      int32_t DimSize = ArgSizes[I];
-      uint64_t Size =
-          NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
-      int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
-      Ret = targetDataNonContiguous(
-          Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
-          /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
-    } else {
-      Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
-                                 ArgTypes[I], AsyncInfo);
-    }
-    if (Ret == OFFLOAD_FAIL)
-      return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
-                                      OMP_TGT_MAPTYPE_LITERAL |
-                                      OMP_TGT_MAPTYPE_IMPLICIT;
-static bool isLambdaMapping(int64_t Mapping) {
-  return (Mapping & LambdaMapping) == LambdaMapping;
-}
-
-namespace {
-/// Find the table information in the map or look it up in the translation
-/// tables.
-TableMap *getTableMap(void *HostPtr) {
-  std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
-  HostPtrToTableMapTy::iterator TableMapIt =
-      PM->HostPtrToTableMap.find(HostPtr);
-
-  if (TableMapIt != PM->HostPtrToTableMap.end())
-    return &TableMapIt->second;
-
-  // We don't have a map. So search all the registered libraries.
-  TableMap *TM = nullptr;
-  std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
-  for (HostEntriesBeginToTransTableTy::iterator Itr =
-           PM->HostEntriesBeginToTransTable.begin();
-       Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
-    // get the translation table (which contains all the good info).
-    TranslationTable *TransTable = &Itr->second;
-    // iterate over all the host table entries to see if we can locate the
-    // host_ptr.
-    __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
-    for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
-      if (Cur->addr != HostPtr)
-        continue;
-      // we got a match, now fill the HostPtrToTableMap so that we
-      // may avoid this search next time.
-      TM = &(PM->HostPtrToTableMap)[HostPtr];
-      TM->Table = TransTable;
-      TM->Index = I;
-      return TM;
-    }
-  }
-
-  return nullptr;
-}
-
-/// A class manages private arguments in a target region.
-class PrivateArgumentManagerTy {
-  /// A data structure for the information of first-private arguments. We can
-  /// use this information to optimize data transfer by packing all
-  /// first-private arguments and transfer them all at once.
-  struct FirstPrivateArgInfoTy {
-    /// Host pointer begin
-    char *HstPtrBegin;
-    /// Host pointer end
-    char *HstPtrEnd;
-    /// The index of the element in \p TgtArgs corresponding to the argument
-    int Index;
-    /// Alignment of the entry (base of the entry, not after the entry).
-    uint32_t Alignment;
-    /// Size (without alignment, see padding)
-    uint32_t Size;
-    /// Padding used to align this argument entry, if necessary.
-    uint32_t Padding;
-    /// Host pointer name
-    map_var_info_t HstPtrName = nullptr;
-
-    FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
-                          uint32_t Alignment, uint32_t Padding,
-                          map_var_info_t HstPtrName = nullptr)
-        : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
-          HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
-          Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
-  };
-
-  /// A vector of target pointers for all private arguments
-  SmallVector<void *> TgtPtrs;
-
-  /// A vector of information of all first-private arguments to be packed
-  SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
-  /// Host buffer for all arguments to be packed
-  SmallVector<char> FirstPrivateArgBuffer;
-  /// The total size of all arguments to be packed
-  int64_t FirstPrivateArgSize = 0;
-
-  /// A reference to the \p DeviceTy object
-  DeviceTy &Device;
-  /// A pointer to a \p AsyncInfoTy object
-  AsyncInfoTy &AsyncInfo;
-
-  // TODO: What would be the best value here? Should we make it configurable?
-  // If the size is larger than this threshold, we will allocate and transfer it
-  // immediately instead of packing it.
-  static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
-
-public:
-  /// Constructor
-  PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
-      : Device(Dev), AsyncInfo(AsyncInfo) {}
-
-  /// Add a private argument
-  int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
-             bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
-             map_var_info_t HstPtrName = nullptr,
-             const bool AllocImmediately = false) {
-    // If the argument is not first-private, or its size is greater than a
-    // predefined threshold, we will allocate memory and issue the transfer
-    // immediately.
-    if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
-        AllocImmediately) {
-      TgtPtr = Device.allocData(ArgSize, HstPtr);
-      if (!TgtPtr) {
-        DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
-           (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
-        return OFFLOAD_FAIL;
-      }
-#ifdef OMPTARGET_DEBUG
-      void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
-      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
-         " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
-         "\n",
-         ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
-         DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
-#endif
-      // If first-private, copy data from host
-      if (IsFirstPrivate) {
-        DP("Submitting firstprivate data to the device.\n");
-        int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
-        if (Ret != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed, failed.\n");
-          return OFFLOAD_FAIL;
-        }
-      }
-      TgtPtrs.push_back(TgtPtr);
-    } else {
-      DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
-         DPxPTR(HstPtr), ArgSize);
-      // When reach this point, the argument must meet all following
-      // requirements:
-      // 1. Its size does not exceed the threshold (see the comment for
-      // FirstPrivateArgSizeThreshold);
-      // 2. It must be first-private (needs to be mapped to target device).
-      // We will pack all this kind of arguments to transfer them all at once
-      // to reduce the number of data transfer. We will not take
-      // non-first-private arguments, aka. private arguments that doesn't need
-      // to be mapped to target device, into account because data allocation
-      // can be very efficient with memory manager.
-
-      // Placeholder value
-      TgtPtr = nullptr;
-      auto *LastFPArgInfo =
-          FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back();
-
-      // Compute the start alignment of this entry, add padding if necessary.
-      // TODO: Consider sorting instead.
-      uint32_t Padding = 0;
-      uint32_t StartAlignment =
-          LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment;
-      if (LastFPArgInfo) {
-        // Check if we keep the start alignment or if it is shrunk due to the
-        // size of the last element.
-        uint32_t Offset = LastFPArgInfo->Size % StartAlignment;
-        if (Offset)
-          StartAlignment = Offset;
-        // We only need as much alignment as the host pointer had (since we
-        // don't know the alignment information from the source we might end up
-        // overaligning accesses but not too much).
-        uint32_t RequiredAlignment =
-            llvm::bit_floor(getPartialStructRequiredAlignment(HstPtr));
-        if (RequiredAlignment > StartAlignment) {
-          Padding = RequiredAlignment - StartAlignment;
-          StartAlignment = RequiredAlignment;
-        }
-      }
-
-      FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
-                                       StartAlignment, Padding, HstPtrName);
-      FirstPrivateArgSize += Padding + ArgSize;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
-  /// and start the transfer.
-  int packAndTransfer(SmallVector<void *> &TgtArgs) {
-    if (!FirstPrivateArgInfo.empty()) {
-      assert(FirstPrivateArgSize != 0 &&
-             "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
-      FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
-      auto *Itr = FirstPrivateArgBuffer.begin();
-      // Copy all host data to this buffer
-      for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
-        // First pad the pointer as we (have to) pad it on the device too.
-        Itr = std::next(Itr, Info.Padding);
-        std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
-        Itr = std::next(Itr, Info.Size);
-      }
-      // Allocate target memory
-      void *TgtPtr =
-          Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
-      if (TgtPtr == nullptr) {
-        DP("Failed to allocate target memory for private arguments.\n");
-        return OFFLOAD_FAIL;
-      }
-      TgtPtrs.push_back(TgtPtr);
-      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
-         FirstPrivateArgSize, DPxPTR(TgtPtr));
-      // Transfer data to target device
-      int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
-                                  FirstPrivateArgSize, AsyncInfo);
-      if (Ret != OFFLOAD_SUCCESS) {
-        DP("Failed to submit data of private arguments.\n");
-        return OFFLOAD_FAIL;
-      }
-      // Fill in all placeholder pointers
-      auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
-      for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
-        void *&Ptr = TgtArgs[Info.Index];
-        assert(Ptr == nullptr && "Target pointer is already set by mistaken");
-        // Pad the device pointer to get the right alignment.
-        TP += Info.Padding;
-        Ptr = reinterpret_cast<void *>(TP);
-        TP += Info.Size;
-        DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
-           "\n",
-           DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
-           DPxPTR(Ptr));
-      }
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Free all target memory allocated for private arguments
-  int free() {
-    for (void *P : TgtPtrs) {
-      int Ret = Device.deleteData(P);
-      if (Ret != OFFLOAD_SUCCESS) {
-        DP("Deallocation of (first-)private arrays failed.\n");
-        return OFFLOAD_FAIL;
-      }
-    }
-
-    TgtPtrs.clear();
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-/// Process data before launching the kernel, including calling targetDataBegin
-/// to map and transfer data to target device, transferring (first-)private
-/// variables.
-static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
-                             int32_t ArgNum, void **ArgBases, void **Args,
-                             int64_t *ArgSizes, int64_t *ArgTypes,
-                             map_var_info_t *ArgNames, void **ArgMappers,
-                             SmallVector<void *> &TgtArgs,
-                             SmallVector<ptrdiff_t> &TgtOffsets,
-                             PrivateArgumentManagerTy &PrivateArgumentManager,
-                             AsyncInfoTy &AsyncInfo) {
-
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  int Ret = targetDataBegin(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
-                            ArgTypes, ArgNames, ArgMappers, AsyncInfo);
-  if (Ret != OFFLOAD_SUCCESS) {
-    REPORT("Call to targetDataBegin failed, abort target.\n");
-    return OFFLOAD_FAIL;
-  }
-
-  // List of (first-)private arrays allocated for this target region
-  SmallVector<int> TgtArgsPositions(ArgNum, -1);
-
-  for (int32_t I = 0; I < ArgNum; ++I) {
-    if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
-      // This is not a target parameter, do not push it into TgtArgs.
-      // Check for lambda mapping.
-      if (isLambdaMapping(ArgTypes[I])) {
-        assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-               "PTR_AND_OBJ must be also MEMBER_OF.");
-        unsigned Idx = getParentIndex(ArgTypes[I]);
-        int TgtIdx = TgtArgsPositions[Idx];
-        assert(TgtIdx != -1 && "Base address must be translated already.");
-        // The parent lambda must be processed already and it must be the last
-        // in TgtArgs and TgtOffsets arrays.
-        void *HstPtrVal = Args[I];
-        void *HstPtrBegin = ArgBases[I];
-        void *HstPtrBase = Args[Idx];
-        void *TgtPtrBase =
-            (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
-        DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
-        uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
-        void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
-        void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
-        TargetPointerResultTy TPR =
-            DeviceOrErr->getMappingInfo().getTgtPtrBegin(
-                HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false,
-                /*UseHoldRefCount=*/false);
-        PointerTgtPtrBegin = TPR.TargetPointer;
-        if (!TPR.isPresent()) {
-          DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
-             DPxPTR(HstPtrVal));
-          continue;
-        }
-        if (TPR.Flags.IsHostPointer) {
-          DP("Unified memory is active, no need to map lambda captured"
-             "variable (" DPxMOD ")\n",
-             DPxPTR(HstPtrVal));
-          continue;
-        }
-        DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
-           DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
-        Ret =
-            DeviceOrErr->submitData(TgtPtrBegin, &PointerTgtPtrBegin,
-                                    sizeof(void *), AsyncInfo, TPR.getEntry());
-        if (Ret != OFFLOAD_SUCCESS) {
-          REPORT("Copying data to device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-      }
-      continue;
-    }
-    void *HstPtrBegin = Args[I];
-    void *HstPtrBase = ArgBases[I];
-    void *TgtPtrBegin;
-    map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
-    ptrdiff_t TgtBaseOffset;
-    TargetPointerResultTy TPR;
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
-      DP("Forwarding first-private value " DPxMOD " to the target construct\n",
-         DPxPTR(HstPtrBase));
-      TgtPtrBegin = HstPtrBase;
-      TgtBaseOffset = 0;
-    } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
-      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-      const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
-      // If there is a next argument and it depends on the current one, we need
-      // to allocate the private memory immediately. If this is not the case,
-      // then the argument can be marked for optimization and packed with the
-      // other privates.
-      const bool AllocImmediately =
-          (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
-      Ret = PrivateArgumentManager.addArg(
-          HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
-          TgtArgs.size(), HstPtrName, AllocImmediately);
-      if (Ret != OFFLOAD_SUCCESS) {
-        REPORT("Failed to process %sprivate argument " DPxMOD "\n",
-               (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
-        return OFFLOAD_FAIL;
-      }
-    } else {
-      if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
-        HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
-      TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin(
-          HstPtrBegin, ArgSizes[I],
-          /*UpdateRefCount=*/false,
-          /*UseHoldRefCount=*/false);
-      TgtPtrBegin = TPR.TargetPointer;
-      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-#ifdef OMPTARGET_DEBUG
-      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
-      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
-         DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
-#endif
-    }
-    TgtArgsPositions[I] = TgtArgs.size();
-    TgtArgs.push_back(TgtPtrBegin);
-    TgtOffsets.push_back(TgtBaseOffset);
-  }
-
-  assert(TgtArgs.size() == TgtOffsets.size() &&
-         "Size mismatch in arguments and offsets");
-
-  // Pack and transfer first-private arguments
-  Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
-  if (Ret != OFFLOAD_SUCCESS) {
-    DP("Failed to pack and transfer first private arguments\n");
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-/// Process data after launching the kernel, including transferring data back to
-/// host if needed and deallocating target memory of (first-)private variables.
-static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
-                            int32_t ArgNum, void **ArgBases, void **Args,
-                            int64_t *ArgSizes, int64_t *ArgTypes,
-                            map_var_info_t *ArgNames, void **ArgMappers,
-                            PrivateArgumentManagerTy &PrivateArgumentManager,
-                            AsyncInfoTy &AsyncInfo) {
-
-  auto DeviceOrErr = PM->getDevice(DeviceId);
-  if (!DeviceOrErr)
-    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
-
-  // Move data from device.
-  int Ret = targetDataEnd(Loc, *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
-                          ArgTypes, ArgNames, ArgMappers, AsyncInfo);
-  if (Ret != OFFLOAD_SUCCESS) {
-    REPORT("Call to targetDataEnd failed, abort target.\n");
-    return OFFLOAD_FAIL;
-  }
-
-  // Free target memory for private arguments after synchronization.
-  // TODO: We might want to remove `mutable` in the future by not changing the
-  // captured variables somehow.
-  AsyncInfo.addPostProcessingFunction(
-      [PrivateArgumentManager =
-           std::move(PrivateArgumentManager)]() mutable -> int {
-        int Ret = PrivateArgumentManager.free();
-        if (Ret != OFFLOAD_SUCCESS) {
-          REPORT("Failed to deallocate target memory for private args\n");
-          return OFFLOAD_FAIL;
-        }
-        return Ret;
-      });
-
-  return OFFLOAD_SUCCESS;
-}
-} // namespace
-
-/// performs the same actions as data_begin in case arg_num is
-/// non-zero and initiates run of the offloaded region on the target platform;
-/// if arg_num is non-zero after the region execution is done it also
-/// performs the same action as data_update and data_end above. This function
-/// returns 0 if it was able to transfer the execution to a target and an
-/// integer different from zero otherwise.
-int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
-           KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) {
-  int32_t DeviceId = Device.DeviceID;
-  TableMap *TM = getTableMap(HostPtr);
-  // No map for this host pointer found!
-  if (!TM) {
-    REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
-           DPxPTR(HostPtr));
-    return OFFLOAD_FAIL;
-  }
-
-  // get target table.
-  __tgt_target_table *TargetTable = nullptr;
-  {
-    std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
-    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
-           "Not expecting a device ID outside the table's bounds!");
-    TargetTable = TM->Table->TargetsTable[DeviceId];
-  }
-  assert(TargetTable && "Global data has not been mapped\n");
-
-  DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount);
-
-  // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
-  // need to manifest base pointers prior to launching a kernel. Even if we have
-  // mapped an object only partially, e.g. A[N:M], although the kernel is
-  // expected to access elements starting at address &A[N] and beyond, we still
-  // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
-  // API, we need the begin address itself, i.e. &A[N], as the API operates on
-  // begin addresses, not bases. That's why we pass args and offsets as two
-  // separate entities so that each plugin can do what it needs. This behavior
-  // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
-  SmallVector<void *> TgtArgs;
-  SmallVector<ptrdiff_t> TgtOffsets;
-
-  PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
-
-  int NumClangLaunchArgs = KernelArgs.NumArgs;
-  int Ret = OFFLOAD_SUCCESS;
-  if (NumClangLaunchArgs) {
-    // Process data, such as data mapping, before launching the kernel
-    Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
-                            KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
-                            KernelArgs.ArgSizes, KernelArgs.ArgTypes,
-                            KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs,
-                            TgtOffsets, PrivateArgumentManager, AsyncInfo);
-    if (Ret != OFFLOAD_SUCCESS) {
-      REPORT("Failed to process data before launching the kernel.\n");
-      return OFFLOAD_FAIL;
-    }
-
-    // Clang might pass more values via the ArgPtrs to the runtime that we pass
-    // on to the kernel.
-    // TOOD: Next time we adjust the KernelArgsTy we should introduce a new
-    // NumKernelArgs field.
-    KernelArgs.NumArgs = TgtArgs.size();
-  }
-
-  // Launch device execution.
-  void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
-  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
-     TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
-
-  {
-    assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_DETAILS_AND_IDENT(
-        "Kernel Target",
-        "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
-            ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
-            ";TripCount=" + std::to_string(KernelArgs.Tripcount),
-        Loc);
-
-#ifdef OMPT_SUPPORT
-    assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
-           "Multi dimensional launch not supported yet.");
-    /// RAII to establish tool anchors before and after kernel launch
-    int32_t NumTeams = KernelArgs.NumTeams[0];
-    // No need to guard this with OMPT_IF_BUILT
-    InterfaceRAII TargetSubmitRAII(
-        RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
-#endif
-
-    Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
-                              KernelArgs, AsyncInfo);
-  }
-
-  if (Ret != OFFLOAD_SUCCESS) {
-    REPORT("Executing target region abort target.\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (NumClangLaunchArgs) {
-    // Transfer data back and deallocate target memory for (first-)private
-    // variables
-    Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
-                           KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
-                           KernelArgs.ArgSizes, KernelArgs.ArgTypes,
-                           KernelArgs.ArgNames, KernelArgs.ArgMappers,
-                           PrivateArgumentManager, AsyncInfo);
-    if (Ret != OFFLOAD_SUCCESS) {
-      REPORT("Failed to process data after launching the kernel.\n");
-      return OFFLOAD_FAIL;
-    }
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-/// Enables the record replay mechanism by pre-allocating MemorySize
-/// and informing the record-replayer of whether to store the output
-/// in some file.
-int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
-                       bool IsRecord, bool SaveOutput,
-                       uint64_t &ReqPtrArgOffset) {
-  return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize,
-                                              VAddr, IsRecord, SaveOutput,
-                                              ReqPtrArgOffset);
-}
-
-/// Executes a kernel using pre-recorded information for loading to
-/// device memory to launch the target kernel with the pre-recorded
-/// configuration.
-int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
-                  void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
-                  ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
-                  int32_t ThreadLimit, uint64_t LoopTripCount,
-                  AsyncInfoTy &AsyncInfo) {
-  int32_t DeviceId = Device.DeviceID;
-  TableMap *TM = getTableMap(HostPtr);
-  // Fail if the table map fails to find the target kernel pointer for the
-  // provided host pointer.
-  if (!TM) {
-    REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
-           DPxPTR(HostPtr));
-    return OFFLOAD_FAIL;
-  }
-
-  // Retrieve the target table of offloading entries.
-  __tgt_target_table *TargetTable = nullptr;
-  {
-    std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
-    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
-           "Not expecting a device ID outside the table's bounds!");
-    TargetTable = TM->Table->TargetsTable[DeviceId];
-  }
-  assert(TargetTable && "Global data has not been mapped\n");
-
-  // Retrieve the target kernel pointer, allocate and store the recorded device
-  // memory data, and launch device execution.
-  void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
-  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
-     TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
-
-  void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
-                                  TARGET_ALLOC_DEFAULT);
-  Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
-
-  KernelArgsTy KernelArgs = {0};
-  KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
-  KernelArgs.NumArgs = NumArgs;
-  KernelArgs.Tripcount = LoopTripCount;
-  KernelArgs.NumTeams[0] = NumTeams;
-  KernelArgs.ThreadLimit[0] = ThreadLimit;
-
-  int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
-                                AsyncInfo);
-
-  if (Ret != OFFLOAD_SUCCESS) {
-    REPORT("Executing target region abort target.\n");
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}

diff --git a/libomptarget/src/private.h b/libomptarget/src/private.h
deleted file mode 100644
index d569f2a..0000000
--- a/libomptarget/src/private.h
+++ /dev/null

@@ -1,79 +0,0 @@
-//===---------- private.h - Target independent OpenMP target RTL ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Private function declarations and helper macros for debugging output.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_PRIVATE_H
-#define _OMPTARGET_PRIVATE_H
-
-#include "Shared/Debug.h"
-#include "Shared/SourceInfo.h"
-
-#include "OpenMP/InternalTypes.h"
-
-#include "device.h"
-#include "omptarget.h"
-
-#include <cstdint>
-
-extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
-                  KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
-
-extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
-                              void *ReqAddr, bool isRecord, bool SaveOutput,
-                              uint64_t &ReqPtrArgOffset);
-
-extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
-                         void *DeviceMemory, int64_t DeviceMemorySize,
-                         void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
-                         int32_t NumTeams, int32_t ThreadLimit,
-                         uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
-
-extern void handleTargetOutcome(bool Success, ident_t *Loc);
-extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc);
-
-////////////////////////////////////////////////////////////////////////////////
-/// Print out the names and properties of the arguments to each kernel
-static inline void
-printKernelArguments(const ident_t *Loc, const int64_t DeviceId,
-                     const int32_t ArgNum, const int64_t *ArgSizes,
-                     const int64_t *ArgTypes, const map_var_info_t *ArgNames,
-                     const char *RegionType) {
-  SourceInfo Info(Loc);
-  INFO(OMP_INFOTYPE_ALL, DeviceId, "%s at %s:%d:%d with %d arguments:\n",
-       RegionType, Info.getFilename(), Info.getLine(), Info.getColumn(),
-       ArgNum);
-
-  for (int32_t I = 0; I < ArgNum; ++I) {
-    const map_var_info_t VarName = (ArgNames) ? ArgNames[I] : nullptr;
-    const char *Type = nullptr;
-    const char *Implicit =
-        (ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT) ? "(implicit)" : "";
-    if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO && ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
-      Type = "tofrom";
-    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO)
-      Type = "to";
-    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM)
-      Type = "from";
-    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)
-      Type = "private";
-    else if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL)
-      Type = "firstprivate";
-    else if (ArgSizes[I] != 0)
-      Type = "alloc";
-    else
-      Type = "use_address";
-
-    INFO(OMP_INFOTYPE_ALL, DeviceId, "%s(%s)[%" PRId64 "] %s\n", Type,
-         getNameFromMapping(VarName).c_str(), ArgSizes[I], Implicit);
-  }
-}
-
-#endif

diff --git a/libomptarget/test/CMakeLists.txt b/libomptarget/test/CMakeLists.txt
deleted file mode 100644
index a0ba233..0000000
--- a/libomptarget/test/CMakeLists.txt
+++ /dev/null

@@ -1,42 +0,0 @@
-# CMakeLists.txt file for unit testing OpenMP offloading runtime library.
-if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR
-   OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0)
-  libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.")
-  libomptarget_warning_say("The check-libomptarget target will not be available!")
-  return()
-endif()
-
-if(LIBOMPTARGET_ENABLE_DEBUG)
-  set(LIBOMPTARGET_DEBUG True)
-else()
-  set(LIBOMPTARGET_DEBUG False)
-endif()
-
-# Replace the space from user's input with ";" in case that CMake add escape
-# char into the lit command.
-string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}")
-
-string(REGEX MATCHALL "([^\ ]+\ |[^\ ]+$)" SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}")
-foreach(CURRENT_TARGET IN LISTS SYSTEM_TARGETS)
-  string(STRIP "${CURRENT_TARGET}" CURRENT_TARGET)
-
-  add_openmp_testsuite(check-libomptarget-${CURRENT_TARGET}
-    "Running libomptarget tests"
-    ${CMAKE_CURRENT_BINARY_DIR}/${CURRENT_TARGET}
-    DEPENDS omptarget omp ${LIBOMPTARGET_TESTED_PLUGINS}
-    ARGS ${LIBOMPTARGET_LIT_ARG_LIST})
-  list(APPEND LIBOMPTARGET_LIT_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CURRENT_TARGET})
-
-  # Configure the lit.site.cfg.in file
-  set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
-  get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY)
-  configure_file(lit.site.cfg.in ${CURRENT_TARGET}/lit.site.cfg @ONLY)
-endforeach()
-
-
-add_openmp_testsuite(check-libomptarget
-  "Running libomptarget tests"
-  ${LIBOMPTARGET_LIT_TESTSUITES}
-  EXCLUDE_FROM_CHECK_ALL
-  DEPENDS omptarget omp ${LIBOMPTARGET_TESTED_PLUGINS}
-  ARGS ${LIBOMPTARGET_LIT_ARG_LIST})

diff --git a/libomptarget/test/Inputs/basic_array.f90 b/libomptarget/test/Inputs/basic_array.f90
deleted file mode 100644
index 30fe7c8..0000000
--- a/libomptarget/test/Inputs/basic_array.f90
+++ /dev/null

@@ -1,7 +0,0 @@
-subroutine increment_at(c_index, arr) bind(C, name="increment_at")
-    use ISO_C_BINDING
-    !$omp declare target
-    integer (C_INT), dimension(*), intent(inout) :: arr
-    integer (C_INT), value :: c_index
-    arr(c_index+1) = arr(c_index+1) + 1
-end subroutine

diff --git a/libomptarget/test/Inputs/declare_indirect_func.c b/libomptarget/test/Inputs/declare_indirect_func.c
deleted file mode 100644
index 20ac661..0000000
--- a/libomptarget/test/Inputs/declare_indirect_func.c
+++ /dev/null

@@ -1,3 +0,0 @@
-
-int func() { return 42; }
-#pragma omp declare target indirect to(func)

diff --git a/libomptarget/test/api/assert.c b/libomptarget/test/api/assert.c
deleted file mode 100644
index 9a9e770..0000000
--- a/libomptarget/test/api/assert.c
+++ /dev/null

@@ -1,15 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-#include <assert.h>
-#include <stdio.h>
-
-int main() {
-  int i = 1;
-#pragma omp target
-  assert(i > 0);
-
-  // CHECK: PASS
-  printf("PASS\n");
-  return 0;
-}

diff --git a/libomptarget/test/api/is_initial_device.c b/libomptarget/test/api/is_initial_device.c
deleted file mode 100644
index f1a85fe..0000000
--- a/libomptarget/test/api/is_initial_device.c
+++ /dev/null

@@ -1,33 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DUNUSED -Wall -Werror
-
-// only run for x86_64 host offloading:
-// REQUIRES: x86_64-pc-linux-gnu
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  int errors = 0;
-#ifdef UNUSED
-// Test if it is OK to leave the variants unused in the header
-#else // UNUSED
-  int host = omp_is_initial_device();
-  int device = 1;
-#pragma omp target map(tofrom : device)
-  { device = omp_is_initial_device(); }
-  if (!host) {
-    printf("omp_is_initial_device() returned false on host\n");
-    errors++;
-  }
-  if (device) {
-    printf("omp_is_initial_device() returned true on device\n");
-    errors++;
-  }
-#endif // UNUSED
-
-  // CHECK: PASS
-  printf("%s\n", errors ? "FAIL" : "PASS");
-
-  return errors;
-}

diff --git a/libomptarget/test/api/omp_device_managed_memory.c b/libomptarget/test/api/omp_device_managed_memory.c
deleted file mode 100644
index 2a9fe09..0000000
--- a/libomptarget/test/api/omp_device_managed_memory.c
+++ /dev/null

@@ -1,29 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-void *llvm_omp_target_alloc_shared(size_t, int);
-void llvm_omp_target_free_shared(void *, int);
-
-int main() {
-  const int N = 64;
-  const int device = omp_get_default_device();
-
-  int *shared_ptr = llvm_omp_target_alloc_shared(N * sizeof(int), device);
-
-#pragma omp target teams distribute parallel for device(device)                \
-    is_device_ptr(shared_ptr)
-  for (int i = 0; i < N; ++i) {
-    shared_ptr[i] = 1;
-  }
-
-  int sum = 0;
-  for (int i = 0; i < N; ++i)
-    sum += shared_ptr[i];
-
-  llvm_omp_target_free_shared(shared_ptr, device);
-  // CHECK: PASS
-  if (sum == N)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_device_managed_memory_alloc.c b/libomptarget/test/api/omp_device_managed_memory_alloc.c
deleted file mode 100644
index c488669..0000000
--- a/libomptarget/test/api/omp_device_managed_memory_alloc.c
+++ /dev/null

@@ -1,28 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  const int N = 64;
-
-  // Allocates device managed memory that is shared between the host and device.
-  int *shared_ptr =
-      omp_alloc(N * sizeof(int), llvm_omp_target_shared_mem_alloc);
-
-#pragma omp target teams distribute parallel for is_device_ptr(shared_ptr)
-  for (int i = 0; i < N; ++i) {
-    shared_ptr[i] = 1;
-  }
-
-  int sum = 0;
-  for (int i = 0; i < N; ++i)
-    sum += shared_ptr[i];
-
-  // CHECK: PASS
-  if (sum == N)
-    printf("PASS\n");
-
-  omp_free(shared_ptr, llvm_omp_target_shared_mem_alloc);
-}

diff --git a/libomptarget/test/api/omp_device_memory.c b/libomptarget/test/api/omp_device_memory.c
deleted file mode 100644
index 816ae35..0000000
--- a/libomptarget/test/api/omp_device_memory.c
+++ /dev/null

@@ -1,27 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  const int N = 64;
-
-  int *device_ptr =
-      omp_alloc(N * sizeof(int), llvm_omp_target_device_mem_alloc);
-
-#pragma omp target teams distribute parallel for is_device_ptr(device_ptr)
-  for (int i = 0; i < N; ++i) {
-    device_ptr[i] = 1;
-  }
-
-  int sum = 0;
-#pragma omp target reduction(+ : sum) is_device_ptr(device_ptr)
-  for (int i = 0; i < N; ++i)
-    sum += device_ptr[i];
-
-  // CHECK: PASS
-  if (sum == N)
-    printf("PASS\n");
-
-  omp_free(device_ptr, llvm_omp_target_device_mem_alloc);
-}

diff --git a/libomptarget/test/api/omp_dynamic_shared_memory.c b/libomptarget/test/api/omp_dynamic_shared_memory.c
deleted file mode 100644
index 3fe75f2..0000000
--- a/libomptarget/test/api/omp_dynamic_shared_memory.c
+++ /dev/null

@@ -1,35 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
-// RUN:   %libomptarget-run-generic | %fcheck-generic
-
-// RUN: %libomptarget-compileopt-generic
-// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
-// RUN:   %libomptarget-run-generic | %fcheck-generic
-
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  int x;
-#pragma omp target parallel map(from : x)
-  {
-    int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
-#pragma omp barrier
-    if (omp_get_thread_num() == 0)
-      *buf = 1;
-#pragma omp barrier
-    if (omp_get_thread_num() == 1)
-      x = *buf;
-  }
-
-  // CHECK: PASS
-  if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_dynamic_shared_memory_amdgpu.c b/libomptarget/test/api/omp_dynamic_shared_memory_amdgpu.c
deleted file mode 100644
index 0b4d9d6..0000000
--- a/libomptarget/test/api/omp_dynamic_shared_memory_amdgpu.c
+++ /dev/null

@@ -1,25 +0,0 @@
-// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device
-// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
-// RUN:   %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
-// REQUIRES: amdgcn-amd-amdhsa
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  int x;
-#pragma omp target parallel map(from : x)
-  {
-    int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
-#pragma omp barrier
-    if (omp_get_thread_num() == 0)
-      *buf = 1;
-#pragma omp barrier
-    if (omp_get_thread_num() == 1)
-      x = *buf;
-  }
-
-  // CHECK: PASS
-  if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_dynamic_shared_memory_mixed.inc b/libomptarget/test/api/omp_dynamic_shared_memory_mixed.inc
deleted file mode 100644
index 16e0bec..0000000
--- a/libomptarget/test/api/omp_dynamic_shared_memory_mixed.inc
+++ /dev/null

@@ -1,49 +0,0 @@
-// Run lines are in the target specific versions.
-
-#include <omp.h>
-#include <stdio.h>
-
-#define N 512
-
-int main() {
-  int Result[N], NumThreads;
-
-#pragma omp target teams num_teams(1) thread_limit(N)                          \
-                         ompx_dyn_cgroup_mem(N * sizeof(Result[0]))            \
-                         map(from : Result, NumThreads)
-  {
-    int Buffer[N];
-#pragma omp parallel
-    {
-      int *DynBuffer = (int *)llvm_omp_target_dynamic_shared_alloc();
-      int TId = omp_get_thread_num();
-      if (TId == 0)
-        NumThreads = omp_get_num_threads();
-      Buffer[TId] = 7;
-      DynBuffer[TId] = 3;
-#pragma omp barrier
-      int WrappedTId = (TId + 37) % NumThreads;
-      Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
-    }
-  }
-
-  if (llvm_omp_target_dynamic_shared_alloc())
-    return -1;
-
-  if (NumThreads < N / 2 || NumThreads > N) {
-    printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
-           NumThreads);
-    return -1;
-  }
-
-  int Failed = 0;
-  for (int i = 0; i < NumThreads; ++i) {
-    if (Result[i] != 7 + 3) {
-      printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
-      ++Failed;
-    }
-  }
-
-  if (!Failed)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c b/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
deleted file mode 100644
index 656c3a2..0000000
--- a/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
+++ /dev/null

@@ -1,7 +0,0 @@
-// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device -I %S
-// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \
-// RUN:   %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
-// REQUIRES: amdgcn-amd-amdhsa
-
-#include "omp_dynamic_shared_memory_mixed.inc"
-// CHECK: PASS

diff --git a/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c b/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c
deleted file mode 100644
index 117809c..0000000
--- a/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c
+++ /dev/null

@@ -1,7 +0,0 @@
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -I %S
-// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \
-// RUN:   %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
-// REQUIRES: nvptx64-nvidia-cuda
-
-#include "omp_dynamic_shared_memory_mixed.inc"
-// CHECK: PASS

diff --git a/libomptarget/test/api/omp_env_vars.c b/libomptarget/test/api/omp_env_vars.c
deleted file mode 100644
index 2e78bb1..0000000
--- a/libomptarget/test/api/omp_env_vars.c
+++ /dev/null

@@ -1,12 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: env OMP_NUM_TEAMS=1 OMP_TEAMS_THREAD_LIMIT=1 LIBOMPTARGET_INFO=16 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
-
-#define N 256
-
-int main() {
-  // CHECK: Launching kernel [[KERNEL:.+_main_.+]] with 1 blocks and 1 threads
-#pragma omp target teams
-#pragma omp parallel
-  {}
-}

diff --git a/libomptarget/test/api/omp_get_device_num.c b/libomptarget/test/api/omp_get_device_num.c
deleted file mode 100644
index 31e13b0..0000000
--- a/libomptarget/test/api/omp_get_device_num.c
+++ /dev/null

@@ -1,31 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int test_omp_get_device_num() {
-  /* checks that omp_get_device_num() == omp_get_num_devices() in the host */
-  int device_num = omp_get_device_num();
-  printf("device_num = %d\n", device_num);
-
-#pragma omp target
-  {}
-
-  return (device_num == omp_get_num_devices());
-}
-
-int main() {
-  int i;
-  int failed = 0;
-
-  if (!test_omp_get_device_num()) {
-    failed++;
-  }
-  if (failed)
-    printf("FAIL\n");
-  else
-    printf("PASS\n");
-  return failed;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/api/omp_get_mapped_ptr.c b/libomptarget/test/api/omp_get_mapped_ptr.c
deleted file mode 100644
index a8e11f9..0000000
--- a/libomptarget/test/api/omp_get_mapped_ptr.c
+++ /dev/null

@@ -1,39 +0,0 @@
-// RUN: %libomptarget-compile-and-run-generic
-
-#include <assert.h>
-#include <omp.h>
-#include <stdlib.h>
-
-#define N 1024
-#define OFFSET 16
-
-int main(int argc, char *argv[]) {
-  int *host_data = (int *)malloc(sizeof(int) * N);
-  void *device_ptr = omp_get_mapped_ptr(host_data, 0);
-
-  assert(device_ptr == NULL && "the pointer should not be mapped right now");
-
-#pragma omp target enter data map(to: host_data[:N])
-
-  device_ptr = omp_get_mapped_ptr(host_data, 0);
-
-  assert(device_ptr && "the pointer should be mapped now");
-
-  void *ptr = NULL;
-
-#pragma omp target map(from: ptr)
-  { ptr = host_data; }
-
-  assert(ptr == device_ptr && "wrong pointer mapping");
-
-  device_ptr = omp_get_mapped_ptr(host_data + OFFSET, 0);
-
-  assert(device_ptr && "the pointer with offset should be mapped");
-
-#pragma omp target map(from: ptr)
-  { ptr = host_data + OFFSET; }
-
-  assert(ptr == device_ptr && "wrong pointer mapping");
-
-  return 0;
-}

diff --git a/libomptarget/test/api/omp_get_num_devices.c b/libomptarget/test/api/omp_get_num_devices.c
deleted file mode 100644
index e8fbdcd..0000000
--- a/libomptarget/test/api/omp_get_num_devices.c
+++ /dev/null

@@ -1,31 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int test_omp_get_num_devices() {
-  /* checks that omp_get_num_devices() > 0 */
-  int num_devices = omp_get_num_devices();
-  printf("num_devices = %d\n", num_devices);
-
-#pragma omp target
-  {}
-
-  return (num_devices > 0);
-}
-
-int main() {
-  int i;
-  int failed = 0;
-
-  if (!test_omp_get_num_devices()) {
-    failed++;
-  }
-  if (failed)
-    printf("FAIL\n");
-  else
-    printf("PASS\n");
-  return failed;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/api/omp_get_num_devices_with_empty_target.c b/libomptarget/test/api/omp_get_num_devices_with_empty_target.c
deleted file mode 100644
index 74ebb3e..0000000
--- a/libomptarget/test/api/omp_get_num_devices_with_empty_target.c
+++ /dev/null

@@ -1,27 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-static int test_omp_get_num_devices_with_empty_target() {
-  /* checks that omp_get_num_devices() > 0 */
-  return omp_get_num_devices() > 0;
-}
-
-int main() {
-  int failed = 0;
-
-  if (!test_omp_get_num_devices_with_empty_target()) {
-    ++failed;
-  }
-
-  if (failed) {
-    printf("FAIL\n");
-  } else {
-    printf("PASS\n");
-  }
-
-  return failed;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/api/omp_get_num_procs.c b/libomptarget/test/api/omp_get_num_procs.c
deleted file mode 100644
index e958d0c..0000000
--- a/libomptarget/test/api/omp_get_num_procs.c
+++ /dev/null

@@ -1,15 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <stdio.h>
-
-int omp_get_num_procs(void);
-
-int main() {
-  int num_procs;
-#pragma omp target map(from : num_procs)
-  { num_procs = omp_get_num_procs(); }
-
-  // CHECK: PASS
-  if (num_procs > 0)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_host_pinned_memory.c b/libomptarget/test/api/omp_host_pinned_memory.c
deleted file mode 100644
index 7a6a00d..0000000
--- a/libomptarget/test/api/omp_host_pinned_memory.c
+++ /dev/null

@@ -1,33 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-// Allocate pinned memory on the host
-void *llvm_omp_target_alloc_host(size_t, int);
-void llvm_omp_target_free_host(void *, int);
-
-int main() {
-  const int N = 64;
-  const int device = omp_get_default_device();
-  const int host = omp_get_initial_device();
-
-  int *hst_ptr = llvm_omp_target_alloc_host(N * sizeof(int), device);
-
-  for (int i = 0; i < N; ++i)
-    hst_ptr[i] = 2;
-
-#pragma omp target teams distribute parallel for device(device)                \
-    map(tofrom : hst_ptr[0 : N])
-  for (int i = 0; i < N; ++i)
-    hst_ptr[i] -= 1;
-
-  int sum = 0;
-  for (int i = 0; i < N; ++i)
-    sum += hst_ptr[i];
-
-  llvm_omp_target_free_host(hst_ptr, device);
-  // CHECK: PASS
-  if (sum == N)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_host_pinned_memory_alloc.c b/libomptarget/test/api/omp_host_pinned_memory_alloc.c
deleted file mode 100644
index fa3e245..0000000
--- a/libomptarget/test/api/omp_host_pinned_memory_alloc.c
+++ /dev/null

@@ -1,26 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  const int N = 64;
-
-  int *hst_ptr = omp_alloc(N * sizeof(int), llvm_omp_target_host_mem_alloc);
-
-  for (int i = 0; i < N; ++i)
-    hst_ptr[i] = 2;
-
-#pragma omp target teams distribute parallel for map(tofrom : hst_ptr[0 : N])
-  for (int i = 0; i < N; ++i)
-    hst_ptr[i] -= 1;
-
-  int sum = 0;
-  for (int i = 0; i < N; ++i)
-    sum += hst_ptr[i];
-
-  omp_free(hst_ptr, llvm_omp_target_host_mem_alloc);
-  // CHECK: PASS
-  if (sum == N)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_indirect_call.c b/libomptarget/test/api/omp_indirect_call.c
deleted file mode 100644
index ac0febf..0000000
--- a/libomptarget/test/api/omp_indirect_call.c
+++ /dev/null

@@ -1,47 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <assert.h>
-#include <stdio.h>
-
-#pragma omp begin declare variant match(device = {kind(gpu)})
-// Provided by the runtime.
-void *__llvm_omp_indirect_call_lookup(void *host_ptr);
-#pragma omp declare target to(__llvm_omp_indirect_call_lookup)                 \
-    device_type(nohost)
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(device = {kind(cpu)})
-// We assume unified addressing on the CPU target.
-void *__llvm_omp_indirect_call_lookup(void *host_ptr) { return host_ptr; }
-#pragma omp end declare variant
-
-#pragma omp begin declare target indirect
-void foo(int *x) { *x = *x + 1; }
-void bar(int *x) { *x = *x + 1; }
-void baz(int *x) { *x = *x + 1; }
-#pragma omp end declare target
-
-int main() {
-  void *foo_ptr = foo;
-  void *bar_ptr = bar;
-  void *baz_ptr = baz;
-
-  int count = 0;
-  void *foo_res;
-  void *bar_res;
-  void *baz_res;
-#pragma omp target map(to : foo_ptr, bar_ptr, baz_ptr) map(tofrom : count)
-  {
-    foo_res = __llvm_omp_indirect_call_lookup(foo_ptr);
-    ((void (*)(int *))foo_res)(&count);
-    bar_res = __llvm_omp_indirect_call_lookup(bar_ptr);
-    ((void (*)(int *))bar_res)(&count);
-    baz_res = __llvm_omp_indirect_call_lookup(baz_ptr);
-    ((void (*)(int *))baz_res)(&count);
-  }
-
-  assert(count == 3 && "Calling failed");
-
-  // CHECK: PASS
-  printf("PASS\n");
-}

diff --git a/libomptarget/test/api/omp_target_memcpy_async1.c b/libomptarget/test/api/omp_target_memcpy_async1.c
deleted file mode 100644
index 1abcfde..0000000
--- a/libomptarget/test/api/omp_target_memcpy_async1.c
+++ /dev/null

@@ -1,48 +0,0 @@
-// RUN: %libomptarget-compile-and-run-generic
-
-// Test case for omp_target_memcpy_async, oringally from GCC
-
-#include "stdio.h"
-#include <omp.h>
-#include <stdlib.h>
-
-int main() {
-  int d = omp_get_default_device();
-  int id = omp_get_initial_device();
-  int q[128], i;
-  void *p;
-
-  if (d < 0 || d >= omp_get_num_devices())
-    d = id;
-
-  p = omp_target_alloc(130 * sizeof(int), d);
-  if (p == NULL)
-    return 0;
-
-  for (i = 0; i < 128; i++)
-    q[i] = i;
-
-  if (omp_target_memcpy_async(p, q, 128 * sizeof(int), sizeof(int), 0, d, id, 0,
-                              NULL)) {
-    abort();
-  }
-
-#pragma omp taskwait
-
-  int q2[128];
-  for (i = 0; i < 128; ++i)
-    q2[i] = 0;
-  if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
-                              0, NULL))
-    abort();
-
-#pragma omp taskwait
-
-  for (i = 0; i < 128; ++i)
-    if (q2[i] != q[i])
-      abort();
-
-  omp_target_free(p, d);
-
-  return 0;
-}

diff --git a/libomptarget/test/api/omp_target_memcpy_async2.c b/libomptarget/test/api/omp_target_memcpy_async2.c
deleted file mode 100644
index d63f610..0000000
--- a/libomptarget/test/api/omp_target_memcpy_async2.c
+++ /dev/null

@@ -1,73 +0,0 @@
-// RUN: %libomptarget-compile-and-run-generic
-
-#include "stdio.h"
-#include <omp.h>
-#include <stdlib.h>
-
-int main() {
-  int d = omp_get_default_device();
-  int id = omp_get_initial_device();
-  int a[128], b[64], c[32], e[16], q[128], i;
-  void *p;
-
-  if (d < 0 || d >= omp_get_num_devices())
-    d = id;
-
-  p = omp_target_alloc(130 * sizeof(int), d);
-  if (p == NULL)
-    return 0;
-
-  for (i = 0; i < 128; ++i)
-    a[i] = i + 1;
-  for (i = 0; i < 64; ++i)
-    b[i] = i + 2;
-  for (i = 0; i < 32; i++)
-    c[i] = 0;
-  for (i = 0; i < 16; i++)
-    e[i] = i + 4;
-
-  omp_depend_t obj[2];
-
-#pragma omp parallel num_threads(5)
-#pragma omp single
-  {
-#pragma omp task depend(out : p)
-    omp_target_memcpy(p, a, 128 * sizeof(int), 0, 0, d, id);
-
-#pragma omp task depend(inout : p)
-    omp_target_memcpy(p, b, 64 * sizeof(int), 0, 0, d, id);
-
-#pragma omp task depend(out : c)
-    for (i = 0; i < 32; i++)
-      c[i] = i + 3;
-
-#pragma omp depobj(obj[0]) depend(inout : p)
-#pragma omp depobj(obj[1]) depend(in : c)
-    omp_target_memcpy_async(p, c, 32 * sizeof(int), 0, 0, d, id, 2, obj);
-
-#pragma omp task depend(in : p)
-    omp_target_memcpy(p, e, 16 * sizeof(int), 0, 0, d, id);
-  }
-
-#pragma omp taskwait
-
-  for (i = 0; i < 128; ++i)
-    q[i] = 0;
-  omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d);
-  for (i = 0; i < 16; ++i)
-    if (q[i] != i + 4)
-      abort();
-  for (i = 16; i < 32; ++i)
-    if (q[i] != i + 3)
-      abort();
-  for (i = 32; i < 64; ++i)
-    if (q[i] != i + 2)
-      abort();
-  for (i = 64; i < 128; ++i)
-    if (q[i] != i + 1)
-      abort();
-
-  omp_target_free(p, d);
-
-  return 0;
-}

diff --git a/libomptarget/test/api/omp_target_memcpy_rect_async1.c b/libomptarget/test/api/omp_target_memcpy_rect_async1.c
deleted file mode 100644
index 4d07898..0000000
--- a/libomptarget/test/api/omp_target_memcpy_rect_async1.c
+++ /dev/null

@@ -1,66 +0,0 @@
-// RUN: %libomptarget-compile-and-run-generic
-
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define NUM_DIMS 3
-
-int main() {
-  int d = omp_get_default_device();
-  int id = omp_get_initial_device();
-  int q[128], q2[128], i;
-  void *p;
-
-  if (d < 0 || d >= omp_get_num_devices())
-    d = id;
-
-  p = omp_target_alloc(130 * sizeof(int), d);
-  if (p == NULL)
-    return 0;
-
-  if (omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
-                                   NULL, d, id, 0, NULL) < 3 ||
-      omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
-                                   NULL, id, d, 0, NULL) < 3 ||
-      omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
-                                   NULL, id, id, 0, NULL) < 3)
-    abort();
-
-  for (i = 0; i < 128; i++)
-    q[i] = 0;
-  if (omp_target_memcpy(p, q, 128 * sizeof(int), 0, 0, d, id) != 0)
-    abort();
-
-  for (i = 0; i < 128; i++)
-    q[i] = i + 1;
-
-  size_t volume[NUM_DIMS] = {1, 2, 3};
-  size_t dst_offsets[NUM_DIMS] = {0, 0, 0};
-  size_t src_offsets[NUM_DIMS] = {0, 0, 0};
-  size_t dst_dimensions[NUM_DIMS] = {3, 4, 5};
-  size_t src_dimensions[NUM_DIMS] = {2, 3, 4};
-
-  if (omp_target_memcpy_rect_async(p, q, sizeof(int), NUM_DIMS, volume,
-                                   dst_offsets, src_offsets, dst_dimensions,
-                                   src_dimensions, d, id, 0, NULL) != 0)
-    abort();
-
-#pragma omp taskwait
-
-  for (i = 0; i < 128; i++)
-    q2[i] = 0;
-  if (omp_target_memcpy(q2, p, 128 * sizeof(int), 0, 0, id, d) != 0)
-    abort();
-
-  /* q2 is expected to contain: 1 2 3 0 0 5 6 7 0 0 .. 0  */
-  if (q2[0] != 1 || q2[1] != 2 || q2[2] != 3 || q2[3] != 0 || q2[4] != 0 ||
-      q2[5] != 5 || q2[6] != 6 || q2[7] != 7)
-    abort();
-  for (i = 8; i < 128; ++i)
-    if (q2[i] != 0)
-      abort();
-
-  omp_target_free(p, d);
-  return 0;
-}

diff --git a/libomptarget/test/api/omp_target_memcpy_rect_async2.c b/libomptarget/test/api/omp_target_memcpy_rect_async2.c
deleted file mode 100644
index 1d17d88..0000000
--- a/libomptarget/test/api/omp_target_memcpy_rect_async2.c
+++ /dev/null

@@ -1,89 +0,0 @@
-// RUN: %libomptarget-compile-and-run-generic
-
-#include <omp.h>
-#include <stdlib.h>
-
-#define NUM_DIMS 3
-
-int main() {
-  int d = omp_get_default_device();
-  int id = omp_get_initial_device();
-  int a[128], b[64], c[128], e[16], q[128], i;
-  void *p;
-
-  if (d < 0 || d >= omp_get_num_devices())
-    d = id;
-
-  p = omp_target_alloc(130 * sizeof(int), d);
-  if (p == NULL)
-    return 0;
-
-  for (i = 0; i < 128; i++)
-    q[i] = 0;
-  if (omp_target_memcpy(p, q, 128 * sizeof(int), 0, 0, d, id) != 0)
-    abort();
-
-  size_t volume[NUM_DIMS] = {2, 2, 3};
-  size_t dst_offsets[NUM_DIMS] = {0, 0, 0};
-  size_t src_offsets[NUM_DIMS] = {0, 0, 0};
-  size_t dst_dimensions[NUM_DIMS] = {3, 4, 5};
-  size_t src_dimensions[NUM_DIMS] = {2, 3, 4};
-
-  for (i = 0; i < 128; i++)
-    a[i] = 42;
-  for (i = 0; i < 64; i++)
-    b[i] = 24;
-  for (i = 0; i < 128; i++)
-    c[i] = 0;
-  for (i = 0; i < 16; i++)
-    e[i] = 77;
-
-  omp_depend_t obj[2];
-
-#pragma omp parallel num_threads(5)
-#pragma omp single
-  {
-#pragma omp task depend(out : p)
-    omp_target_memcpy(p, a, 128 * sizeof(int), 0, 0, d, id);
-
-#pragma omp task depend(inout : p)
-    omp_target_memcpy(p, b, 64 * sizeof(int), 0, 0, d, id);
-
-#pragma omp task depend(out : c)
-    for (i = 0; i < 128; i++)
-      c[i] = i + 1;
-
-#pragma omp depobj(obj[0]) depend(inout : p)
-#pragma omp depobj(obj[1]) depend(in : c)
-
-    /*  This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
-        13 14 15 - - 17 18 19 - - at positions 20..29.  */
-    omp_target_memcpy_rect_async(p, c, sizeof(int), NUM_DIMS, volume,
-                                 dst_offsets, src_offsets, dst_dimensions,
-                                 src_dimensions, d, id, 2, obj);
-
-#pragma omp task depend(in : p)
-    omp_target_memcpy(p, e, 16 * sizeof(int), 0, 0, d, id);
-  }
-
-#pragma omp taskwait
-
-  if (omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d) != 0)
-    abort();
-
-  for (i = 0; i < 16; ++i)
-    if (q[i] != 77)
-      abort();
-  if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18 ||
-      q[27] != 19)
-    abort();
-  for (i = 28; i < 64; ++i)
-    if (q[i] != 24)
-      abort();
-  for (i = 64; i < 128; ++i)
-    if (q[i] != 42)
-      abort();
-
-  omp_target_free(p, d);
-  return 0;
-}

diff --git a/libomptarget/test/api/omp_target_memset.c b/libomptarget/test/api/omp_target_memset.c
deleted file mode 100644
index ea0c6d7..0000000
--- a/libomptarget/test/api/omp_target_memset.c
+++ /dev/null

@@ -1,45 +0,0 @@
-// RUN: %libomptarget-compile-and-run-generic
-
-#include "stdio.h"
-#include <omp.h>
-#include <stdlib.h>
-
-int main() {
-  int d = omp_get_default_device();
-  int id = omp_get_initial_device();
-  int q[128], i;
-  void *p;
-  void *result;
-
-  if (d < 0 || d >= omp_get_num_devices())
-    d = id;
-
-  p = omp_target_alloc(130 * sizeof(int), d);
-  if (p == NULL)
-    return 0;
-
-  for (i = 0; i < 128; i++)
-    q[i] = i;
-
-  result = omp_target_memset(p, 0, 130 * sizeof(int), d);
-  if (result != p) {
-    abort();
-  }
-
-  int q2[128];
-  for (i = 0; i < 128; ++i)
-    q2[i] = i;
-  if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
-                              0, NULL))
-    abort();
-
-#pragma omp taskwait
-
-  for (i = 0; i < 128; ++i)
-    if (q2[i] != 0)
-      abort();
-
-  omp_target_free(p, d);
-
-  return 0;
-}

diff --git a/libomptarget/test/api/ompx_3d.c b/libomptarget/test/api/ompx_3d.c
deleted file mode 100644
index 2ae17c2..0000000
--- a/libomptarget/test/api/ompx_3d.c
+++ /dev/null

@@ -1,41 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <ompx.h>
-#include <stdio.h>
-
-void foo(int device) {
-  int tid = 0, bid = 0, bdim = 0;
-#pragma omp target teams distribute parallel for map(from                      \
-                                                     : tid, bid, bdim)         \
-    device(device) thread_limit(2) num_teams(5)
-  for (int i = 0; i < 1000; ++i) {
-    if (i == 42) {
-      tid = ompx_block_dim_x();
-      bid = ompx_block_id_x();
-      bdim = ompx_grid_dim_x();
-    }
-  }
-  // CHECK: tid: 2, bid: 1, bdim: 5
-  // CHECK: tid: 2, bid: 0, bdim: 1
-  printf("tid: %i, bid: %i, bdim: %i\n", tid, bid, bdim);
-}
-
-int isGPU() { return 0; }
-#pragma omp declare variant(isGPU) match(device = {kind(gpu)})
-int isGPUvariant() { return 1; }
-
-int defaultIsGPU() {
-  int r = 0;
-#pragma omp target map(from : r)
-  r = isGPU();
-  return r;
-}
-
-int main() {
-  if (defaultIsGPU())
-    foo(omp_get_default_device());
-  else
-    printf("tid: 2, bid: 1, bdim: 5\n");
-  foo(omp_get_initial_device());
-}

diff --git a/libomptarget/test/api/ompx_3d.cpp b/libomptarget/test/api/ompx_3d.cpp
deleted file mode 100644
index 5b54912..0000000
--- a/libomptarget/test/api/ompx_3d.cpp
+++ /dev/null

@@ -1,41 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <omp.h>
-#include <ompx.h>
-#include <stdio.h>
-
-void foo(int device) {
-  int tid = 0, bid = 0, bdim = 0;
-#pragma omp target teams distribute parallel for map(from                      \
-                                                     : tid, bid, bdim)         \
-    device(device) thread_limit(2) num_teams(5)
-  for (int i = 0; i < 1000; ++i) {
-    if (i == 42) {
-      tid = ompx::block_dim_x();
-      bid = ompx::block_id_x();
-      bdim = ompx::grid_dim_x();
-    }
-  }
-  // CHECK: tid: 2, bid: 1, bdim: 5
-  // CHECK: tid: 2, bid: 0, bdim: 1
-  printf("tid: %i, bid: %i, bdim: %i\n", tid, bid, bdim);
-}
-
-int isGPU() { return 0; }
-#pragma omp declare variant(isGPU) match(device = {kind(gpu)})
-int isGPUvariant() { return 1; }
-
-int defaultIsGPU() {
-  int r = 0;
-#pragma omp target map(from : r)
-  r = isGPU();
-  return r;
-}
-
-int main() {
-  if (defaultIsGPU())
-    foo(omp_get_default_device());
-  else
-    printf("tid: 2, bid: 1, bdim: 5\n");
-  foo(omp_get_initial_device());
-}

diff --git a/libomptarget/test/api/ompx_dump_mapping_tables.cpp b/libomptarget/test/api/ompx_dump_mapping_tables.cpp
deleted file mode 100644
index c170c2d..0000000
--- a/libomptarget/test/api/ompx_dump_mapping_tables.cpp
+++ /dev/null

@@ -1,35 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <cstdio>
-#include <omp.h>
-
-#define N 10
-
-int main() {
-  int *a = new __int32_t[N];     // mapped and released from device 0
-  int *b = new __int32_t[2 * N]; // mapped to device 0
-
-  // clang-format off
-  // CHECK: Mapping tables after target enter data:
-  // CHECK-NEXT: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
-  // CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
-  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} {{[48]}}0
-  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} {{[48]}}0
-#pragma omp target enter data device(0) map(to : a[ : N])
-#pragma omp target enter data device(0) map(to : b[ : 2*N])
-  // clang-format on
-  printf("Mapping tables after target enter data:\n");
-  ompx_dump_mapping_tables();
-
-  // clang-format off
-  // CHECK: Mapping tables after target exit data for a:
-  // CHECK-NEXT: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
-  // CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
-  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} 80
-#pragma omp target exit data device(0) map(release : a[ : N])
-  // clang-format on
-  printf("\nMapping tables after target exit data for a:\n");
-  ompx_dump_mapping_tables();
-
-  return 0;
-}

diff --git a/libomptarget/test/api/ompx_sync.c b/libomptarget/test/api/ompx_sync.c
deleted file mode 100644
index b71eba4..0000000
--- a/libomptarget/test/api/ompx_sync.c
+++ /dev/null

@@ -1,42 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <ompx.h>
-#include <stdio.h>
-
-void foo(int device) {
-  int X;
-  // clang-format off
-#pragma omp target teams map(from: X) device(device) thread_limit(2) num_teams(1)
-#pragma omp parallel
-  // clang-format on
-  {
-    int tid = ompx_thread_id_x();
-    int bid = ompx_block_id_x();
-    if (tid == 1 && bid == 0) {
-      X = 42;
-      ompx_sync_block_divergent(3);
-    } else {
-      ompx_sync_block_divergent(1);
-    }
-    if (tid == 0 && bid == 0)
-      X++;
-    ompx_sync_block(ompx_seq_cst);
-    if (tid == 1 && bid == 0)
-      X++;
-    ompx_sync_block_acq_rel();
-    if (tid == 0 && bid == 0)
-      X++;
-    ompx_sync_block(ompx_release);
-    if (tid == 0 && bid == 0)
-      X++;
-  }
-  // CHECK: X: 46
-  // CHECK: X: 46
-  printf("X: %i\n", X);
-}
-
-int main() {
-  foo(omp_get_default_device());
-  foo(omp_get_initial_device());
-}

diff --git a/libomptarget/test/api/ompx_sync.cpp b/libomptarget/test/api/ompx_sync.cpp
deleted file mode 100644
index c6e1710..0000000
--- a/libomptarget/test/api/ompx_sync.cpp
+++ /dev/null

@@ -1,42 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <omp.h>
-#include <ompx.h>
-#include <stdio.h>
-
-void foo(int device) {
-  int X;
-  // clang-format off
-#pragma omp target teams map(from: X) device(device) thread_limit(2) num_teams(1)
-#pragma omp parallel
-  // clang-format on
-  {
-    int tid = ompx::thread_id_x();
-    int bid = ompx::block_id_x();
-    if (tid == 1 && bid == 0) {
-      X = 42;
-      ompx::sync_block_divergent(3);
-    } else {
-      ompx::sync_block_divergent();
-    }
-    if (tid == 0 && bid == 0)
-      X++;
-    ompx::sync_block(ompx::seq_cst);
-    if (tid == 1 && bid == 0)
-      X++;
-    ompx::sync_block();
-    if (tid == 0 && bid == 0)
-      X++;
-    ompx_sync_block(ompx_release);
-    if (tid == 0 && bid == 0)
-      X++;
-  }
-  // CHECK: X: 46
-  // CHECK: X: 46
-  printf("X: %i\n", X);
-}
-
-int main() {
-  foo(omp_get_default_device());
-  foo(omp_get_initial_device());
-}

diff --git a/libomptarget/test/env/base_ptr_ref_count.c b/libomptarget/test/env/base_ptr_ref_count.c
deleted file mode 100644
index 9a15568..0000000
--- a/libomptarget/test/env/base_ptr_ref_count.c
+++ /dev/null

@@ -1,44 +0,0 @@
-// RUN: %libomptarget-compile-generic && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 | %fcheck-generic
-// REQUIRES: libomptarget-debug
-
-#include <stdio.h>
-#include <stdlib.h>
-
-int *allocate(size_t n) {
-  int *ptr = malloc(sizeof(int) * n);
-#pragma omp target enter data map(to : ptr[ : n])
-  return ptr;
-}
-
-void deallocate(int *ptr, size_t n) {
-#pragma omp target exit data map(delete : ptr[ : n])
-  free(ptr);
-}
-
-#pragma omp declare target
-int *cnt;
-void foo() { ++(*cnt); }
-#pragma omp end declare target
-
-int main(void) {
-  int *A = allocate(10);
-  int *V = allocate(10);
-  deallocate(A, 10);
-  deallocate(V, 10);
-  // CHECK-NOT: RefCount=2
-  cnt = malloc(sizeof(int));
-  *cnt = 0;
-#pragma omp target map(cnt[ : 1])
-  foo();
-  printf("Cnt = %d.\n", *cnt);
-  // CHECK: Cnt = 1.
-  *cnt = 0;
-#pragma omp target data map(cnt[ : 1])
-#pragma omp target
-  foo();
-  printf("Cnt = %d.\n", *cnt);
-  // CHECK: Cnt = 1.
-  free(cnt);
-
-  return 0;
-}

diff --git a/libomptarget/test/env/omp_target_debug.c b/libomptarget/test/env/omp_target_debug.c
deleted file mode 100644
index ec81873..0000000
--- a/libomptarget/test/env/omp_target_debug.c
+++ /dev/null

@@ -1,13 +0,0 @@
-// RUN: %libomptarget-compile-generic && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 | %fcheck-generic -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-generic && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-generic 2>&1 | %fcheck-generic -allow-empty -check-prefix=NDEBUG
-// REQUIRES: libomptarget-debug
-
-int main(void) {
-#pragma omp target
-  {}
-  return 0;
-}
-
-// DEBUG: omptarget
-// NDEBUG-NOT: omptarget
-// NDEBUG-NOT: Target

diff --git a/libomptarget/test/jit/empty_kernel.inc b/libomptarget/test/jit/empty_kernel.inc
deleted file mode 100644
index 4381389..0000000
--- a/libomptarget/test/jit/empty_kernel.inc
+++ /dev/null

@@ -1,41 +0,0 @@
-int main(int argc, char** argv) {
-  #pragma omp TGT1_DIRECTIVE
-  {
-#ifdef LOOP_DIRECTIVE
-    #pragma omp LOOP_DIRECTIVE
-    for (int i = 0; i < argc; ++i)
-#endif
-    {
-#ifdef BODY_DIRECTIVE
-      #pragma omp BODY_DIRECTIVE
-      {
-      }
-#endif
-    }
-  }
-
-#ifdef TGT2_DIRECTIVE
-#pragma omp TGT2_DIRECTIVE
-  {
-#ifdef LOOP_DIRECTIVE
-    #pragma omp LOOP_DIRECTIVE
-    for (int i = 0; i < argc; ++i)
-#endif
-    {
-#ifdef BODY_DIRECTIVE
-      #pragma omp BODY_DIRECTIVE
-      {
-      }
-#endif
-    }
-  }
-#endif
-}
-
-// Check for an empty kernel (IR level)
-// FIRST:       define weak_odr {{.*}} void @__omp_offloading_{{.*}}_main
-// FIRST-NEXT:    ret void
-
-// Check for two empty kernels (IR level)
-// SECOND:      define weak_odr {{.*}} void @__omp_offloading_{{.*}}_main
-// SECOND-NEXT:   ret void

diff --git a/libomptarget/test/jit/empty_kernel_lvl1.c b/libomptarget/test/jit/empty_kernel_lvl1.c
deleted file mode 100644
index a0b8cd4..0000000
--- a/libomptarget/test/jit/empty_kernel_lvl1.c
+++ /dev/null

@@ -1,38 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compileopt-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
-// RUN: %libomptarget-compileopt-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target"                        \
-// RUN:     -DTGT2_DIRECTIVE="target"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST
-//
-// RUN: %libomptarget-compileopt-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
-// RUN: %libomptarget-compileopt-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"                  \
-// RUN:     -DTGT2_DIRECTIVE="target teams"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
-// clang-format on
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include "empty_kernel.inc"

diff --git a/libomptarget/test/jit/empty_kernel_lvl2.c b/libomptarget/test/jit/empty_kernel_lvl2.c
deleted file mode 100644
index 81a04f5..0000000
--- a/libomptarget/test/jit/empty_kernel_lvl2.c
+++ /dev/null

@@ -1,98 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target"                          \
-// RUN:     -DLOOP_DIRECTIVE="for"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target"                          \
-// RUN:     -DTGT2_DIRECTIVE="target"                          \
-// RUN:     -DLOOP_DIRECTIVE="for"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
-//
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target"                          \
-// RUN:     -DLOOP_DIRECTIVE="parallel for"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target"                          \
-// RUN:     -DTGT2_DIRECTIVE="target teams"                    \
-// RUN:     -DLOOP_DIRECTIVE="parallel for"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
-//
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"                    \
-// RUN:     -DLOOP_DIRECTIVE="distribute"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"                    \
-// RUN:     -DTGT2_DIRECTIVE="target teams"                    \
-// RUN:     -DLOOP_DIRECTIVE="distribute"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
-//
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"                    \
-// RUN:     -DLOOP_DIRECTIVE="distribute parallel for"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"                    \
-// RUN:     -DTGT2_DIRECTIVE="target teams"                    \
-// RUN:     -DLOOP_DIRECTIVE="distribute parallel for"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
-//
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"                    \
-// RUN:     -DLOOP_DIRECTIVE="distribute parallel for simd"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
-// RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
-// RUN:     -DTGT1_DIRECTIVE="target teams"                    \
-// RUN:     -DTGT2_DIRECTIVE="target teams"                    \
-// RUN:     -DLOOP_DIRECTIVE="distribute parallel for simd"
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
-// clang-format on
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include "empty_kernel.inc"

diff --git a/libomptarget/test/jit/type_punning.c b/libomptarget/test/jit/type_punning.c
deleted file mode 100644
index 10e3d2c..0000000
--- a/libomptarget/test/jit/type_punning.c
+++ /dev/null

@@ -1,43 +0,0 @@
-// clang-format off
-//
-// RUN: %libomptarget-compileopt-generic -fopenmp-target-jit
-// RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
-// RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
-// RUN:     %libomptarget-run-generic
-// RUN: %fcheck-plain-generic --input-file %t.pre.ll %s
-//
-// clang-format on
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-// Ensure that there is only the kernel function left, not any outlined
-// parallel regions.
-//
-// CHECK: define
-// CHECK-NOT: define
-
-#include <omp.h>
-#include <stdio.h>
-
-void f(long *A, int N) {
-  long i = 0;
-#pragma omp target map(A[ : N])
-  {
-#pragma omp parallel firstprivate(i)
-    A[omp_get_thread_num()] = i;
-#pragma omp parallel firstprivate(i, N)
-    A[omp_get_thread_num()] += i + N;
-  }
-}
-
-int main() {
-  long A[1];
-  f(&A[0], 1);
-  printf("%li\n", A[0]);
-  return 0;
-}

diff --git a/libomptarget/test/libc/assert.c b/libomptarget/test/libc/assert.c
deleted file mode 100644
index 803a820..0000000
--- a/libomptarget/test/libc/assert.c
+++ /dev/null

@@ -1,23 +0,0 @@
-// RUN: %libomptarget-compile-generic && %libomptarget-run-fail-generic 2>&1 | \
-// RUN:   %fcheck-generic --check-prefix=CHECK
-
-// REQUIRES: libc
-
-// NVPTX without LTO uses the implementation in OpenMP currently.
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: powerpc64-ibm-linux-gnu
-// UNSUPPORTED: powerpc64-ibm-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-#include <assert.h>
-
-int main() {
-  // CHECK: Assertion failed: '0 && "Trivial failure"' in function: 'int main()'
-  // CHECK-NOT: Assertion failed:
-#pragma omp target
-#pragma omp parallel
-  { assert(0 && "Trivial failure"); }
-}

diff --git a/libomptarget/test/libc/fwrite.c b/libomptarget/test/libc/fwrite.c
deleted file mode 100644
index 7ffb449..0000000
--- a/libomptarget/test/libc/fwrite.c
+++ /dev/null

@@ -1,22 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: libc
-
-#include <assert.h>
-#include <stdio.h>
-
-#pragma omp declare target to(stdout)
-
-int main() {
-  int r = 0;
-// CHECK: PASS
-#pragma omp target map(from : r)
-  { r = fwrite("PASS\n", 1, sizeof("PASS\n") - 1, stdout); }
-  assert(r == sizeof("PASS\n") - 1 && "Incorrect number of bytes written");
-
-// CHECK: PASS
-#pragma omp target map(from : r) nowait
-  { r = fwrite("PASS\n", 1, 5, stdout); }
-#pragma omp taskwait
-  assert(r == sizeof("PASS\n") - 1 && "Incorrect number of bytes written");
-}

diff --git a/libomptarget/test/libc/global_ctor_dtor.cpp b/libomptarget/test/libc/global_ctor_dtor.cpp
deleted file mode 100644
index a2d2a48..0000000
--- a/libomptarget/test/libc/global_ctor_dtor.cpp
+++ /dev/null

@@ -1,37 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-// REQUIRES: libc
-
-#include <stdio.h>
-
-#pragma omp begin declare target device_type(nohost)
-
-// CHECK: void ctor1()
-// CHECK: void ctor2()
-// CHECK: void ctor3()
-[[gnu::constructor(101)]] void ctor1() { puts(__PRETTY_FUNCTION__); }
-[[gnu::constructor(102)]] void ctor2() { puts(__PRETTY_FUNCTION__); }
-[[gnu::constructor(103)]] void ctor3() { puts(__PRETTY_FUNCTION__); }
-
-struct S {
-  S() { puts(__PRETTY_FUNCTION__); }
-  ~S() { puts(__PRETTY_FUNCTION__); }
-};
-
-// CHECK: S::S()
-// CHECK: S::~S()
-S s;
-
-// CHECK: void dtor3()
-// CHECK: void dtor2()
-// CHECK: void dtor1()
-[[gnu::destructor(101)]] void dtor1() { puts(__PRETTY_FUNCTION__); }
-[[gnu::destructor(103)]] void dtor3() { puts(__PRETTY_FUNCTION__); }
-[[gnu::destructor(102)]] void dtor2() { puts(__PRETTY_FUNCTION__); }
-
-#pragma omp end declare target
-
-int main() {
-#pragma omp target
-  ;
-}

diff --git a/libomptarget/test/libc/host_call.c b/libomptarget/test/libc/host_call.c
deleted file mode 100644
index 11260cc..0000000
--- a/libomptarget/test/libc/host_call.c
+++ /dev/null

@@ -1,54 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: libc
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp begin declare variant match(device = {kind(gpu)})
-// Extension provided by the 'libc' project.
-void rpc_host_call(void *fn, void *args, size_t size);
-#pragma omp declare target to(rpc_host_call) device_type(nohost)
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(device = {kind(cpu)})
-// Dummy host implementation to make this work for all targets.
-void rpc_host_call(void *fn, void *args, size_t size) {
-  ((void (*)(void *))fn)(args);
-}
-#pragma omp end declare variant
-
-typedef struct args_s {
-  int thread_id;
-  int block_id;
-} args_t;
-
-// CHECK-DAG: Thread: 0, Block: 0
-// CHECK-DAG: Thread: 1, Block: 0
-// CHECK-DAG: Thread: 0, Block: 1
-// CHECK-DAG: Thread: 1, Block: 1
-// CHECK-DAG: Thread: 0, Block: 2
-// CHECK-DAG: Thread: 1, Block: 2
-// CHECK-DAG: Thread: 0, Block: 3
-// CHECK-DAG: Thread: 1, Block: 3
-void foo(void *data) {
-  assert(omp_is_initial_device() && "Not executing on host?");
-  args_t *args = (args_t *)data;
-  printf("Thread: %d, Block: %d\n", args->thread_id, args->block_id);
-}
-
-void *fn_ptr = NULL;
-#pragma omp declare target to(fn_ptr)
-
-int main() {
-  fn_ptr = (void *)&foo;
-#pragma omp target update to(fn_ptr)
-
-#pragma omp target teams num_teams(4)
-#pragma omp parallel num_threads(2)
-  {
-    args_t args = {omp_get_thread_num(), omp_get_team_num()};
-    rpc_host_call(fn_ptr, &args, sizeof(args_t));
-  }
-}

diff --git a/libomptarget/test/libc/malloc.c b/libomptarget/test/libc/malloc.c
deleted file mode 100644
index b587b61..0000000
--- a/libomptarget/test/libc/malloc.c
+++ /dev/null

@@ -1,37 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: libc
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#pragma omp declare target to(malloc)
-#pragma omp declare target to(free)
-
-int main() {
-  unsigned h_x;
-  unsigned *d_x;
-#pragma omp target map(from : d_x)
-  {
-    d_x = (unsigned *)malloc(sizeof(unsigned));
-    *d_x = 1;
-  }
-
-#pragma omp target is_device_ptr(d_x) map(from : h_x)
-  { h_x = *d_x; }
-
-#pragma omp target is_device_ptr(d_x)
-  { free(d_x); }
-
-#pragma omp target teams num_teams(64)
-#pragma omp parallel num_threads(32)
-  {
-    int *ptr = (int *)malloc(sizeof(int));
-    *ptr = 42;
-    free(ptr);
-  }
-
-  // CHECK: PASS
-  if (h_x == 1)
-    fputs("PASS\n", stdout);
-}

diff --git a/libomptarget/test/libc/puts.c b/libomptarget/test/libc/puts.c
deleted file mode 100644
index 0e363f5..0000000
--- a/libomptarget/test/libc/puts.c
+++ /dev/null

@@ -1,35 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: libc
-
-#include <stdio.h>
-
-#pragma omp declare target to(stdout)
-
-int main() {
-// CHECK: PASS
-#pragma omp target
-  { fputs("PASS\n", stdout); }
-
-// CHECK: PASS
-#pragma omp target nowait
-  { fputs("PASS\n", stdout); }
-
-// CHECK: PASS
-#pragma omp target nowait
-  { fputs("PASS\n", stdout); }
-
-#pragma omp taskwait
-
-// CHECK: PASS
-// CHECK: PASS
-// CHECK: PASS
-// CHECK: PASS
-// CHECK: PASS
-// CHECK: PASS
-// CHECK: PASS
-// CHECK: PASS
-#pragma omp target teams num_teams(4)
-#pragma omp parallel num_threads(2)
-  { puts("PASS\n"); }
-}

diff --git a/libomptarget/test/lit.cfg b/libomptarget/test/lit.cfg
deleted file mode 100644
index 6c59060..0000000
--- a/libomptarget/test/lit.cfg
+++ /dev/null

@@ -1,399 +0,0 @@
-# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
-# Configuration file for the 'lit' test runner.
-
-import os
-import lit.formats
-
-# Tell pylint that we know config and lit_config exist somewhere.
-if 'PYLINT_IMPORT' in os.environ:
-    config = object()
-    lit_config = object()
-
-# Use the CUDA device as suggested by the env
-if 'CUDA_VISIBLE_DEVICES' in os.environ:
-    config.environment['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES']
-
-# Use the ROCR device as suggested by the env
-if 'ROCR_VISIBLE_DEVICES' in os.environ:
-    config.environment['ROCR_VISIBLE_DEVICES'] = os.environ['ROCR_VISIBLE_DEVICES']
-
-# Allow running the tests with omptarget debug output
-if 'LIBOMPTARGET_DEBUG' in os.environ:
-    config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
-
-# Allow running the tests with nextgen plugins when available
-if 'LIBOMPTARGET_NEXTGEN_PLUGINS' in os.environ:
-    config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] = os.environ['LIBOMPTARGET_NEXTGEN_PLUGINS']
-
-if 'LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS' in os.environ:
-    config.environment['LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS'] = os.environ['LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS']
-
-if 'OMP_TARGET_OFFLOAD' in os.environ:
-    config.environment['OMP_TARGET_OFFLOAD'] = os.environ['OMP_TARGET_OFFLOAD']
-
-if 'HSA_ENABLE_SDMA' in os.environ:
-    config.environment['HSA_ENABLE_SDMA'] = os.environ['HSA_ENABLE_SDMA']
-
-# Architectures like gfx942 may or may not be APUs so an additional environment
-# variable is required as some tests can be APU specific.
-if 'IS_APU' in os.environ:
-    config.environment['IS_APU'] = os.environ['IS_APU']
-
-# set default environment variables for test
-if 'CHECK_OPENMP_ENV' in os.environ:
-    test_env = os.environ['CHECK_OPENMP_ENV'].split()
-    for env in test_env:
-        name = env.split('=')[0]
-        value = env.split('=')[1]
-        config.environment[name] = value
-
-def append_dynamic_library_path(name, value, sep):
-    if name in config.environment:
-        config.environment[name] = value + sep + config.environment[name]
-    else:
-        config.environment[name] = value
-
-# Evaluate the environment variable which is a string boolean value.
-def evaluate_bool_env(env):
-    env = env.lower()
-    possible_true_values = ["on", "true", "1"]
-    for v in possible_true_values:
-        if env == v:
-            return True
-    return False
-
-# name: The name of this test suite.
-config.name = 'libomptarget :: ' + config.libomptarget_current_target
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc', '.f90']
-
-# excludes: A list of directories to exclude from the testuites.
-config.excludes = ['Inputs']
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root object directory where output is placed
-config.test_exec_root = config.libomptarget_obj_root
-
-# test format
-config.test_format = lit.formats.ShTest()
-
-# compiler flags
-config.test_flags = " -I " + config.test_source_root + \
-    " -I " + config.omp_header_directory + \
-    " -L " + config.library_dir + \
-    " -L " + config.llvm_lib_directory
-
-# compiler specific flags
-config.test_flags_clang = ""
-config.test_flags_flang = ""
-
-if config.omp_host_rtl_directory:
-    config.test_flags = config.test_flags + " -L " + \
-        config.omp_host_rtl_directory
-
-config.test_flags = config.test_flags + " " + config.test_extra_flags
-
-# Allow REQUIRES / UNSUPPORTED / XFAIL to work
-config.target_triple = [ ]
-for feature in config.test_compiler_features:
-    config.available_features.add(feature)
-
-if config.libomptarget_debug:
-  config.available_features.add('libomptarget-debug')
-
-if config.has_libomptarget_ompt:
-  config.available_features.add('ompt')
-
-config.available_features.add(config.libomptarget_current_target)
-
-if config.libomptarget_has_libc:
-  config.available_features.add('libc')
-
-# Determine whether the test system supports unified memory.
-# For CUDA, this is the case with compute capability 70 (Volta) or higher.
-# For all other targets, we currently assume it is.
-supports_unified_shared_memory = True
-supports_apu = False
-if config.libomptarget_current_target.startswith('nvptx'):
-  try:
-    cuda_arch = int(config.cuda_test_arch[:3])
-    if cuda_arch < 70:
-      supports_unified_shared_memory = False
-  except ValueError:
-    # If the architecture is invalid, assume it is supported.
-    supports_unified_shared_memory = True
-elif config.libomptarget_current_target.startswith('amdgcn'):
-    # amdgpu_test_arch contains a list of AMD GPUs in the system
-    # only check the first one assuming that we will run the test on it.
-    if not (config.amdgpu_test_arch.startswith("gfx90a") or
-            config.amdgpu_test_arch.startswith("gfx940") or
-            config.amdgpu_test_arch.startswith("gfx942")):
-       supports_unified_shared_memory = False
-    # check if AMD architecture is an APU:
-    if (config.amdgpu_test_arch.startswith("gfx940") or
-        (config.amdgpu_test_arch.startswith("gfx942") and
-         evaluate_bool_env(config.environment['IS_APU']))):
-       supports_apu = True
-if supports_unified_shared_memory:
-   config.available_features.add('unified_shared_memory')
-if supports_apu:
-   config.available_features.add('apu')
-
-# Setup environment to find dynamic library at runtime
-if config.operating_system == 'Windows':
-    append_dynamic_library_path('PATH', config.library_dir, ";")
-    append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";")
-elif config.operating_system == 'Darwin':
-    append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":")
-    append_dynamic_library_path('DYLD_LIBRARY_PATH', \
-        config.omp_host_rtl_directory, ";")
-    config.test_flags += " -Wl,-rpath," + config.library_dir
-    config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
-else: # Unices
-    if config.libomptarget_current_target != "nvptx64-nvidia-cuda":
-        config.test_flags += " -nogpulib"
-    config.test_flags += " -Wl,-rpath," + config.library_dir
-    config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
-    config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory
-    if config.cuda_libdir:
-        config.test_flags += " -Wl,-rpath," + config.cuda_libdir
-    if config.libomptarget_current_target.startswith('nvptx'):
-        config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
-    if config.libomptarget_current_target.endswith('-LTO'):
-        config.test_flags += " -foffload-lto"
-    if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(
-        config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS']
-    ):
-        config.test_flags += " -foffload-lto"
-        config.test_flags += " -Wl,--embed-bitcode"
-
-def remove_suffix_if_present(name):
-    if name.endswith('-LTO'):
-        return name[:-4]
-    elif name.endswith('-JIT-LTO'):
-        return name[:-8]
-    else:
-        return name
-
-def add_libraries(source):
-    if config.libomptarget_has_libc:
-        if config.libomptarget_current_target.startswith('nvptx'):
-            return source + " " + config.llvm_library_dir + "/libcgpu-nvptx.a " + \
-                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
-        elif config.libomptarget_current_target.startswith('amdgcn'):
-            return source + " " + config.llvm_library_dir + "/libcgpu-amdgpu.a " + \
-                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
-    return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
-
-# substitutions
-# - for targets that exist in the system create the actual command.
-# - for valid targets that do not exist in the system, return false, so that the
-#   same test can be used for different targets.
-
-# Scan all the valid targets.
-for libomptarget_target in config.libomptarget_all_targets:
-    # Is this target in the current system? If so create a compile, run and test
-    # command. Otherwise create command that return false.
-    if libomptarget_target == config.libomptarget_current_target:
-        config.substitutions.append(("%libomptarget-compilexx-run-and-check-generic",
-            "%libomptarget-compilexx-run-and-check-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-run-and-check-generic",
-            "%libomptarget-compile-run-and-check-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-fortran-run-and-check-generic",
-            "%libomptarget-compile-fortran-run-and-check-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compilexx-and-run-generic",
-            "%libomptarget-compilexx-and-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-and-run-generic",
-            "%libomptarget-compile-and-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compilexx-generic",
-            "%libomptarget-compilexx-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compilexxx-generic-force-usm",
-            "%libomptarget-compilexxx-force-usm-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-generic",
-            "%libomptarget-compile-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-fortran-generic",
-            "%libomptarget-compile-fortran-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-generic",
-            "%libomptarget-compileoptxx-run-and-check-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileopt-run-and-check-generic",
-            "%libomptarget-compileopt-run-and-check-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileoptxx-and-run-generic",
-            "%libomptarget-compileoptxx-and-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileopt-and-run-generic",
-            "%libomptarget-compileopt-and-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileoptxx-generic",
-            "%libomptarget-compileoptxx-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileopt-generic",
-            "%libomptarget-compileopt-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-run-generic",
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-run-fail-generic",
-            "%libomptarget-run-fail-" + libomptarget_target))
-        config.substitutions.append(("%clangxx-generic",
-            "%clangxx-" + libomptarget_target))
-        config.substitutions.append(("%clang-generic",
-            "%clang-" + libomptarget_target))
-        config.substitutions.append(("%fcheck-generic",
-            config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%fcheck-plain-generic",
-            config.libomptarget_filecheck))
-
-
-        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
-            libomptarget_target, \
-            "%libomptarget-compilexx-and-run-" + libomptarget_target + \
-            " | " + config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
-            libomptarget_target, \
-            "%libomptarget-compile-and-run-" + libomptarget_target + \
-            " | " + config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%libomptarget-compile-fortran-run-and-check-" + \
-            libomptarget_target, \
-            "%libomptarget-compile-fortran-and-run-" + libomptarget_target + \
-            " | " + config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
-            libomptarget_target, \
-            "%libomptarget-compilexx-" + libomptarget_target + " && " + \
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-and-run-" + \
-            libomptarget_target, \
-            "%libomptarget-compile-" + libomptarget_target + " && " + \
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-fortran-and-run-" + \
-            libomptarget_target, \
-            "%libomptarget-compile-fortran-" + libomptarget_target + " && " + \
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compilexx-" + \
-            libomptarget_target, \
-            "%clangxx-" + libomptarget_target + add_libraries(" %s -o %t")))
-        config.substitutions.append(("%libomptarget-compilexxx-force-usm-" +
-            libomptarget_target, "%clangxxx-force-usm-" + libomptarget_target + \
-                                     add_libraries(" %s -o %t")))
-        config.substitutions.append(("%libomptarget-compile-" + \
-            libomptarget_target, \
-            "%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
-        config.substitutions.append(("%libomptarget-compile-fortran-" + \
-            libomptarget_target, \
-            "%flang-" + libomptarget_target + add_libraries(" %s -o %t")))
-        config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-" + \
-            libomptarget_target, \
-            "%libomptarget-compileoptxx-and-run-" + libomptarget_target + \
-            " | " + config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%libomptarget-compileopt-run-and-check-" + \
-            libomptarget_target, \
-            "%libomptarget-compileopt-and-run-" + libomptarget_target + \
-            " | " + config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%libomptarget-compileoptxx-and-run-" + \
-            libomptarget_target, \
-            "%libomptarget-compileoptxx-" + libomptarget_target + " && " + \
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileopt-and-run-" + \
-            libomptarget_target, \
-            "%libomptarget-compileopt-" + libomptarget_target + " && " + \
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compileoptxx-" + \
-            libomptarget_target, \
-            "%clangxx-" + libomptarget_target + add_libraries(" -O3 %s -o %t")))
-        config.substitutions.append(("%libomptarget-compileopt-" + \
-            libomptarget_target, \
-            "%clang-" + libomptarget_target + add_libraries(" -O3 %s -o %t")))
-        config.substitutions.append(("%libomptarget-run-" + \
-            libomptarget_target, \
-            "%t"))
-        config.substitutions.append(("%libomptarget-run-fail-" + \
-            libomptarget_target, \
-            "%not --crash %t"))
-        config.substitutions.append(("%clangxx-" + libomptarget_target, \
-                                     "%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
-                                     remove_suffix_if_present(libomptarget_target)))
-        config.substitutions.append(("%clangxxx-force-usm-" + libomptarget_target, \
-                                     "%clangxx %openmp_flags -fopenmp-force-usm  %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
-                                     remove_suffix_if_present(libomptarget_target)))
-        config.substitutions.append(("%clang-" + libomptarget_target, \
-                                     "%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
-                                     remove_suffix_if_present(libomptarget_target)))
-        config.substitutions.append(("%flang-" + libomptarget_target, \
-                                     "%flang %openmp_flags %flags %flags_flang -fopenmp-targets=" +\
-                                     remove_suffix_if_present(libomptarget_target)))
-        config.substitutions.append(("%fcheck-" + libomptarget_target, \
-            config.libomptarget_filecheck + " %s"))
-    else:
-        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compile-fortran-run-and-check-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compile-and-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compile-fortran-and-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compilexx-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compile-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compile-fortran-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compileopt-run-and-check-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compileopt-and-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compileoptxx-and-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compileoptxx-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compileopt-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-run-fail-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%clang-" + libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%clangxx-" + libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%fcheck-" + libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%flang-" + libomptarget_target, \
-            "echo ignored-command"))
-
-config.substitutions.append(("%clangxx", config.test_cxx_compiler))
-config.substitutions.append(("%clang", config.test_c_compiler))
-
-if config.test_fortran_compiler:
-    config.available_features.add('flang')
-    config.substitutions.append(("%flang", config.test_fortran_compiler))
-
-config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
-if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
-    config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
-else:
-    config.substitutions.append(("%cuda_flags", ""))
-config.substitutions.append(("%flags_clang", config.test_flags_clang))
-config.substitutions.append(("%flags_flang", config.test_flags_flang))
-config.substitutions.append(("%flags", config.test_flags))
-config.substitutions.append(("%not", config.libomptarget_not))

diff --git a/libomptarget/test/lit.site.cfg.in b/libomptarget/test/lit.site.cfg.in
deleted file mode 100644
index 7c75aaa..0000000
--- a/libomptarget/test/lit.site.cfg.in
+++ /dev/null

@@ -1,31 +0,0 @@
-@AUTO_GEN_COMMENT@
-
-config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
-config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
-config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
-config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@"
-config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
-config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
-config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
-config.cuda_path = "@CUDA_TOOLKIT_ROOT_DIR@"
-config.cuda_libdir = "@CUDA_LIBDIR@"
-config.cuda_test_arch = "@LIBOMPTARGET_DEP_CUDA_ARCH@"
-config.amdgpu_test_arch = "@LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST@"
-config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@/@CURRENT_TARGET@"
-config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
-config.llvm_library_dir = "@LIBOMPTARGET_LLVM_LIBRARY_DIR@"
-config.llvm_library_intdir = "@LIBOMPTARGET_LLVM_LIBRARY_INTDIR@"
-config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
-config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
-config.llvm_lib_directory = "@LIBOMPTARGET_LLVM_LIBRARY_DIR@"
-config.operating_system = "@CMAKE_SYSTEM_NAME@"
-config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
-config.libomptarget_current_target = "@CURRENT_TARGET@"
-config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
-config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
-config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
-config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
-config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")

diff --git a/libomptarget/test/mapping/alloc_fail.c b/libomptarget/test/mapping/alloc_fail.c
deleted file mode 100644
index c4ae70f..0000000
--- a/libomptarget/test/mapping/alloc_fail.c
+++ /dev/null

@@ -1,15 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// CHECK: omptarget message: explicit extension not allowed: host address specified is 0x{{.*}} (8 bytes), but device allocation maps to host at 0x{{.*}} (8 bytes)
-// CHECK: omptarget error: Call to getTargetPointer returned null pointer (device failure or illegal mapping).
-// CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-
-int main() {
-  int arr[4] = {0, 1, 2, 3};
-#pragma omp target data map(alloc : arr[0 : 2])
-#pragma omp target data map(alloc : arr[1 : 2])
-  ;
-  return 0;
-}

diff --git a/libomptarget/test/mapping/array_section_implicit_capture.c b/libomptarget/test/mapping/array_section_implicit_capture.c
deleted file mode 100644
index 210b7e5..0000000
--- a/libomptarget/test/mapping/array_section_implicit_capture.c
+++ /dev/null

@@ -1,58 +0,0 @@
-// RUN: %libomptarget-compile-generic 
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define N 1024
-#define FROM 64
-#define LENGTH 128
-
-int main() {
-  float *A = (float *)malloc(N * sizeof(float));
-  float *B = (float *)malloc(N * sizeof(float));
-  float *C = (float *)malloc(N * sizeof(float));
-
-  for (int i = 0; i < N; i++) {
-    C[i] = 0.0;
-  }
-
-  for (int i = 0; i < N; i++) {
-    A[i] = i;
-    B[i] = 2 * i;
-  }
-
-#pragma omp target enter data map(to : A[FROM : LENGTH], B[FROM : LENGTH])
-#pragma omp target enter data map(alloc : C[FROM : LENGTH])
-
-// A, B and C have been mapped starting at index FROM, but inside the kernel
-// they are captured implicitly so the library must look them up using their
-// base address.
-#pragma omp target
-  {
-    for (int i = FROM; i < FROM + LENGTH; i++) {
-      C[i] = A[i] + B[i];
-    }
-  }
-
-#pragma omp target exit data map(from : C[FROM : LENGTH])
-#pragma omp target exit data map(delete : A[FROM : LENGTH], B[FROM : LENGTH])
-
-  int errors = 0;
-  for (int i = FROM; i < FROM + LENGTH; i++)
-    if (C[i] != A[i] + B[i])
-      ++errors;
-
-  // CHECK: Success
-  if (errors)
-    fprintf(stderr, "Failure\n");
-  else
-    fprintf(stderr, "Success\n");
-
-  free(A);
-  free(B);
-  free(C);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/array_section_use_device_ptr.c b/libomptarget/test/mapping/array_section_use_device_ptr.c
deleted file mode 100644
index 86e2875..0000000
--- a/libomptarget/test/mapping/array_section_use_device_ptr.c
+++ /dev/null

@@ -1,35 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define N 1024
-#define FROM 64
-#define LENGTH 128
-
-int main() {
-  float *A = (float *)malloc(N * sizeof(float));
-
-#pragma omp target enter data map(to : A[FROM : LENGTH])
-
-  // A, has been mapped starting at index FROM, but inside the use_device_ptr
-  // clause it is captured by base so the library must look it up using the
-  // base address.
-
-  float *A_dev = NULL;
-#pragma omp target data use_device_ptr(A)
-  { A_dev = A; }
-#pragma omp target exit data map(delete : A[FROM : LENGTH])
-
-  // CHECK: Success
-  if (A_dev == NULL || A_dev == A)
-    fprintf(stderr, "Failure\n");
-  else
-    fprintf(stderr, "Success\n");
-
-  free(A);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/auto_zero_copy.cpp b/libomptarget/test/mapping/auto_zero_copy.cpp
deleted file mode 100644
index 4664120..0000000
--- a/libomptarget/test/mapping/auto_zero_copy.cpp
+++ /dev/null

@@ -1,59 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compilexx-generic
-// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
-
-// RUN: %libomptarget-compilexx-generic
-// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-// REQUIRES: unified_shared_memory
-
-// clang-format on
-
-#include <cstdio>
-
-int main() {
-  int n = 1024;
-
-  // test various mapping types
-  int *a = new int[n];
-  int k = 3;
-  int b[n];
-
-  for (int i = 0; i < n; i++)
-    b[i] = i;
-
-    // clang-format off
-  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
-  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
-
-  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
-  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
-  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
-  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
-// clang-format on
-#pragma omp target teams distribute parallel for map(tofrom : a[ : n])         \
-    map(to : b[ : n])
-  for (int i = 0; i < n; i++)
-    a[i] = i + b[i] + k;
-
-  int err = 0;
-  for (int i = 0; i < n; i++)
-    if (a[i] != i + b[i] + k)
-      err++;
-
-  // CHECK: PASS
-  if (err == 0)
-    printf("PASS\n");
-  return err;
-}

diff --git a/libomptarget/test/mapping/auto_zero_copy_apu.cpp b/libomptarget/test/mapping/auto_zero_copy_apu.cpp
deleted file mode 100644
index 48360e4..0000000
--- a/libomptarget/test/mapping/auto_zero_copy_apu.cpp
+++ /dev/null

@@ -1,57 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compilexx-generic
-// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
-
-// RUN: %libomptarget-compilexx-generic
-// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-// REQUIRES: apu
-
-// clang-format on
-
-#include <cstdio>
-
-int main() {
-  int n = 1024;
-
-  // test various mapping types
-  int *a = new int[n];
-  int k = 3;
-  int b[n];
-
-  for (int i = 0; i < n; i++)
-    b[i] = i;
-
-    // clang-format off
-  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
-  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
-
-  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
-  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
-  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
-  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
-// clang-format on
-#pragma omp target teams distribute parallel for map(tofrom : a[ : n])         \
-    map(to : b[ : n])
-  for (int i = 0; i < n; i++)
-    a[i] = i + b[i] + k;
-
-  int err = 0;
-  for (int i = 0; i < n; i++)
-    if (a[i] != i + b[i] + k)
-      err++;
-
-  // CHECK: PASS
-  if (err == 0)
-    printf("PASS\n");
-  return err;
-}

diff --git a/libomptarget/test/mapping/auto_zero_copy_globals.cpp b/libomptarget/test/mapping/auto_zero_copy_globals.cpp
deleted file mode 100644
index 55dfb28..0000000
--- a/libomptarget/test/mapping/auto_zero_copy_globals.cpp
+++ /dev/null

@@ -1,87 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compilexx-generic
-// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=60 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=CHECK
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-// REQUIRES: unified_shared_memory
-
-// clang-format on
-
-#include <cstdint>
-#include <cstdio>
-
-/// Test for globals under automatic zero-copy.
-/// Because we are building without unified_shared_memory
-/// requirement pragma, all globals are allocated in the device
-/// memory of all used GPUs. To ensure those globals contain the intended
-/// values, we need to execute H2D and D2H memory copies even if we are running
-/// in automatic zero-copy. This only applies to globals. Local variables (their
-/// host pointers) are passed to the kernels by-value, according to the
-/// automatic zero-copy behavior.
-
-#pragma omp begin declare target
-int32_t x;     // 4 bytes
-int32_t z[10]; // 40 bytes
-int32_t *k;    // 20 bytes
-#pragma omp end declare target
-
-int main() {
-  int32_t *dev_k = nullptr;
-  x = 3;
-  int32_t y = -1;
-  for (size_t t = 0; t < 10; t++)
-    z[t] = t;
-  k = new int32_t[5];
-
-  printf("Host pointer for k = %p\n", k);
-  for (size_t t = 0; t < 5; t++)
-    k[t] = -t;
-
-/// target update to forces a copy between host and device global, which we must
-/// execute to keep the two global copies consistent. CHECK: Copying data from
-/// host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=40, Name=z
-#pragma omp target update to(z[ : 10])
-
-/// target map with always modifier (for x) forces a copy between host and
-/// device global, which we must execute to keep the two global copies
-/// consistent. k's content (host address) is passed by-value to the kernel
-/// (Size=20 case). y, being a local variable, is also passed by-value to the
-/// kernel (Size=4 case) CHECK: Return HstPtrBegin {{.*}} Size=4 for unified
-/// shared memory CHECK: Return HstPtrBegin {{.*}} Size=20 for unified shared
-/// memory CHECK: Copying data from host to device, HstPtr={{.*}},
-/// TgtPtr={{.*}}, Size=4, Name=x
-#pragma omp target map(to : k[ : 5]) map(always, tofrom : x) map(tofrom : y)   \
-    map(from : dev_k)
-  {
-    x++;
-    y++;
-    for (size_t t = 0; t < 10; t++)
-      z[t]++;
-    dev_k = k;
-  }
-/// CHECK-NOT: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}},
-/// Size=20, Name=k
-
-/// CHECK: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}},
-/// Size=4, Name=x
-
-/// CHECK: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}},
-/// Size=40, Name=z
-#pragma omp target update from(z[ : 10])
-
-  /// CHECK-NOT: k pointer not correctly passed to kernel
-  if (dev_k != k)
-    printf("k pointer not correctly passed to kernel\n");
-
-  delete[] k;
-  return 0;
-}

diff --git a/libomptarget/test/mapping/data_absent_at_exit.c b/libomptarget/test/mapping/data_absent_at_exit.c
deleted file mode 100644
index 5baaf93..0000000
--- a/libomptarget/test/mapping/data_absent_at_exit.c
+++ /dev/null

@@ -1,28 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <stdio.h>
-
-// OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
-// "If the map clause appears on a target, target data, or target exit data
-// construct and a corresponding list item of the original list item is not
-// present in the device data environment on exit from the region then the
-// list item is ignored."
-
-int main(void) {
-  int f = 5, r = 6, d = 7, af = 8;
-
-// Check exit from omp target data.
-// CHECK: f = 5, af = 8
-#pragma omp target data map(from : f) map(always, from : af)
-  {
-#pragma omp target exit data map(delete : f, af)
-  } printf("f = %d, af = %d\n", f, af);
-
-// Check omp target exit data.
-// CHECK: f = 5, r = 6, d = 7, af = 8
-#pragma omp target exit data map(from : f) map(release : r) map(delete : d)    \
-    map(always, from : af)
-  printf("f = %d, r = %d, d = %d, af = %d\n", f, r, d, af);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/data_member_ref.cpp b/libomptarget/test/mapping/data_member_ref.cpp
deleted file mode 100644
index fdb8abc..0000000
--- a/libomptarget/test/mapping/data_member_ref.cpp
+++ /dev/null

@@ -1,78 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <stdio.h>
-
-struct View {
-  int Data;
-};
-
-struct ViewPtr {
-  int *Data;
-};
-
-template <typename T> struct Foo {
-  Foo(T &V) : VRef(V) {}
-  T &VRef;
-};
-
-int main() {
-  View V;
-  V.Data = 123456;
-  Foo<View> Bar(V);
-  ViewPtr V1;
-  int Data = 123456;
-  V1.Data = &Data;
-  Foo<ViewPtr> Baz(V1);
-  int D1, D2;
-
-  // CHECK: Host 123456.
-  printf("Host %d.\n", Bar.VRef.Data);
-#pragma omp target map(Bar.VRef) map(from : D1, D2)
-  {
-    // CHECK: Device 123456.
-    D1 = Bar.VRef.Data;
-    printf("Device %d.\n", D1);
-    V.Data = 654321;
-    // CHECK: Device 654321.
-    D2 = Bar.VRef.Data;
-    printf("Device %d.\n", D2);
-  }
-  printf("Device %d.\n", D1);
-  printf("Device %d.\n", D2);
-  // CHECK: Host 654321 654321.
-  printf("Host %d %d.\n", Bar.VRef.Data, V.Data);
-  V.Data = 123456;
-  // CHECK: Host 123456.
-  printf("Host %d.\n", Bar.VRef.Data);
-#pragma omp target map(Bar) map(Bar.VRef) map(from : D1, D2)
-  {
-    // CHECK: Device 123456.
-    D1 = Bar.VRef.Data;
-    printf("Device %d.\n", D1);
-    V.Data = 654321;
-    // CHECK: Device 654321.
-    D2 = Bar.VRef.Data;
-    printf("Device %d.\n", D2);
-  }
-  printf("Device %d.\n", D1);
-  printf("Device %d.\n", D2);
-  // CHECK: Host 654321 654321.
-  printf("Host %d %d.\n", Bar.VRef.Data, V.Data);
-  // CHECK: Host 123456.
-  printf("Host %d.\n", *Baz.VRef.Data);
-#pragma omp target map(*Baz.VRef.Data) map(from : D1, D2)
-  {
-    // CHECK: Device 123456.
-    D1 = *Baz.VRef.Data;
-    printf("Device %d.\n", D1);
-    *V1.Data = 654321;
-    // CHECK: Device 654321.
-    D2 = *Baz.VRef.Data;
-    printf("Device %d.\n", D2);
-  }
-  printf("Device %d.\n", D1);
-  printf("Device %d.\n", D2);
-  // CHECK: Host 654321 654321 654321.
-  printf("Host %d %d %d.\n", *Baz.VRef.Data, *V1.Data, Data);
-  return 0;
-}

diff --git a/libomptarget/test/mapping/declare_mapper_api.cpp b/libomptarget/test/mapping/declare_mapper_api.cpp
deleted file mode 100644
index 041c7f6..0000000
--- a/libomptarget/test/mapping/declare_mapper_api.cpp
+++ /dev/null

@@ -1,48 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <cinttypes>
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-// Data structure definitions copied from OpenMP RTL.
-struct MapComponentInfoTy {
-  void *Base;
-  void *Begin;
-  int64_t Size;
-  int64_t Type;
-  void *Name;
-  MapComponentInfoTy() = default;
-  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type,
-                     void *Name)
-      : Base(Base), Begin(Begin), Size(Size), Type(Type), Name(Name) {}
-};
-
-struct MapperComponentsTy {
-  std::vector<MapComponentInfoTy> Components;
-};
-
-// OpenMP RTL interfaces
-#ifdef __cplusplus
-extern "C" {
-#endif
-int64_t __tgt_mapper_num_components(void *rt_mapper_handle);
-void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
-                                 void *begin, int64_t size, int64_t type,
-                                 void *name);
-#ifdef __cplusplus
-}
-#endif
-
-int main(int argc, char *argv[]) {
-  MapperComponentsTy MC;
-  void *base, *begin;
-  int64_t size, type;
-  // Push 2 elements into MC.
-  __tgt_push_mapper_component((void *)&MC, base, begin, size, type, nullptr);
-  __tgt_push_mapper_component((void *)&MC, base, begin, size, type, nullptr);
-  int64_t num = __tgt_mapper_num_components((void *)&MC);
-  // CHECK: num=2
-  printf("num=%" PRId64 "\n", num);
-  return 0;
-}

diff --git a/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp b/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
deleted file mode 100644
index c6c5657..0000000
--- a/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ /dev/null

@@ -1,65 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <cstdio>
-#include <cstdlib>
-
-typedef struct {
-  int a;
-  double *b;
-} C1;
-#pragma omp declare mapper(C1 s) map(to : s.a) map(from : s.b[0 : 2])
-
-typedef struct {
-  int a;
-  double *b;
-  C1 c;
-} C;
-#pragma omp declare mapper(C s) map(to : s.a, s.c) map(from : s.b[0 : 2])
-
-typedef struct {
-  int e;
-  C f;
-  int h;
-} D;
-
-int main() {
-  constexpr int N = 10;
-  D s;
-  s.e = 111;
-  s.f.a = 222;
-  s.f.c.a = 777;
-  double x[2];
-  double x1[2];
-  x[1] = 20;
-  s.f.b = &x[0];
-  s.f.c.b = &x1[0];
-  s.h = N;
-
-  D *sp = &s;
-  D **spp = &sp;
-
-  printf("%d %d %d %4.5f %d\n", spp[0][0].e, spp[0][0].f.a, spp[0][0].f.c.a,
-         spp[0][0].f.b[1], spp[0][0].f.b == &x[0] ? 1 : 0);
-  // CHECK: 111 222 777 20.00000 1
-
-  int spp00fa = -1, spp00fca = -1, spp00fb_r = -1;
-  __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]);
-#pragma omp target map(tofrom: spp[0][0]) firstprivate(p)                           \
-                   map(from: spp00fa, spp00fca, spp00fb_r)
-  {
-    spp00fa = spp[0][0].f.a;
-    spp00fca = spp[0][0].f.c.a;
-    spp00fb_r = spp[0][0].f.b == reinterpret_cast<void *>(p) ? 1 : 0;
-    printf("%d %d %d\n", spp00fa, spp00fca, spp00fb_r);
-    // XCHECK: 222 777 0
-    spp[0][0].e = 333;
-    spp[0][0].f.a = 444;
-    spp[0][0].f.c.a = 555;
-    spp[0][0].f.b[1] = 40;
-  }
-  printf("%d %d %d\n", spp00fa, spp00fca, spp00fb_r);
-  // CHECK: 222 777 0
-  printf("%d %d %d %4.5f %d\n", spp[0][0].e, spp[0][0].f.a, spp[0][0].f.c.a,
-         spp[0][0].f.b[1], spp[0][0].f.b == &x[0] ? 1 : 0);
-  // CHECK: 333 222 777 40.00000 1
-}

diff --git a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_array.cpp b/libomptarget/test/mapping/declare_mapper_nested_default_mappers_array.cpp
deleted file mode 100644
index dddbd33..0000000
--- a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_array.cpp
+++ /dev/null

@@ -1,70 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
-
-// UNSUPPORTED: clang
-
-#include <cstdio>
-#include <cstdlib>
-
-typedef struct {
-  int a;
-  double *b;
-} C1;
-#pragma omp declare mapper(C1 s) map(to : s.a) map(from : s.b[0 : 2])
-
-typedef struct {
-  int a;
-  double *b;
-  C1 c;
-} C;
-#pragma omp declare mapper(C s) map(to : s.a, s.c) map(from : s.b[0 : 2])
-
-typedef struct {
-  int e;
-  C f;
-  int h;
-} D;
-
-int main() {
-  constexpr int N = 10;
-  D sa[2];
-  double x[2], y[2];
-  double x1[2], y1[2];
-  y[1] = x[1] = 20;
-
-  sa[0].e = 111;
-  sa[0].f.a = 222;
-  sa[0].f.c.a = 777;
-  sa[0].f.b = &x[0];
-  sa[0].f.c.b = &x1[0];
-  sa[0].h = N;
-
-  sa[1].e = 111;
-  sa[1].f.a = 222;
-  sa[1].f.c.a = 777;
-  sa[1].f.b = &y[0];
-  sa[1].f.c.b = &y1[0];
-  sa[1].h = N;
-
-  printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
-  // CHECK: 111 222 777 20.00000 1
-
-  __intptr_t p = reinterpret_cast<__intptr_t>(&y[0]);
-#pragma omp target map(tofrom : sa) firstprivate(p)
-  {
-    printf("%d %d %d\n", sa[1].f.a, sa[1].f.c.a,
-           sa[1].f.b == reinterpret_cast<void *>(p) ? 1 : 0);
-    // CHECK: 222 777 0
-    sa[1].e = 333;
-    sa[1].f.a = 444;
-    sa[1].f.c.a = 555;
-    sa[1].f.b[1] = 40;
-  }
-  printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
-  // CHECK: 333 222 777 40.00000 1
-}

diff --git a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_array_subscript.cpp b/libomptarget/test/mapping/declare_mapper_nested_default_mappers_array_subscript.cpp
deleted file mode 100644
index bf51076..0000000
--- a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_array_subscript.cpp
+++ /dev/null

@@ -1,60 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
-
-#include <cstdio>
-#include <cstdlib>
-
-typedef struct {
-  int a;
-  double *b;
-} C1;
-#pragma omp declare mapper(C1 s) map(to : s.a) map(from : s.b[0 : 2])
-
-typedef struct {
-  int a;
-  double *b;
-  C1 c;
-} C;
-#pragma omp declare mapper(C s) map(to : s.a, s.c) map(from : s.b[0 : 2])
-
-typedef struct {
-  int e;
-  C f;
-  int h;
-} D;
-
-int main() {
-  constexpr int N = 10;
-  D sa[10];
-  sa[1].e = 111;
-  sa[1].f.a = 222;
-  sa[1].f.c.a = 777;
-  double x[2];
-  double x1[2];
-  x[1] = 20;
-  sa[1].f.b = &x[0];
-  sa[1].f.c.b = &x1[0];
-  sa[1].h = N;
-
-  printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
-  // CHECK: 111 222 777 20.00000 1
-
-  __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]);
-#pragma omp target map(tofrom : sa[1]) firstprivate(p)
-  {
-    printf("%d %d %d\n", sa[1].f.a, sa[1].f.c.a,
-           sa[1].f.b == reinterpret_cast<void *>(p) ? 1 : 0);
-    // CHECK: 222 777 0
-    sa[1].e = 333;
-    sa[1].f.a = 444;
-    sa[1].f.c.a = 555;
-    sa[1].f.b[1] = 40;
-  }
-  printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
-  // CHECK: 333 222 777 40.00000 1
-}

diff --git a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_complex_structure.cpp b/libomptarget/test/mapping/declare_mapper_nested_default_mappers_complex_structure.cpp
deleted file mode 100644
index d609221..0000000
--- a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_complex_structure.cpp
+++ /dev/null

@@ -1,129 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define N 2
-
-class MyObjectA {
-public:
-  MyObjectA() {
-    data1 = 1;
-    data2 = 2;
-  }
-  void show() {
-    printf("\t\tObject A Contents:\n");
-    printf("\t\t\tdata1 = %d  data2 = %d\n", data1, data2);
-  }
-  void foo() {
-    data1 += 10;
-    data2 += 20;
-  }
-  int data1;
-  int data2;
-};
-
-class MyObjectB {
-public:
-  MyObjectB() {
-    arr = new MyObjectA[N];
-    len = N;
-  }
-  void show() {
-    printf("\tObject B Contents:\n");
-    for (int i = 0; i < len; i++)
-      arr[i].show();
-  }
-  void foo() {
-    for (int i = 0; i < len; i++)
-      arr[i].foo();
-  }
-  MyObjectA *arr;
-  int len;
-};
-#pragma omp declare mapper(MyObjectB obj) map(obj, obj.arr[ : obj.len])
-
-class MyObjectC {
-public:
-  MyObjectC() {
-    arr = new MyObjectB[N];
-    len = N;
-  }
-  void show() {
-    printf("Object C Contents:\n");
-    for (int i = 0; i < len; i++)
-      arr[i].show();
-  }
-  void foo() {
-    for (int i = 0; i < len; i++)
-      arr[i].foo();
-  }
-  MyObjectB *arr;
-  int len;
-};
-#pragma omp declare mapper(MyObjectC obj) map(obj, obj.arr[ : obj.len])
-
-int main(void) {
-  MyObjectC *outer = new MyObjectC[N];
-
-  printf("Original data hierarchy:\n");
-  for (int i = 0; i < N; i++)
-    outer[i].show();
-
-  printf("Sending data to device...\n");
-#pragma omp target enter data map(to : outer[ : N])
-
-  printf("Calling foo()...\n");
-#pragma omp target teams distribute parallel for
-  for (int i = 0; i < N; i++)
-    outer[i].foo();
-
-  printf("foo() complete!\n");
-
-  printf("Sending data back to host...\n");
-#pragma omp target exit data map(from : outer[ : N])
-
-  printf("Modified Data Hierarchy:\n");
-  for (int i = 0; i < N; i++)
-    outer[i].show();
-
-  printf("Testing for correctness...\n");
-  for (int i = 0; i < N; ++i)
-    for (int j = 0; j < N; ++j)
-      for (int k = 0; k < N; ++k) {
-        printf("outer[%d].arr[%d].arr[%d].data1 = %d.\n", i, j, k,
-               outer[i].arr[j].arr[k].data1);
-        printf("outer[%d].arr[%d].arr[%d].data2 = %d.\n", i, j, k,
-               outer[i].arr[j].arr[k].data2);
-        assert(outer[i].arr[j].arr[k].data1 == 11 &&
-               outer[i].arr[j].arr[k].data2 == 22);
-      }
-  // CHECK: outer[0].arr[0].arr[0].data1 = 11.
-  // CHECK: outer[0].arr[0].arr[0].data2 = 22.
-  // CHECK: outer[0].arr[0].arr[1].data1 = 11.
-  // CHECK: outer[0].arr[0].arr[1].data2 = 22.
-  // CHECK: outer[0].arr[1].arr[0].data1 = 11.
-  // CHECK: outer[0].arr[1].arr[0].data2 = 22.
-  // CHECK: outer[0].arr[1].arr[1].data1 = 11.
-  // CHECK: outer[0].arr[1].arr[1].data2 = 22.
-  // CHECK: outer[1].arr[0].arr[0].data1 = 11.
-  // CHECK: outer[1].arr[0].arr[0].data2 = 22.
-  // CHECK: outer[1].arr[0].arr[1].data1 = 11.
-  // CHECK: outer[1].arr[0].arr[1].data2 = 22.
-  // CHECK: outer[1].arr[1].arr[0].data1 = 11.
-  // CHECK: outer[1].arr[1].arr[0].data2 = 22.
-  // CHECK: outer[1].arr[1].arr[1].data1 = 11.
-  // CHECK: outer[1].arr[1].arr[1].data2 = 22.
-  assert(outer[1].arr[1].arr[0].data1 == 11 &&
-         outer[1].arr[1].arr[0].data2 == 22 &&
-         outer[1].arr[1].arr[1].data1 == 11 &&
-         outer[1].arr[1].arr[1].data2 == 22);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_ptr_subscript.cpp b/libomptarget/test/mapping/declare_mapper_nested_default_mappers_ptr_subscript.cpp
deleted file mode 100644
index 27ea747..0000000
--- a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_ptr_subscript.cpp
+++ /dev/null

@@ -1,62 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
-
-#include <cstdio>
-#include <cstdlib>
-
-typedef struct {
-  int a;
-  double *b;
-} C1;
-#pragma omp declare mapper(C1 s) map(to : s.a) map(from : s.b[0 : 2])
-
-typedef struct {
-  int a;
-  double *b;
-  C1 c;
-} C;
-#pragma omp declare mapper(C s) map(to : s.a, s.c) map(from : s.b[0 : 2])
-
-typedef struct {
-  int e;
-  C f;
-  int h;
-} D;
-
-int main() {
-  constexpr int N = 10;
-  D s;
-  s.e = 111;
-  s.f.a = 222;
-  s.f.c.a = 777;
-  double x[2];
-  double x1[2];
-  x[1] = 20;
-  s.f.b = &x[0];
-  s.f.c.b = &x1[0];
-  s.h = N;
-
-  D *sp = &s;
-
-  printf("%d %d %d %4.5f %d\n", sp[0].e, sp[0].f.a, sp[0].f.c.a, sp[0].f.b[1],
-         sp[0].f.b == &x[0] ? 1 : 0);
-  // CHECK: 111 222 777 20.00000 1
-
-  __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]);
-#pragma omp target map(tofrom : sp[0]) firstprivate(p)
-  {
-    printf("%d %d %d\n", sp[0].f.a, sp[0].f.c.a,
-           sp[0].f.b == reinterpret_cast<void *>(p) ? 1 : 0);
-    // CHECK: 222 777 0
-    sp[0].e = 333;
-    sp[0].f.a = 444;
-    sp[0].f.c.a = 555;
-    sp[0].f.b[1] = 40;
-  }
-  printf("%d %d %d %4.5f %d\n", sp[0].e, sp[0].f.a, sp[0].f.c.a, sp[0].f.b[1],
-         sp[0].f.b == &x[0] ? 1 : 0);
-  // CHECK: 333 222 777 40.00000 1
-}

diff --git a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_var.cpp b/libomptarget/test/mapping/declare_mapper_nested_default_mappers_var.cpp
deleted file mode 100644
index b7f6ef5..0000000
--- a/libomptarget/test/mapping/declare_mapper_nested_default_mappers_var.cpp
+++ /dev/null

@@ -1,62 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
-
-#include <cstdio>
-#include <cstdlib>
-
-typedef struct {
-  int a;
-  double *b;
-} C1;
-#pragma omp declare mapper(C1 s) map(to : s.a) map(from : s.b[0 : 2])
-
-typedef struct {
-  int a;
-  double *b;
-  C1 c;
-} C;
-#pragma omp declare mapper(C s) map(to : s.a, s.c) map(from : s.b[0 : 2])
-
-typedef struct {
-  int e;
-  C f;
-  int h;
-} D;
-
-int main() {
-  constexpr int N = 10;
-  D s;
-  s.e = 111;
-  s.f.a = 222;
-  s.f.c.a = 777;
-  double x[2];
-  double x1[2];
-  x[1] = 20;
-  s.f.b = &x[0];
-  s.f.c.b = &x1[0];
-  s.h = N;
-
-  printf("%d %d %d %4.5f %d\n", s.e, s.f.a, s.f.c.a, s.f.b[1],
-         s.f.b == &x[0] ? 1 : 0);
-  // CHECK: 111 222 777 20.00000 1
-
-  __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]);
-
-#pragma omp target map(tofrom : s) firstprivate(p)
-  {
-    printf("%d %d %d\n", s.f.a, s.f.c.a,
-           s.f.b == reinterpret_cast<void *>(p) ? 1 : 0);
-    // CHECK: 222 777 0
-    s.e = 333;
-    s.f.a = 444;
-    s.f.c.a = 555;
-    s.f.b[1] = 40;
-  }
-
-  printf("%d %d %d %4.5f %d\n", s.e, s.f.a, s.f.c.a, s.f.b[1],
-         s.f.b == &x[0] ? 1 : 0);
-  // CHECK: 333 222 777 40.00000 1
-}

diff --git a/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp b/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
deleted file mode 100644
index a9e3f05..0000000
--- a/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
+++ /dev/null

@@ -1,65 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <cstdio>
-#include <cstdlib>
-
-typedef struct {
-  int a;
-  double *b;
-} C;
-#pragma omp declare mapper(id1 : C s) map(to : s.a) map(from : s.b[0 : 2])
-
-typedef struct {
-  int e;
-  C f;
-  int h;
-  short *g;
-} D;
-#pragma omp declare mapper(default : D r) map(from : r.e)                      \
-    map(mapper(id1), tofrom : r.f) map(tofrom : r.g[0 : r.h])
-
-int main() {
-  constexpr int N = 10;
-  D s;
-  s.e = 111;
-  s.f.a = 222;
-  double x[2];
-  x[1] = 20;
-  short y[N];
-  y[1] = 30;
-  s.f.b = &x[0];
-  s.g = &y[0];
-  s.h = N;
-
-  D *sp = &s;
-  D **spp = &sp;
-
-  printf("%d %d %4.5f %d %d %d\n", spp[0][0].e, spp[0][0].f.a, spp[0][0].f.b[1],
-         spp[0][0].f.b == &x[0] ? 1 : 0, spp[0][0].g[1],
-         spp[0][0].g == &y[0] ? 1 : 0);
-  // CHECK: 111 222 20.00000 1 30 1
-
-  int spp00fa = -1, spp00fb_r = -1, spp00fg1 = -1, spp00fg_r = -1;
-  __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]),
-             p1 = reinterpret_cast<__intptr_t>(&y[0]);
-#pragma omp target map(tofrom : spp[0][0]) firstprivate(p, p1)                  \
-                   map(from: spp00fa, spp00fb_r, spp00fg1, spp00fg_r)
-  {
-    spp00fa = spp[0][0].f.a;
-    spp00fb_r = spp[0][0].f.b == reinterpret_cast<void *>(p) ? 1 : 0;
-    spp00fg1 = spp[0][0].g[1];
-    spp00fg_r = spp[0][0].g == reinterpret_cast<void *>(p1) ? 1 : 0;
-    printf("%d %d %d %d\n", spp00fa, spp00fb_r, spp00fg1, spp00fg_r);
-    // XCHECK: 222 0 30 0
-    spp[0][0].e = 333;
-    spp[0][0].f.a = 444;
-    spp[0][0].f.b[1] = 40;
-    spp[0][0].g[1] = 50;
-  }
-    printf("%d %d %d %d\n", spp00fa, spp00fb_r, spp00fg1, spp00fg_r);
-  // CHECK: 222 0 30 0
-  printf("%d %d %4.5f %d %d %d\n", spp[0][0].e, spp[0][0].f.a, spp[0][0].f.b[1],
-         spp[0][0].f.b == &x[0] ? 1 : 0, spp[0][0].g[1],
-         spp[0][0].g == &y[0] ? 1 : 0);
-  // CHECK: 333 222 40.00000 1 50 1
-}

diff --git a/libomptarget/test/mapping/declare_mapper_target.cpp b/libomptarget/test/mapping/declare_mapper_target.cpp
deleted file mode 100644
index 4d7237e..0000000
--- a/libomptarget/test/mapping/declare_mapper_target.cpp
+++ /dev/null

@@ -1,32 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <cstdio>
-#include <cstdlib>
-
-#define NUM 1024
-
-class C {
-public:
-  int *a;
-};
-
-#pragma omp declare mapper(id : C s) map(s.a[0 : NUM])
-
-int main() {
-  C c;
-  c.a = (int *)malloc(sizeof(int) * NUM);
-  for (int i = 0; i < NUM; i++) {
-    c.a[i] = 1;
-  }
-#pragma omp target teams distribute parallel for map(mapper(id), tofrom : c)
-  for (int i = 0; i < NUM; i++) {
-    ++c.a[i];
-  }
-  int sum = 0;
-  for (int i = 0; i < NUM; i++) {
-    sum += c.a[i];
-  }
-  // CHECK: Sum = 2048
-  printf("Sum = %d\n", sum);
-  return 0;
-}

diff --git a/libomptarget/test/mapping/declare_mapper_target_data.cpp b/libomptarget/test/mapping/declare_mapper_target_data.cpp
deleted file mode 100644
index 7f09844..0000000
--- a/libomptarget/test/mapping/declare_mapper_target_data.cpp
+++ /dev/null

@@ -1,35 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <cstdio>
-#include <cstdlib>
-
-#define NUM 1024
-
-class C {
-public:
-  int *a;
-};
-
-#pragma omp declare mapper(id : C s) map(s.a[0 : NUM])
-
-int main() {
-  C c;
-  c.a = (int *)malloc(sizeof(int) * NUM);
-  for (int i = 0; i < NUM; i++) {
-    c.a[i] = 1;
-  }
-#pragma omp target data map(mapper(id), tofrom : c)
-  {
-#pragma omp target teams distribute parallel for
-    for (int i = 0; i < NUM; i++) {
-      ++c.a[i];
-    }
-  }
-  int sum = 0;
-  for (int i = 0; i < NUM; i++) {
-    sum += c.a[i];
-  }
-  // CHECK: Sum = 2048
-  printf("Sum = %d\n", sum);
-  return 0;
-}

diff --git a/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp b/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp
deleted file mode 100644
index f5fad8b..0000000
--- a/libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp
+++ /dev/null

@@ -1,34 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <cstdio>
-#include <cstdlib>
-
-#define NUM 1024
-
-class C {
-public:
-  int *a;
-};
-
-#pragma omp declare mapper(id : C s) map(s.a[0 : NUM])
-
-int main() {
-  C c;
-  c.a = (int *)malloc(sizeof(int) * NUM);
-  for (int i = 0; i < NUM; i++) {
-    c.a[i] = 1;
-  }
-#pragma omp target enter data map(mapper(id), to : c)
-#pragma omp target teams distribute parallel for
-  for (int i = 0; i < NUM; i++) {
-    ++c.a[i];
-  }
-#pragma omp target exit data map(mapper(id), from : c)
-  int sum = 0;
-  for (int i = 0; i < NUM; i++) {
-    sum += c.a[i];
-  }
-  // CHECK: Sum = 2048
-  printf("Sum = %d\n", sum);
-  return 0;
-}

diff --git a/libomptarget/test/mapping/declare_mapper_target_update.cpp b/libomptarget/test/mapping/declare_mapper_target_update.cpp
deleted file mode 100644
index fe4597b..0000000
--- a/libomptarget/test/mapping/declare_mapper_target_update.cpp
+++ /dev/null

@@ -1,56 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <cstdio>
-#include <cstdlib>
-
-#define NUM 1024
-
-class C {
-public:
-  int *a;
-};
-
-#pragma omp declare mapper(id : C s) map(s.a[0 : NUM])
-
-int main() {
-  C c;
-  int sum = 0;
-  c.a = (int *)malloc(sizeof(int) * NUM);
-  for (int i = 0; i < NUM; i++) {
-    c.a[i] = 1;
-  }
-#pragma omp target enter data map(mapper(id), alloc : c)
-#pragma omp target teams distribute parallel for
-  for (int i = 0; i < NUM; i++) {
-    c.a[i] = 0;
-  }
-#pragma omp target update from(mapper(id) : c)
-  for (int i = 0; i < NUM; i++) {
-    sum += c.a[i];
-  }
-  // CHECK: Sum (after first update from) = 0
-  printf("Sum (after first update from) = %d\n", sum);
-  for (int i = 0; i < NUM; i++) {
-    c.a[i] = 1;
-  }
-#pragma omp target update to(mapper(id) : c)
-#pragma omp target teams distribute parallel for
-  for (int i = 0; i < NUM; i++) {
-    ++c.a[i];
-  }
-  sum = 0;
-  for (int i = 0; i < NUM; i++) {
-    sum += c.a[i];
-  }
-  // CHECK: Sum (after update to) = 1024
-  printf("Sum (after update to) = %d\n", sum);
-#pragma omp target update from(mapper(id) : c)
-  sum = 0;
-  for (int i = 0; i < NUM; i++) {
-    sum += c.a[i];
-  }
-  // CHECK: Sum (after second update from) = 2048
-  printf("Sum (after second update from) = %d\n", sum);
-#pragma omp target exit data map(mapper(id), delete : c)
-  return 0;
-}

diff --git a/libomptarget/test/mapping/delete_inf_refcount.c b/libomptarget/test/mapping/delete_inf_refcount.c
deleted file mode 100644
index f6016c1..0000000
--- a/libomptarget/test/mapping/delete_inf_refcount.c
+++ /dev/null

@@ -1,29 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp declare target
-int isHost;
-#pragma omp end declare target
-
-int main(void) {
-  isHost = -1;
-
-#pragma omp target enter data map(to : isHost)
-
-#pragma omp target
-  { isHost = omp_is_initial_device(); }
-#pragma omp target update from(isHost)
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-#pragma omp target exit data map(delete : isHost)
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-
-  return isHost;
-}

diff --git a/libomptarget/test/mapping/device_ptr_update.c b/libomptarget/test/mapping/device_ptr_update.c
deleted file mode 100644
index c6719d2..0000000
--- a/libomptarget/test/mapping/device_ptr_update.c
+++ /dev/null

@@ -1,44 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=DEBUG -check-prefix=CHECK
-// REQUIRES: libomptarget-debug
-
-#include <stdio.h>
-
-struct S {
-  int *p;
-};
-
-int main(void) {
-  int A[10];
-  struct S s1;
-
-  s1.p = A;
-
-// DEBUG: Update pointer ([[DEV_PTR:0x[^ ]+]]) -> {{\[}}[[DEV_OBJ_A:0x[^ ]+]]{{\]}}
-#pragma omp target enter data map(alloc : s1.p[0 : 10])
-
-// DEBUG-NOT: Update pointer ([[DEV_PTR]]) -> {{\[}}[[DEV_OBJ_A]]{{\]}}
-#pragma omp target map(alloc : s1.p[0 : 10])
-  {
-    for (int i = 0; i < 10; ++i)
-      s1.p[i] = i;
-  }
-
-#pragma omp target exit data map(from : s1.p[0 : 10])
-
-  int fail_A = 0;
-  for (int i = 0; i < 10; ++i) {
-    if (A[i] != i) {
-      fail_A = 1;
-      break;
-    }
-  }
-
-  // CHECK-NOT: Test A failed
-  if (fail_A) {
-    printf("Test A failed\n");
-  }
-
-  return fail_A;
-}

diff --git a/libomptarget/test/mapping/firstprivate_aligned.cpp b/libomptarget/test/mapping/firstprivate_aligned.cpp
deleted file mode 100644
index ae6be0f..0000000
--- a/libomptarget/test/mapping/firstprivate_aligned.cpp
+++ /dev/null

@@ -1,37 +0,0 @@
-// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
-
-#include <stdio.h>
-
-// CHECK: rx: 16, ry: 16;
-// CHECK: rx: 16, ry: 16;
-// CHECK: rx: 16, ry: 16;
-// CHECK: rx: 16, ry: 16;
-
-template <bool Aligned> void test() {
-  printf("Test %saligned firstprivate\n", Aligned ? "" : "non-");
-  char z1[3 + Aligned], z2[3 + Aligned];
-  int x[4];
-  int y[4];
-  y[0] = y[1] = y[2] = y[3] = 4;
-  x[0] = x[1] = x[2] = x[3] = 4;
-  int rx = -1, ry = -1;
-#pragma omp target firstprivate(z1, y, z2) map(from : ry, rx) map(to : x)
-  {
-    ry = (y[0] + y[1] + y[2] + y[3]);
-    rx = (x[0] + x[1] + x[2] + x[3]);
-  }
-  printf(" rx:%i, ry:%i\n", rx, ry);
-#pragma omp target firstprivate(z1, y, z2) map(from : ry, rx) map(to : x)
-  {
-    z1[2] += 5;
-    ry = (y[0] + y[1] + y[2] + y[3]);
-    rx = (x[0] + x[1] + x[2] + x[3]);
-    z2[2] += 7;
-  }
-  printf(" rx:%i, ry:%i\n", rx, ry);
-}
-
-int main() {
-  test<true>();
-  test<false>();
-}

diff --git a/libomptarget/test/mapping/has_device_addr.cpp b/libomptarget/test/mapping/has_device_addr.cpp
deleted file mode 100644
index 63fa313..0000000
--- a/libomptarget/test/mapping/has_device_addr.cpp
+++ /dev/null

@@ -1,33 +0,0 @@
-// RUN: %libomptarget-compilexx-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <assert.h>
-#include <iostream>
-#include <omp.h>
-
-struct view {
-  const int size = 10;
-  int *data_host;
-  int *data_device;
-  void foo() {
-    std::size_t bytes = size * sizeof(int);
-    const int host_id = omp_get_initial_device();
-    const int device_id = omp_get_default_device();
-    data_host = (int *)malloc(bytes);
-    data_device = (int *)omp_target_alloc(bytes, device_id);
-#pragma omp target teams distribute parallel for has_device_addr(data_device[0])
-    for (int i = 0; i < size; ++i)
-      data_device[i] = i;
-    omp_target_memcpy(data_host, data_device, bytes, 0, 0, host_id, device_id);
-    for (int i = 0; i < size; ++i)
-      assert(data_host[i] == i);
-  }
-};
-
-int main() {
-  view a;
-  a.foo();
-  // CHECK: PASSED
-  printf("PASSED\n");
-}

diff --git a/libomptarget/test/mapping/implicit_device_ptr.c b/libomptarget/test/mapping/implicit_device_ptr.c
deleted file mode 100644
index baa75d2..0000000
--- a/libomptarget/test/mapping/implicit_device_ptr.c
+++ /dev/null

@@ -1,26 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-// OpenMP 5.1. sec 5.8.6 "Pointer Initialization for Device Data Environments"
-// p. 160 L32-33: "If a matching mapped list item is not found, the pointer
-// retains its original value as per the32 firstprivate semantics described in
-// Section 5.4.4."
-
-int main(void) {
-  int *A = (int *)omp_target_alloc(sizeof(int), omp_get_default_device());
-
-#pragma omp target
-  { *A = 1; }
-
-  int Result = 0;
-#pragma omp target map(from : Result)
-  { Result = *A; }
-
-  // CHECK: PASS
-  if (Result == 1)
-    printf("PASS\n");
-
-  omp_target_free(A, omp_get_default_device());
-}

diff --git a/libomptarget/test/mapping/is_device_ptr.cpp b/libomptarget/test/mapping/is_device_ptr.cpp
deleted file mode 100644
index 6433f82..0000000
--- a/libomptarget/test/mapping/is_device_ptr.cpp
+++ /dev/null

@@ -1,31 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <assert.h>
-#include <iostream>
-#include <omp.h>
-
-struct view {
-  const int size = 10;
-  int *data_host;
-  int *data_device;
-  void foo() {
-    std::size_t bytes = size * sizeof(int);
-    const int host_id = omp_get_initial_device();
-    const int device_id = omp_get_default_device();
-    data_host = (int *)malloc(bytes);
-    data_device = (int *)omp_target_alloc(bytes, device_id);
-#pragma omp target teams distribute parallel for is_device_ptr(data_device)
-    for (int i = 0; i < size; ++i)
-      data_device[i] = i;
-    omp_target_memcpy(data_host, data_device, bytes, 0, 0, host_id, device_id);
-    for (int i = 0; i < size; ++i)
-      assert(data_host[i] == i);
-  }
-};
-
-int main() {
-  view a;
-  a.foo();
-  // CHECK: PASSED
-  printf("PASSED\n");
-}

diff --git a/libomptarget/test/mapping/lambda_by_value.cpp b/libomptarget/test/mapping/lambda_by_value.cpp
deleted file mode 100644
index 5516ded..0000000
--- a/libomptarget/test/mapping/lambda_by_value.cpp
+++ /dev/null

@@ -1,49 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <stdint.h>
-#include <stdio.h>
-
-// CHECK: before: [[V1:111]] [[V2:222]] [[PX:0x[^ ]+]] [[PY:0x[^ ]+]]
-// CHECK: lambda: [[V1]] [[V2]] [[PX_TGT:0x[^ ]+]] 0x{{.*}}
-// CHECK: tgt   : [[V2]] [[PX_TGT]] 1
-// CHECK: out   : [[V2]] [[V2]] [[PX]] [[PY]]
-
-#pragma omp begin declare target
-int a = -1, *c;
-long b = -1;
-const long *d;
-int e = -1, *f, g = -1;
-#pragma omp end declare target
-
-int main() {
-  int x[10];
-  long y[8];
-  x[1] = 111;
-  y[1] = 222;
-
-  auto lambda = [&x, y]() {
-    a = x[1];
-    b = y[1];
-    c = &x[0];
-    d = &y[0];
-    printf("lambda: %d %ld %p %p\n", x[1], y[1], &x[0], &y[0]);
-    x[1] = y[1];
-  };
-  printf("before: %d %ld %p %p\n", x[1], y[1], &x[0], &y[0]);
-
-  intptr_t xp = (intptr_t)&x[0];
-#pragma omp target firstprivate(xp)
-  {
-    lambda();
-    e = x[1];
-    f = &x[0];
-    g = (&x[0] != (int *)xp);
-    printf("tgt   : %d %p %d\n", x[1], &x[0], (&x[0] != (int *)xp));
-  }
-#pragma omp target update from(a, b, c, d, e, f, g)
-  printf("lambda: %d %ld %p %p\n", a, b, c, d);
-  printf("tgt   : %d %p %d\n", e, f, g);
-  printf("out   : %d %ld %p %p\n", x[1], y[1], &x[0], &y[0]);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/lambda_mapping.cpp b/libomptarget/test/mapping/lambda_mapping.cpp
deleted file mode 100644
index 63b1719..0000000
--- a/libomptarget/test/mapping/lambda_mapping.cpp
+++ /dev/null

@@ -1,53 +0,0 @@
-// Unonptimized, we need 24000000 bytes heap
-// RUN: %libomptarget-compilexx-generic
-// RUN: env LIBOMPTARGET_HEAP_SIZE=24000000 \
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
-// RUN: %libomptarget-compileoptxx-run-and-check-generic
-
-#include <iostream>
-
-template <typename LOOP_BODY>
-inline void forall(int Begin, int End, LOOP_BODY LoopBody) {
-#pragma omp target parallel for schedule(static)
-  for (int I = Begin; I < End; ++I) {
-    LoopBody(I);
-  }
-}
-
-#define N (1000)
-
-//
-// Demonstration of the RAJA abstraction using lambdas
-// Requires data mapping onto the target section
-//
-int main() {
-  double A[N], B[N], C[N];
-
-  for (int I = 0; I < N; I++) {
-    A[I] = I + 1;
-    B[I] = -I;
-    C[I] = -9;
-  }
-
-#pragma omp target data map(tofrom : C[0 : N]) map(to : A[0 : N], B[0 : N])
-  {
-    forall(0, N, [&](int I) { C[I] += A[I] + B[I]; });
-  }
-
-  int Fail = 0;
-  for (int I = 0; I < N; I++) {
-    if (C[I] != -8) {
-      std::cout << "Failed at " << I << " with val " << C[I] << std::endl;
-      Fail = 1;
-    }
-  }
-
-  // CHECK: Succeeded
-  if (Fail) {
-    std::cout << "Failed" << std::endl;
-  } else {
-    std::cout << "Succeeded" << std::endl;
-  }
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/low_alignment.c b/libomptarget/test/mapping/low_alignment.c
deleted file mode 100644
index 615a5a9..0000000
--- a/libomptarget/test/mapping/low_alignment.c
+++ /dev/null

@@ -1,49 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  struct S {
-    int i;
-    int j;
-  } s;
-  s.i = 20;
-  s.j = 30;
-#pragma omp target data map(tofrom : s)
-  {
-#pragma omp target map(from : s.i, s.j)
-    {
-      s.i = 21;
-      s.j = 31;
-    }
-  }
-  if (s.i == 21 && s.j == 31)
-    printf("PASS 1\n");
-  // CHECK: PASS 1
-
-  struct T {
-    int a;
-    int b;
-    int c;
-    int d;
-    int i;
-    int j;
-  } t;
-  t.a = 10;
-  t.i = 20;
-  t.j = 30;
-#pragma omp target data map(from : t.i, t.j)
-  {
-#pragma omp target map(from : t.a)
-    {
-      t.a = 11;
-      t.i = 21;
-      t.j = 31;
-    }
-  }
-  if (t.a == 11 && t.i == 21 && t.j == 31)
-    printf("PASS 2\n");
-  // CHECK: PASS 2
-  return 0;
-}

diff --git a/libomptarget/test/mapping/map_back_race.cpp b/libomptarget/test/mapping/map_back_race.cpp
deleted file mode 100644
index 8a988d3..0000000
--- a/libomptarget/test/mapping/map_back_race.cpp
+++ /dev/null

@@ -1,32 +0,0 @@
-// RUN: %libomptarget-compilexx-and-run-generic
-
-// Taken from https://github.com/llvm/llvm-project/issues/54216
-
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-
-bool almost_equal(float x, float gold, float rel_tol = 1e-09,
-                  float abs_tol = 0.0) {
-  return std::abs(x - gold) <=
-         std::max(rel_tol * std::max(std::abs(x), std::abs(gold)), abs_tol);
-}
-void test_parallel_for__target() {
-  const int N0{32768};
-  const float expected_value{N0};
-  float counter_N0{};
-#pragma omp parallel for
-  for (int i0 = 0; i0 < N0; i0++) {
-#pragma omp target map(tofrom : counter_N0)
-    {
-#pragma omp atomic update
-      counter_N0 = counter_N0 + 1.;
-    }
-  }
-  if (!almost_equal(counter_N0, expected_value, 0.01)) {
-    std::cerr << "Expected: " << expected_value << " Got: " << counter_N0
-              << std::endl;
-    std::exit(112);
-  }
-}
-int main() { test_parallel_for__target(); }

diff --git a/libomptarget/test/mapping/ompx_hold/omp_target_disassociate_ptr.c b/libomptarget/test/mapping/ompx_hold/omp_target_disassociate_ptr.c
deleted file mode 100644
index 302bf4f..0000000
--- a/libomptarget/test/mapping/ompx_hold/omp_target_disassociate_ptr.c
+++ /dev/null

@@ -1,68 +0,0 @@
-// omp_target_disassociate_ptr should always fail if the hold reference count is
-// non-zero, regardless of the dynamic reference count.  When the latter is
-// finite, the implementation happens to choose to report the hold diagnostic.
-
-// RUN: %libomptarget-compile-generic -fopenmp-extensions
-// RUN: %not %libomptarget-run-generic 0   2>&1 | %fcheck-generic
-// RUN: %not %libomptarget-run-generic 1   2>&1 | %fcheck-generic
-// RUN: %not %libomptarget-run-generic inf 2>&1 | %fcheck-generic
-
-// RUN: %libomptarget-compile-generic -fopenmp-extensions -DHOLD_MORE
-// RUN: %not %libomptarget-run-generic 0   2>&1 | %fcheck-generic
-// RUN: %not %libomptarget-run-generic 1   2>&1 | %fcheck-generic
-// RUN: %not %libomptarget-run-generic inf 2>&1 | %fcheck-generic
-
-#include <limits.h>
-#include <omp.h>
-#include <stdio.h>
-#include <string.h>
-
-int main(int argc, char *argv[]) {
-  // Parse command line.
-  int DynRef;
-  if (argc != 2) {
-    fprintf(stderr, "bad arguments\n");
-    return 1;
-  }
-  if (0 == strcmp(argv[1], "inf"))
-    DynRef = INT_MAX;
-  else
-    DynRef = atoi(argv[1]);
-
-  // Allocate and set dynamic reference count as specified.
-  int DevNum = omp_get_default_device();
-  int X;
-  void *XDev = omp_target_alloc(sizeof X, DevNum);
-  if (!XDev) {
-    fprintf(stderr, "omp_target_alloc failed\n");
-    return 1;
-  }
-  if (DynRef == INT_MAX) {
-    if (omp_target_associate_ptr(&X, &XDev, sizeof X, 0, DevNum)) {
-      fprintf(stderr, "omp_target_associate_ptr failed\n");
-      return 1;
-    }
-  } else {
-    for (int I = 0; I < DynRef; ++I) {
-#pragma omp target enter data map(alloc : X)
-    }
-  }
-
-  // Disassociate while hold reference count > 0.
-  int Status = 0;
-#pragma omp target data map(ompx_hold, alloc : X)
-#if HOLD_MORE
-#pragma omp target data map(ompx_hold, alloc : X)
-#pragma omp target data map(ompx_hold, alloc : X)
-#endif
-  {
-    //      CHECK: omptarget error: Trying to disassociate a pointer with a
-    // CHECK-SAME: non-zero hold reference count
-    // CHECK-NEXT: omp_target_disassociate_ptr failed
-    if (omp_target_disassociate_ptr(&X, DevNum)) {
-      fprintf(stderr, "omp_target_disassociate_ptr failed\n");
-      Status = 1;
-    }
-  }
-  return Status;
-}

diff --git a/libomptarget/test/mapping/ompx_hold/struct.c b/libomptarget/test/mapping/ompx_hold/struct.c
deleted file mode 100644
index da2b380..0000000
--- a/libomptarget/test/mapping/ompx_hold/struct.c
+++ /dev/null

@@ -1,230 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-extensions
-// RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp begin declare target
-#define MAX_NAME_SIZE 100
-char N1[MAX_NAME_SIZE], N2[MAX_NAME_SIZE];
-int V1, V2;
-void copy_name(char *dst, char *src) {
-  int i;
-  for (i = 0; i < MAX_NAME_SIZE - 1 && src[i]; ++i)
-    dst[i] = src[i];
-  dst[i] = 0;
-}
-#pragma omp end declare target
-
-#define CHECK_PRESENCE(Var1, Var2, Var3)                                       \
-  printf("    presence of %s, %s, %s: %d, %d, %d\n", #Var1, #Var2, #Var3,      \
-         omp_target_is_present(&(Var1), omp_get_default_device()),             \
-         omp_target_is_present(&(Var2), omp_get_default_device()),             \
-         omp_target_is_present(&(Var3), omp_get_default_device()))
-
-#define CHECK_VALUES_HELPER(N1, N2, Var1, Var2)                                \
-  printf("    values of %s, %s: %d, %d\n", N1, N2, (Var1), (Var2))
-
-#define CHECK_VALUES_DELAYED(Var1, Var2)                                       \
-  copy_name(N1, #Var1);                                                        \
-  copy_name(N2, #Var2);                                                        \
-  V1 = (Var1);                                                                 \
-  V2 = (Var2);
-
-#define CHECK_DELAYED_VALUES()                                                 \
-  _Pragma("omp target update from(N1, N2, V1, V2)")                            \
-      CHECK_VALUES_HELPER(N1, N2, V1, V2)
-
-#define CHECK_VALUES(Var1, Var2)                                               \
-  CHECK_VALUES_HELPER(#Var1, #Var2, (Var1), (Var2))
-
-int main() {
-  struct S {
-    int i;
-    int j;
-  } s;
-  // CHECK: presence of s, s.i, s.j: 0, 0, 0
-  CHECK_PRESENCE(s, s.i, s.j);
-
-  // =======================================================================
-  // Check that ompx_hold keeps entire struct present.
-
-  // -----------------------------------------------------------------------
-  // CHECK-LABEL: check:{{.*}}
-  printf("check: ompx_hold only on first member\n");
-  s.i = 20;
-  s.j = 30;
-#pragma omp target data map(tofrom : s) map(ompx_hold, tofrom : s.i)           \
-    map(tofrom : s.j)
-  {
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    CHECK_PRESENCE(s, s.i, s.j);
-#pragma omp target map(tofrom : s)
-    {
-      s.i = 21;
-      s.j = 31;
-    }
-#pragma omp target exit data map(delete : s, s.i)
-    // ompx_hold on s.i applies to all of s.
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    // CHECK-NEXT: values of s.i, s.j: 20, 30
-    CHECK_PRESENCE(s, s.i, s.j);
-    CHECK_VALUES(s.i, s.j);
-  }
-  // CHECK-NEXT: presence of s, s.i, s.j: 0, 0, 0
-  // CHECK-NEXT: values of s.i, s.j: 21, 31
-  CHECK_PRESENCE(s, s.i, s.j);
-  CHECK_VALUES(s.i, s.j);
-
-  // -----------------------------------------------------------------------
-  // CHECK-LABEL: check:{{.*}}
-  printf("check: ompx_hold only on last member\n");
-  s.i = 20;
-  s.j = 30;
-#pragma omp target data map(tofrom : s) map(tofrom : s.i)                      \
-    map(ompx_hold, tofrom : s.j)
-  {
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    CHECK_PRESENCE(s, s.i, s.j);
-#pragma omp target map(tofrom : s)
-    {
-      s.i = 21;
-      s.j = 31;
-    }
-#pragma omp target exit data map(delete : s, s.i)
-    // ompx_hold on s.j applies to all of s.
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    // CHECK-NEXT: values of s.i, s.j: 20, 30
-    CHECK_PRESENCE(s, s.i, s.j);
-    CHECK_VALUES(s.i, s.j);
-  }
-  // CHECK-NEXT: presence of s, s.i, s.j: 0, 0, 0
-  // CHECK-NEXT: values of s.i, s.j: 21, 31
-  CHECK_PRESENCE(s, s.i, s.j);
-  CHECK_VALUES(s.i, s.j);
-
-  // -----------------------------------------------------------------------
-  // CHECK-LABEL: check:{{.*}}
-  printf("check: ompx_hold only on struct\n");
-  s.i = 20;
-  s.j = 30;
-#pragma omp target data map(ompx_hold, tofrom : s) map(tofrom : s.i)           \
-    map(tofrom : s.j)
-  {
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    CHECK_PRESENCE(s, s.i, s.j);
-#pragma omp target map(tofrom : s)
-    {
-      s.i = 21;
-      s.j = 31;
-    }
-#pragma omp target exit data map(delete : s, s.i)
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    // CHECK-NEXT: values of s.i, s.j: 20, 30
-    CHECK_PRESENCE(s, s.i, s.j);
-    CHECK_VALUES(s.i, s.j);
-  }
-  // CHECK-NEXT: presence of s, s.i, s.j: 0, 0, 0
-  // CHECK-NEXT: values of s.i, s.j: 21, 31
-  CHECK_PRESENCE(s, s.i, s.j);
-  CHECK_VALUES(s.i, s.j);
-
-  // =======================================================================
-  // Check that transfer to/from host checks reference count correctly.
-
-  // -----------------------------------------------------------------------
-  // CHECK-LABEL: check:{{.*}}
-  printf("check: parent DynRefCount=1 is not sufficient for transfer\n");
-  s.i = 20;
-  s.j = 30;
-#pragma omp target data map(ompx_hold, tofrom : s)
-#pragma omp target data map(ompx_hold, tofrom : s)
-  {
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    CHECK_PRESENCE(s, s.i, s.j);
-#pragma omp target map(from : s.i, s.j)
-    {
-      s.i = 21;
-      s.j = 31;
-    } // No transfer here even though parent's DynRefCount=1.
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    // CHECK-NEXT: values of s.i, s.j: 20, 30
-    CHECK_PRESENCE(s, s.i, s.j);
-    CHECK_VALUES(s.i, s.j);
-#pragma omp target map(to : s.i, s.j)
-    { // No transfer here even though parent's DynRefCount=1.
-      // CHECK-NEXT: values of s.i, s.j: 21, 31
-      CHECK_VALUES_DELAYED(s.i, s.j);
-    }
-    CHECK_DELAYED_VALUES();
-  }
-  // CHECK-NEXT: presence of s, s.i, s.j: 0, 0, 0
-  // CHECK-NEXT: values of s.i, s.j: 21, 31
-  CHECK_PRESENCE(s, s.i, s.j);
-  CHECK_VALUES(s.i, s.j);
-
-  // -----------------------------------------------------------------------
-  // CHECK-LABEL: check:{{.*}}
-  printf("check: parent HoldRefCount=1 is not sufficient for transfer\n");
-  s.i = 20;
-  s.j = 30;
-#pragma omp target data map(tofrom : s)
-#pragma omp target data map(tofrom : s)
-  {
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    CHECK_PRESENCE(s, s.i, s.j);
-#pragma omp target map(ompx_hold, from : s.i, s.j)
-    {
-      s.i = 21;
-      s.j = 31;
-    } // No transfer here even though parent's HoldRefCount=1.
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    // CHECK-NEXT: values of s.i, s.j: 20, 30
-    CHECK_PRESENCE(s, s.i, s.j);
-    CHECK_VALUES(s.i, s.j);
-#pragma omp target map(ompx_hold, to : s.i, s.j)
-    { // No transfer here even though parent's HoldRefCount=1.
-      // CHECK-NEXT: values of s.i, s.j: 21, 31
-      CHECK_VALUES_DELAYED(s.i, s.j);
-    }
-    CHECK_DELAYED_VALUES();
-  }
-  // CHECK-NEXT: presence of s, s.i, s.j: 0, 0, 0
-  // CHECK-NEXT: values of s.i, s.j: 21, 31
-  CHECK_PRESENCE(s, s.i, s.j);
-  CHECK_VALUES(s.i, s.j);
-
-  // -----------------------------------------------------------------------
-  // CHECK-LABEL: check:{{.*}}
-  //
-  // At the beginning of a region, if the parent's TotalRefCount=1, then the
-  // transfer should happen.
-  //
-  // At the end of a region, it also must be true that the reference count being
-  // decremented is the reference count that is 1.
-  printf("check: parent TotalRefCount=1 is not sufficient for transfer\n");
-  s.i = 20;
-  s.j = 30;
-#pragma omp target data map(ompx_hold, tofrom : s)
-  {
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    CHECK_PRESENCE(s, s.i, s.j);
-#pragma omp target map(ompx_hold, tofrom : s.i, s.j)
-    {
-      s.i = 21;
-      s.j = 31;
-    }
-#pragma omp target exit data map(from : s.i, s.j)
-    // No transfer here even though parent's TotalRefCount=1.
-    // CHECK-NEXT: presence of s, s.i, s.j: 1, 1, 1
-    // CHECK-NEXT: values of s.i, s.j: 20, 30
-    CHECK_PRESENCE(s, s.i, s.j);
-    CHECK_VALUES(s.i, s.j);
-  }
-  // CHECK-NEXT: presence of s, s.i, s.j: 0, 0, 0
-  // CHECK-NEXT: values of s.i, s.j: 21, 31
-  CHECK_PRESENCE(s, s.i, s.j);
-  CHECK_VALUES(s.i, s.j);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/ompx_hold/target-data.c b/libomptarget/test/mapping/ompx_hold/target-data.c
deleted file mode 100644
index dd78459..0000000
--- a/libomptarget/test/mapping/ompx_hold/target-data.c
+++ /dev/null

@@ -1,229 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-extensions
-// RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace
-
-#include <omp.h>
-#include <stdio.h>
-
-#define CHECK_PRESENCE(Var1, Var2, Var3)                                       \
-  printf("    presence of %s, %s, %s: %d, %d, %d\n", #Var1, #Var2, #Var3,      \
-         omp_target_is_present(&Var1, omp_get_default_device()),               \
-         omp_target_is_present(&Var2, omp_get_default_device()),               \
-         omp_target_is_present(&Var3, omp_get_default_device()))
-
-int main() {
-  int m, r, d;
-  // CHECK: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK-NEXT: check:{{.*}}
-  printf("check: dyn>0, hold=0, dec/reset dyn=0\n");
-
-  // CHECK-NEXT: structured{{.*}}
-  printf("  structured dec of dyn\n");
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // CHECK-NEXT: dynamic{{.*}}
-  printf("  dynamic dec/reset of dyn\n");
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r)
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r) map(delete : d)
-      // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r) map(delete : d)
-    // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK: check:{{.*}}
-  printf("check: dyn=0, hold>0, dec/reset dyn=0, dec hold=0\n");
-
-  // Structured dec of dyn would require dyn>0.
-
-  // CHECK-NEXT: dynamic{{.*}}
-  printf("  dynamic dec/reset of dyn\n");
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r)
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r) map(delete : d)
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r) map(delete : d)
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK: check:{{.*}}
-  printf("check: dyn>0, hold>0, dec/reset dyn=0, dec hold=0\n");
-
-  // CHECK-NEXT: structured{{.*}}
-  printf("  structured dec of dyn\n");
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-      {
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-        {
-          // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-          CHECK_PRESENCE(m, r, d);
-        }
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-      }
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // CHECK-NEXT: dynamic{{.*}}
-  printf("  dynamic dec/reset of dyn\n");
-#pragma omp target enter data map(to : m) map(alloc : r, d)
-  // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-  CHECK_PRESENCE(m, r, d);
-#pragma omp target enter data map(to : m) map(alloc : r, d)
-  // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-  CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r)
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r) map(delete : d)
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r) map(delete : d)
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK: check:{{.*}}
-  printf("check: dyn>0, hold>0, dec hold=0, dec/reset dyn=0\n");
-
-  // CHECK-NEXT: structured{{.*}}
-  printf("  structured dec of dyn\n");
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-      {
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-        {
-          // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-          CHECK_PRESENCE(m, r, d);
-        }
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-      }
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // CHECK-NEXT: dynamic{{.*}}
-  printf("  dynamic dec/reset of dyn\n");
-#pragma omp target enter data map(to : m) map(alloc : r, d)
-  // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-  CHECK_PRESENCE(m, r, d);
-#pragma omp target enter data map(to : m) map(alloc : r, d)
-  // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-  CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-  CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r)
-  // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-  CHECK_PRESENCE(m, r, d);
-#pragma omp target exit data map(from : m) map(release : r) map(delete : d)
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/ompx_hold/target.c b/libomptarget/test/mapping/ompx_hold/target.c
deleted file mode 100644
index 40fd14c..0000000
--- a/libomptarget/test/mapping/ompx_hold/target.c
+++ /dev/null

@@ -1,161 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-extensions
-// RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace
-
-#include <omp.h>
-#include <stdio.h>
-
-#define CHECK_PRESENCE(Var1, Var2, Var3)                                       \
-  printf("    presence of %s, %s, %s: %d, %d, %d\n", #Var1, #Var2, #Var3,      \
-         omp_target_is_present(&Var1, omp_get_default_device()),               \
-         omp_target_is_present(&Var2, omp_get_default_device()),               \
-         omp_target_is_present(&Var3, omp_get_default_device()))
-
-int main() {
-  int m, r, d;
-  // CHECK: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK-NEXT: check:{{.*}}
-  printf("check: dyn>0, hold=0, dec dyn=0\n");
-
-  // CHECK-NEXT: once
-  printf("  once\n");
-#pragma omp target map(tofrom : m) map(alloc : r, d)
-  ;
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // CHECK-NEXT: twice
-  printf("  twice\n");
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target map(tofrom : m) map(alloc : r, d)
-    ;
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK: check:{{.*}}
-  printf("check: dyn=0, hold>0, dec hold=0\n");
-
-  // CHECK-NEXT: once
-  printf("  once\n");
-#pragma omp target map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  ;
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // CHECK-NEXT: twice
-  printf("  twice\n");
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-    ;
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK: check:{{.*}}
-  printf("check: dyn>0, hold>0, dec dyn=0, dec hold=0\n");
-
-  // CHECK-NEXT: once each
-  printf("  once each\n");
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target map(tofrom : m) map(alloc : r, d)
-    ;
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // CHECK-NEXT: twice each
-  printf("  twice each\n");
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-      {
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-#pragma omp target map(tofrom : m) map(alloc : r, d)
-        ;
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-      }
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // -----------------------------------------------------------------------
-  // CHECK: check:{{.*}}
-  printf("check: dyn>0, hold>0, dec hold=0, dec dyn=0\n");
-
-  // CHECK-NEXT: once each
-  printf("  once each\n");
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-    ;
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  // CHECK-NEXT: twice each
-  printf("  twice each\n");
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-  {
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(tofrom : m) map(alloc : r, d)
-    {
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-#pragma omp target data map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-      {
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-#pragma omp target map(ompx_hold, tofrom : m) map(ompx_hold, alloc : r, d)
-        ;
-        // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-        CHECK_PRESENCE(m, r, d);
-      }
-      // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-      CHECK_PRESENCE(m, r, d);
-    }
-    // CHECK-NEXT: presence of m, r, d: 1, 1, 1
-    CHECK_PRESENCE(m, r, d);
-  }
-  // CHECK-NEXT: presence of m, r, d: 0, 0, 0
-  CHECK_PRESENCE(m, r, d);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/padding_not_mapped.c b/libomptarget/test/mapping/padding_not_mapped.c
deleted file mode 100644
index 3ee70ab..0000000
--- a/libomptarget/test/mapping/padding_not_mapped.c
+++ /dev/null

@@ -1,43 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-version=51
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// The host memory layout for the following program looks like this:
-//
-//   | 4 bytes | 4 bytes | 8 bytes |
-//   |   s.x   |   s.y   |   s.z   |
-//   `-----------------------------'
-//
-// s is always at least 8-byte aligned in host memory due to s.z, so
-// libomptarget's device padding for map(s.y,s.z) always maps to host memory
-// that includes s.x.  At one time, s.x appeared to be mapped as a result, but
-// libomptarget has since been fixed not to consider device padding as mapped to
-// host memory.
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  struct S { int x; int y; double z; } s = {1, 2, 3};
-
-  // CHECK: &s.x = 0x[[#%x,HOST_ADDR:]], size = [[#%u,SIZE:]]
-  fprintf(stderr, "&s = %p\n", &s);
-  fprintf(stderr, "&s.x = %p, size = %ld\n", &s.x, sizeof s.x);
-  fprintf(stderr, "&s.y = %p\n", &s.y);
-  fprintf(stderr, "&s.z = %p\n", &s.z);
-
-  // CHECK: s.x is present: 0
-  // CHECK: s.x = 1{{$}}
-  #pragma omp target enter data map(alloc: s.y, s.z)
-  int dev = omp_get_default_device();
-  fprintf(stderr, "s.x is present: %d\n", omp_target_is_present(&s.x, dev));
-  #pragma omp target update from(s.x) // should have no effect
-  fprintf(stderr, "s.x = %d\n", s.x);
-
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-  // CHECK: omptarget error: Call to getTargetPointer returned null pointer ('present' map type modifier).
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-  #pragma omp target enter data map(present, alloc: s.x)
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/power_of_two_alignment.c b/libomptarget/test/mapping/power_of_two_alignment.c
deleted file mode 100644
index faebe4f..0000000
--- a/libomptarget/test/mapping/power_of_two_alignment.c
+++ /dev/null

@@ -1,89 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-// Assuming the stack is allocated on the host starting at high addresses, the
-// host memory layout for the following program looks like this:
-//
-//   low addr <----------------------------------------------------- high addr
-//              |   16 bytes  | 16 bytes  |  16 bytes  | ? bytes  |
-//              | collidePost |     s     | collidePre | stackPad |
-//              |             | x | y | z |            |          |
-//              `-------------'
-//                    ^  `--------'
-//                    |      ^
-//                    |      |
-//                    |      `-- too much padding (< 16 bytes) for s maps here
-//                    |
-//                    `------------------array extension error maps here
-//
-// libomptarget used to add too much padding to the device allocation of s and
-// map it back to the host at the location indicated above when all of the
-// following conditions were true:
-// - Multiple members (s.y and s.z below) were mapped.  In this case, initial
-//   padding might be needed to ensure later mapped members (s.z) are aligned
-//   properly on the device.  (If the first member in the struct, s.x, were also
-//   mapped, then the correct initial padding would always be zero.)
-// - mod16 = &s % 16 was not a power of 2 (e.g., 0x7ffcce2b584e % 16 = 14).
-//   libomptarget then incorrectly assumed mod16 was the existing host memory
-//   alignment of s.  (The fix was to only look for alignments that are powers
-//   of 2.)
-// - &s.y % mod16 was > 1 (e.g., 0x7ffcce2b584f % 14 = 11).  libomptarget added
-//   padding of that size for s, but at most 1 byte is ever actually needed.
-//
-// Below, we try many sizes of stackPad to try to produce those conditions.
-//
-// When collidePost was then mapped to the same host memory as the unnecessary
-// padding for s, libomptarget reported an array extension error.  collidePost
-// is never fully contained within that padding (which would avoid the extension
-// error) because collidePost is 16 bytes while the padding is always less than
-// 16 bytes due to the modulo operations.  (Later, libomptarget was changed not
-// to consider padding to be mapped to the host, so it cannot be involved in
-// array extension errors.)
-
-#include <stdint.h>
-#include <stdio.h>
-
-template <typename StackPad>
-void test() {
-  StackPad stackPad;
-  struct S { char x; char y[7]; char z[8]; };
-  struct S collidePre, s, collidePost;
-  uintptr_t mod16 = (uintptr_t)&s % 16;
-  fprintf(stderr, "&s = %p\n", &s);
-  fprintf(stderr, "&s %% 16 = %lu\n", mod16);
-  if (mod16) {
-    fprintf(stderr, "&s.y = %p\n", &s.y);
-    fprintf(stderr, "&s.y %% %lu = %lu\n", mod16, (uintptr_t)&s.y % mod16);
-  }
-  fprintf(stderr, "&collidePre = %p\n", &collidePre);
-  fprintf(stderr, "&collidePost = %p\n", &collidePost);
-  #pragma omp target data map(to:s.y, s.z)
-  #pragma omp target data map(to:collidePre, collidePost)
-  ;
-}
-
-#define TEST(StackPad)                                                         \
-  fprintf(stderr, "-------------------------------------\n");                  \
-  fprintf(stderr, "StackPad=%s\n", #StackPad);                                 \
-  test<StackPad>()
-
-int main() {
-  TEST(char[1]);
-  TEST(char[2]);
-  TEST(char[3]);
-  TEST(char[4]);
-  TEST(char[5]);
-  TEST(char[6]);
-  TEST(char[7]);
-  TEST(char[8]);
-  TEST(char[9]);
-  TEST(char[10]);
-  TEST(char[11]);
-  TEST(char[12]);
-  TEST(char[13]);
-  TEST(char[14]);
-  TEST(char[15]);
-  TEST(char[16]);
-  // CHECK: pass
-  printf("pass\n");
-  return 0;
-}

diff --git a/libomptarget/test/mapping/pr38704.c b/libomptarget/test/mapping/pr38704.c
deleted file mode 100644
index 72b4a17..0000000
--- a/libomptarget/test/mapping/pr38704.c
+++ /dev/null

@@ -1,44 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// Clang 6.0 doesn't use the new map interface, undefined behavior when
-// the compiler emits "old" interface code for structures.
-// UNSUPPORTED: clang-6
-
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct {
-  int *ptr1;
-  int *ptr2;
-} StructWithPtrs;
-
-int main(int argc, char *argv[]) {
-  StructWithPtrs s, s2;
-  s.ptr1 = malloc(sizeof(int));
-  s.ptr2 = malloc(2 * sizeof(int));
-  s2.ptr1 = malloc(sizeof(int));
-  s2.ptr2 = malloc(2 * sizeof(int));
-
-#pragma omp target enter data map(to : s2.ptr2[0 : 1])
-#pragma omp target map(s.ptr1[0 : 1], s.ptr2[0 : 2])
-  {
-    s.ptr1[0] = 1;
-    s.ptr2[0] = 2;
-    s.ptr2[1] = 3;
-  }
-#pragma omp target exit data map(from : s2.ptr1[0 : 1], s2.ptr2[0 : 1])
-
-  // CHECK: s.ptr1[0] = 1
-  // CHECK: s.ptr2[0] = 2
-  // CHECK: s.ptr2[1] = 3
-  printf("s.ptr1[0] = %d\n", s.ptr1[0]);
-  printf("s.ptr2[0] = %d\n", s.ptr2[0]);
-  printf("s.ptr2[1] = %d\n", s.ptr2[1]);
-
-  free(s.ptr1);
-  free(s.ptr2);
-  free(s2.ptr1);
-  free(s2.ptr2);
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/prelock.cpp b/libomptarget/test/mapping/prelock.cpp
deleted file mode 100644
index d019108..0000000
--- a/libomptarget/test/mapping/prelock.cpp
+++ /dev/null

@@ -1,69 +0,0 @@
-// RUN: %libomptarget-compilexx-generic
-// RUN: %libomptarget-run-generic %fcheck-generic
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
-#include <cstdio>
-
-#include <omp.h>
-
-extern "C" {
-void *llvm_omp_target_lock_mem(void *ptr, size_t size, int device_num);
-void llvm_omp_target_unlock_mem(void *ptr, int device_num);
-}
-
-int main() {
-  int n = 100;
-  int *unlocked = new int[n];
-
-  for (int i = 0; i < n; i++)
-    unlocked[i] = i;
-
-  int *locked = (int *)llvm_omp_target_lock_mem(unlocked, n * sizeof(int),
-                                                omp_get_default_device());
-  if (!locked)
-    return 0;
-
-#pragma omp target teams distribute parallel for map(tofrom : unlocked[ : n])
-  for (int i = 0; i < n; i++)
-    unlocked[i] += 1;
-
-#pragma omp target teams distribute parallel for map(tofrom : unlocked[10 : 10])
-  for (int i = 10; i < 20; i++)
-    unlocked[i] += 1;
-
-#pragma omp target teams distribute parallel for map(tofrom : locked[ : n])
-  for (int i = 0; i < n; i++)
-    locked[i] += 1;
-
-#pragma omp target teams distribute parallel for map(tofrom : locked[10 : 10])
-  for (int i = 10; i < 20; i++)
-    locked[i] += 1;
-
-  llvm_omp_target_unlock_mem(unlocked, omp_get_default_device());
-
-  int err = 0;
-  for (int i = 0; i < n; i++) {
-    if (i < 10 || i > 19) {
-      if (unlocked[i] != i + 2) {
-        printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 1);
-        err++;
-      }
-    } else if (unlocked[i] != i + 4) {
-      printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 2);
-      err++;
-    }
-  }
-
-  // CHECK: PASS
-  if (err == 0)
-    printf("PASS\n");
-
-  return err;
-}

diff --git a/libomptarget/test/mapping/present/target.c b/libomptarget/test/mapping/present/target.c
deleted file mode 100644
index 4344c42..0000000
--- a/libomptarget/test/mapping/present/target.c
+++ /dev/null

@@ -1,33 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int i;
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : i)
-#pragma omp target map(present, alloc : i)
-  ;
-
-  // CHECK: i is present
-  fprintf(stderr, "i is present\n");
-
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-  // CHECK: omptarget error: Call to getTargetPointer returned null pointer ('present' map type modifier).
-  // CHECK: omptarget error: Call to targetDataBegin failed, abort target.
-  // CHECK: omptarget error: Failed to process data before launching the kernel.
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target map(present, alloc : i)
-  ;
-
-  // CHECK-NOT: i is present
-  fprintf(stderr, "i is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_array_extension.c b/libomptarget/test/mapping/present/target_array_extension.c
deleted file mode 100644
index 873b2b3..0000000
--- a/libomptarget/test/mapping/present/target_array_extension.c
+++ /dev/null

@@ -1,89 +0,0 @@
-// --------------------------------------------------
-// Check extends before
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DEXTENDS=BEFORE
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check extends after
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DEXTENDS=AFTER
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// END.
-
-#include <stdio.h>
-
-#define BEFORE 0
-#define AFTER 1
-
-#define SIZE 100
-
-#if EXTENDS == BEFORE
-#define SMALL_BEG (SIZE - 2)
-#define SMALL_END SIZE
-#define LARGE_BEG 0
-#define LARGE_END SIZE
-#elif EXTENDS == AFTER
-#define SMALL_BEG 0
-#define SMALL_END 2
-#define LARGE_BEG 0
-#define LARGE_END SIZE
-#else
-#error EXTENDS undefined
-#endif
-
-#define SMALL_SIZE (SMALL_END - SMALL_BEG)
-#define LARGE_SIZE (LARGE_END - LARGE_BEG)
-
-#define SMALL                                                                  \
-  SMALL_BEG:                                                                   \
-  SMALL_SIZE
-#define LARGE                                                                  \
-  LARGE_BEG:                                                                   \
-  LARGE_SIZE
-
-int main() {
-  int arr[SIZE];
-
-  // CHECK: addr=0x[[#%x,SMALL_ADDR:]], size=[[#%u,SMALL_BYTES:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &arr[SMALL_BEG],
-          SMALL_SIZE * sizeof arr[0]);
-
-  // CHECK: addr=0x[[#%x,LARGE_ADDR:]], size=[[#%u,LARGE_BYTES:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &arr[LARGE_BEG],
-          LARGE_SIZE * sizeof arr[0]);
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : arr[LARGE])
-  {
-#pragma omp target map(present, tofrom : arr[SMALL])
-    ;
-  }
-
-  // CHECK: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  // CHECK: omptarget message: explicit extension not allowed: host address specified is 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes), but device allocation maps to host at 0x{{0*}}[[#SMALL_ADDR]] ([[#SMALL_BYTES]] bytes)
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes)
-  // CHECK: omptarget error: Call to getTargetPointer returned null pointer ('present' map type modifier).
-  // CHECK: omptarget error: Call to targetDataBegin failed, abort target.
-  // CHECK: omptarget error: Failed to process data before launching the kernel.
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target data map(alloc : arr[SMALL])
-  {
-#pragma omp target map(present, tofrom : arr[LARGE])
-    ;
-  }
-
-  // CHECK-NOT: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_data.c b/libomptarget/test/mapping/present/target_data.c
deleted file mode 100644
index f894283..0000000
--- a/libomptarget/test/mapping/present/target_data.c
+++ /dev/null

@@ -1,30 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int i;
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : i)
-#pragma omp target data map(present, alloc : i)
-  ;
-
-  // CHECK: i is present
-  fprintf(stderr, "i is present\n");
-
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target data map(present, alloc : i)
-  ;
-
-  // CHECK-NOT: i is present
-  fprintf(stderr, "i is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_data_array_extension.c b/libomptarget/test/mapping/present/target_data_array_extension.c
deleted file mode 100644
index 794543a..0000000
--- a/libomptarget/test/mapping/present/target_data_array_extension.c
+++ /dev/null

@@ -1,87 +0,0 @@
-// --------------------------------------------------
-// Check extends before
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DEXTENDS=BEFORE
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check extends after
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DEXTENDS=AFTER
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// END.
-
-#include <stdio.h>
-
-#define BEFORE 0
-#define AFTER 1
-
-#define SIZE 100
-
-#if EXTENDS == BEFORE
-#define SMALL_BEG (SIZE - 2)
-#define SMALL_END SIZE
-#define LARGE_BEG 0
-#define LARGE_END SIZE
-#elif EXTENDS == AFTER
-#define SMALL_BEG 0
-#define SMALL_END 2
-#define LARGE_BEG 0
-#define LARGE_END SIZE
-#else
-#error EXTENDS undefined
-#endif
-
-#define SMALL_SIZE (SMALL_END - SMALL_BEG)
-#define LARGE_SIZE (LARGE_END - LARGE_BEG)
-
-#define SMALL                                                                  \
-  SMALL_BEG:                                                                   \
-  SMALL_SIZE
-#define LARGE                                                                  \
-  LARGE_BEG:                                                                   \
-  LARGE_SIZE
-
-int main() {
-  int arr[SIZE];
-
-  // CHECK: addr=0x[[#%x,SMALL_ADDR:]], size=[[#%u,SMALL_BYTES:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &arr[SMALL_BEG],
-          SMALL_SIZE * sizeof arr[0]);
-
-  // CHECK: addr=0x[[#%x,LARGE_ADDR:]], size=[[#%u,LARGE_BYTES:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &arr[LARGE_BEG],
-          LARGE_SIZE * sizeof arr[0]);
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : arr[LARGE])
-  {
-#pragma omp target data map(present, tofrom : arr[SMALL])
-    ;
-  }
-
-  // CHECK: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  // CHECK: omptarget message: explicit extension not allowed: host address specified is 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes), but device allocation maps to host at 0x{{0*}}[[#SMALL_ADDR]] ([[#SMALL_BYTES]] bytes)
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#LARGE_ADDR]] ([[#LARGE_BYTES]] bytes)
-  // CHECK: omptarget error: Call to getTargetPointer returned null pointer ('present' map type modifier).
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target data map(alloc : arr[SMALL])
-  {
-#pragma omp target data map(present, tofrom : arr[LARGE])
-    ;
-  }
-
-  // CHECK-NOT: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_data_at_exit.c b/libomptarget/test/mapping/present/target_data_at_exit.c
deleted file mode 100644
index c1fbbae..0000000
--- a/libomptarget/test/mapping/present/target_data_at_exit.c
+++ /dev/null

@@ -1,25 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int i;
-
-#pragma omp target enter data map(alloc : i)
-
-  // i isn't present at the end of the target data region, but the "present"
-  // modifier is only checked at the beginning of a region.
-#pragma omp target data map(present, alloc : i)
-  {
-#pragma omp target exit data map(delete : i)
-  }
-
-  // CHECK-NOT: omptarget
-  // CHECK: success
-  // CHECK-NOT: omptarget
-  fprintf(stderr, "success\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_enter_data.c b/libomptarget/test/mapping/present/target_enter_data.c
deleted file mode 100644
index 871a052..0000000
--- a/libomptarget/test/mapping/present/target_enter_data.c
+++ /dev/null

@@ -1,30 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int i;
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
-
-  // CHECK-NOT: omptarget
-#pragma omp target enter data map(alloc : i)
-#pragma omp target enter data map(present, alloc : i)
-#pragma omp target exit data map(delete : i)
-
-  // CHECK: i is present
-  fprintf(stderr, "i is present\n");
-
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-  // CHECK: omptarget error: Call to getTargetPointer returned null pointer ('present' map type modifier).
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target enter data map(present, alloc : i)
-
-  // CHECK-NOT: i is present
-  fprintf(stderr, "i is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_exit_data_delete.c b/libomptarget/test/mapping/present/target_exit_data_delete.c
deleted file mode 100644
index 0fb812b..0000000
--- a/libomptarget/test/mapping/present/target_exit_data_delete.c
+++ /dev/null

@@ -1,28 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int i;
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
-
-// CHECK-NOT: omptarget
-#pragma omp target enter data map(alloc : i)
-#pragma omp target exit data map(present, delete : i)
-
-  // CHECK: i was present
-  fprintf(stderr, "i was present\n");
-
-// CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-// CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target exit data map(present, delete : i)
-
-  // CHECK-NOT: i was present
-  fprintf(stderr, "i was present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_exit_data_release.c b/libomptarget/test/mapping/present/target_exit_data_release.c
deleted file mode 100644
index 14be22f..0000000
--- a/libomptarget/test/mapping/present/target_exit_data_release.c
+++ /dev/null

@@ -1,28 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int i;
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
-
-// CHECK-NOT: omptarget
-#pragma omp target enter data map(alloc : i)
-#pragma omp target exit data map(present, release : i)
-
-  // CHECK: i was present
-  fprintf(stderr, "i was present\n");
-
-// CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-// CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target exit data map(present, release : i)
-
-  // CHECK-NOT: i was present
-  fprintf(stderr, "i was present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_update.c b/libomptarget/test/mapping/present/target_update.c
deleted file mode 100644
index 9f6783b..0000000
--- a/libomptarget/test/mapping/present/target_update.c
+++ /dev/null

@@ -1,43 +0,0 @@
-// --------------------------------------------------
-// Check 'to'
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=to
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check 'from'
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=from
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int i;
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", &i, sizeof i);
-
-  // CHECK-NOT: omptarget
-#pragma omp target enter data map(alloc : i)
-#pragma omp target update CLAUSE(present : i)
-#pragma omp target exit data map(delete : i)
-
-  // CHECK: i is present
-  fprintf(stderr, "i is present\n");
-
-  // CHECK: omptarget message: device mapping required by 'present' motion modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target update CLAUSE(present : i)
-
-  // CHECK-NOT: i is present
-  fprintf(stderr, "i is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/target_update_array_extension.c b/libomptarget/test/mapping/present/target_update_array_extension.c
deleted file mode 100644
index 11ad4a8..0000000
--- a/libomptarget/test/mapping/present/target_update_array_extension.c
+++ /dev/null

@@ -1,80 +0,0 @@
-// --------------------------------------------------
-// Check 'to' and extends before
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=to -DEXTENDS=BEFORE
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check 'from' and extends before
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=from -DEXTENDS=BEFORE
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check 'to' and extends after
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=to -DEXTENDS=AFTER
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check 'from' and extends after
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=from -DEXTENDS=AFTER
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// END.
-
-#include <stdio.h>
-
-#define BEFORE 0
-#define AFTER 1
-
-#if EXTENDS == BEFORE
-#define SMALL 2 : 3
-#define LARGE 0 : 5
-#elif EXTENDS == AFTER
-#define SMALL 0 : 3
-#define LARGE 0 : 5
-#else
-#error EXTENDS undefined
-#endif
-
-int main() {
-  int arr[5];
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]], size=[[#%u,SIZE:]]
-  fprintf(stderr, "addr=%p, size=%ld\n", arr, sizeof arr);
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : arr[LARGE])
-  {
-#pragma omp target update CLAUSE(present : arr[SMALL])
-  }
-
-  // CHECK: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  // CHECK: omptarget message: device mapping required by 'present' motion modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] ([[#SIZE]] bytes)
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target data map(alloc : arr[SMALL])
-  {
-#pragma omp target update CLAUSE(present : arr[LARGE])
-  }
-
-  // CHECK-NOT: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/unified_shared_memory.c b/libomptarget/test/mapping/present/unified_shared_memory.c
deleted file mode 100644
index ab6e3bd..0000000
--- a/libomptarget/test/mapping/present/unified_shared_memory.c
+++ /dev/null

@@ -1,31 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// REQUIRES: unified_shared_memory
-
-#include <stdio.h>
-
-// The runtime considers unified shared memory to be always present.
-#pragma omp requires unified_shared_memory
-
-int main() {
-  int i;
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : i)
-#pragma omp target map(present, alloc : i)
-  ;
-
-  // CHECK: i is present
-  fprintf(stderr, "i is present\n");
-
-  // CHECK-NOT: omptarget
-#pragma omp target map(present, alloc : i)
-  ;
-
-  // CHECK: is present
-  fprintf(stderr, "i is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/zero_length_array_section.c b/libomptarget/test/mapping/present/zero_length_array_section.c
deleted file mode 100644
index e903a26..0000000
--- a/libomptarget/test/mapping/present/zero_length_array_section.c
+++ /dev/null

@@ -1,36 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int arr[5];
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]]
-  fprintf(stderr, "addr=%p\n", arr);
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : arr[0 : 5])
-#pragma omp target map(present, alloc : arr[0 : 0])
-  ;
-
-  // CHECK: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  // arr[0:0] doesn't create an actual mapping in the first directive.
-  //
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] (0 bytes)
-  // CHECK: omptarget error: Call to getTargetPointer returned null pointer ('present' map type modifier).
-  // CHECK: omptarget error: Call to targetDataBegin failed, abort target.
-  // CHECK: omptarget error: Failed to process data before launching the kernel.
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target data map(alloc : arr[0 : 0])
-#pragma omp target map(present, alloc : arr[0 : 0])
-  ;
-
-  // CHECK-NOT: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/present/zero_length_array_section_exit.c b/libomptarget/test/mapping/present/zero_length_array_section_exit.c
deleted file mode 100644
index 5a73605..0000000
--- a/libomptarget/test/mapping/present/zero_length_array_section_exit.c
+++ /dev/null

@@ -1,31 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-
-int main() {
-  int arr[5];
-
-  // CHECK: addr=0x[[#%x,HOST_ADDR:]]
-  fprintf(stderr, "addr=%p\n", arr);
-
-  // CHECK-NOT: omptarget
-#pragma omp target enter data map(alloc : arr[0 : 5])
-#pragma omp target exit data map(present, release : arr[0 : 0])
-
-  // CHECK: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  // arr[0:0] doesn't create an actual mapping in the first directive.
-  //
-  // CHECK: omptarget message: device mapping required by 'present' map type modifier does not exist for host address 0x{{0*}}[[#HOST_ADDR]] (0 bytes)
-  // CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-#pragma omp target enter data map(alloc : arr[0 : 0])
-#pragma omp target exit data map(present, release : arr[0 : 0])
-
-  // CHECK-NOT: arr is present
-  fprintf(stderr, "arr is present\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/private_mapping.c b/libomptarget/test/mapping/private_mapping.c
deleted file mode 100644
index 1329a66..0000000
--- a/libomptarget/test/mapping/private_mapping.c
+++ /dev/null

@@ -1,44 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
-#include <assert.h>
-#include <stdio.h>
-
-int main() {
-  int data1[3] = {1, 2, 5};
-  int data2[3] = {10, 20, 50};
-  int data3[3] = {100, 200, 500};
-  int sum[16] = {0};
-
-  for (int i=0; i<16; i++) sum[i] = 10000;
-
-#pragma omp target teams distribute parallel for map(tofrom : sum[:16])       \
-    firstprivate(data1, data2, data3)
-  for (int i = 0; i < 16; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      sum[i] += data1[j];
-      sum[i] += data2[j];
-      sum[i] += data3[j];
-    }
-  }
-
-  int correct = 1;
-  for (int i = 0; i < 16; ++i) {
-    if (sum[i] != 10888) {
-      correct = 0;
-      printf("ERROR: The sum for index %d is %d\n", i, sum[i]);
-      printf("ERROR: data1 = {%d, %d, %d}\n", data1[0], data1[1], data1[2]);
-      printf("ERROR: data2 = {%d, %d, %d}\n", data2[0], data2[1], data2[2]);
-      printf("ERROR: data3 = {%d, %d, %d}\n", data3[0], data3[1], data3[2]);
-      break;
-    }
-  }
-  fflush(stdout);
-  assert(correct);
-
-  printf("PASS\n");
-
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/mapping/ptr_and_obj_motion.c b/libomptarget/test/mapping/ptr_and_obj_motion.c
deleted file mode 100644
index 8fa2c98..0000000
--- a/libomptarget/test/mapping/ptr_and_obj_motion.c
+++ /dev/null

@@ -1,43 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <stdio.h>
-
-typedef struct {
-  double *dataptr;
-  int dummy1;
-  int dummy2;
-} DV;
-
-void init(double vertexx[]) {
-#pragma omp target map(vertexx[0 : 100])
-  {
-    printf("In init: %lf, expected 100.0\n", vertexx[77]);
-    vertexx[77] = 77.0;
-  }
-}
-
-void change(DV *dvptr) {
-#pragma omp target map(dvptr->dataptr[0 : 100])
-  {
-    printf("In change: %lf, expected 77.0\n", dvptr->dataptr[77]);
-    dvptr->dataptr[77] += 1.0;
-  }
-}
-
-int main() {
-  double vertexx[100];
-  vertexx[77] = 100.0;
-
-  DV dv;
-  dv.dataptr = &vertexx[0];
-
-#pragma omp target enter data map(to : vertexx[0 : 100])
-
-  init(vertexx);
-  change(&dv);
-
-#pragma omp target exit data map(from : vertexx[0 : 100])
-
-  // CHECK: Final: 78.0
-  printf("Final: %lf\n", vertexx[77]);
-}

diff --git a/libomptarget/test/mapping/reduction_implicit_map.cpp b/libomptarget/test/mapping/reduction_implicit_map.cpp
deleted file mode 100644
index 329aa43..0000000
--- a/libomptarget/test/mapping/reduction_implicit_map.cpp
+++ /dev/null

@@ -1,22 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <stdio.h>
-
-void sum(int *input, int size, int *output) {
-#pragma omp target teams distribute parallel for reduction(+ : output[0])      \
-    map(to : input[0 : size])
-  for (int i = 0; i < size; i++)
-    output[0] += input[i];
-}
-int main() {
-  const int size = 100;
-  int *array = new int[size];
-  int result = 0;
-  for (int i = 0; i < size; i++)
-    array[i] = i + 1;
-  sum(array, size, &result);
-  // CHECK: Result=5050
-  printf("Result=%d\n", result);
-  delete[] array;
-  return 0;
-}

diff --git a/libomptarget/test/mapping/target_data_array_extension_at_exit.c b/libomptarget/test/mapping/target_data_array_extension_at_exit.c
deleted file mode 100644
index e300c80..0000000
--- a/libomptarget/test/mapping/target_data_array_extension_at_exit.c
+++ /dev/null

@@ -1,114 +0,0 @@
-// --------------------------------------------------
-// Check extends before
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DEXTENDS=BEFORE
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check extends after
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DEXTENDS=AFTER
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// END.
-
-#include <stdio.h>
-
-#define BEFORE 0
-#define AFTER 1
-
-#define SIZE 100
-
-#if EXTENDS == BEFORE
-#define SMALL_BEG (SIZE - 2)
-#define SMALL_END SIZE
-#define LARGE_BEG 0
-#define LARGE_END SIZE
-#elif EXTENDS == AFTER
-#define SMALL_BEG 0
-#define SMALL_END 2
-#define LARGE_BEG 0
-#define LARGE_END SIZE
-#else
-#error EXTENDS undefined
-#endif
-
-#define SMALL                                                                  \
-  SMALL_BEG:                                                                   \
-  (SMALL_END - SMALL_BEG)
-#define LARGE                                                                  \
-  LARGE_BEG:                                                                   \
-  (LARGE_END - LARGE_BEG)
-
-void check_not_present() {
-  int arr[SIZE];
-
-  for (int i = 0; i < SIZE; ++i)
-    arr[i] = 99;
-
-  // CHECK-LABEL: checking not present
-  fprintf(stderr, "checking not present\n");
-
-  // arr[LARGE] isn't (fully) present at the end of the target data region, so
-  // the device-to-host transfer should not be performed, or it might fail.
-#pragma omp target data map(tofrom : arr[LARGE])
-  {
-#pragma omp target exit data map(delete : arr[LARGE])
-#pragma omp target enter data map(alloc : arr[SMALL])
-#pragma omp target map(alloc : arr[SMALL])
-    for (int i = SMALL_BEG; i < SMALL_END; ++i)
-      arr[i] = 88;
-  }
-
-  // CHECK-NOT: omptarget
-  // CHECK-NOT: error
-  for (int i = 0; i < SIZE; ++i) {
-    if (arr[i] != 99)
-      fprintf(stderr, "error: arr[%d]=%d\n", i, arr[i]);
-  }
-}
-
-void check_is_present() {
-  int arr[SIZE];
-
-  for (int i = 0; i < SIZE; ++i)
-    arr[i] = 99;
-
-  // CHECK-LABEL: checking is present
-  fprintf(stderr, "checking is present\n");
-
-  // arr[SMALL] is (fully) present at the end of the target data region, and the
-  // device-to-host transfer should be performed only for it even though more
-  // of the array is then present.
-#pragma omp target data map(tofrom : arr[SMALL])
-  {
-#pragma omp target exit data map(delete : arr[SMALL])
-#pragma omp target enter data map(alloc : arr[LARGE])
-#pragma omp target map(alloc : arr[LARGE])
-    for (int i = LARGE_BEG; i < LARGE_END; ++i)
-      arr[i] = 88;
-  }
-
-  // CHECK-NOT: omptarget
-  // CHECK-NOT: error
-  for (int i = 0; i < SIZE; ++i) {
-    if (SMALL_BEG <= i && i < SMALL_END) {
-      if (arr[i] != 88)
-        fprintf(stderr, "error: arr[%d]=%d\n", i, arr[i]);
-    } else if (arr[i] != 99) {
-      fprintf(stderr, "error: arr[%d]=%d\n", i, arr[i]);
-    }
-  }
-}
-
-int main() {
-  check_not_present();
-  check_is_present();
-  return 0;
-}

diff --git a/libomptarget/test/mapping/target_derefence_array_pointrs.cpp b/libomptarget/test/mapping/target_derefence_array_pointrs.cpp
deleted file mode 100644
index a6dd406..0000000
--- a/libomptarget/test/mapping/target_derefence_array_pointrs.cpp
+++ /dev/null

@@ -1,110 +0,0 @@
-// RUN: %libomptarget-compilexx-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// FIXME: This is currently broken on all GPU targets
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void foo(int **t1d) {
-  int ***t2d = &t1d;
-  int ****t3d = &t2d;
-  *t1d = (int *)malloc(3 * sizeof(int));
-  int j, a = 0, b = 0;
-
-  for (j = 0; j < 3; j++)
-    (*t1d)[j] = 0;
-#pragma omp target map(tofrom : (*t1d)[0 : 3])
-  { (*t1d)[1] = 1; }
-  // CHECK: 1
-  printf("%d\n", (*t1d)[1]);
-#pragma omp target map(tofrom : (**t2d)[0 : 3])
-  { (**t2d)[1] = 2; }
-  // CHECK: 2
-  printf("%d\n", (**t2d)[1]);
-#pragma omp target map(tofrom : (***t3d)[0 : 3])
-  { (***t3d)[1] = 3; }
-  // CHECK: 3
-  printf("%d\n", (***t3d)[1]);
-#pragma omp target map(tofrom : (**t1d))
-  { (*t1d)[0] = 4; }
-  // CHECK: 4
-  printf("%d\n", (*t1d)[0]);
-#pragma omp target map(tofrom : (*(*(t1d + a) + b)))
-  { *(*(t1d + a) + b) = 5; }
-  // CHECK: 5
-  printf("%d\n", *(*(t1d + a) + b));
-}
-
-typedef int(T)[3];
-void bar() {
-  T **a;
-  int b[2][3];
-  int(*p)[3] = b;
-  a = &p;
-  for (int i = 0; i < 3; i++) {
-    (**a)[1] = i;
-  }
-#pragma omp target map((**a)[ : 3])
-  {
-    (**a)[1] = 6;
-    // CHECK: 6
-    printf("%d\n", (**a)[1]);
-  }
-}
-
-struct SSA {
-  int i;
-  SSA *sa;
-  SSA() {
-    i = 1;
-    sa = this;
-  }
-};
-
-void zoo(int **f, SSA *sa) {
-  int *t = *f;
-  f = (int **)malloc(sa->i * 4 * sizeof(int));
-  t = (int *)malloc(sa->i * sizeof(int));
-  *(f + sa->i + 1) = t;
-  *(sa->sa->i + *(f + sa->i + 1)) = 4;
-  printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
-#pragma omp target map(sa, *(sa->sa->i + *(1 + sa->i + f)))
-  { *(sa->sa->i + *(1 + sa->i + f)) = 7; }
-  // CHECK: 7
-  printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
-}
-
-void xoo() {
-  int *x = 0;
-  SSA *sa = new SSA();
-  zoo(&x, sa);
-}
-
-void yoo(int **x) {
-  *x = (int *)malloc(2 * sizeof(int));
-#pragma omp target map(**x)
-  {
-    **x = 8;
-    // CHECK: 8
-    printf("%d\n", **x);
-  }
-#pragma omp target map(*(*x + 1))
-  {
-    *(*x + 1) = 9;
-    // CHECK: 9
-    printf("%d\n", *(*x + 1));
-  }
-}
-
-int main() {
-  int *data = 0;
-  foo(&data);
-  bar();
-  xoo();
-  yoo(&data);
-}

diff --git a/libomptarget/test/mapping/target_has_device_addr.c b/libomptarget/test/mapping/target_has_device_addr.c
deleted file mode 100644
index e8bfff8..0000000
--- a/libomptarget/test/mapping/target_has_device_addr.c
+++ /dev/null

@@ -1,101 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define N 1024
-#define FROM 64
-#define LENGTH 128
-
-void foo() {
-  const int device_id = omp_get_default_device();
-  float *A;
-  A = (float *)omp_target_alloc((FROM + LENGTH) * sizeof(float), device_id);
-
-  float *A_dev = NULL;
-#pragma omp target has_device_addr(A[FROM : LENGTH]) map(A_dev)
-  { A_dev = A; }
-  // CHECK: Success
-  if (A_dev == NULL || A_dev != A)
-    fprintf(stderr, "Failure %p %p \n", A_dev, A);
-  else
-    fprintf(stderr, "Success\n");
-}
-
-void bar() {
-  short x[10];
-  short *xp = &x[0];
-
-  x[1] = 111;
-#pragma omp target data map(tofrom : xp[0 : 2]) use_device_addr(xp[0 : 2])
-#pragma omp target has_device_addr(xp[0 : 2])
-  {
-    xp[1] = 222;
-    // CHECK: 222
-    printf("%d %p\n", xp[1], &xp[1]);
-  }
-  // CHECK: 222
-  printf("%d %p\n", xp[1], &xp[1]);
-}
-
-void moo() {
-  short *b = malloc(sizeof(short));
-  b = b - 1;
-
-  b[1] = 111;
-#pragma omp target data map(tofrom : b[1]) use_device_addr(b[1])
-#pragma omp target has_device_addr(b[1])
-  {
-    b[1] = 222;
-    // CHECK: 222
-    printf("%hd %p %p %p\n", b[1], b, &b[1], &b);
-  }
-  // CHECK: 222
-  printf("%hd %p %p %p\n", b[1], b, &b[1], &b);
-}
-
-void zoo() {
-  short x[10];
-  short *(xp[10]);
-  xp[1] = &x[0];
-  short **xpp = &xp[0];
-
-  x[1] = 111;
-#pragma omp target data map(tofrom : xpp[1][1]) use_device_addr(xpp[1][1])
-#pragma omp target has_device_addr(xpp[1][1])
-  {
-    xpp[1][1] = 222;
-    // CHECK: 222
-    printf("%d %p %p\n", xpp[1][1], xpp[1], &xpp[1][1]);
-  }
-  // CHECK: 222
-  printf("%d %p %p\n", xpp[1][1], xpp[1], &xpp[1][1]);
-}
-void xoo() {
-  short a[10], b[10];
-  a[1] = 111;
-  b[1] = 111;
-#pragma omp target data map(to : a[0 : 2], b[0 : 2]) use_device_addr(a, b)
-#pragma omp target has_device_addr(a) has_device_addr(b[0])
-  {
-    a[1] = 222;
-    b[1] = 222;
-    // CHECK: 222 222
-    printf("%hd %hd %p %p %p\n", a[1], b[1], &a, b, &b);
-  }
-  // CHECK:111
-  printf("%hd %hd %p %p %p\n", a[1], b[1], &a, b, &b); // 111 111 p1d p2d p3d
-}
-int main() {
-  foo();
-  bar();
-  moo();
-  zoo();
-  xoo();
-  return 0;
-}

diff --git a/libomptarget/test/mapping/target_implicit_partial_map.c b/libomptarget/test/mapping/target_implicit_partial_map.c
deleted file mode 100644
index a8b2cc8..0000000
--- a/libomptarget/test/mapping/target_implicit_partial_map.c
+++ /dev/null

@@ -1,39 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// END.
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  int arr[100];
-
-#pragma omp target data map(alloc : arr[50 : 2]) // partially mapped
-  {
-    // CHECK: arr[50] must present: 1
-    fprintf(stderr, "arr[50] must present: %d\n",
-            omp_target_is_present(&arr[50], omp_get_default_device()));
-
-    // CHECK: arr[0] should not present: 0
-    fprintf(stderr, "arr[0] should not present: %d\n",
-            omp_target_is_present(&arr[0], omp_get_default_device()));
-
-    // CHECK: arr[49] should not present: 0
-    fprintf(stderr, "arr[49] should not present: %d\n",
-            omp_target_is_present(&arr[49], omp_get_default_device()));
-
-#pragma omp target // would implicitly map with full size but already present
-    {
-      arr[50] = 5;
-      arr[51] = 6;
-    } // must treat as present (dec ref count) even though full size not present
-  }   // wouldn't delete if previous ref count dec didn't happen
-
-  // CHECK: arr[50] still present: 0
-  fprintf(stderr, "arr[50] still present: %d\n",
-          omp_target_is_present(&arr[50], omp_get_default_device()));
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/target_map_for_member_data.cpp b/libomptarget/test/mapping/target_map_for_member_data.cpp
deleted file mode 100644
index fafff6b..0000000
--- a/libomptarget/test/mapping/target_map_for_member_data.cpp
+++ /dev/null

@@ -1,98 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-extern "C" int printf(const char *, ...);
-template <typename T> class A {
-protected:
-  T X;
-  T Y;
-
-public:
-  A(T x, T y) : X{x}, Y{y} {};
-};
-
-template <typename T> class B : public A<T> {
-  using A<T>::X;
-  using A<T>::Y;
-
-public:
-  T res;
-
-  B(T x, T y) : A<T>(x, y), res{0} {};
-
-  void run(void) {
-#pragma omp target map(res)
-    { res = X + Y; }
-  }
-};
-
-class X {
-protected:
-  int A;
-
-public:
-  X(int a) : A{a} {};
-};
-class Y : public X {
-  using X::A;
-
-protected:
-  int B;
-
-public:
-  Y(int a, int b) : X(a), B{b} {};
-};
-class Z : public Y {
-  using X::A;
-  using Y::B;
-
-public:
-  int res;
-  Z(int a, int b) : Y(a, b), res{0} {};
-  void run(void) {
-#pragma omp target map(res)
-    { res = A + B; }
-  }
-};
-struct descriptor {
-  int A;
-  int C;
-};
-
-class BASE {};
-
-class C : public BASE {
-public:
-  void bar(descriptor &d) {
-    auto Asize = 4;
-    auto Csize = 4;
-
-#pragma omp target data map(from : d.C)
-    {
-#pragma omp target teams firstprivate(Csize)
-      d.C = 1;
-    }
-#pragma omp target map(from : d.A)
-    d.A = 3;
-  }
-};
-
-int main(int argc, char *argv[]) {
-  B<int> b(2, 3);
-  b.run();
-  // CHECK: 5
-  printf("b.res = %d \n", b.res);
-  Z c(2, 3);
-  c.run();
-  // CHECK: 5
-  printf("c.res = %d \n", c.res);
-
-  descriptor d;
-  C z;
-  z.bar(d);
-  // CHECK 1
-  printf("%d\n", d.C);
-  // CHECK 3
-  printf("%d\n", d.A);
-}

diff --git a/libomptarget/test/mapping/target_pointers_members_map.cpp b/libomptarget/test/mapping/target_pointers_members_map.cpp
deleted file mode 100644
index daf9819..0000000
--- a/libomptarget/test/mapping/target_pointers_members_map.cpp
+++ /dev/null

@@ -1,53 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
-
-#include <cstdio>
-#include <cstdlib>
-
-typedef struct {
-  short *a;
-  long d1, d2;
-} DV_A;
-
-typedef struct {
-  DV_A b;
-  long d3;
-} C;
-
-typedef struct {
-  C *c;
-  long d4, d5;
-} DV_B;
-
-int main() {
-
-  short arr1[10] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
-  short arr2[10] = {20, 31, 22, 23, 24, 25, 26, 27, 28, 29};
-
-  C c1[2];
-  c1[0].b.a = (short *)arr1;
-  c1[1].b.a = (short *)arr2;
-  c1[0].b.d1 = 111;
-
-  DV_B dvb1;
-  dvb1.c = (C *)&c1;
-
-  // CHECK: 10 111
-  printf("%d %ld %p %p %p %p\n", dvb1.c[0].b.a[0], dvb1.c[0].b.d1, &dvb1,
-         &dvb1.c[0], &dvb1.c[0].b, &dvb1.c[0].b.a[0]);
-#pragma omp target map(to : dvb1, dvb1.c[0 : 2])                               \
-    map(tofrom : dvb1.c[0].b.a[0 : 10], dvb1.c[1].b.a[0 : 10])
-  {
-    // CHECK: 10 111
-    printf("%d %ld %p %p %p %p\n", dvb1.c[0].b.a[0], dvb1.c[0].b.d1, &dvb1,
-           &dvb1.c[0], &dvb1.c[0].b, &dvb1.c[0].b.a[0]);
-    dvb1.c[0].b.a[0] = 333;
-    dvb1.c[0].b.d1 = 444;
-  }
-  // CHECK: 333 111
-  printf("%d %ld %p %p %p %p\n", dvb1.c[0].b.a[0], dvb1.c[0].b.d1, &dvb1,
-         &dvb1.c[0], &dvb1.c[0].b, &dvb1.c[0].b.a[0]);
-}

diff --git a/libomptarget/test/mapping/target_update_array_extension.c b/libomptarget/test/mapping/target_update_array_extension.c
deleted file mode 100644
index ee926fe..0000000
--- a/libomptarget/test/mapping/target_update_array_extension.c
+++ /dev/null

@@ -1,76 +0,0 @@
-// --------------------------------------------------
-// Check 'to' and extends before
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=to -DEXTENDS=BEFORE
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check 'from' and extends before
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=from -DEXTENDS=BEFORE
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check 'to' and extends after
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=to -DEXTENDS=AFTER
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// --------------------------------------------------
-// Check 'from' and extends after
-// --------------------------------------------------
-
-// RUN: %libomptarget-compile-generic \
-// RUN:   -DCLAUSE=from -DEXTENDS=AFTER
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// END.
-
-#include <stdio.h>
-
-#define BEFORE 0
-#define AFTER 1
-
-#if EXTENDS == BEFORE
-#define SMALL 2 : 3
-#define LARGE 0 : 5
-#elif EXTENDS == AFTER
-#define SMALL 0 : 3
-#define LARGE 0 : 5
-#else
-#error EXTENDS undefined
-#endif
-
-int main() {
-  int arr[5];
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : arr[LARGE])
-  {
-#pragma omp target update CLAUSE(arr[SMALL])
-  }
-
-  // CHECK: success
-  fprintf(stderr, "success\n");
-
-  // CHECK-NOT: omptarget
-#pragma omp target data map(alloc : arr[SMALL])
-  {
-#pragma omp target update CLAUSE(arr[LARGE])
-  }
-
-  // CHECK: success
-  fprintf(stderr, "success\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/mapping/target_use_device_addr.c b/libomptarget/test/mapping/target_use_device_addr.c
deleted file mode 100644
index 5c2bb8a..0000000
--- a/libomptarget/test/mapping/target_use_device_addr.c
+++ /dev/null

@@ -1,18 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-#include <stdio.h>
-int main() {
-  short x[10];
-  short *xp = &x[0];
-
-  x[1] = 111;
-
-  printf("%d, %p\n", xp[1], &xp[1]);
-#pragma omp target data use_device_addr(xp[1 : 3]) map(tofrom : x)
-#pragma omp target is_device_ptr(xp)
-  { xp[1] = 222; }
-  // CHECK: 222
-  printf("%d, %p\n", xp[1], &xp[1]);
-}

diff --git a/libomptarget/test/mapping/target_uses_allocator.c b/libomptarget/test/mapping/target_uses_allocator.c
deleted file mode 100755
index eb20e96..0000000
--- a/libomptarget/test/mapping/target_uses_allocator.c
+++ /dev/null

@@ -1,61 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// FIXME: https://github.com/llvm/llvm-project/issues/77841
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-#define N 1024
-
-int test_omp_aligned_alloc_on_device() {
-  int errors = 0;
-
-  omp_memspace_handle_t memspace = omp_default_mem_space;
-  omp_alloctrait_t traits[2] = {{omp_atk_alignment, 64}, {omp_atk_access, 64}};
-  omp_allocator_handle_t alloc =
-      omp_init_allocator(omp_default_mem_space, 1, traits);
-
-#pragma omp target map(tofrom : errors) uses_allocators(alloc(traits))
-  {
-    int *x;
-    int not_correct_array_values = 0;
-
-    x = (int *)omp_aligned_alloc(64, N * sizeof(int), alloc);
-    if (x == NULL) {
-      errors++;
-    } else {
-#pragma omp parallel for simd simdlen(16) aligned(x : 64)
-      for (int i = 0; i < N; i++) {
-        x[i] = i;
-      }
-
-#pragma omp parallel for simd simdlen(16) aligned(x : 64)
-      for (int i = 0; i < N; i++) {
-        if (x[i] != i) {
-#pragma omp atomic write
-          not_correct_array_values = 1;
-        }
-      }
-      if (not_correct_array_values) {
-        errors++;
-      }
-      omp_free(x, alloc);
-    }
-  }
-
-  omp_destroy_allocator(alloc);
-
-  return errors;
-}
-
-int main() {
-  int errors = 0;
-  if (test_omp_aligned_alloc_on_device())
-    printf("FAILE\n");
-  else
-    // CHECK: PASSED
-    printf("PASSED\n");
-}

diff --git a/libomptarget/test/mapping/target_wrong_use_device_addr.c b/libomptarget/test/mapping/target_wrong_use_device_addr.c
deleted file mode 100644
index 7a5babd..0000000
--- a/libomptarget/test/mapping/target_wrong_use_device_addr.c
+++ /dev/null

@@ -1,30 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-version=51 -g
-// RUN: env LIBOMPTARGET_INFO=64 %libomptarget-run-fail-generic 2>&1 \
-// RUN: | %fcheck-generic
-
-// FIXME: Fails due to optimized debugging in 'ptxas'
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <stdio.h>
-
-int main() {
-  float arr[10];
-  float *x = &arr[0];
-
-  // CHECK: host addr=0x[[#%x,HOST_ADDR:]]
-  fprintf(stderr, "host addr=%p\n", x);
-
-#pragma omp target data map(to : x [0:10])
-  {
-// CHECK: omptarget device 0 info: variable x does not have a valid device
-// counterpart
-#pragma omp target data use_device_addr(x)
-    {
-      // CHECK-NOT: device addr=0x[[#%x,HOST_ADDR:]]
-      fprintf(stderr, "device addr=%p\n", x);
-    }
-  }
-
-  return 0;
-}
-

diff --git a/libomptarget/test/offloading/assert.cpp b/libomptarget/test/offloading/assert.cpp
deleted file mode 100644
index 07716f4..0000000
--- a/libomptarget/test/offloading/assert.cpp
+++ /dev/null

@@ -1,9 +0,0 @@
-// RUN: %libomptarget-compilexx-generic && %libomptarget-run-fail-generic
-// RUN: %libomptarget-compileoptxx-generic && %libomptarget-run-fail-generic
-
-int main(int argc, char *argv[]) {
-#pragma omp target
-  { __builtin_trap(); }
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/atomic-compare-signedness.c b/libomptarget/test/offloading/atomic-compare-signedness.c
deleted file mode 100644
index d774b85..0000000
--- a/libomptarget/test/offloading/atomic-compare-signedness.c
+++ /dev/null

@@ -1,47 +0,0 @@
-// Check that omp atomic compare handles signedness of integer comparisons
-// correctly.
-//
-// At one time, a bug sometimes reversed the signedness.
-
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-run-generic | %fcheck-generic
-// RUN: %libomptarget-compileopt-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic | %fcheck-generic
-
-// High parallelism increases our chances of detecting a lack of atomicity.
-#define NUM_THREADS_TRY 256
-
-#include <limits.h>
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  //      CHECK: signed: num_threads=[[#NUM_THREADS:]]{{$}}
-  // CHECK-NEXT: signed: xs=[[#NUM_THREADS-1]]{{$}}
-  int xs = -1;
-  int numThreads;
-#pragma omp target parallel for num_threads(NUM_THREADS_TRY)                   \
-    map(tofrom : xs, numThreads)
-  for (int i = 0; i < omp_get_num_threads(); ++i) {
-#pragma omp atomic compare
-    if (xs < i) {
-      xs = i;
-    }
-    if (i == 0)
-      numThreads = omp_get_num_threads();
-  }
-  printf("signed: num_threads=%d\n", numThreads);
-  printf("signed: xs=%d\n", xs);
-
-  // CHECK-NEXT: unsigned: xu=0x0{{$}}
-  unsigned xu = UINT_MAX;
-#pragma omp target parallel for num_threads(NUM_THREADS_TRY) map(tofrom : xu)
-  for (int i = 0; i < omp_get_num_threads(); ++i) {
-#pragma omp atomic compare
-    if (xu > i) {
-      xu = i;
-    }
-  }
-  printf("unsigned: xu=0x%x\n", xu);
-  return 0;
-}

diff --git a/libomptarget/test/offloading/back2back_distribute.c b/libomptarget/test/offloading/back2back_distribute.c
deleted file mode 100644
index 63cabd0..0000000
--- a/libomptarget/test/offloading/back2back_distribute.c
+++ /dev/null

@@ -1,75 +0,0 @@
-// RUN: %libomptarget-compile-generic -O3 && %libomptarget-run-generic | %fcheck-generic
-
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define MAX_N 25000
-
-void reset_input(double *a, double *a_h, double *b, double *c) {
-  for(int i = 0 ; i < MAX_N ; i++) {
-    a[i] = a_h[i] = i;
-    b[i] = i*2;
-    c[i] = i-3;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  double *a = (double *)calloc(MAX_N, sizeof(double));
-  double *a_h = (double *)calloc(MAX_N, sizeof(double));
-  double *d = (double *)calloc(MAX_N, sizeof(double));
-  double *d_h = (double *)calloc(MAX_N, sizeof(double));
-  double *b = (double *)calloc(MAX_N, sizeof(double));
-  double *c = (double *)calloc(MAX_N, sizeof(double));
-
-#pragma omp target enter data map(to:a[:MAX_N],b[:MAX_N],c[:MAX_N],d[:MAX_N])
-
-  for (int n = 32 ; n < MAX_N ; n+=5000) {
-    reset_input(a, a_h, b, c);
-
-#pragma omp target update to(a[:n],b[:n],c[:n],d[:n])
-    int t = 0;
-    for (int tms = 1 ; tms <= 256 ; tms *= 2) { // 8 times
-      for (int ths = 32 ; ths <= 1024 ; ths *= 2) { // 6 times
-        t++;
-#pragma omp target
-#pragma omp teams num_teams(tms) thread_limit(ths)
-        {
-#pragma omp distribute parallel for
-          for (int i = 0; i < n; ++i) {
-            a[i] += b[i] + c[i];
-          }
-#pragma omp distribute parallel for
-          for (int i = 0; i < n; ++i) {
-           d[i] -= b[i] + c[i];
-          }
-        }
-      } // loop over 'ths'
-    } // loop over 'tms'
-
-    // check results for each 'n'
-    for (int times = 0 ; times < t ; times++) {
-      for (int i = 0; i < n; ++i) {
-        a_h[i] += b[i] + c[i];
-      }
-      for (int i = 0; i < n; ++i)
-        d_h[i] -= b[i] + c[i];
-    }
-#pragma omp target update from(a[:n],d[:n])
-
-    for (int i = 0; i < n; ++i) {
-      if (a_h[i] != a[i]) {
-        printf("A Error at n = %d, i = %d: host = %f, device = %f\n", n, i, a_h[i], a[i]);
-        return 1;
-      }
-      if (d_h[i] != d[i]) {
-        printf("D Error at n = %d, i = %d: host = %lf, device = %lf\n", n, i, d_h[i], d[i]);
-        return 1;
-      }
-    }
-  } // loop over 'n'
-
-  // CHECK: Succeeded
-  printf("Succeeded\n");
-  return 0;
-}

diff --git a/libomptarget/test/offloading/barrier_fence.c b/libomptarget/test/offloading/barrier_fence.c
deleted file mode 100644
index b9a8ca2..0000000
--- a/libomptarget/test/offloading/barrier_fence.c
+++ /dev/null

@@ -1,84 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-offload-mandatory -O3
-// RUN: %libomptarget-run-generic
-// RUN: %libomptarget-compileopt-generic -fopenmp-offload-mandatory -O3
-// RUN: %libomptarget-run-generic
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-struct IdentTy;
-void __kmpc_barrier_simple_spmd(struct IdentTy *Loc, int32_t TId);
-void __kmpc_barrier_simple_generic(struct IdentTy *Loc, int32_t TId);
-
-#pragma omp begin declare target device_type(nohost)
-static int A[512] __attribute__((address_space(3), loader_uninitialized));
-static int B[512 * 32] __attribute__((loader_uninitialized));
-#pragma omp end declare target
-
-int main() {
-  printf("Testing simple spmd barrier\n");
-  for (int r = 0; r < 50; r++) {
-#pragma omp target teams distribute thread_limit(512) num_teams(440)
-    for (int j = 0; j < 512 * 32; ++j) {
-#pragma omp parallel firstprivate(j)
-      {
-        int TId = omp_get_thread_num();
-        int TeamId = omp_get_team_num();
-        int NT = omp_get_num_threads();
-        // Sequential
-        for (int i = 0; i < NT; ++i) {
-          // Test shared memory globals
-          if (TId == i)
-            A[i] = i + j;
-          __kmpc_barrier_simple_spmd(0, TId);
-          if (A[i] != i + j)
-            __builtin_trap();
-          __kmpc_barrier_simple_spmd(0, TId);
-          // Test generic globals
-          if (TId == i)
-            B[TeamId] = i;
-          __kmpc_barrier_simple_spmd(0, TId);
-          if (B[TeamId] != i)
-            __builtin_trap();
-          __kmpc_barrier_simple_spmd(0, TId);
-        }
-      }
-    }
-  }
-
-  printf("Testing simple generic barrier\n");
-  for (int r = 0; r < 50; r++) {
-#pragma omp target teams distribute thread_limit(512) num_teams(440)
-    for (int j = 0; j < 512 * 32; ++j) {
-#pragma omp parallel firstprivate(j)
-      {
-        int TId = omp_get_thread_num();
-        int TeamId = omp_get_team_num();
-        int NT = omp_get_num_threads();
-        // Sequential
-        for (int i = 0; i < NT; ++i) {
-          if (TId == i)
-            A[i] = i + j;
-          __kmpc_barrier_simple_generic(0, TId);
-          if (A[i] != i + j)
-            __builtin_trap();
-          __kmpc_barrier_simple_generic(0, TId);
-          if (TId == i)
-            B[TeamId] = i;
-          __kmpc_barrier_simple_generic(0, TId);
-          if (B[TeamId] != i)
-            __builtin_trap();
-          __kmpc_barrier_simple_generic(0, TId);
-        }
-      }
-    }
-  }
-  return 0;
-}

diff --git a/libomptarget/test/offloading/bug47654.cpp b/libomptarget/test/offloading/bug47654.cpp
deleted file mode 100644
index eba7904..0000000
--- a/libomptarget/test/offloading/bug47654.cpp
+++ /dev/null

@@ -1,26 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-// RUN: %libomptarget-compileoptxx-run-and-check-generic
-
-#include <cassert>
-#include <iostream>
-
-int main(int argc, char *argv[]) {
-  int i = 0, j = 0;
-
-#pragma omp target map(tofrom : i, j) nowait
-  {
-    i = 1;
-    j = 2;
-  }
-
-#pragma omp taskwait
-
-  assert(i == 1);
-  assert(j == 2);
-
-  std::cout << "PASS\n";
-
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/bug49021.cpp b/libomptarget/test/offloading/bug49021.cpp
deleted file mode 100644
index 37da8ce..0000000
--- a/libomptarget/test/offloading/bug49021.cpp
+++ /dev/null

@@ -1,82 +0,0 @@
-// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
-// RUN: %libomptarget-compilexx-generic -O3 -ffast-math && %libomptarget-run-generic
-// RUN: %libomptarget-compileoptxx-generic -O3 && %libomptarget-run-generic
-// RUN: %libomptarget-compileoptxx-generic -O3 -ffast-math && %libomptarget-run-generic
-
-#include <iostream>
-
-template <typename T> int test_map() {
-  std::cout << "map(T)" << std::endl;
-  T a(0.2), a_check;
-#pragma omp target map(from : a_check)
-  { a_check = a; }
-
-  if (a_check != a) {
-    std::cout << " wrong results";
-    return 1;
-  }
-
-  return 0;
-}
-
-template <typename T> int test_reduction() {
-  std::cout << "flat parallelism" << std::endl;
-  T sum(0), sum_host(0);
-  const int size = 100;
-  T array[size];
-  for (int i = 0; i < size; i++) {
-    array[i] = i;
-    sum_host += array[i];
-  }
-
-#pragma omp target teams distribute parallel for map(to : array[ : size])      \
-    reduction(+ : sum)
-  for (int i = 0; i < size; i++)
-    sum += array[i];
-
-  if (sum != sum_host)
-    std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
-
-  std::cout << "hierarchical parallelism" << std::endl;
-  const int nblock(10), block_size(10);
-  T block_sum[nblock];
-#pragma omp target teams distribute map(to : array[ : size])                   \
-    map(from : block_sum[ : nblock])
-  for (int ib = 0; ib < nblock; ib++) {
-    T partial_sum = 0;
-    const int istart = ib * block_size;
-    const int iend = (ib + 1) * block_size;
-#pragma omp parallel for reduction(+ : partial_sum)
-    for (int i = istart; i < iend; i++)
-      partial_sum += array[i];
-    block_sum[ib] = partial_sum;
-  }
-
-  sum = 0;
-  for (int ib = 0; ib < nblock; ib++) {
-    sum += block_sum[ib];
-  }
-
-  if (sum != sum_host) {
-    std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
-    return 1;
-  }
-
-  return 0;
-}
-
-template <typename T> int test_POD() {
-  int ret = 0;
-  ret |= test_map<T>();
-  ret |= test_reduction<T>();
-  return ret;
-}
-
-int main() {
-  int ret = 0;
-  std::cout << "Testing float" << std::endl;
-  ret |= test_POD<float>();
-  std::cout << "Testing double" << std::endl;
-  ret |= test_POD<double>();
-  return ret;
-}

diff --git a/libomptarget/test/offloading/bug49334.cpp b/libomptarget/test/offloading/bug49334.cpp
deleted file mode 100644
index 1f19dab..0000000
--- a/libomptarget/test/offloading/bug49334.cpp
+++ /dev/null

@@ -1,157 +0,0 @@
-// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
-// RUN: %libomptarget-compilexx-generic -O3 -ffast-math && \
-// RUN: %libomptarget-run-generic
-// RUN: %libomptarget-compileoptxx-generic -O3 && %libomptarget-run-generic
-// RUN: %libomptarget-compileoptxx-generic -O3 -ffast-math && \
-// RUN: %libomptarget-run-generic
-
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <cassert>
-#include <cmath>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <vector>
-
-class BlockMatrix {
-private:
-  const int rowsPerBlock;
-  const int colsPerBlock;
-  const long nRows;
-  const long nCols;
-  const int nBlocksPerRow;
-  const int nBlocksPerCol;
-  std::vector<std::vector<std::unique_ptr<float[]>>> Blocks;
-
-public:
-  BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
-              const long _nRows, const long _nCols)
-      : rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock), nRows(_nRows),
-        nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
-        nBlocksPerCol(_nCols / _colsPerBlock), Blocks(nBlocksPerCol) {
-    for (int i = 0; i < nBlocksPerCol; i++) {
-      for (int j = 0; j < nBlocksPerRow; j++) {
-        Blocks[i].emplace_back(new float[_rowsPerBlock * _colsPerBlock]);
-      }
-    }
-  };
-
-  // Initialize the BlockMatrix from 2D arrays
-  void Initialize(const std::vector<float> &matrix) {
-    for (int i = 0; i < nBlocksPerCol; i++)
-      for (int j = 0; j < nBlocksPerRow; j++) {
-        float *CurrBlock = GetBlock(i, j);
-        for (int ii = 0; ii < colsPerBlock; ++ii)
-          for (int jj = 0; jj < rowsPerBlock; ++jj) {
-            int curri = i * colsPerBlock + ii;
-            int currj = j * rowsPerBlock + jj;
-            CurrBlock[ii + jj * colsPerBlock] = matrix[curri + currj * nCols];
-          }
-      }
-  }
-
-  void Compare(const std::vector<float> &matrix) const {
-    for (int i = 0; i < nBlocksPerCol; i++)
-      for (int j = 0; j < nBlocksPerRow; j++) {
-        float *CurrBlock = GetBlock(i, j);
-        for (int ii = 0; ii < colsPerBlock; ++ii)
-          for (int jj = 0; jj < rowsPerBlock; ++jj) {
-            int curri = i * colsPerBlock + ii;
-            int currj = j * rowsPerBlock + jj;
-            float m_value = matrix[curri + currj * nCols];
-            float bm_value = CurrBlock[ii + jj * colsPerBlock];
-            assert(std::fabs(bm_value - m_value) <
-                   std::numeric_limits<float>::epsilon());
-          }
-      }
-  }
-
-  float *GetBlock(int i, int j) const {
-    assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside block");
-    return Blocks[i][j].get();
-  }
-};
-
-constexpr const int BS = 16;
-constexpr const int N = 256;
-
-int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
-#pragma omp parallel
-#pragma omp master
-  for (int i = 0; i < N / BS; ++i)
-    for (int j = 0; j < N / BS; ++j) {
-      float *BlockC = C.GetBlock(i, j);
-      for (int k = 0; k < N / BS; ++k) {
-        float *BlockA = A.GetBlock(i, k);
-        float *BlockB = B.GetBlock(k, j);
-// clang-format off
-#pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0])   \
-            map(to: BlockA[:BS * BS], BlockB[:BS * BS])                        \
-            map(tofrom: BlockC[:BS * BS]) nowait
-// clang-format on
-#pragma omp parallel for
-        for (int ii = 0; ii < BS; ii++)
-          for (int jj = 0; jj < BS; jj++) {
-            for (int kk = 0; kk < BS; ++kk)
-              BlockC[ii + jj * BS] +=
-                  BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
-          }
-      }
-    }
-  return 0;
-}
-
-void Matmul(const std::vector<float> &a, const std::vector<float> &b,
-            std::vector<float> &c) {
-  for (int i = 0; i < N; ++i) {
-    for (int j = 0; j < N; ++j) {
-      float sum = 0.0;
-      for (int k = 0; k < N; ++k) {
-        sum = sum + a[i * N + k] * b[k * N + j];
-      }
-      c[i * N + j] = sum;
-    }
-  }
-}
-
-int main(int argc, char *argv[]) {
-  std::vector<float> a(N * N);
-  std::vector<float> b(N * N);
-  std::vector<float> c(N * N, 0.0);
-
-  for (int i = 0; i < N; ++i) {
-    for (int j = 0; j < N; ++j) {
-      a[i * N + j] = b[i * N + j] = i + j % 100;
-    }
-  }
-
-  auto BlockedA = BlockMatrix(BS, BS, N, N);
-  auto BlockedB = BlockMatrix(BS, BS, N, N);
-  auto BlockedC = BlockMatrix(BS, BS, N, N);
-  BlockedA.Initialize(a);
-  BlockedB.Initialize(b);
-  BlockedC.Initialize(c);
-  BlockedA.Compare(a);
-  BlockedB.Compare(b);
-  BlockedC.Compare(c);
-
-  Matmul(a, b, c);
-  BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
-
-  BlockedC.Compare(c);
-
-  std::cout << "PASS\n";
-
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/bug49779.cpp b/libomptarget/test/offloading/bug49779.cpp
deleted file mode 100644
index d036f95..0000000
--- a/libomptarget/test/offloading/bug49779.cpp
+++ /dev/null

@@ -1,38 +0,0 @@
-// RUN: %libomptarget-compilexx-generic && \
-// RUN:   env LIBOMPTARGET_STACK_SIZE=2048 %libomptarget-run-generic
-// RUN: %libomptarget-compileoptxx-generic && \
-// RUN:   env LIBOMPTARGET_STACK_SIZE=2048 %libomptarget-run-generic
-
-// We need malloc/global_alloc support
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
-#include <cassert>
-#include <iostream>
-
-void work(int *C) {
-#pragma omp atomic
-  ++(*C);
-}
-
-void use(int *C) {
-#pragma omp parallel num_threads(2)
-  work(C);
-}
-
-int main() {
-  int C = 0;
-#pragma omp target map(C)
-  {
-    use(&C);
-#pragma omp parallel num_threads(2)
-    use(&C);
-  }
-
-  assert(C >= 2 && C <= 6);
-
-  std::cout << "PASS\n";
-
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/bug50022.cpp b/libomptarget/test/offloading/bug50022.cpp
deleted file mode 100644
index d2ae02b..0000000
--- a/libomptarget/test/offloading/bug50022.cpp
+++ /dev/null

@@ -1,40 +0,0 @@
-// RUN: %libomptarget-compilexx-and-run-generic
-// RUN: %libomptarget-compileoptxx-and-run-generic
-
-#include <cassert>
-#include <iostream>
-#include <stdexcept>
-
-int main(int argc, char *argv[]) {
-  int a = 0;
-  std::cout << "outside a = " << a << " addr " << &a << std::endl;
-#pragma omp target map(tofrom : a) depend(out : a) nowait
-  {
-    int sum = 0;
-    for (int i = 0; i < 100000; i++)
-      sum++;
-    a = 1;
-  }
-
-#pragma omp task depend(inout : a) shared(a)
-  {
-    std::cout << "a = " << a << " addr " << &a << std::endl;
-    if (a != 1)
-      throw std::runtime_error("wrong result!");
-    a = 2;
-  }
-
-#pragma omp task depend(inout : a) shared(a)
-  {
-    std::cout << "a = " << a << " addr " << &a << std::endl;
-    if (a != 2)
-      throw std::runtime_error("wrong result!");
-    a = 3;
-  }
-
-#pragma omp taskwait
-
-  assert(a == 3 && "wrong result!");
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/bug51781.c b/libomptarget/test/offloading/bug51781.c
deleted file mode 100644
index 35ecf55..0000000
--- a/libomptarget/test/offloading/bug51781.c
+++ /dev/null

@@ -1,53 +0,0 @@
-// Use the generic state machine.  On some architectures, other threads in the
-// main thread's warp must avoid barrier instructions.
-//
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// SPMDize.  There is no main thread, so there's no issue.
-//
-// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt > %t.spmd 2>&1
-// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=SPMD -input-file=%t.spmd
-// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=SPMD -input-file=%t.spmd
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
-//
-// SPMD: Transformed generic-mode kernel to SPMD-mode.
-
-// Use the custom state machine, which must avoid the same barrier problem as
-// the generic state machine.
-//
-// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt \
-// RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
-// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
-// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
-//
-// Repeat with reduction clause, which has managed to break the custom state
-// machine in the past.
-//
-// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt -DADD_REDUCTION \
-// RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
-// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
-// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
-//
-// CUSTOM: Rewriting generic-mode kernel with a customized state machine.
-
-#if ADD_REDUCTION
-#define REDUCTION(...) reduction(__VA_ARGS__)
-#else
-#define REDUCTION(...)
-#endif
-
-#include <stdio.h>
-int main() {
-  int x = 0, y = 1;
-#pragma omp target teams num_teams(1) map(tofrom : x, y) REDUCTION(+ : x)
-  {
-    x += 5;
-#pragma omp parallel
-    y = 6;
-  }
-  // CHECK: 5, 6
-  printf("%d, %d\n", x, y);
-  return 0;
-}

diff --git a/libomptarget/test/offloading/bug51982.c b/libomptarget/test/offloading/bug51982.c
deleted file mode 100644
index 91ce4a2..0000000
--- a/libomptarget/test/offloading/bug51982.c
+++ /dev/null

@@ -1,25 +0,0 @@
-// RUN: %libomptarget-compile-generic -O1 && %libomptarget-run-generic
-// -O1 to run openmp-opt
-// RUN: %libomptarget-compileopt-generic -O1 && %libomptarget-run-generic
-
-int main(void) {
-  long int aa = 0;
-
-  int ng = 12;
-  int nxyz = 5;
-
-  const long exp = ng * nxyz;
-
-#pragma omp target map(tofrom : aa)
-  for (int gid = 0; gid < nxyz; gid++) {
-#pragma omp parallel for
-    for (unsigned int g = 0; g < ng; g++) {
-#pragma omp atomic
-      aa += 1;
-    }
-  }
-  if (aa != exp) {
-    return 1;
-  }
-  return 0;
-}

diff --git a/libomptarget/test/offloading/bug53727.cpp b/libomptarget/test/offloading/bug53727.cpp
deleted file mode 100644
index 8ce8b7e..0000000
--- a/libomptarget/test/offloading/bug53727.cpp
+++ /dev/null

@@ -1,58 +0,0 @@
-// RUN: %libomptarget-compilexx-and-run-generic
-// RUN: %libomptarget-compileoptxx-and-run-generic
-
-#include <cassert>
-#include <iostream>
-
-constexpr const int N = 10;
-
-struct T {
-  int a;
-  int *p;
-};
-
-struct S {
-  int b;
-  T t;
-};
-
-int main(int argc, char *argv[]) {
-  S s;
-  s.t.p = new int[N];
-  for (int i = 0; i < N; ++i) {
-    s.t.p[i] = i;
-  }
-
-#pragma omp target enter data map(to : s, s.t.p[ : N])
-
-#pragma omp target
-  {
-    for (int i = 0; i < N; ++i) {
-      s.t.p[i] += i;
-    }
-  }
-
-#pragma omp target update from(s.t.p[ : N])
-
-  for (int i = 0; i < N; ++i) {
-    assert(s.t.p[i] == 2 * i);
-    s.t.p[i] += i;
-  }
-
-#pragma omp target update to(s.t.p[ : N])
-
-#pragma omp target
-  {
-    for (int i = 0; i < N; ++i) {
-      s.t.p[i] += i;
-    }
-  }
-
-#pragma omp target exit data map(from : s, s.t.p[ : N])
-
-  for (int i = 0; i < N; ++i) {
-    assert(s.t.p[i] == 4 * i);
-  }
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/bug64959.c b/libomptarget/test/offloading/bug64959.c
deleted file mode 100644
index eddc553..0000000
--- a/libomptarget/test/offloading/bug64959.c
+++ /dev/null

@@ -1,60 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-// TODO: This requires malloc support for the threads states.
-// FIXME: Flaky on all GPU targets.
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <omp.h>
-#include <stdio.h>
-#define N 10
-
-int isCPU() { return 1; }
-
-#pragma omp begin declare variant match(device = {kind(gpu)})
-int isCPU() { return 0; }
-#pragma omp end declare variant
-
-int main(void) {
-  long int aa = 0;
-  int res = 0;
-
-  int ng = 12;
-  int cmom = 14;
-  int nxyz;
-
-#pragma omp target map(from : nxyz, ng, cmom)
-  {
-    nxyz = isCPU() ? 2 : 5000;
-    ng = isCPU() ? 2 : 12;
-    cmom = isCPU() ? 2 : 14;
-  }
-
-#pragma omp target teams distribute num_teams(nxyz)                            \
-    thread_limit(ng *(cmom - 1)) map(tofrom : aa)
-  for (int gid = 0; gid < nxyz; gid++) {
-#pragma omp parallel for collapse(2)
-    for (unsigned int g = 0; g < ng; g++) {
-      for (unsigned int l = 0; l < cmom - 1; l++) {
-        int a = 0;
-#pragma omp parallel for reduction(+ : a)
-        for (int i = 0; i < N; i++) {
-          a += i;
-        }
-#pragma omp atomic
-        aa += a;
-      }
-    }
-  }
-  long exp = (long)ng * (cmom - 1) * nxyz * (N * (N - 1) / 2);
-  printf("The result is = %ld exp:%ld!\n", aa, exp);
-  if (aa != exp) {
-    printf("Failed %ld\n", aa);
-    return 1;
-  }
-  // CHECK: Success
-  printf("Success\n");
-  return 0;
-}

diff --git a/libomptarget/test/offloading/bug64959_compile_only.c b/libomptarget/test/offloading/bug64959_compile_only.c
deleted file mode 100644
index 310942f..0000000
--- a/libomptarget/test/offloading/bug64959_compile_only.c
+++ /dev/null

@@ -1,40 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: %libomptarget-compileopt-generic
-
-#include <stdio.h>
-#define N 10
-
-int main(void) {
-  long int aa = 0;
-  int res = 0;
-
-  int ng = 12;
-  int cmom = 14;
-  int nxyz = 5000;
-
-#pragma omp target teams distribute num_teams(nxyz)                            \
-    thread_limit(ng *(cmom - 1)) map(tofrom : aa)
-  for (int gid = 0; gid < nxyz; gid++) {
-#pragma omp parallel for collapse(2)
-    for (unsigned int g = 0; g < ng; g++) {
-      for (unsigned int l = 0; l < cmom - 1; l++) {
-        int a = 0;
-#pragma omp parallel for reduction(+ : a)
-        for (int i = 0; i < N; i++) {
-          a += i;
-        }
-#pragma omp atomic
-        aa += a;
-      }
-    }
-  }
-  long exp = (long)ng * (cmom - 1) * nxyz * (N * (N - 1) / 2);
-  printf("The result is = %ld exp:%ld!\n", aa, exp);
-  if (aa != exp) {
-    printf("Failed %ld\n", aa);
-    return 1;
-  }
-  // CHECK: Success
-  printf("Success\n");
-  return 0;
-}

diff --git a/libomptarget/test/offloading/bug74582.c b/libomptarget/test/offloading/bug74582.c
deleted file mode 100644
index c6a283b..0000000
--- a/libomptarget/test/offloading/bug74582.c
+++ /dev/null

@@ -1,13 +0,0 @@
-// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
-// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic
-
-// Verify we do not read bits in the image that are not there (nobits section).
-
-#pragma omp begin declare target
-char BigUninitializedBuffer[4096 * 64] __attribute__((loader_uninitialized));
-#pragma omp end declare target
-
-int main() {
-#pragma omp target
-  {}
-}

diff --git a/libomptarget/test/offloading/complex_reduction.cpp b/libomptarget/test/offloading/complex_reduction.cpp
deleted file mode 100644
index 3239f48..0000000
--- a/libomptarget/test/offloading/complex_reduction.cpp
+++ /dev/null

@@ -1,94 +0,0 @@
-// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
-// RUN: %libomptarget-compilexx-generic -O3 -ffast-math && %libomptarget-run-generic
-
-#include <complex>
-#include <iostream>
-
-bool failed = false;
-
-template <typename T> void test_map() {
-  std::cout << "map(complex<T>)" << std::endl;
-  std::complex<T> a(0.2, 1), a_check;
-#pragma omp target map(from : a_check)
-  { a_check = a; }
-
-  if (std::abs(a - a_check) > 1e-6) {
-    std::cout << "wrong map value check" << a_check << " correct value " << a
-              << std::endl;
-    failed = true;
-  }
-}
-
-#if !defined(__NO_UDR)
-#pragma omp declare reduction(+ : std::complex <float> : omp_out += omp_in)
-#pragma omp declare reduction(+ : std::complex <double> : omp_out += omp_in)
-#endif
-
-template <typename T> class initiator {
-public:
-  static T value(int i) { return T(i); }
-};
-
-template <typename T> class initiator<std::complex<T>> {
-public:
-  static std::complex<T> value(int i) { return {T(i), T(-i)}; }
-};
-
-template <typename T> void test_reduction() {
-  T sum(0), sum_host(0);
-  const int size = 100;
-  T array[size];
-  for (int i = 0; i < size; i++) {
-    array[i] = initiator<T>::value(i);
-    sum_host += array[i];
-  }
-
-#pragma omp target teams distribute parallel for map(to : array[:size]) reduction(+ : sum)
-  for (int i = 0; i < size; i++)
-    sum += array[i];
-
-  if (std::abs(sum - sum_host) > 1e-6) {
-    std::cout << "wrong reduction value check" << sum << " correct value "
-              << sum_host << std::endl;
-    failed = true;
-  }
-
-  const int nblock(10), block_size(10);
-  T block_sum[nblock];
-#pragma omp target teams distribute map(to                                     \
-                                        : array[:size])                        \
-    map(from                                                                   \
-        : block_sum[:nblock])
-  for (int ib = 0; ib < nblock; ib++) {
-    T partial_sum(0);
-    const int istart = ib * block_size;
-    const int iend = (ib + 1) * block_size;
-#pragma omp parallel for reduction(+ : partial_sum)
-    for (int i = istart; i < iend; i++)
-      partial_sum += array[i];
-    block_sum[ib] = partial_sum;
-  }
-
-  sum = 0;
-  for (int ib = 0; ib < nblock; ib++)
-    sum += block_sum[ib];
-  if (std::abs(sum - sum_host) > 1e-6) {
-    std::cout << "hierarchical parallelism wrong reduction value check" << sum
-              << " correct value " << sum_host << std::endl;
-    failed = true;
-  }
-}
-
-template <typename T> void test_complex() {
-  test_map<T>();
-  test_reduction<std::complex<T>>();
-}
-
-int main() {
-  std::cout << "Testing complex" << std::endl;
-  std::cout << "Testing float" << std::endl;
-  test_complex<float>();
-  std::cout << "Testing double" << std::endl;
-  test_complex<double>();
-  return failed;
-}

diff --git a/libomptarget/test/offloading/ctor_dtor.cpp b/libomptarget/test/offloading/ctor_dtor.cpp
deleted file mode 100644
index 7f08798..0000000
--- a/libomptarget/test/offloading/ctor_dtor.cpp
+++ /dev/null

@@ -1,24 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-// RUN: %libomptarget-compileoptxx-run-and-check-generic
-
-#include <cstdio>
-struct S {
-  S() : i(7) {}
-  ~S() { foo(); }
-  int foo() { return i; }
-
-private:
-  int i;
-};
-
-S s;
-#pragma omp declare target(s)
-
-int main() {
-  int r;
-#pragma omp target map(from : r)
-  r = s.foo();
-
-  // CHECK: 7
-  printf("%i\n", r);
-}

diff --git a/libomptarget/test/offloading/cuda_no_devices.c b/libomptarget/test/offloading/cuda_no_devices.c
deleted file mode 100644
index bbcbf6b..0000000
--- a/libomptarget/test/offloading/cuda_no_devices.c
+++ /dev/null

@@ -1,20 +0,0 @@
-// The CUDA plugin used to complain on stderr when no CUDA devices were enabled,
-// and then it let the application run anyway.  Check that there's no such
-// complaint anymore, especially when the user isn't targeting CUDA.
-
-// RUN: %libomptarget-compile-generic
-// RUN: env CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
-
-#include <stdio.h>
-
-// CHECK-NOT: {{.}}
-//     CHECK: Hello World: 4
-// CHECK-NOT: {{.}}
-int main() {
-  int x = 0;
-#pragma omp target teams num_teams(2) reduction(+ : x)
-  x += 2;
-  printf("Hello World: %d\n", x);
-  return 0;
-}

diff --git a/libomptarget/test/offloading/d2d_memcpy.c b/libomptarget/test/offloading/d2d_memcpy.c
deleted file mode 100644
index f26d0c7..0000000
--- a/libomptarget/test/offloading/d2d_memcpy.c
+++ /dev/null

@@ -1,69 +0,0 @@
-// RUN: %libomptarget-compile-generic && \
-// RUN: env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-generic | \
-// RUN: %fcheck-generic -allow-empty
-// RUN: %libomptarget-compileopt-generic && \
-// RUN: env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-generic | \
-// RUN: %fcheck-generic -allow-empty
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const int magic_num = 7;
-
-int main(int argc, char *argv[]) {
-  const int N = 128;
-  const int num_devices = omp_get_num_devices();
-
-  // No target device, just return
-  if (num_devices == 0) {
-    printf("PASS\n");
-    return 0;
-  }
-
-  const int src_device = 0;
-  int dst_device = num_devices - 1;
-
-  int length = N * sizeof(int);
-  int *src_ptr = omp_target_alloc(length, src_device);
-  int *dst_ptr = omp_target_alloc(length, dst_device);
-
-  assert(src_ptr && "src_ptr is NULL");
-  assert(dst_ptr && "dst_ptr is NULL");
-
-#pragma omp target teams distribute parallel for device(src_device)            \
-    is_device_ptr(src_ptr)
-  for (int i = 0; i < N; ++i) {
-    src_ptr[i] = magic_num;
-  }
-
-  int rc =
-      omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
-
-  assert(rc == 0 && "error in omp_target_memcpy");
-
-  int *buffer = malloc(length);
-
-  assert(buffer && "failed to allocate host buffer");
-
-#pragma omp target teams distribute parallel for device(dst_device)            \
-    map(from : buffer[0 : N]) is_device_ptr(dst_ptr)
-  for (int i = 0; i < N; ++i) {
-    buffer[i] = dst_ptr[i] + magic_num;
-  }
-
-  for (int i = 0; i < N; ++i)
-    assert(buffer[i] == 2 * magic_num);
-
-  printf("PASS\n");
-
-  // Free host and device memory
-  free(buffer);
-  omp_target_free(src_ptr, src_device);
-  omp_target_free(dst_ptr, dst_device);
-
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/d2d_memcpy_sync.c b/libomptarget/test/offloading/d2d_memcpy_sync.c
deleted file mode 100644
index 6b9b765..0000000
--- a/libomptarget/test/offloading/d2d_memcpy_sync.c
+++ /dev/null

@@ -1,72 +0,0 @@
-// RUN: %libomptarget-compile-generic && \
-// RUN: env LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES=0 %libomptarget-run-generic | \
-// RUN: %fcheck-generic -allow-empty
-// REQUIRES: amdgcn-amd-amdhsa
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const int magic_num = 7;
-
-int main(int argc, char *argv[]) {
-  const int N = 128;
-  const int num_devices = omp_get_num_devices();
-
-  // No target device, just return
-  if (num_devices == 0) {
-    printf("PASS\n");
-    return 0;
-  }
-
-  const int src_device = 0;
-  int dst_device = num_devices - 1;
-
-  int length = N * sizeof(int);
-  int *src_ptr = omp_target_alloc(length, src_device);
-  int *dst_ptr = omp_target_alloc(length, dst_device);
-
-  if (!src_ptr || !dst_ptr) {
-    printf("FAIL\n");
-    return 1;
-  }
-
-#pragma omp target teams distribute parallel for device(src_device)            \
-    is_device_ptr(src_ptr)
-  for (int i = 0; i < N; ++i) {
-    src_ptr[i] = magic_num;
-  }
-
-  if (omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device,
-                        src_device)) {
-    printf("FAIL\n");
-    return 1;
-  }
-
-  int *buffer = malloc(length);
-  if (!buffer) {
-    printf("FAIL\n");
-    return 1;
-  }
-
-#pragma omp target teams distribute parallel for device(dst_device)            \
-    map(from : buffer[0 : N]) is_device_ptr(dst_ptr)
-  for (int i = 0; i < N; ++i) {
-    buffer[i] = dst_ptr[i] + magic_num;
-  }
-
-  for (int i = 0; i < N; ++i)
-    assert(buffer[i] == 2 * magic_num);
-
-  printf("PASS\n");
-
-  // Free host and device memory
-  free(buffer);
-  omp_target_free(src_ptr, src_device);
-  omp_target_free(dst_ptr, dst_device);
-
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/default_thread_limit.c b/libomptarget/test/offloading/default_thread_limit.c
deleted file mode 100644
index 4da02bb..0000000
--- a/libomptarget/test/offloading/default_thread_limit.c
+++ /dev/null

@@ -1,104 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=16 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
-
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-__attribute__((optnone)) int optnone() { return 1; }
-
-int main() {
-  int N = optnone() * 4098 * 32;
-
-// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
-#pragma omp target 
-#pragma omp teams distribute parallel for
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 42 (MaxFlatWorkGroupSize: 1024
-#pragma omp target thread_limit(optnone() * 42)
-#pragma omp teams distribute parallel for
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
-#pragma omp target thread_limit(optnone() * 42) ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
-#pragma omp teams distribute parallel for
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
-#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
-#pragma omp teams distribute parallel for
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: MaxFlatWorkGroupSize: 1024
-#pragma omp target
-#pragma omp teams distribute parallel for num_threads(optnone() * 42)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: MaxFlatWorkGroupSize: 1024
-#pragma omp target teams distribute parallel for thread_limit(optnone() * 42)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: MaxFlatWorkGroupSize: 1024
-#pragma omp target teams distribute parallel for num_threads(optnone() * 42)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
-#pragma omp target
-#pragma omp teams distribute parallel for num_threads(9)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
-#pragma omp target thread_limit(4)
-#pragma omp teams distribute parallel for
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
-#pragma omp target
-#pragma omp teams distribute parallel for thread_limit(4)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
-#pragma omp target teams distribute parallel for num_threads(9)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
-#pragma omp target teams distribute parallel for simd thread_limit(4)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-}
-

diff --git a/libomptarget/test/offloading/dynamic_module.c b/libomptarget/test/offloading/dynamic_module.c
deleted file mode 100644
index f1e9862..0000000
--- a/libomptarget/test/offloading/dynamic_module.c
+++ /dev/null

@@ -1,17 +0,0 @@
-// RUN: %libomptarget-compile-generic -DSHARED -fPIC -shared -o %t.so && \
-// RUN: %libomptarget-compile-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic
-// RUN: %libomptarget-compileopt-generic -DSHARED -fPIC -shared -o %t.so && \
-// RUN: %libomptarget-compileopt-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic
-
-#ifdef SHARED
-void foo() {}
-#else
-#include <stdio.h>
-int main() {
-#pragma omp target
-  ;
-  // CHECK: DONE.
-  printf("%s\n", "DONE.");
-  return 0;
-}
-#endif

diff --git a/libomptarget/test/offloading/dynamic_module_load.c b/libomptarget/test/offloading/dynamic_module_load.c
deleted file mode 100644
index 5393f33..0000000
--- a/libomptarget/test/offloading/dynamic_module_load.c
+++ /dev/null

@@ -1,34 +0,0 @@
-// RUN: %libomptarget-compile-generic -DSHARED -fPIC -shared -o %t.so && %clang %flags %s -o %t -ldl && %libomptarget-run-generic %t.so 2>&1 | %fcheck-generic
-
-#ifdef SHARED
-#include <stdio.h>
-int foo() {
-#pragma omp target
-  ;
-  printf("%s\n", "DONE.");
-  return 0;
-}
-#else
-#include <dlfcn.h>
-#include <stdio.h>
-int main(int argc, char **argv) {
-#pragma omp target
-  ;
-
-  void *Handle = dlopen(argv[1], RTLD_NOW);
-  int (*Foo)(void);
-
-  if (Handle == NULL) {
-    printf("dlopen() failed: %s\n", dlerror());
-    return 1;
-  }
-  Foo = (int (*)(void))dlsym(Handle, "foo");
-  if (Handle == NULL) {
-    printf("dlsym() failed: %s\n", dlerror());
-    return 1;
-  }
-  // CHECK: DONE.
-  // CHECK-NOT: {{abort|fault}}
-  return Foo();
-}
-#endif

diff --git a/libomptarget/test/offloading/extern.c b/libomptarget/test/offloading/extern.c
deleted file mode 100644
index 35cf905..0000000
--- a/libomptarget/test/offloading/extern.c
+++ /dev/null

@@ -1,27 +0,0 @@
-// RUN: %libomptarget-compile-generic -DVAR -c -o %t.o
-// RUN: %libomptarget-compile-generic %t.o && %libomptarget-run-generic | %fcheck-generic
-
-#ifdef VAR
-int x = 1;
-#else
-#include <stdio.h>
-#include <assert.h>
-extern int x;
-
-int main() {
-  int value = 0;
-#pragma omp target map(from : value)
-  value = x;
-  assert(value == 1);
-
-  x = 999;
-#pragma omp target update to(x)
-
-#pragma omp target map(from : value)
-  value = x;
-  assert(value == 999);
-
-  // CHECK: PASS
-  printf ("PASS\n");
-}
-#endif

diff --git a/libomptarget/test/offloading/force-usm.cpp b/libomptarget/test/offloading/force-usm.cpp
deleted file mode 100644
index a043ba4..0000000
--- a/libomptarget/test/offloading/force-usm.cpp
+++ /dev/null

@@ -1,61 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compilexx-generic
-// RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM
-//
-// RUN: %libomptarget-compilexxx-generic-force-usm
-// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \
-// RUN:       %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM
-//
-// REQUIRES: unified_shared_memory
-//
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// clang-format on
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-
-int GI;
-#pragma omp declare target
-int *pGI;
-#pragma omp end declare target
-
-int main(void) {
-
-  GI = 0;
-  // Implicit mappings
-  int alpha = 1;
-  int beta[3] = {2, 5, 8};
-
-  // Require map clauses for non-USM execution
-  pGI = (int *)malloc(sizeof(int));
-  *pGI = 42;
-
-#pragma omp target map(pGI[ : 1], GI)
-  {
-    GI = 1 * alpha;
-    *pGI = 2 * beta[1];
-  }
-
-  assert(GI == 1);
-  assert(*pGI == 10);
-
-  printf("SUCCESS\n");
-
-  return 0;
-}
-
-// clang-format off
-// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
-// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
-// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
-// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
-// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
-// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=12
-// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
-// NO-USM-NEXT: SUCCESS
-
-// FORCE-USM: SUCCESS
-//
-// clang-format on

diff --git a/libomptarget/test/offloading/fortran/basic-target-parallel-do.f90 b/libomptarget/test/offloading/fortran/basic-target-parallel-do.f90
deleted file mode 100644
index 0027f15..0000000
--- a/libomptarget/test/offloading/fortran/basic-target-parallel-do.f90
+++ /dev/null

@@ -1,33 +0,0 @@
-! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-generic
-! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
-program main
-   use omp_lib
-   integer :: x(100)
-   integer :: errors = 0
-   integer :: i
-
-   !$omp target parallel do map(from: x)
-   do i = 1, 100
-       x(i) = i
-   end do
-   !$omp end target parallel do
-   do i = 1, 100
-       if ( x(i) .ne. i ) then
-           errors = errors + 1
-       end if
-   end do
-
-   print *,"number of errors: ", errors
-
-end program main
-
-! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
-! CHECK:  number of errors: 0

diff --git a/libomptarget/test/offloading/fortran/basic-target-parallel-region.f90 b/libomptarget/test/offloading/fortran/basic-target-parallel-region.f90
deleted file mode 100644
index 54341f7..0000000
--- a/libomptarget/test/offloading/fortran/basic-target-parallel-region.f90
+++ /dev/null

@@ -1,21 +0,0 @@
-! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-   use omp_lib
-   integer :: x
-
-   !$omp target parallel map(from: x)
-         x = omp_get_num_threads()
-   !$omp end target parallel
-   print *,"parallel = ", (x .ne. 1)
-
-end program main
-
-! CHECK: parallel = T

diff --git a/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90 b/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
deleted file mode 100644
index 476b77e..0000000
--- a/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
+++ /dev/null

@@ -1,27 +0,0 @@
-! Basic offloading test of arrays with provided lower 
-! and upper bounds as specified by OpenMP's sectioning
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    implicit none
-    integer :: write_arr(10) =  (/0,0,0,0,0,0,0,0,0,0/)
-    integer :: read_arr(10) = (/1,2,3,4,5,6,7,8,9,10/)
-    integer :: i = 2
-    integer :: j = 5
-    !$omp target map(to:read_arr(2:5)) map(from:write_arr(2:5)) map(to:i,j)
-        do while (i <= j)
-            write_arr(i) = read_arr(i)
-            i = i + 1
-        end do
-    !$omp end target
-    
-    print *, write_arr(:)
-end program
-
-! CHECK: 0 2 3 4 5 0 0 0 0 0

diff --git a/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90 b/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
deleted file mode 100644
index 229798b..0000000
--- a/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
+++ /dev/null

@@ -1,44 +0,0 @@
-! Basic offloading test of a regular array explicitly
-! passed within a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    implicit none
-    integer :: inArray(3,3,3)
-    integer :: outArray(3,3,3)
-    integer :: i, j, k 
-    integer :: j2 = 3, k2 = 3
-
-    do i = 1, 3
-      do j = 1, 3
-        do k = 1, 3
-            inArray(i, j, k) = 42
-            outArray(i, j, k) = 0
-        end do
-       end do
-    end do
-
-j = 1
-k = 1 
-!$omp target map(tofrom:inArray(1:3, 1:3, 2:2), outArray(1:3, 1:3, 1:3), j, k, j2, k2)
-    do while (j <= j2)
-      k = 1
-      do while (k <= k2)
-        outArray(k, j, 2) = inArray(k, j, 2)
-        k = k + 1
-      end do
-      j = j + 1
-    end do
-!$omp end target
-
- print *, outArray
-
-end program
-
-! CHECK:  0 0 0 0 0 0 0 0 0 42 42 42 42 42 42 42 42 42 0 0 0 0 0 0 0 0 0

diff --git a/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90 b/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
deleted file mode 100644
index ea30481..0000000
--- a/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
+++ /dev/null

@@ -1,60 +0,0 @@
-! Basic offloading test of a regular array explicitly
-! passed within a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer :: x(2,2,2)
-    integer :: i, j, k
-    integer :: i2 = 2, j2 = 2, k2 = 2
-    integer :: counter = 1
-    do i = 1, 2
-        do j = 1, 2
-          do k = 1, 2
-            x(i, j, k) = 0
-          end do
-        end do
-    end do
-
-i = 1
-j = 1
-k = 1
-
-!$omp target map(tofrom:x, counter) map(to: i, j, k, i2, j2, k2)
-    do while (i <= i2)
-      j = 1
-        do while (j <= j2)
-          k = 1
-          do while (k <= k2)
-            x(i, j, k) = counter
-            counter = counter + 1
-            k = k + 1
-          end do
-          j = j + 1
-        end do
-        i = i + 1
-    end do
-!$omp end target
-
-     do i = 1, 2
-        do j = 1, 2
-          do k = 1, 2
-            print *, x(i, j, k)
-          end do
-        end do
-    end do
-end program main
-
-! CHECK: 1
-! CHECK: 2
-! CHECK: 3
-! CHECK: 4
-! CHECK: 5
-! CHECK: 6
-! CHECK: 7
-! CHECK: 8

diff --git a/libomptarget/test/offloading/fortran/basic_array.c b/libomptarget/test/offloading/fortran/basic_array.c
deleted file mode 100644
index 4daf1d4..0000000
--- a/libomptarget/test/offloading/fortran/basic_array.c
+++ /dev/null

@@ -1,43 +0,0 @@
-// Basic offloading test for function compiled with flang
-// REQUIRES: flang, amdgcn-amd-amdhsa
-
-// RUN: %flang -c -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
-// RUN:   %S/../../Inputs/basic_array.f90 -o basic_array.o
-// RUN: %libomptarget-compile-generic basic_array.o
-// RUN: %t | %fcheck-generic
-
-#include <stdio.h>
-#define TEST_ARR_LEN 10
-
-#pragma omp declare target
-void increment_at(int i, int *array);
-#pragma omp end declare target
-
-void increment_array(int *b, int n) {
-#pragma omp target map(tofrom : b [0:n])
-  for (int i = 0; i < n; i++) {
-    increment_at(i, b);
-  }
-}
-
-int main() {
-  int arr[TEST_ARR_LEN] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  increment_array(arr, TEST_ARR_LEN);
-  for (int i = 0; i < TEST_ARR_LEN; i++) {
-    printf("%d = %d\n", i, arr[i]);
-  }
-
-  return 0;
-}
-
-// CHECK: 0 = 1
-// CHECK-NEXT: 1 = 2
-// CHECK-NEXT: 2 = 3
-// CHECK-NEXT: 3 = 4
-// CHECK-NEXT: 4 = 5
-// CHECK-NEXT: 5 = 6
-// CHECK-NEXT: 6 = 7
-// CHECK-NEXT: 7 = 8
-// CHECK-NEXT: 8 = 9
-// CHECK-NEXT: 9 = 10

diff --git a/libomptarget/test/offloading/fortran/basic_target_region.f90 b/libomptarget/test/offloading/fortran/basic_target_region.f90
deleted file mode 100644
index d856d42..0000000
--- a/libomptarget/test/offloading/fortran/basic_target_region.f90
+++ /dev/null

@@ -1,20 +0,0 @@
-! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-  integer :: x;
-  x = 0
-!$omp target map(from:x)
-    x = 5
-!$omp end target
-  print *, "x = ", x
-end program main
-
-! CHECK: x = 5
-

diff --git a/libomptarget/test/offloading/fortran/constant-arr-index.f90 b/libomptarget/test/offloading/fortran/constant-arr-index.f90
deleted file mode 100644
index 6696305..0000000
--- a/libomptarget/test/offloading/fortran/constant-arr-index.f90
+++ /dev/null

@@ -1,26 +0,0 @@
-! Basic offloading test with a target region
-! that checks constant indexing on device
-! correctly works (regression test for prior
-! bug).
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    INTEGER :: sp(10) = (/0,0,0,0,0,0,0,0,0,0/)
-
-  !$omp target map(tofrom:sp)
-     sp(1) = 20
-     sp(5) = 10
-  !$omp end target
-
-   print *, sp(1)
-   print *, sp(5)
-end program
-
-! CHECK: 20
-! CHECK: 10

diff --git a/libomptarget/test/offloading/fortran/declare-target-vars-in-target-region.f90 b/libomptarget/test/offloading/fortran/declare-target-vars-in-target-region.f90
deleted file mode 100644
index f524dea..0000000
--- a/libomptarget/test/offloading/fortran/declare-target-vars-in-target-region.f90
+++ /dev/null

@@ -1,81 +0,0 @@
-! Offloading test with a target region mapping a declare target
-! Fortran array writing some values to it and checking the host
-! correctly receives the updates made on the device.
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-module test_0
-  implicit none
-  INTEGER :: arr1(10) = (/0,0,0,0,0,0,0,0,0,0/)
-  INTEGER :: arr2(10) = (/0,0,0,0,0,0,0,0,0,0/)
-  !$omp declare target link(arr1) enter(arr2)
-  INTEGER :: scalar = 1
-  !$omp declare target link(scalar)
-end module test_0
-
-subroutine test_with_array_link_and_tofrom()
-  use test_0
-  integer :: i = 1
-  integer :: j = 11
-  !$omp target map(tofrom:arr1, i, j)
-  do while (i <= j)
-      arr1(i) = i;
-      i = i + 1
-  end do
-  !$omp end target
-
-  ! CHECK: 1 2 3 4 5 6 7 8 9 10
-  PRINT *, arr1(:)
-end subroutine test_with_array_link_and_tofrom
-
-subroutine test_with_array_link_only()
-  use test_0
-  integer :: i = 1
-  integer :: j = 11
-  !$omp target map(i, j)
-      do while (i <= j)
-          arr1(i) = i + 1;
-          i = i + 1
-      end do
-  !$omp end target
-
-  ! CHECK: 2 3 4 5 6 7 8 9 10 11
-  PRINT *, arr1(:)
-end subroutine test_with_array_link_only
-
-subroutine test_with_array_enter_only()
-  use test_0
-  integer :: i = 1
-  integer :: j = 11
-  !$omp target map(i, j)
-      do while (i <= j)
-          arr2(i) = i + 1;
-          i = i + 1
-      end do
-  !$omp end target
-
-  ! CHECK: 0 0 0 0 0 0 0 0 0 0
-  PRINT *, arr2(:)
-end subroutine test_with_array_enter_only
-
-subroutine test_with_scalar_link_only()
-  use test_0
-  !$omp target
-      scalar = 10
-  !$omp end target
-
-  ! CHECK: 10
-  PRINT *, scalar
-end subroutine test_with_scalar_link_only
-
-program main
-  call test_with_array_link_and_tofrom()
-  call test_with_array_link_only()
-  call test_with_array_enter_only()
-  call test_with_scalar_link_only()
-end program

diff --git a/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90 b/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
deleted file mode 100644
index 56c9672..0000000
--- a/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
+++ /dev/null

@@ -1,43 +0,0 @@
-! Offloading test with two target regions mapping the same
-! declare target Fortran array and writing some values to 
-! it before checking the host correctly receives the 
-! correct updates made on the device.
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-module test_0
-    implicit none
-    integer :: sp(10) = (/0,0,0,0,0,0,0,0,0,0/)
-    !$omp declare target link(sp)
-end module test_0
-
-program main
-    use test_0
-    integer :: i = 1
-    integer :: j = 11
-
-!$omp target map(tofrom:sp) map(to: i, j)
-    do while (i <= j)
-        sp(i) = i;
-        i = i + 1
-    end do
-!$omp end target
-
-!$omp target map(tofrom:sp) map(to: i, j)
-    do while (i <= j)
-        sp(i) = sp(i) + i;
-        i = i + 1
-    end do
-!$omp end target
-    
-print *, sp(:)
-
-end program
-
-! CHECK: 2 4 6 8 10 12 14 16 18 20

diff --git a/libomptarget/test/offloading/fortran/target-map-allocatable-array-section-1d-bounds.f90 b/libomptarget/test/offloading/fortran/target-map-allocatable-array-section-1d-bounds.f90
deleted file mode 100644
index 99dbe99..0000000
--- a/libomptarget/test/offloading/fortran/target-map-allocatable-array-section-1d-bounds.f90
+++ /dev/null

@@ -1,46 +0,0 @@
-! Offloading test checking interaction of a
-! two 1-D allocatable arrays with a target region
-! while providing the map upper and lower bounds
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer,  allocatable :: sp_read(:), sp_write(:)
-    allocate(sp_read(10))
-    allocate(sp_write(10))
-
-    do i = 1, 10
-        sp_read(i) = i
-        sp_write(i) = 0
-    end do
-
-    !$omp target map(tofrom:sp_read(2:6)) map(tofrom:sp_write(2:6))
-        do i = 1, 10
-            sp_write(i) = sp_read(i)
-        end do
-    !$omp end target
-
-    do i = 1, 10
-        print *, sp_write(i)
-    end do
-
-    deallocate(sp_read)
-    deallocate(sp_write)
-end program
-
-! CHECK: 0
-! CHECK: 2
-! CHECK: 3
-! CHECK: 4
-! CHECK: 5
-! CHECK: 6
-! CHECK: 0
-! CHECK: 0
-! CHECK: 0
-! CHECK: 0

diff --git a/libomptarget/test/offloading/fortran/target-map-allocatable-array-section-3d-bounds.f90 b/libomptarget/test/offloading/fortran/target-map-allocatable-array-section-3d-bounds.f90
deleted file mode 100644
index 0786e0f..0000000
--- a/libomptarget/test/offloading/fortran/target-map-allocatable-array-section-3d-bounds.f90
+++ /dev/null

@@ -1,44 +0,0 @@
-! Offloading test checking interaction of allocatables
-! with multi-dimensional bounds (3-D in this case) and
-! a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer, allocatable :: inArray(:,:,:)
-    integer, allocatable :: outArray(:,:,:)
-
-    allocate(inArray(3,3,3))
-    allocate(outArray(3,3,3))
-
-    do i = 1, 3
-      do j = 1, 3
-        do k = 1, 3
-            inArray(i, j, k) = 42
-            outArray(i, j, k) = 0
-        end do
-       end do
-    end do
-
-!$omp target map(tofrom:inArray(1:3, 1:3, 2:2), outArray(1:3, 1:3, 1:3))
-    do j = 1, 3
-      do k = 1, 3
-        outArray(k, j, 2) = inArray(k, j, 2)
-      end do
-    end do
-!$omp end target
-
-print *, outArray
-
-deallocate(inArray)
-deallocate(outArray)
-
-end program
-
-! CHECK: 0 0 0 0 0 0 0 0 0 42 42 42 42 42 42 42 42 42 0 0 0 0 0 0 0 0 0

diff --git a/libomptarget/test/offloading/fortran/target-map-allocatable-map-scopes.f90 b/libomptarget/test/offloading/fortran/target-map-allocatable-map-scopes.f90
deleted file mode 100644
index bb47d3d..0000000
--- a/libomptarget/test/offloading/fortran/target-map-allocatable-map-scopes.f90
+++ /dev/null

@@ -1,66 +0,0 @@
-! Offloading test checking interaction of allocatables
-! with target in different scopes
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-module test
-    contains
-  subroutine func_arg(arg_alloc) 
-    integer,  allocatable, intent (inout) :: arg_alloc(:)
-  
-  !$omp target map(tofrom: arg_alloc) 
-    do index = 1, 10
-      arg_alloc(index) = arg_alloc(index) + index
-    end do
-  !$omp end target
-    
-    print *, arg_alloc
-  end subroutine func_arg
-end module 
-  
-subroutine func 
-    integer,  allocatable :: local_alloc(:) 
-    allocate(local_alloc(10))
-  
-  !$omp target map(tofrom: local_alloc) 
-    do index = 1, 10
-      local_alloc(index) = index
-    end do
-  !$omp end target
-  
-    print *, local_alloc
-  
-    deallocate(local_alloc)
-end subroutine func 
-  
-  
-program main 
-  use test 
-  integer,  allocatable :: map_ptr(:) 
-  
-  allocate(map_ptr(10))
-  
-  !$omp target map(tofrom: map_ptr) 
-    do index = 1, 10
-      map_ptr(index) = index
-    end do
-  !$omp end target
-
-   call func 
-
-   print *, map_ptr
-
-   call func_arg(map_ptr)
-
-   deallocate(map_ptr)
-end program 
-
-! CHECK: 1 2 3 4 5 6 7 8 9 10
-! CHECK: 1 2 3 4 5 6 7 8 9 10
-! CHECK: 2 4 6 8 10 12 14 16 18 20

diff --git a/libomptarget/test/offloading/fortran/target-map-enter-exit-allocatables.f90 b/libomptarget/test/offloading/fortran/target-map-enter-exit-allocatables.f90
deleted file mode 100644
index 865be95..0000000
--- a/libomptarget/test/offloading/fortran/target-map-enter-exit-allocatables.f90
+++ /dev/null

@@ -1,44 +0,0 @@
-! Offloading test checking interaction of allocatables
-! with enter, exit and target 
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer, allocatable :: A(:)    
-    allocate(A(10))
-    
-   !$omp target enter data map(alloc: A)
-
-    !$omp target
-        do I = 1, 10
-            A(I) = I
-        end do
-    !$omp end target
-
-    !$omp target exit data map(from: A)
-
-    !$omp target exit data map(delete: A)
-    
-    do i = 1, 10
-        print *, A(i)
-    end do
-    
-    deallocate(A)
-end program
-
-! CHECK: 1
-! CHECK: 2
-! CHECK: 3
-! CHECK: 4
-! CHECK: 5
-! CHECK: 6
-! CHECK: 7
-! CHECK: 8
-! CHECK: 9
-! CHECK: 10

diff --git a/libomptarget/test/offloading/fortran/target-map-enter-exit-array-2.f90 b/libomptarget/test/offloading/fortran/target-map-enter-exit-array-2.f90
deleted file mode 100644
index 489c253..0000000
--- a/libomptarget/test/offloading/fortran/target-map-enter-exit-array-2.f90
+++ /dev/null

@@ -1,39 +0,0 @@
-! Offloading test checking interaction of an
-! enter and exit map of an array of scalars
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer :: array(10)
-
-    do I = 1, 10
-      array(I) = I + I
-    end do
-
-    !$omp target enter data map(to: array)
-
-    ! Shouldn't overwrite data already locked in
-    ! on target via enter, this will then be 
-    ! overwritten by our exit
-    do I = 1, 10
-      array(I) = 10
-    end do
-
-   !$omp target
-    do i=1,10
-      array(i) = array(i) + i
-    end do
-  !$omp end target 
-
-  !$omp target exit data map(from: array)
-
-  print*, array
-end program
-
-!CHECK: 3 6 9 12 15 18 21 24 27 30

diff --git a/libomptarget/test/offloading/fortran/target-map-enter-exit-array-bounds.f90 b/libomptarget/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
deleted file mode 100644
index 3c8c350..0000000
--- a/libomptarget/test/offloading/fortran/target-map-enter-exit-array-bounds.f90
+++ /dev/null

@@ -1,44 +0,0 @@
-! Offloading test checking interaction of an
-! enter and exit map of an array of scalars
-! with specified bounds
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-
-program main
-    integer :: array(10)
-
-    do I = 1, 10
-      array(I) = I + I
-    end do
-
-    !$omp target enter data map(to: array(3:6))
-
-    ! Shouldn't overwrite data already locked in
-    ! on target via enter, which will then be 
-    ! overwritten by our exit
-    do I = 1, 10
-      array(I) = 10
-    end do
-
-  ! The compiler/runtime is less lenient about read/write out of 
-  ! bounds when using enter and exit, we have to specifically loop
-  ! over the correctly mapped range
-   !$omp target
-    do i=3,6
-      array(i) = array(i) + i
-    end do
-  !$omp end target 
-
-  !$omp target exit data map(from: array(3:6))
-
-  print *, array
-end program
-
-!CHECK: 10 10 9 12 15 18 10 10 10 10

diff --git a/libomptarget/test/offloading/fortran/target-map-enter-exit-array.f90 b/libomptarget/test/offloading/fortran/target-map-enter-exit-array.f90
deleted file mode 100644
index 4a9fb6e..0000000
--- a/libomptarget/test/offloading/fortran/target-map-enter-exit-array.f90
+++ /dev/null

@@ -1,41 +0,0 @@
-! Offloading test checking interaction of fixed size
-! arrays with enter, exit and target 
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer :: A(10)
-    
-   !$omp target enter data map(alloc: A)
-
-    !$omp target
-        do I = 1, 10
-            A(I) = I
-        end do
-    !$omp end target
-
-    !$omp target exit data map(from: A)
-
-    !$omp target exit data map(delete: A)
-    
-    do i = 1, 10
-        print *, A(i)
-    end do
-end program
-
-! CHECK: 1
-! CHECK: 2
-! CHECK: 3
-! CHECK: 4
-! CHECK: 5
-! CHECK: 6
-! CHECK: 7
-! CHECK: 8
-! CHECK: 9
-! CHECK: 10

diff --git a/libomptarget/test/offloading/fortran/target-map-enter-exit-scalar.f90 b/libomptarget/test/offloading/fortran/target-map-enter-exit-scalar.f90
deleted file mode 100644
index 29a0b5e..0000000
--- a/libomptarget/test/offloading/fortran/target-map-enter-exit-scalar.f90
+++ /dev/null

@@ -1,33 +0,0 @@
-! Offloading test checking interaction of an
-! enter and exit map of an scalar
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer :: scalar
-    scalar = 10
-
-    !$omp target enter data map(to: scalar)
-
-    !ignored, as we've already attached
-    scalar = 20
-
-   !$omp target
-      scalar = scalar + 50
-   !$omp end target 
-
-  !$omp target exit data map(from: scalar)
-
-  ! not the answer one may expect, but it is the same 
-  ! answer Clang gives so we are correctly on par with 
-  ! Clang for the moment.
-  print *, scalar
-end program
-
-!CHECK: 10

diff --git a/libomptarget/test/offloading/fortran/target-map-pointer-scopes-enter-exit.f90 b/libomptarget/test/offloading/fortran/target-map-pointer-scopes-enter-exit.f90
deleted file mode 100644
index dee75af..0000000
--- a/libomptarget/test/offloading/fortran/target-map-pointer-scopes-enter-exit.f90
+++ /dev/null

@@ -1,83 +0,0 @@
-! Offloading test checking interaction of pointers
-! with target in different scopes
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-module test
-    contains
-  subroutine func_arg(arg_alloc) 
-    integer,  pointer, intent (inout) :: arg_alloc(:)
-    
-  !$omp target enter data map(alloc: arg_alloc)
-    
-  !$omp target 
-    do index = 1, 10
-      arg_alloc(index) = arg_alloc(index) + index
-    end do
-  !$omp end target
-    
-  !$omp target exit data map(from: arg_alloc)
-  
-  !$omp target exit data map(delete: arg_alloc)
-    
-    print *, arg_alloc
-  end subroutine func_arg
-end module 
-  
-subroutine func 
-  integer,  pointer :: local_alloc(:) 
-  allocate(local_alloc(10))
-
-  !$omp target enter data map(alloc: local_alloc)
-
-  !$omp target 
-    do index = 1, 10
-      local_alloc(index) = index
-    end do
-  !$omp end target
-  
-  !$omp target exit data map(from: local_alloc)
-
-  !$omp target exit data map(delete: local_alloc)
-
-  print *, local_alloc
-    
-  deallocate(local_alloc)
-end subroutine func 
-  
-  
-program main 
-  use test 
-  integer,  pointer :: map_ptr(:) 
-  allocate(map_ptr(10))
-  
-  !$omp target enter data map(alloc: map_ptr)
-
-  !$omp target
-    do index = 1, 10
-      map_ptr(index) = index
-    end do
-  !$omp end target
-  
-  !$omp target exit data map(from: map_ptr)
-
-  !$omp target exit data map(delete: map_ptr)
-
-  call func 
-
-  print *, map_ptr
-
-  call func_arg(map_ptr)
-
-  deallocate(map_ptr)
-end program
-
-! CHECK: 1 2 3 4 5 6 7 8 9 10
-! CHECK: 1 2 3 4 5 6 7 8 9 10
-! CHECK: 2 4 6 8 10 12 14 16 18 20

diff --git a/libomptarget/test/offloading/fortran/target-map-pointer-target-array-section-3d-bounds.f90 b/libomptarget/test/offloading/fortran/target-map-pointer-target-array-section-3d-bounds.f90
deleted file mode 100644
index ff2298c..0000000
--- a/libomptarget/test/offloading/fortran/target-map-pointer-target-array-section-3d-bounds.f90
+++ /dev/null

@@ -1,43 +0,0 @@
-! Offloading test checking interaction of pointer
-! and target with target where 3-D bounds have 
-! been specified
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer, pointer :: inArray(:,:,:)
-    integer, pointer :: outArray(:,:,:)
-    integer, target :: in(3,3,3)
-    integer, target :: out(3,3,3)
-    
-    inArray => in
-    outArray => out
-    
-    do i = 1, 3
-      do j = 1, 3
-        do k = 1, 3
-            inArray(i, j, k) = 42
-            outArray(i, j, k) = 0
-        end do
-       end do
-    end do
-
-!$omp target map(tofrom:inArray(1:3, 1:3, 2:2), outArray(1:3, 1:3, 1:3))
-    do j = 1, 3
-      do k = 1, 3
-        outArray(k, j, 2) = inArray(k, j, 2)
-      end do
-    end do
-!$omp end target
-
- print *, outArray
-
-end program
-
-! CHECK: 0 0 0 0 0 0 0 0 0 42 42 42 42 42 42 42 42 42 0 0 0 0 0 0 0 0 0

diff --git a/libomptarget/test/offloading/fortran/target-map-pointer-target-scopes.f90 b/libomptarget/test/offloading/fortran/target-map-pointer-target-scopes.f90
deleted file mode 100644
index d9a7000..0000000
--- a/libomptarget/test/offloading/fortran/target-map-pointer-target-scopes.f90
+++ /dev/null

@@ -1,64 +0,0 @@
-! Offloading test checking interaction of pointer
-! and target with target across multiple scopes 
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-module test
-    contains
-  subroutine func_arg(arg_alloc) 
-    integer,  pointer, intent (inout) :: arg_alloc(:)
-  
-  !$omp target map(tofrom: arg_alloc) 
-    do index = 1, 10
-      arg_alloc(index) = arg_alloc(index) + index
-    end do
-  !$omp end target
-    
-    print *, arg_alloc
-  end subroutine func_arg
-end module
-  
-subroutine func 
-   integer,  pointer :: local_alloc(:) 
-   integer, target :: b(10)
-   local_alloc => b
-  
-  !$omp target map(tofrom: local_alloc) 
-    do index = 1, 10
-      local_alloc(index) = index
-    end do
-  !$omp end target
-  
-    print *, local_alloc
-  end subroutine func 
-  
-  
-  program main 
-  use test 
-  integer,  pointer :: map_ptr(:) 
-  integer, target :: b(10)
-  
-  map_ptr => b
-  
-  !$omp target map(tofrom: map_ptr) 
-    do index = 1, 10
-      map_ptr(index) = index
-    end do
-  !$omp end target
-  
-   call func 
-
-   print *, map_ptr
-
-   call func_arg(map_ptr)
-end program
-
-!CHECK: 1 2 3 4 5 6 7 8 9 10
-!CHECK: 1 2 3 4 5 6 7 8 9 10
-!CHECK: 2 4 6 8 10 12 14 16 18 20

diff --git a/libomptarget/test/offloading/fortran/target-nested-target-data.f90 b/libomptarget/test/offloading/fortran/target-nested-target-data.f90
deleted file mode 100644
index a694f24..0000000
--- a/libomptarget/test/offloading/fortran/target-nested-target-data.f90
+++ /dev/null

@@ -1,31 +0,0 @@
-! Offloading test for target nested inside
-! a target data region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-   integer :: A(10), B(10), C(10)
-
-   do I = 1, 10
-      A(I) = 1
-      B(I) = 2
-   end do
-   !$omp target data map(to: A, B) map(alloc: C)
-   !$omp target map(from: C)
-   do I = 1, 10
-      C(I) = A(I) + B(I) ! assigns 3, A:1 + B:2
-   end do
-   !$omp end target
-   !$omp target update from(C) ! updates C device -> host
-   !$omp end target data
-
-   print *, C ! should be all 3's
-
-end program
-
-! CHECK: 3 3 3 3 3 3 3 3 3 3

diff --git a/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90 b/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90
deleted file mode 100644
index e44d6a5..0000000
--- a/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90
+++ /dev/null

@@ -1,44 +0,0 @@
-! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-generic
-! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
-program main
-   use omp_lib
-   implicit none
-   integer :: i,j
-   integer :: array(10,10), errors = 0
-   do i = 1, 10
-      do j = 1, 10
-         array(j, i) = 0
-      end do
-   end do
-
-   !$omp target parallel do map(from:array) collapse(2)
-   do i = 1, 10
-      do j = 1, 10
-         array( j, i) = i + j
-      end do
-    end do
-    !$omp end target parallel do
-
-    do i = 1, 10
-       do j = 1, 10
-          if ( array( j, i) .ne. (i + j) ) then
-             errors = errors + 1
-          end if
-       end do
-   end do
-
-   print *,"number of errors: ", errors
-
-end program main
-
-! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
-! CHECK:  number of errors: 0
-

diff --git a/libomptarget/test/offloading/fortran/target-region-implicit-array.f90 b/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
deleted file mode 100644
index ada6ef2..0000000
--- a/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
+++ /dev/null

@@ -1,25 +0,0 @@
-! Basic offloading test of a regular array explicitly
-! passed within a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program main
-    integer :: x(10) = (/0,0,0,0,0,0,0,0,0,0/)
-    integer :: i = 1
-    integer :: j = 11
-
-  !$omp target
-     do i = 1, j
-        x(i) = i;
-     end do
-  !$omp end target
-
-   PRINT *, x(:)
-end program main
-
-! CHECK: 1 2 3 4 5 6 7 8 9 10

diff --git a/libomptarget/test/offloading/fortran/target_map_common_block.f90 b/libomptarget/test/offloading/fortran/target_map_common_block.f90
deleted file mode 100644
index 402de27..0000000
--- a/libomptarget/test/offloading/fortran/target_map_common_block.f90
+++ /dev/null

@@ -1,128 +0,0 @@
-! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-
-! Testing simple variables in common block.
-program main
-  call check_device
-  call commonblock_simple_with_implicit_type_var
-  call commonblock_simple_with_integer
-  call commonblock_simple_with_real
-  call commonblock_simple_to_from
-  call set_commonblock_named
-  call use_commonblock_named
-end program main
-
-!-----
-
-subroutine check_device
-  use omp_lib
-  integer :: devices(2)
-  devices(1) = omp_get_device_num()
-  !$omp target map(tofrom:devices)
-    devices(2) = omp_get_device_num()
-  !$omp end target
-  print *, omp_get_num_devices()
-  !CHECK: [[ND:[0-9]+]]
-  print *, omp_get_default_device()
-  !CHECK: [[DD:[0-9]+]]
-  !CHECK: devices: [[ND]] [[DD]]
-  print *, "devices: ", devices
-end subroutine check_device
-
-!-----
-
-subroutine commonblock_simple_with_implicit_type_var
-  use omp_lib
-  common var1
-  var1 = 10
-  print *, "var1 before target = ", var1
-  !$omp target map(tofrom:var1)
-    var1 = 20
-  !$omp end target
-  print *, "var1 after target = ", var1
-end subroutine
-
-! CHECK: var1 before target = 10
-! CHECK: var1 after target = 20
-
-! -----
-
-subroutine commonblock_simple_with_integer
-  use omp_lib
-  integer :: var2
-  common var2
-  var2 = 10
-  print *, "var2 before target = ", var2
-  !$omp target map(tofrom:var2)
-    var2 = 20
-  !$omp end target
-  print *, "var2 after target = ", var2
-end subroutine
-
-! CHECK: var2 before target = 10
-! CHECK: var2 after target = 20
-
-! -----
-
-subroutine commonblock_simple_with_real
-  use omp_lib
-  real :: var3
-  common var3
-  var3 = 12.5
-  print *, "var3 before target = ", var3
-  !$omp target map(tofrom:var3)
-    var3 = 14.5
-  !$omp end target
-  print *, "var3 after target = ", var3
-end subroutine
-
-! CHECK: var3 before target = 12.5
-! CHECK: var3 after target = 14.5
-
-! -----
-
-subroutine commonblock_simple_to_from
-  use omp_lib
-  integer :: var4, tmp
-  common var4
-  var4 = 10
-  tmp = 20
-  !$omp target map(to:var4) map(from:tmp)
-    tmp = var4
-    var4 = 20
-  !$omp end target
-  print *, "var4 after target = ", var4
-  print *, "tmp after target = ", tmp
-end subroutine
-
-! CHECK: var4 after target = 10
-! CHECK: tmp after target = 10
-
-! -----
-
-subroutine set_commonblock_named
-  integer :: var6
-  common /my_common_block/ var6
-  var6 = 20
-end subroutine
-
-subroutine use_commonblock_named
-  integer :: var6
-  common /my_common_block/ var6
-  print *, "var6 before target = ", var6
-  !$omp target map(tofrom: var6)
-    var6 = 30
-  !$omp end target
-  print *, "var6 after target = ", var6
-end subroutine
-
-! CHECK: var6 before target = 20
-! CHECK: var6 after target = 30

diff --git a/libomptarget/test/offloading/fortran/target_map_common_block1.f90 b/libomptarget/test/offloading/fortran/target_map_common_block1.f90
deleted file mode 100644
index 6aaa66d..0000000
--- a/libomptarget/test/offloading/fortran/target_map_common_block1.f90
+++ /dev/null

@@ -1,28 +0,0 @@
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-
-program main
-  use omp_lib
-  integer :: devices(2), var1
-  common var1
-  var1 = 10
-  print *, "var1 before target = ", var1
-  devices(1) = omp_get_device_num()
-  !$omp target map(tofrom:devices) map(tofrom:var1)
-    var1 = 20
-    devices(2) = omp_get_device_num()
-  !$omp end target
-  print *, "var1 after target = ", var1
-  print *, "devices are different? ", (devices(1) /= devices(2))
-end program
-
-! CHECK: var1 before target =  10
-! CHECK: var1 after target =  20
-! CHECK: devices are different? T

diff --git a/libomptarget/test/offloading/fortran/target_map_common_block2.f90 b/libomptarget/test/offloading/fortran/target_map_common_block2.f90
deleted file mode 100644
index 24e3e2b..0000000
--- a/libomptarget/test/offloading/fortran/target_map_common_block2.f90
+++ /dev/null

@@ -1,26 +0,0 @@
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-
-program main
-  use omp_lib
-  integer :: tmp, var4
-  common var4
-  var4 = 24
-  tmp = 12
-  print *, "var4 before target =", var4
-  !$omp target map(tofrom:var4)
-    var4 = tmp
-  !$omp end target
-  print *, "var4 after target =", var4
-end program
-
-! CHECK: var4 before target = 24
-! CHECK: var4 after target = 12
-

diff --git a/libomptarget/test/offloading/fortran/target_update.f90 b/libomptarget/test/offloading/fortran/target_update.f90
deleted file mode 100644
index fb35c5a..0000000
--- a/libomptarget/test/offloading/fortran/target_update.f90
+++ /dev/null

@@ -1,50 +0,0 @@
-! Offloading test for the `target update` directive.
-
-! REQUIRES: flang, amdgcn-amd-amdhsa
-
-! RUN: %libomptarget-compile-fortran-run-and-check-generic
-program target_update
-    implicit none
-    integer :: x(1)
-    integer :: host_id
-    integer :: device_id(1)
-
-    INTERFACE
-        FUNCTION omp_get_device_num() BIND(C)
-            USE, INTRINSIC :: iso_c_binding, ONLY: C_INT
-            integer :: omp_get_device_num
-        END FUNCTION omp_get_device_num
-    END INTERFACE
-
-    x(1) = 5
-    host_id = omp_get_device_num()
-
-!$omp target enter data map(to:x, device_id)
-!$omp target
-    x(1) = 42
-!$omp end target
-
-    ! Test that without a `target update` directive, the target update to x is
-    ! not yet seen by the host.
-    ! CHECK: After first target regions and before target update: x = 5
-    print *, "After first target regions and before target update: x =", x(1)
-
-!$omp target
-    x(1) = 84
-    device_id(1) = omp_get_device_num()
-!$omp end target
-!$omp target update from(x, device_id)
-
-    ! Test that after the `target update`, the host can see the new x value.
-    ! CHECK: After second target regions and target update: x = 84
-    print *, "After second target regions and target update: x =", x(1)
-
-    ! Make sure that offloading to the device actually happened. This way we
-    ! verify that we didn't take the fallback host execution path.
-    ! CHECK: Offloading succeeded!
-    if (host_id /= device_id(1)) then
-        print *, "Offloading succeeded!"
-    else
-        print *, "Offloading failed!"
-    end if
-end program target_update

diff --git a/libomptarget/test/offloading/generic_multiple_parallel_regions.c b/libomptarget/test/offloading/generic_multiple_parallel_regions.c
deleted file mode 100644
index 33fe2ca..0000000
--- a/libomptarget/test/offloading/generic_multiple_parallel_regions.c
+++ /dev/null

@@ -1,29 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-__attribute__((optnone)) void optnone() {}
-
-int main() {
-  int i = 0;
-#pragma omp target teams num_teams(1) map(tofrom : i)
-  {
-    optnone();
-#pragma omp parallel
-    if (omp_get_thread_num() == 0)
-      ++i;
-#pragma omp parallel
-    if (omp_get_thread_num() == 0)
-      ++i;
-#pragma omp parallel
-    if (omp_get_thread_num() == 0)
-      ++i;
-#pragma omp parallel
-    if (omp_get_thread_num() == 0)
-      ++i;
-  }
-  // CHECK: 4
-  printf("%i\n", i);
-}

diff --git a/libomptarget/test/offloading/global_constructor.cpp b/libomptarget/test/offloading/global_constructor.cpp
deleted file mode 100644
index eb68c5f..0000000
--- a/libomptarget/test/offloading/global_constructor.cpp
+++ /dev/null

@@ -1,25 +0,0 @@
-// RUN: %libomptarget-compilexx-generic && %libomptarget-run-generic | %fcheck-generic
-
-#include <cstdio>
-
-int foo() { return 1; }
-
-class C {
-public:
-  C() : x(foo()) {}
-
-  int x;
-};
-
-C c;
-#pragma omp declare target(c)
-
-int main() {
-  int x = 0;
-#pragma omp target map(from : x)
-  { x = c.x; }
-
-  // CHECK: PASS
-  if (x == 1)
-    printf("PASS\n");
-}

diff --git a/libomptarget/test/offloading/host_as_target.c b/libomptarget/test/offloading/host_as_target.c
deleted file mode 100644
index 8ab991b..0000000
--- a/libomptarget/test/offloading/host_as_target.c
+++ /dev/null

@@ -1,151 +0,0 @@
-// Check that specifying device as omp_get_initial_device():
-// - Doesn't cause the runtime to fail.
-// - Offloads code to the host.
-// - Doesn't transfer data.  In this case, just check that neither host data nor
-//   default device data are affected by the specified transfers.
-// - Works whether it's specified directly or as the default device.
-
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-static void check(char *X, int Dev) {
-  printf("  host X = %c\n", *X);
-  char DV = -1;
-#pragma omp target device(Dev) map(from : DV)
-  DV = *X;
-  printf("device X = %c\n", DV);
-}
-
-#define CHECK_DATA() check(&X, DevDefault)
-
-int main(void) {
-  int DevDefault = omp_get_default_device();
-  int DevInit = omp_get_initial_device();
-
-  //--------------------------------------------------
-  // Initialize data on the host and default device.
-  //--------------------------------------------------
-
-  //      CHECK:   host X = h
-  // CHECK-NEXT: device X = d
-  char X = 'd';
-#pragma omp target enter data map(to : X)
-  X = 'h';
-  CHECK_DATA();
-
-//--------------------------------------------------
-// Check behavior when specifying host directly.
-//--------------------------------------------------
-
-// CHECK-NEXT: omp_is_initial_device() = 1
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target device(DevInit) map(always, tofrom : X)
-  printf("omp_is_initial_device() = %d\n", omp_is_initial_device());
-  CHECK_DATA();
-
-// CHECK-NEXT: omp_is_initial_device() = 1
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target teams device(DevInit) num_teams(1) map(always, tofrom : X)
-  printf("omp_is_initial_device() = %d\n", omp_is_initial_device());
-  CHECK_DATA();
-
-// Check that __kmpc_push_target_tripcount_mapper doesn't fail. I'm not sure
-// how to check that it actually pushes to the initial device.
-#pragma omp target teams device(DevInit) num_teams(1)
-#pragma omp distribute
-  for (int i = 0; i < 2; ++i)
-    ;
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target data device(DevInit) map(always, tofrom : X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target enter data device(DevInit) map(always, to : X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target exit data device(DevInit) map(always, from : X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target update device(DevInit) to(X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target update device(DevInit) from(X)
-  ;
-  CHECK_DATA();
-
-  //--------------------------------------------------
-  // Check behavior when device defaults to host.
-  //--------------------------------------------------
-
-  omp_set_default_device(DevInit);
-
-// CHECK-NEXT: omp_is_initial_device() = 1
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target map(always, tofrom : X)
-  printf("omp_is_initial_device() = %d\n", omp_is_initial_device());
-  CHECK_DATA();
-
-// CHECK-NEXT: omp_is_initial_device() = 1
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target teams num_teams(1) map(always, tofrom : X)
-  printf("omp_is_initial_device() = %d\n", omp_is_initial_device());
-  CHECK_DATA();
-
-// Check that __kmpc_push_target_tripcount_mapper doesn't fail. I'm not sure
-// how to check that it actually pushes to the initial device.
-#pragma omp target teams num_teams(1)
-#pragma omp distribute
-  for (int i = 0; i < 2; ++i)
-    ;
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target data map(always, tofrom : X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target enter data map(always, to : X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target exit data map(always, from : X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target update to(X)
-  ;
-  CHECK_DATA();
-
-// CHECK-NEXT:   host X = h
-// CHECK-NEXT: device X = d
-#pragma omp target update from(X)
-  ;
-  CHECK_DATA();
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/indirect_fp_mapping.c b/libomptarget/test/offloading/indirect_fp_mapping.c
deleted file mode 100644
index a400349..0000000
--- a/libomptarget/test/offloading/indirect_fp_mapping.c
+++ /dev/null

@@ -1,37 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic | %fcheck-generic
-// RUN: %libomptarget-compileopt-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic | %fcheck-generic
-
-#include <stdio.h>
-
-int square(int x) { return x * x; }
-#pragma omp declare target indirect to(square)
-
-typedef int (*fp_t)(int);
-
-int main() {
-  int i = 17, r;
-
-  fp_t fp = &square;
-  // CHECK: host: &square =
-  printf("host: &square = %p\n", fp);
-
-#pragma omp target map(from : fp)
-  fp = &square;
-  // CHECK: device: &square = [[DEV_FP:.*]]
-  printf("device: &square = %p\n", fp);
-
-  fp_t fp1 = square;
-  fp_t fp2 = 0;
-#pragma omp target map(from : fp2)
-  fp2 = fp1;
-  // CHECK: device: fp2 = [[DEV_FP]]
-  printf("device: fp2 = %p\n", fp2);
-
-#pragma omp target map(from : r)
-  { r = fp1(i); }
-
-  // CHECK: 17*17 = 289
-  printf("%i*%i = %i\n", i, i, r);
-}

diff --git a/libomptarget/test/offloading/info.c b/libomptarget/test/offloading/info.c
deleted file mode 100644
index da8e4c4..0000000
--- a/libomptarget/test/offloading/info.c
+++ /dev/null

@@ -1,75 +0,0 @@
-// RUN: %libomptarget-compile-generic \
-// RUN:     -gline-tables-only -fopenmp-extensions
-// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
-// RUN:   %fcheck-generic -allow-empty -check-prefixes=INFO
-// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 | \
-// RUN:   %fcheck-amdgcn-amd-amdhsa -allow-empty -check-prefixes=INFO,AMDGPU
-
-// FIXME: Fails due to optimized debugging in 'ptxas'.
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-#define N 64
-
-#pragma omp declare target
-int global;
-#pragma omp end declare target
-
-extern void __tgt_set_info_flag(unsigned);
-
-int main() {
-  int A[N];
-  int B[N];
-  int C[N];
-  int val = 1;
-
-// clang-format off
-// INFO: info: Entering OpenMP data region with being_mapper at info.c:{{[0-9]+}}:{{[0-9]+}} with 3 arguments:
-// INFO: info: alloc(A[0:64])[256]
-// INFO: info: tofrom(B[0:64])[256]
-// INFO: info: to(C[0:64])[256]
-// INFO: info: Creating new map entry with HstPtrBase={{.*}}, HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, DynRefCount=1, HoldRefCount=0, Name=A[0:64]
-// INFO: info: Creating new map entry with HstPtrBase={{.*}}, HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, DynRefCount=0, HoldRefCount=1, Name=B[0:64]
-// INFO: info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=256, Name=B[0:64]
-// INFO: info: Creating new map entry with HstPtrBase={{.*}}, HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, DynRefCount=1, HoldRefCount=0, Name=C[0:64]
-// INFO: info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=256, Name=C[0:64]
-// INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}:
-// INFO: info: Host Ptr           Target Ptr         Size (B) DynRefCount HoldRefCount Declaration
-// INFO: info: {{.*}}             {{.*}}             256      1           0            C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}
-// INFO: info: {{.*}}             {{.*}}             256      0           1            B[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}
-// INFO: info: {{.*}}             {{.*}}             256      1           0            A[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}
-// INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments:
-// INFO: info: firstprivate(val)[4]
-// INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode
-// AMDGPU: AMDGPU device {{[0-9]}} info: #Args: {{[0-9]}} Teams x Thrds: {{[0-9]+}}x {{[0-9]+}} (MaxFlatWorkGroupSize: {{[0-9]+}}) LDS Usage: {{[0-9]+}}B #SGPRs/VGPRs: {{[0-9]+}}/{{[0-9]+}} #SGPR/VGPR Spills: {{[0-9]+}}/{{[0-9]+}} Tripcount: {{[0-9]+}}
-// INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}:
-// INFO: info: Host Ptr           Target Ptr         Size (B) DynRefCount HoldRefCount Declaration
-// INFO: info: {{.*}}             {{.*}}             256      1           0            C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}
-// INFO: info: {{.*}}             {{.*}}             256      0           1            B[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}
-// INFO: info: {{.*}}             {{.*}}             256      1           0            A[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}
-// INFO: info: Exiting OpenMP data region with end_mapper at info.c:{{[0-9]+}}:{{[0-9]+}} with 3 arguments:
-// INFO: info: alloc(A[0:64])[256]
-// INFO: info: tofrom(B[0:64])[256]
-// INFO: info: to(C[0:64])[256]
-// INFO: info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=256, Name=B[0:64]
-// INFO: info: Removing map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=C[0:64]
-// INFO: info: Removing map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=B[0:64]
-// INFO: info: Removing map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=A[0:64]
-// INFO: info: OpenMP Host-Device pointer mappings after block at info.c:[[#%u,]]:[[#%u,]]:
-// INFO: info: Host Ptr  Target Ptr Size (B) DynRefCount HoldRefCount Declaration
-// INFO: info: [[#%#x,]] [[#%#x,]]  4        INF         0            global at unknown:0:0
-// clang-format on
-#pragma omp target data map(alloc : A[0 : N])                                  \
-    map(ompx_hold, tofrom : B[0 : N]) map(to : C[0 : N])
-#pragma omp target firstprivate(val)
-  { val = 1; }
-
-  __tgt_set_info_flag(0x0);
-// INFO-NOT: omptarget device 0 info: {{.*}}
-#pragma omp target
-  {}
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/interop.c b/libomptarget/test/offloading/interop.c
deleted file mode 100644
index 26287e3..0000000
--- a/libomptarget/test/offloading/interop.c
+++ /dev/null

@@ -1,48 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: nvptx64-nvidia-cuda
-
-#include <assert.h>
-#include <omp.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef void *cudaStream_t;
-
-int main() {
-
-  int device_id = omp_get_default_device();
-
-#pragma omp parallel master
-  {
-
-    double D0, D2;
-    omp_interop_t interop;
-
-#pragma omp interop init(targetsync : interop) device(device_id) nowait
-    assert(interop != NULL);
-
-    int err;
-    for (int i = omp_ipr_first; i < 0; i++) {
-      const char *n =
-          omp_get_interop_name(interop, (omp_interop_property_t)(i));
-      long int li =
-          omp_get_interop_int(interop, (omp_interop_property_t)(i), &err);
-      const void *p =
-          omp_get_interop_ptr(interop, (omp_interop_property_t)(i), &err);
-      const char *s =
-          omp_get_interop_str(interop, (omp_interop_property_t)(i), &err);
-      const char *n1 =
-          omp_get_interop_type_desc(interop, (omp_interop_property_t)(i));
-    }
-#pragma omp interop use(interop) depend(in : D0, D2)
-
-    cudaStream_t stream =
-        (omp_get_interop_ptr(interop, omp_ipr_targetsync, NULL));
-    assert(stream != NULL);
-
-#pragma omp interop destroy(interop) depend(in : D0, D2) device(device_id)
-  }
-  printf("PASS\n");
-}
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/lone_target_exit_data.c b/libomptarget/test/offloading/lone_target_exit_data.c
deleted file mode 100644
index 73a5ffa..0000000
--- a/libomptarget/test/offloading/lone_target_exit_data.c
+++ /dev/null

@@ -1,14 +0,0 @@
-// Check that a target exit data directive behaves correctly when the runtime
-// has not yet been initialized.
-
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <stdio.h>
-
-int main() {
-  // CHECK: x = 98
-  int x = 98;
-#pragma omp target exit data map(from : x)
-  printf("x = %d\n", x);
-  return 0;
-}

diff --git a/libomptarget/test/offloading/looptripcnt.c b/libomptarget/test/offloading/looptripcnt.c
deleted file mode 100644
index d1a0950..0000000
--- a/libomptarget/test/offloading/looptripcnt.c
+++ /dev/null

@@ -1,32 +0,0 @@
-// RUN: %libomptarget-compile-generic && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 | %fcheck-generic -allow-empty -check-prefix=DEBUG
-// REQUIRES: libomptarget-debug
-
-/*
-  Test for looptripcount being popped from runtime stack.
-*/
-#include <omp.h>
-#include <stdio.h>
-int main() {
-  int N = 128;
-  int NN = 1024;
-  int num_teams[NN];
-  int num_threads[NN];
-
-  printf("#pragma omp target teams distribute parallel for thread_limit(4)\n");
-#pragma omp target teams distribute parallel for thread_limit(4)
-  for (int j = 0; j < N; j++) {
-    num_threads[j] = omp_get_num_threads();
-    num_teams[j] = omp_get_num_teams();
-  }
-  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
-  // DEBUG: loop trip count is 128
-  printf("#pragma omp target teams distribute parallel for\n");
-#pragma omp target teams distribute parallel for
-  for (int j = 0; j < N; j++) {
-    num_threads[j] = omp_get_num_threads();
-    num_teams[j] = omp_get_num_teams();
-  }
-  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
-  // DEBUG: loop trip count is 128
-  return 0;
-}

diff --git a/libomptarget/test/offloading/malloc.c b/libomptarget/test/offloading/malloc.c
deleted file mode 100644
index 7b98e1f..0000000
--- a/libomptarget/test/offloading/malloc.c
+++ /dev/null

@@ -1,37 +0,0 @@
-// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
-// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic
-
-#include <stdio.h>
-#include <stdlib.h>
-
-int main() {
-  long unsigned *DP = 0;
-  int N = 32;
-  int Threads = 64;
-  int Teams = 10;
-
-  // Allocate ~55MB on the device.
-#pragma omp target map(from : DP)
-  DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
-
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads) is_device_ptr(DP)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      DP[i * N + j] = i + j;
-    }
-  }
-
-  long unsigned s = 0;
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads) reduction(+ : s)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      s += DP[i * N + j];
-    }
-  }
-
-  // CHECK: Sum: 6860800
-  printf("Sum: %li\n", s);
-  return 0;
-}

diff --git a/libomptarget/test/offloading/malloc_parallel.c b/libomptarget/test/offloading/malloc_parallel.c
deleted file mode 100644
index 076a7ba..0000000
--- a/libomptarget/test/offloading/malloc_parallel.c
+++ /dev/null

@@ -1,42 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int main() {
-  long unsigned **DP = 0;
-  int N = 32;
-  int Threads = 64;
-  int Teams = 10;
-
-#pragma omp target map(from : DP)
-  DP = (long unsigned **)malloc(sizeof(long unsigned *) * Threads * Teams);
-
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads)
-  for (int i = 0; i < Threads * Teams; ++i)
-    DP[i] = (long unsigned *)malloc(sizeof(long unsigned) * N);
-
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      DP[i][j] = i + j;
-    }
-  }
-
-  long unsigned s = 0;
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads) reduction(+ : s)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      s += DP[i][j];
-    }
-  }
-
-  // CHECK: Sum: 6860800
-  printf("Sum: %li\n", s);
-  return 0;
-}

diff --git a/libomptarget/test/offloading/mandatory_but_no_devices.c b/libomptarget/test/offloading/mandatory_but_no_devices.c
deleted file mode 100644
index ecdee72..0000000
--- a/libomptarget/test/offloading/mandatory_but_no_devices.c
+++ /dev/null

@@ -1,56 +0,0 @@
-// Check that mandatory offloading causes various offloading directives to fail
-// when omp_get_num_devices() == 0 even if the requested device is the initial
-// device.  This behavior is proposed for OpenMP 5.2 in OpenMP spec github
-// issue 2669.
-
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DDIR=target
-// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
-// RUN:   %fcheck-nvptx64-nvidia-cuda
-
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DDIR='target teams'
-// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
-// RUN:   %fcheck-nvptx64-nvidia-cuda
-
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -DDIR='target data map(X)'
-// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
-// RUN:   %fcheck-nvptx64-nvidia-cuda
-
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda \
-// RUN:   -DDIR='target enter data map(to:X)'
-// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
-// RUN:   %fcheck-nvptx64-nvidia-cuda
-
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda \
-// RUN:   -DDIR='target exit data map(from:X)'
-// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
-// RUN:   %fcheck-nvptx64-nvidia-cuda
-
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda \
-// RUN:   -DDIR='target update to(X)'
-// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
-// RUN:   %fcheck-nvptx64-nvidia-cuda
-
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda \
-// RUN:   -DDIR='target update from(X)'
-// RUN: env OMP_TARGET_OFFLOAD=mandatory CUDA_VISIBLE_DEVICES= \
-// RUN:   %libomptarget-run-fail-nvptx64-nvidia-cuda 2>&1 | \
-// RUN:   %fcheck-nvptx64-nvidia-cuda
-
-// REQUIRES: nvptx64-nvidia-cuda
-
-#include <omp.h>
-#include <stdio.h>
-
-// CHECK: omptarget fatal error 1: failure of target construct while offloading is mandatory
-int main(void) {
-  int X;
-#pragma omp DIR device(omp_get_initial_device())
-  ;
-  return 0;
-}

diff --git a/libomptarget/test/offloading/memory_manager.cpp b/libomptarget/test/offloading/memory_manager.cpp
deleted file mode 100644
index ac437c5..0000000
--- a/libomptarget/test/offloading/memory_manager.cpp
+++ /dev/null

@@ -1,46 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-#include <omp.h>
-
-#include <cassert>
-#include <iostream>
-
-int main(int argc, char *argv[]) {
-#pragma omp parallel for
-  for (int i = 0; i < 16; ++i) {
-    for (int n = 1; n < (1 << 13); n <<= 1) {
-      void *p = omp_target_alloc(n * sizeof(int), 0);
-      omp_target_free(p, 0);
-    }
-  }
-
-#pragma omp parallel for
-  for (int i = 0; i < 16; ++i) {
-    for (int n = 1; n < (1 << 13); n <<= 1) {
-      int *p = (int *)omp_target_alloc(n * sizeof(int), 0);
-#pragma omp target teams distribute parallel for is_device_ptr(p)
-      for (int j = 0; j < n; ++j) {
-        p[j] = i;
-      }
-      int buffer[n];
-#pragma omp target teams distribute parallel for is_device_ptr(p)              \
-    map(from : buffer)
-      for (int j = 0; j < n; ++j) {
-        buffer[j] = p[j];
-      }
-      for (int j = 0; j < n; ++j) {
-        assert(buffer[j] == i);
-      }
-      omp_target_free(p, 0);
-    }
-  }
-
-  std::cout << "PASS\n";
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/multiple_reductions_simple.c b/libomptarget/test/offloading/multiple_reductions_simple.c
deleted file mode 100644
index 54e55f9..0000000
--- a/libomptarget/test/offloading/multiple_reductions_simple.c
+++ /dev/null

@@ -1,17 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-#include <stdio.h>
-
-int main(int argc, char **argv) {
-
-  unsigned s1 = 0, s2 = 1;
-#pragma omp target teams distribute parallel for reduction(+ : s1, s2)
-  for (int i = 0; i < 10000; ++i) {
-    s1 += i;
-    s2 += i;
-  }
-
-  // CHECK: 49995000 : 49995001
-  printf("%i : %i\n", s1, s2);
-}

diff --git a/libomptarget/test/offloading/non_contiguous_update.cpp b/libomptarget/test/offloading/non_contiguous_update.cpp
deleted file mode 100644
index 609f0f9..0000000
--- a/libomptarget/test/offloading/non_contiguous_update.cpp
+++ /dev/null

@@ -1,93 +0,0 @@
-// RUN: %libomptarget-compile-generic && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 | %fcheck-generic -allow-empty -check-prefix=DEBUG
-// REQUIRES: libomptarget-debug
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-
-// Data structure definitions copied from OpenMP RTL.
-struct __tgt_target_non_contig {
-  int64_t offset;
-  int64_t width;
-  int64_t stride;
-};
-
-enum tgt_map_type { OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000 };
-
-// OpenMP RTL interfaces
-#ifdef __cplusplus
-extern "C" {
-#endif
-void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
-                              void **args_base, void **args, int64_t *arg_sizes,
-                              int64_t *arg_types);
-#ifdef __cplusplus
-}
-#endif
-
-int main() {
-  // case 1
-  // int arr[3][4][5][6];
-  // #pragma omp target update to(arr[0:2][1:3][1:2][:])
-  // set up descriptor
-  __tgt_target_non_contig non_contig[5] = {
-      {0, 2, 480}, {1, 3, 120}, {1, 2, 24}, {0, 6, 4}, {0, 1, 4}};
-  int64_t size = 4, type = OMP_TGT_MAPTYPE_NON_CONTIG;
-
-  void *base;
-  void *begin = &non_contig;
-  int64_t *sizes = &size;
-  int64_t *types = &type;
-
-  // The below diagram is the visualization of the non-contiguous transfer after
-  // optimization. Note that each element represent the innermost dimension
-  // (unit size = 24) since the stride * count of last dimension is equal to the
-  // stride of second last dimension.
-  //
-  // OOOOO OOOOO OOOOO
-  // OXXOO OXXOO OOOOO
-  // OXXOO OXXOO OOOOO
-  // OXXOO OXXOO OOOOO
-  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
-                           sizes, types);
-  // DEBUG: offset 144
-  // DEBUG: offset 264
-  // DEBUG: offset 384
-  // DEBUG: offset 624
-  // DEBUG: offset 744
-  // DEBUG: offset 864
-
-  // case 2
-  // double darr[3][4][5];
-  // #pragma omp target update to(darr[0:2:2][2:2][:2:2])
-  // set up descriptor
-  __tgt_target_non_contig non_contig_2[4] = {
-      {0, 2, 320}, {2, 2, 40}, {0, 2, 16}, {0, 1, 8}};
-  int64_t size_2 = 4, type_2 = OMP_TGT_MAPTYPE_NON_CONTIG;
-
-  void *base_2;
-  void *begin_2 = &non_contig_2;
-  int64_t *sizes_2 = &size_2;
-  int64_t *types_2 = &type_2;
-
-  // The below diagram is the visualization of the non-contiguous transfer after
-  // optimization. Note that each element represent the innermost dimension
-  // (unit size = 24) since the stride * count of last dimension is equal to the
-  // stride of second last dimension.
-  //
-  // OOOOO OOOOO OOOOO
-  // OOOOO OOOOO OOOOO
-  // XOXOO OOOOO XOXOO
-  // XOXOO OOOOO XOXOO
-  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base_2, &begin_2,
-                           sizes_2, types_2);
-  // DEBUG: offset 80
-  // DEBUG: offset 96
-  // DEBUG: offset 120
-  // DEBUG: offset 136
-  // DEBUG: offset 400
-  // DEBUG: offset 416
-  // DEBUG: offset 440
-  // DEBUG: offset 456
-  return 0;
-}

diff --git a/libomptarget/test/offloading/offloading_success.c b/libomptarget/test/offloading/offloading_success.c
deleted file mode 100644
index f849cb4..0000000
--- a/libomptarget/test/offloading/offloading_success.c
+++ /dev/null

@@ -1,20 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(void) {
-  int isHost = -1;
-
-#pragma omp target map(from : isHost)
-  { isHost = omp_is_initial_device(); }
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-
-  return isHost;
-}

diff --git a/libomptarget/test/offloading/offloading_success.cpp b/libomptarget/test/offloading/offloading_success.cpp
deleted file mode 100644
index d3103b8..0000000
--- a/libomptarget/test/offloading/offloading_success.cpp
+++ /dev/null

@@ -1,20 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(void) {
-  int isHost = 0;
-
-#pragma omp target map(from : isHost)
-  { isHost = omp_is_initial_device(); }
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-
-  return isHost;
-}

diff --git a/libomptarget/test/offloading/ompx_bare.c b/libomptarget/test/offloading/ompx_bare.c
deleted file mode 100644
index 3dabdcd..0000000
--- a/libomptarget/test/offloading/ompx_bare.c
+++ /dev/null

@@ -1,41 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
-// RUN:   %fcheck-generic
-//
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include <assert.h>
-#include <ompx.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, char *argv[]) {
-  const int num_blocks = 64;
-  const int block_size = 64;
-  const int N = num_blocks * block_size;
-  int *data = (int *)malloc(N * sizeof(int));
-
-  // CHECK: "PluginInterface" device 0 info: Launching kernel __omp_offloading_{{.*}} with 64 blocks and 64 threads in SPMD mode
-
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(from: data[0:N])
-  {
-    int bid = ompx_block_id_x();
-    int bdim = ompx_block_dim_x();
-    int tid = ompx_thread_id_x();
-    int idx = bid * bdim + tid;
-    data[idx] = idx;
-  }
-
-  for (int i = 0; i < N; ++i)
-    assert(data[i] == i);
-
-  // CHECK: PASS
-  printf("PASS\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/ompx_coords.c b/libomptarget/test/offloading/ompx_coords.c
deleted file mode 100644
index 5e4e14b..0000000
--- a/libomptarget/test/offloading/ompx_coords.c
+++ /dev/null

@@ -1,65 +0,0 @@
-// RUN: %libomptarget-compileopt-run-and-check-generic
-//
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include <omp.h>
-#include <ompx.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-struct info {
-  int tid, bid, tdim;
-};
-
-int main(int argc, char **argv) {
-  int N = 1 << 20;
-  if (argc > 1)
-    N = atoi(argv[1]);
-
-  struct info *X = (struct info *)malloc(sizeof(*X) * N);
-  memset(X, '0', sizeof(*X) * N);
-
-  int TL = 256;
-  int NT = (N + TL - 1) / TL;
-
-#pragma omp target data map(tofrom : X [0:N])
-#pragma omp target teams num_teams(NT) thread_limit(TL)
-  {
-#pragma omp parallel
-    {
-      int tid = ompx_thread_id_x();
-      int bid = ompx_block_id_x();
-      int tdim = ompx_block_dim_x();
-      int gid = tid + bid * tdim;
-      if (gid < N) {
-        X[gid].tid = tid;
-        X[gid].bid = bid;
-        X[gid].tdim = tdim;
-      };
-    }
-  }
-
-  int tid = 0, bid = 0, tdim = 256;
-  for (int i = 0; i < N; i++) {
-    if (X[i].tid != tid || X[i].bid != bid || X[i].tdim != tdim) {
-      printf("%i: %i vs %i, %i vs %i, %i vs %i\n", i, X[i].tid, tid, X[i].bid,
-             bid, X[i].tdim, tdim);
-      return 1;
-    }
-    tid++;
-    if (tid == tdim) {
-      tid = 0;
-      bid++;
-    }
-  }
-
-  // CHECK: OK
-  printf("OK");
-  return 0;
-}

diff --git a/libomptarget/test/offloading/ompx_saxpy_mixed.c b/libomptarget/test/offloading/ompx_saxpy_mixed.c
deleted file mode 100644
index f479be8..0000000
--- a/libomptarget/test/offloading/ompx_saxpy_mixed.c
+++ /dev/null

@@ -1,59 +0,0 @@
-// RUN: %libomptarget-compileopt-run-and-check-generic
-//
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include <math.h>
-#include <omp.h>
-#include <ompx.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, char **argv) {
-  int N = 1 << 29;
-  if (argc > 1)
-    N = atoi(argv[1]);
-  float a = 2.f;
-
-  float *X = (float *)malloc(sizeof(*X) * N);
-  float *Y = (float *)malloc(sizeof(*X) * N);
-
-  for (int i = 0; i < N; i++) {
-    X[i] = 1.0f;
-    Y[i] = 2.0f;
-  }
-
-  int TL = 256;
-  int NT = (N + TL - 1) / TL;
-
-#pragma omp target data map(to : X [0:N]) map(Y [0:N])
-#pragma omp target teams num_teams(NT) thread_limit(TL)
-  {
-#pragma omp parallel
-    {
-      int tid = ompx_thread_id_x();
-      int bid = ompx_block_id_x();
-      int tdim = ompx_block_dim_x();
-      int gid = tid + bid * tdim;
-      if (gid < N)
-        Y[gid] = a * X[gid] + Y[gid];
-    }
-  }
-
-  float maxError = 0.0f;
-  for (int i = 0; i < N; i++) {
-    maxError = fmax(maxError, fabs(Y[i] - 4.0f));
-    if (maxError) {
-      printf("%i %f %f\n", i, maxError, Y[i]);
-      break;
-    }
-  }
-  // CHECK: Max error: 0.00
-  printf("Max error: %f\n", maxError);
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/parallel_offloading_map.cpp b/libomptarget/test/offloading/parallel_offloading_map.cpp
deleted file mode 100644
index 55cdb45..0000000
--- a/libomptarget/test/offloading/parallel_offloading_map.cpp
+++ /dev/null

@@ -1,42 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-#include <cassert>
-#include <iostream>
-
-int main(int argc, char *argv[]) {
-  constexpr const int num_threads = 64, N = 128;
-  int array[num_threads] = {0};
-
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; ++i) {
-    int tmp[N];
-
-    for (int j = 0; j < N; ++j) {
-      tmp[j] = i;
-    }
-
-#pragma omp target teams distribute parallel for map(tofrom : tmp)
-    for (int j = 0; j < N; ++j) {
-      tmp[j] += j;
-    }
-
-    for (int j = 0; j < N; ++j) {
-      array[i] += tmp[j];
-    }
-  }
-
-  // Verify
-  for (int i = 0; i < num_threads; ++i) {
-    const int ref = (0 + N - 1) * N / 2 + i * N;
-    assert(array[i] == ref);
-  }
-
-  std::cout << "PASS\n";
-
-  return 0;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/parallel_target_teams_reduction.cpp b/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
deleted file mode 100644
index 10e1b33..0000000
--- a/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
+++ /dev/null

@@ -1,38 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-// RUN: %libomptarget-compileoptxx-run-and-check-generic
-
-// FIXME: This is a bug in host offload, this should run fine.
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include <iostream>
-#include <vector>
-
-#define N 8
-
-int main() {
-  std::vector<int> avec(N);
-  int *a = avec.data();
-#pragma omp parallel for
-  for (int i = 0; i < N; i++) {
-    a[i] = 0;
-#pragma omp target teams distribute parallel for reduction(+ : a[i])
-    for (int j = 0; j < N; j++)
-      a[i] += 1;
-  }
-
-  // CHECK: 8
-  // CHECK: 8
-  // CHECK: 8
-  // CHECK: 8
-  // CHECK: 8
-  // CHECK: 8
-  // CHECK: 8
-  // CHECK: 8
-  for (int i = 0; i < N; i++)
-    std::cout << a[i] << std::endl;
-}

diff --git a/libomptarget/test/offloading/parallel_target_teams_reduction_max.cpp b/libomptarget/test/offloading/parallel_target_teams_reduction_max.cpp
deleted file mode 100644
index 8f06802..0000000
--- a/libomptarget/test/offloading/parallel_target_teams_reduction_max.cpp
+++ /dev/null

@@ -1,73 +0,0 @@
-// RUN: %libomptarget-compilexx-and-run-generic
-// RUN: %libomptarget-compileoptxx-and-run-generic
-
-// FIXME: This is a bug in host offload, this should run fine.
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-// This test validates that the OpenMP target reductions to find a maximum work
-// as indended for a few common data types.
-
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <vector>
-
-template <class Tp> void test_max_idx_reduction() {
-  const Tp length = 1000;
-  const Tp nmaximas = 8;
-  std::vector<float> a(length, 3.0f);
-  const Tp step = length / nmaximas;
-  for (Tp i = 0; i < nmaximas; i++) {
-    a[i * step] += 1.0f;
-  }
-  for (Tp i = nmaximas; i > 0; i--) {
-    Tp idx = 0;
-    float *b = a.data();
-#pragma omp target teams distribute parallel for reduction(max : idx)          \
-    map(always, to : b[0 : length])
-    for (Tp j = 0; j < length - 1; j++) {
-      if (b[j] > b[j + 1]) {
-        idx = std::max(idx, j);
-      }
-    }
-    assert(idx == (i - 1) * step &&
-           "#pragma omp target teams distribute parallel for "
-           "reduction(max:<identifier list>) does not work as intended.");
-    a[idx] -= 1.0f;
-  }
-}
-
-template <class Tp> void test_max_val_reduction() {
-  const int length = 1000;
-  const int half = length / 2;
-  std::vector<Tp> a(length, (Tp)3);
-  a[half] += (Tp)1;
-  Tp max_val = std::numeric_limits<Tp>::lowest();
-  Tp *b = a.data();
-#pragma omp target teams distribute parallel for reduction(max : max_val)      \
-    map(always, to : b[0 : length])
-  for (int i = 0; i < length; i++) {
-    max_val = std::max(max_val, b[i]);
-  }
-  assert(std::abs(((double)a[half + 1]) - ((double)max_val) + 1.0) < 1e-6 &&
-         "#pragma omp target teams distribute parallel for "
-         "reduction(max:<identifier list>) does not work as intended.");
-}
-
-int main() {
-  // Reducing over indices
-  test_max_idx_reduction<int>();
-  test_max_idx_reduction<unsigned int>();
-  test_max_idx_reduction<long>();
-
-  // Reducing over values
-  test_max_val_reduction<int>();
-  test_max_val_reduction<unsigned int>();
-  test_max_val_reduction<long>();
-  test_max_val_reduction<float>();
-  test_max_val_reduction<double>();
-  return 0;
-}

diff --git a/libomptarget/test/offloading/parallel_target_teams_reduction_min.cpp b/libomptarget/test/offloading/parallel_target_teams_reduction_min.cpp
deleted file mode 100644
index 66c972f..0000000
--- a/libomptarget/test/offloading/parallel_target_teams_reduction_min.cpp
+++ /dev/null

@@ -1,73 +0,0 @@
-// RUN: %libomptarget-compilexx-and-run-generic
-// RUN: %libomptarget-compileoptxx-and-run-generic
-
-// FIXME: This is a bug in host offload, this should run fine.
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-// This test validates that the OpenMP target reductions to find a minimum work
-// as indended for a few common data types.
-
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <vector>
-
-template <class Tp> void test_min_idx_reduction() {
-  const Tp length = 1000;
-  const Tp nminimas = 8;
-  std::vector<float> a(length, 3.0f);
-  const Tp step = length / nminimas;
-  for (Tp i = 0; i < nminimas; i++) {
-    a[i * step] -= 1.0f;
-  }
-  for (Tp i = 0; i < nminimas; i++) {
-    Tp idx = a.size();
-    float *b = a.data();
-#pragma omp target teams distribute parallel for reduction(min : idx)          \
-    map(always, to : b[0 : length])
-    for (Tp j = 0; j < length - 1; j++) {
-      if (b[j] < b[j + 1]) {
-        idx = std::min(idx, j);
-      }
-    }
-    assert(idx == i * step &&
-           "#pragma omp target teams distribute parallel for "
-           "reduction(min:<identifier list>) does not work as intended.");
-    a[idx] += 1.0f;
-  }
-}
-
-template <class Tp> void test_min_val_reduction() {
-  const int length = 1000;
-  const int half = length / 2;
-  std::vector<Tp> a(length, (Tp)3);
-  a[half] -= (Tp)1;
-  Tp min_val = std::numeric_limits<Tp>::max();
-  Tp *b = a.data();
-#pragma omp target teams distribute parallel for reduction(min : min_val)      \
-    map(always, to : b[0 : length])
-  for (int i = 0; i < length; i++) {
-    min_val = std::min(min_val, b[i]);
-  }
-  assert(std::abs(((double)a[half + 1]) - ((double)min_val) - 1.0) < 1e-6 &&
-         "#pragma omp target teams distribute parallel for "
-         "reduction(min:<identifier list>) does not work as intended.");
-}
-
-int main() {
-  // Reducing over indices
-  test_min_idx_reduction<int>();
-  test_min_idx_reduction<unsigned int>();
-  test_min_idx_reduction<long>();
-
-  // Reducing over values
-  test_min_val_reduction<int>();
-  test_min_val_reduction<unsigned int>();
-  test_min_val_reduction<long>();
-  test_min_val_reduction<float>();
-  test_min_val_reduction<double>();
-  return 0;
-}

diff --git a/libomptarget/test/offloading/requires.c b/libomptarget/test/offloading/requires.c
deleted file mode 100644
index 2a129a7..0000000
--- a/libomptarget/test/offloading/requires.c
+++ /dev/null

@@ -1,84 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compile-generic -DREQ=1 && %libomptarget-run-generic 2>&1 | %fcheck-generic -check-prefix=GOOD
-// RUN: %libomptarget-compile-generic -DREQ=2 && not %libomptarget-run-generic 2>&1 | %fcheck-generic -check-prefix=BAD
-// clang-format on
-
-/*
-  Test for the 'requires' clause check.
-  When a target region is used, the requires flags are set in the
-  runtime for the entire compilation unit. If the flags are set again,
-  (for whatever reason) the set must be consistent with previously
-  set values.
-*/
-#include <omp.h>
-#include <stdio.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-typedef struct {
-  void *addr;
-  char *name;
-  size_t size;
-  int32_t flags;
-  int32_t data;
-} __tgt_offload_entry;
-
-enum Flags {
-  OMP_REGISTER_REQUIRES = 0x10,
-};
-
-typedef struct {
-  void *ImageStart;
-  void *ImageEnd;
-  __tgt_offload_entry *EntriesBegin;
-  __tgt_offload_entry *EntriesEnd;
-} __tgt_device_image;
-
-typedef struct {
-  int32_t NumDeviceImages;
-  __tgt_device_image *DeviceImages;
-  __tgt_offload_entry *HostEntriesBegin;
-  __tgt_offload_entry *HostEntriesEnd;
-} __tgt_bin_desc;
-
-void __tgt_register_lib(__tgt_bin_desc *Desc);
-void __tgt_unregister_lib(__tgt_bin_desc *Desc);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-void run_reg_requires() {
-  // Before the target region is registered, the requires registers the status
-  // of the requires clauses. Since there are no requires clauses in this file
-  // the flags state can only be OMP_REQ_NONE i.e. 1.
-
-  // This is the 2nd time this function is called so it should print SUCCESS if
-  // REQ is compatible with `1` and otherwise cause an error.
-  __tgt_offload_entry entries[] = {{NULL, "", 0, OMP_REGISTER_REQUIRES, 1},
-                                   {NULL, "", 0, OMP_REGISTER_REQUIRES, REQ}};
-  __tgt_device_image image = {NULL, NULL, &entries[0], &entries[1] + 1};
-  __tgt_bin_desc bin = {1, &image, &entries[0], &entries[1] + 1};
-
-  __tgt_register_lib(&bin);
-
-  printf("SUCCESS");
-
-  __tgt_unregister_lib(&bin);
-
-  // clang-format off
-  // GOOD: SUCCESS
-  // BAD: omptarget fatal error 2: '#pragma omp requires reverse_offload' not used consistently!
-  // clang-format on
-}
-
-// ---------------------------------------------------------------------------
-int main() {
-  run_reg_requires();
-
-// This also runs reg requires for the first time.
-#pragma omp target
-  {}
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/runtime_init.c b/libomptarget/test/offloading/runtime_init.c
deleted file mode 100644
index 96fd50f..0000000
--- a/libomptarget/test/offloading/runtime_init.c
+++ /dev/null

@@ -1,30 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN:   env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 \
-// RUN: %fcheck-generic
-
-// REQUIRES: libomptarget-debug
-
-#include <omp.h>
-#include <stdio.h>
-
-extern void __tgt_rtl_init(void);
-extern void __tgt_rtl_deinit(void);
-
-// Sanity checks to make sure that this works and is thread safe.
-int main() {
-  // CHECK: Init offload library!
-  // CHECK: Deinit offload library!
-  __tgt_rtl_init();
-#pragma omp parallel num_threads(8)
-  {
-    __tgt_rtl_init();
-    __tgt_rtl_deinit();
-  }
-  __tgt_rtl_deinit();
-
-  __tgt_rtl_init();
-  __tgt_rtl_deinit();
-
-  // CHECK: PASS
-  printf("PASS\n");
-}

diff --git a/libomptarget/test/offloading/shared_lib_fp_mapping.c b/libomptarget/test/offloading/shared_lib_fp_mapping.c
deleted file mode 100644
index ffb4bc9..0000000
--- a/libomptarget/test/offloading/shared_lib_fp_mapping.c
+++ /dev/null

@@ -1,22 +0,0 @@
-// clang-format off
-// RUN: %clang-generic -fPIC -shared %S/../Inputs/declare_indirect_func.c -o %T/libslfm.so  -fopenmp-version=51
-// RUN: %libomptarget-compile-generic -rpath %T -L %T -l slfm -o %t  -fopenmp-version=51
-// RUN: env LIBOMPTARGET_INFO=32 %t 2>&1 | %fcheck-generic
-// clang-format on
-
-#include <stdio.h>
-
-extern int func(); // Provided in liba.so, returns 42
-typedef int (*fp_t)();
-
-int main() {
-  int x = 0;
-  fp_t fp = &func;
-  printf("TARGET\n");
-#pragma omp target map(from : x)
-  x = fp();
-  // CHECK: Copying data from device to host, {{.*}} Size=8
-  // CHECK: Copying data from device to host, {{.*}} Size=4
-  // CHECK: 42
-  printf("%i\n", x);
-}

diff --git a/libomptarget/test/offloading/small_trip_count.c b/libomptarget/test/offloading/small_trip_count.c
deleted file mode 100644
index 65f094f..0000000
--- a/libomptarget/test/offloading/small_trip_count.c
+++ /dev/null

@@ -1,45 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=16 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
-// RUN: env LIBOMPTARGET_INFO=16 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=8 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=EIGHT
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#define N 128
-
-__attribute__((optnone)) void optnone() {}
-
-int main() {
-  // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
-  // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-  // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
-  // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N - 1; ++i) {
-    optnone();
-  }
-  // DEFAULT: Launching kernel {{.+_main_.+}} with 5 blocks and 32 threads in SPMD mode
-  // EIGHT: Launching kernel {{.+_main_.+}} with 17 blocks and 8 threads in SPMD mode
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N + 1; ++i) {
-    optnone();
-  }
-  // DEFAULT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
-  // EIGHT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
-#pragma omp target teams distribute parallel for simd thread_limit(4)
-  for (int i = 0; i < N; ++i) {
-    optnone();
-  }
-}
-

diff --git a/libomptarget/test/offloading/small_trip_count_thread_limit.cpp b/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
deleted file mode 100644
index b7ae52a..0000000
--- a/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
+++ /dev/null

@@ -1,33 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compilexx-generic
-// RUN: env LIBOMPTARGET_INFO=16 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-int main(int argc, char *argv[]) {
-  constexpr const int block_size = 256;
-  constexpr const int grid_size = 4;
-  constexpr const int count = block_size * grid_size;
-
-  int *data = new int[count];
-
-#pragma omp target teams distribute parallel for thread_limit(block_size) map(from: data[0:count])
-  for (int i = 0; i < count; ++i)
-    data[i] = i;
-
-  for (int i = 0; i < count; ++i)
-    if (data[i] != i)
-      return 1;
-
-  delete[] data;
-
-  return 0;
-}
-
-// CHECK: Launching kernel {{.*}} with 4 blocks and 256 threads in SPMD mode

diff --git a/libomptarget/test/offloading/spmdization.c b/libomptarget/test/offloading/spmdization.c
deleted file mode 100644
index 77913be..0000000
--- a/libomptarget/test/offloading/spmdization.c
+++ /dev/null

@@ -1,44 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compileopt-generic
-// RUN: env LIBOMPTARGET_INFO=16 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,SPMD
-// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization
-// RUN: env LIBOMPTARGET_INFO=16 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,GENERIC
-// clang-format on
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-__attribute__((weak)) void noop() {}
-
-int main(void) {
-  int nthreads = 0, ip = 0, lvl = 0, alvl = 0, nested = 0, tid = 0, maxt = 0;
-
-#pragma omp target map(from : nthreads, ip, lvl, alvl, nested, tid, maxt)
-  {
-    nthreads = omp_get_num_threads();
-    ip = omp_in_parallel();
-    lvl = omp_get_level();
-    alvl = omp_get_active_level();
-    nested = omp_get_nested();
-    tid = omp_get_thread_num();
-    maxt = omp_get_max_threads();
-    #pragma omp parallel
-    noop();  
-  }
-  printf("NumThreads: %i, InParallel: %i, Level: %i, ActiveLevel: %i, Nested: %i, "
-         "ThreadNum: %i, MaxThreads: %i\n",
-         nthreads, ip, lvl, alvl, nested, tid, maxt);
-  // GENERIC: Generic mode
-  // SPMD: Generic-SPMD mode
-  // CHECK: NumThreads: 1, InParallel: 0, Level: 0, ActiveLevel: 0, Nested: 0, ThreadNum: 0, MaxThreads: 
-  return 0;
-}

diff --git a/libomptarget/test/offloading/static_linking.c b/libomptarget/test/offloading/static_linking.c
deleted file mode 100644
index 7be95a1..0000000
--- a/libomptarget/test/offloading/static_linking.c
+++ /dev/null

@@ -1,26 +0,0 @@
-// RUN: %libomptarget-compile-generic -DLIBRARY -c -o %t.o
-// RUN: ar rcs %t.a %t.o
-// RUN: %libomptarget-compile-generic %t.a && %libomptarget-run-generic 2>&1 | %fcheck-generic
-
-#ifdef LIBRARY
-int x = 42;
-#pragma omp declare target(x)
-
-int foo() {
-  int value;
-#pragma omp target map(from : value)
-  value = x;
-  return value;
-}
-#else
-#include <stdio.h>
-int foo();
-
-int main() {
-  int x = foo();
-
-  // CHECK: PASS
-  if (x == 42)
-    printf("PASS\n");
-}
-#endif

diff --git a/libomptarget/test/offloading/std_complex_arithmetic.cpp b/libomptarget/test/offloading/std_complex_arithmetic.cpp
deleted file mode 100644
index 641b849..0000000
--- a/libomptarget/test/offloading/std_complex_arithmetic.cpp
+++ /dev/null

@@ -1,89 +0,0 @@
-// RUN: %libomptarget-compilexx-generic && %libomptarget-run-generic
-// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
-// RUN: %libomptarget-compilexx-generic -O3 -ffast-math && \
-// RUN:   %libomptarget-run-generic
-
-// FIXME: This fails to link due to missing math symbols. We should provide the
-//        needed math functions in the GPU `libm` and require the GPU C library.
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <cassert>
-#include <complex>
-#include <iostream>
-
-template <typename T> void test_map() {
-  std::complex<T> a(0.2, 1), a_check;
-#pragma omp target map(from : a_check)
-  { a_check = a; }
-
-  assert(std::abs(a - a_check) < 1e-6);
-}
-
-template <typename RT, typename AT, typename BT> void test_plus(AT a, BT b) {
-  std::complex<RT> c, c_host;
-
-  c_host = a + b;
-#pragma omp target map(from : c)
-  { c = a + b; }
-
-  assert(std::abs(c - c_host) < 1e-6);
-}
-
-template <typename RT, typename AT, typename BT> void test_minus(AT a, BT b) {
-  std::complex<RT> c, c_host;
-
-  c_host = a - b;
-#pragma omp target map(from : c)
-  { c = a - b; }
-
-  assert(std::abs(c - c_host) < 1e-6);
-}
-
-template <typename RT, typename AT, typename BT> void test_mul(AT a, BT b) {
-  std::complex<RT> c, c_host;
-
-  c_host = a * b;
-#pragma omp target map(from : c)
-  { c = a * b; }
-
-  assert(std::abs(c - c_host) < 1e-6);
-}
-
-template <typename RT, typename AT, typename BT> void test_div(AT a, BT b) {
-  std::complex<RT> c, c_host;
-
-  c_host = a / b;
-#pragma omp target map(from : c)
-  { c = a / b; }
-
-  assert(std::abs(c - c_host) < 1e-6);
-}
-
-template <typename T> void test_complex() {
-  test_map<T>();
-
-  test_plus<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
-  test_plus<T>(std::complex<T>(0, 1), T(0.5));
-  test_plus<T>(T(0.5), std::complex<T>(0, 1));
-
-  test_minus<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
-  test_minus<T>(std::complex<T>(0, 1), T(0.5));
-  test_minus<T>(T(0.5), std::complex<T>(0, 1));
-
-  test_mul<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
-  test_mul<T>(std::complex<T>(0, 1), T(0.5));
-  test_mul<T>(T(0.5), std::complex<T>(0, 1));
-
-  test_div<T>(std::complex<T>(0, 1), std::complex<T>(0.5, 0.3));
-  test_div<T>(std::complex<T>(0, 1), T(0.5));
-  test_div<T>(T(0.5), std::complex<T>(0, 1));
-}
-
-int main() {
-  std::cout << "Testing float" << std::endl;
-  test_complex<float>();
-  std::cout << "Testing double" << std::endl;
-  test_complex<double>();
-  return 0;
-}

diff --git a/libomptarget/test/offloading/struct_mapping_with_pointers.cpp b/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
deleted file mode 100644
index f0fde50..0000000
--- a/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
+++ /dev/null

@@ -1,120 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 | %fcheck-generic
-// clang-format on
-
-// REQUIRES: libomptarget-debug
-
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <stdio.h>
-#include <stdlib.h>
-
-struct Descriptor {
-  int *datum;
-  long int x;
-  int *more_datum;
-  int xi;
-  int val_datum, val_more_datum;
-  long int arr[1][30];
-  int val_arr;
-};
-
-int main() {
-  Descriptor dat = Descriptor();
-  dat.datum = (int *)malloc(sizeof(int) * 10);
-  dat.more_datum = (int *)malloc(sizeof(int) * 20);
-  dat.xi = 3;
-  dat.arr[0][0] = 1;
-
-  dat.datum[7] = 7;
-  dat.more_datum[17] = 17;
-  dat.datum[dat.arr[0][0]] = 0;
-
-  /// The struct is mapped with type 0x0 when the pointer fields are mapped.
-  /// The struct is also map explicitely by the user. The second mapping by
-  /// the user must not overwrite the mapping set up for the pointer fields
-  /// when mapping the struct happens after the mapping of the pointers.
-
-  // clang-format off
-  // CHECK: omptarget --> Entry  0: Base=[[DAT_HST_PTR_BASE:0x.*]], Begin=[[DAT_HST_PTR_BASE]], Size=288, Type=0x0, Name=unknown
-  // CHECK: omptarget --> Entry  1: Base=[[DAT_HST_PTR_BASE]], Begin=[[DAT_HST_PTR_BASE]], Size=288, Type=0x1000000000001, Name=unknown
-  // CHECK: omptarget --> Entry  2: Base=[[DAT_HST_PTR_BASE]], Begin=[[DATUM_HST_PTR_BASE:0x.*]], Size=40, Type=0x1000000000011, Name=unknown
-  // CHECK: omptarget --> Entry  3: Base=[[MORE_DATUM_HST_PTR_BASE:0x.*]], Begin=[[MORE_DATUM_HST_PTR_BEGIN:0x.*]], Size=80, Type=0x1000000000011, Name=unknown
-  // clang-format on
-
-  /// The struct will be mapped in the same order as the above entries.
-
-  /// First argument is the struct itself and it will be mapped once.
-
-  // clang-format off
-  // CHECK: omptarget --> Looking up mapping(HstPtrBegin=[[DAT_HST_PTR_BASE]], Size=288)...
-  // CHECK: PluginInterface --> MemoryManagerTy::allocate: size 288 with host pointer [[DAT_HST_PTR_BASE]].
-  // CHECK: omptarget --> Creating new map entry with HstPtrBase=[[DAT_HST_PTR_BASE]], HstPtrBegin=[[DAT_HST_PTR_BASE]], TgtAllocBegin=[[DAT_DEVICE_PTR_BASE:0x.*]], TgtPtrBegin=[[DAT_DEVICE_PTR_BASE]], Size=288, DynRefCount=1, HoldRefCount=0, Name=unknown
-  // CHECK: omptarget --> Moving 288 bytes (hst:[[DAT_HST_PTR_BASE]]) -> (tgt:[[DAT_DEVICE_PTR_BASE]])
-  // clang-format on
-
-  /// Second argument is dat.datum:
-  // clang-format off
-  // CHECK: omptarget --> Looking up mapping(HstPtrBegin=[[DATUM_HST_PTR_BASE]], Size=40)...
-  // CHECK: PluginInterface --> MemoryManagerTy::allocate: size 40 with host pointer [[DATUM_HST_PTR_BASE]].
-  // CHECK: omptarget --> Creating new map entry with HstPtrBase=[[DATUM_HST_PTR_BASE]], HstPtrBegin=[[DATUM_HST_PTR_BASE]], TgtAllocBegin=[[DATUM_DEVICE_PTR_BASE:0x.*]], TgtPtrBegin=[[DATUM_DEVICE_PTR_BASE]], Size=40, DynRefCount=1, HoldRefCount=0, Name=unknown
-  // CHECK: omptarget --> Moving 40 bytes (hst:[[DATUM_HST_PTR_BASE]]) -> (tgt:[[DATUM_DEVICE_PTR_BASE]])
-  // clang-format on
-
-  /// Third argument is dat.more_datum:
-  // clang-format off
-  // CHECK: omptarget --> Looking up mapping(HstPtrBegin=[[MORE_DATUM_HST_PTR_BEGIN]], Size=80)...
-  // CHECK: PluginInterface --> MemoryManagerTy::allocate: size 80 with host pointer [[MORE_DATUM_HST_PTR_BEGIN]].
-  // CHECK: omptarget --> Creating new map entry with HstPtrBase=[[MORE_DATUM_HST_PTR_BEGIN]], HstPtrBegin=[[MORE_DATUM_HST_PTR_BEGIN]], TgtAllocBegin=[[MORE_DATUM_DEVICE_PTR_BEGIN:0x.*]], TgtPtrBegin=[[MORE_DATUM_DEVICE_PTR_BEGIN]], Size=80, DynRefCount=1, HoldRefCount=0, Name=unknown
-  // CHECK: omptarget --> Moving 80 bytes (hst:[[MORE_DATUM_HST_PTR_BEGIN]]) -> (tgt:[[MORE_DATUM_DEVICE_PTR_BEGIN]])
-  // clang-format on
-
-#pragma omp target enter data map(to : dat.datum[ : 10])                       \
-    map(to : dat.more_datum[ : 20]) map(to : dat)
-
-  /// Checks induced by having a target region:
-  // clang-format off
-  // CHECK: omptarget --> Entry  0: Base=[[DAT_HST_PTR_BASE]], Begin=[[DAT_HST_PTR_BASE]], Size=288, Type=0x223, Name=unknown
-  // CHECK: omptarget --> Mapping exists (implicit) with HstPtrBegin=[[DAT_HST_PTR_BASE]], TgtPtrBegin=[[DAT_DEVICE_PTR_BASE]], Size=288, DynRefCount=2 (incremented), HoldRefCount=0, Name=unknown
-  // CHECK: omptarget --> Obtained target argument [[DAT_DEVICE_PTR_BASE]] from host pointer [[DAT_HST_PTR_BASE]]
-  // clang-format on
-
-#pragma omp target
-  {
-    dat.xi = 4;
-    dat.datum[7]++;
-    dat.more_datum[17]++;
-    dat.val_datum = dat.datum[7];
-    dat.val_more_datum = dat.more_datum[17];
-    dat.datum[dat.arr[0][0]] = dat.xi;
-    dat.val_arr = dat.datum[dat.arr[0][0]];
-  }
-
-  /// Post-target region checks:
-  // clang-format off
-  // CHECK: omptarget --> Mapping exists with HstPtrBegin=[[DAT_HST_PTR_BASE]], TgtPtrBegin=[[DAT_DEVICE_PTR_BASE]], Size=288, DynRefCount=1 (decremented), HoldRefCount=0
-  // clang-format on
-
-#pragma omp target exit data map(from : dat)
-
-  /// Target data end checks:
-  // clang-format off
-  // CHECK: omptarget --> Mapping exists with HstPtrBegin=[[DAT_HST_PTR_BASE]], TgtPtrBegin=[[DAT_DEVICE_PTR_BASE]], Size=288, DynRefCount=0 (decremented, delayed deletion), HoldRefCount=0
-  // CHECK: omptarget --> Moving 288 bytes (tgt:[[DAT_DEVICE_PTR_BASE]]) -> (hst:[[DAT_HST_PTR_BASE]])
-  // clang-format on
-
-  // CHECK: dat.xi = 4
-  // CHECK: dat.val_datum = 8
-  // CHECK: dat.val_more_datum = 18
-  // CHECK: dat.datum[dat.arr[0][0]] = 0
-  // CHECK: dat.val_arr = 4
-
-  printf("dat.xi = %d\n", dat.xi);
-  printf("dat.val_datum = %d\n", dat.val_datum);
-  printf("dat.val_more_datum = %d\n", dat.val_more_datum);
-  printf("dat.datum[dat.arr[0][0]] = %d\n", dat.datum[dat.arr[0][0]]);
-  printf("dat.val_arr = %d\n", dat.val_arr);
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/target-teams-atomic.c b/libomptarget/test/offloading/target-teams-atomic.c
deleted file mode 100644
index 9756342..0000000
--- a/libomptarget/test/offloading/target-teams-atomic.c
+++ /dev/null

@@ -1,50 +0,0 @@
-// Check that omp atomic is permitted and behaves when strictly nested within
-// omp target teams.  This is an extension to OpenMP 5.2 and is enabled by
-// default.
-
-// RUN: %libomptarget-compile-run-and-check-generic
-
-#include <omp.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-
-// High parallelism increases our chances of detecting a lack of atomicity.
-#define NUM_TEAMS_TRY 256
-
-int main() {
-  //      CHECK: update: num_teams=[[#NUM_TEAMS:]]{{$}}
-  // CHECK-NEXT: update: x=[[#NUM_TEAMS]]{{$}}
-  int x = 0;
-  int numTeams;
-#pragma omp target teams num_teams(NUM_TEAMS_TRY) map(tofrom : x, numTeams)
-  {
-#pragma omp atomic update
-    ++x;
-    if (omp_get_team_num() == 0)
-      numTeams = omp_get_num_teams();
-  }
-  printf("update: num_teams=%d\n", numTeams);
-  printf("update: x=%d\n", x);
-
-  // CHECK-NEXT: capture: x=[[#NUM_TEAMS]]{{$}}
-  // CHECK-NEXT: capture: xCapturedCount=[[#NUM_TEAMS]]{{$}}
-  bool xCaptured[numTeams];
-  memset(xCaptured, 0, sizeof xCaptured);
-  x = 0;
-#pragma omp target teams num_teams(NUM_TEAMS_TRY) map(tofrom : x, numTeams)
-  {
-    int v;
-#pragma omp atomic capture
-    v = x++;
-    xCaptured[v] = true;
-  }
-  printf("capture: x=%d\n", x);
-  int xCapturedCount = 0;
-  for (int i = 0; i < numTeams; ++i) {
-    if (xCaptured[i])
-      ++xCapturedCount;
-  }
-  printf("capture: xCapturedCount=%d\n", xCapturedCount);
-  return 0;
-}

diff --git a/libomptarget/test/offloading/target-tile.c b/libomptarget/test/offloading/target-tile.c
deleted file mode 100644
index 8460b43..0000000
--- a/libomptarget/test/offloading/target-tile.c
+++ /dev/null

@@ -1,62 +0,0 @@
-// Check that omp tile (introduced in OpenMP 5.1) is permitted and behaves when
-// strictly nested within omp target.
-
-// RUN: %libomptarget-compile-generic -fopenmp-version=51
-// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
-
-#include <stdio.h>
-
-#define I_NTILES 8
-#define J_NTILES 9
-#define I_NELEMS 2
-#define J_NELEMS 3
-
-int main() {
-  int order[I_NTILES][J_NTILES][I_NELEMS][J_NELEMS];
-  int i, j;
-  #pragma omp target map(tofrom: i, j)
-  {
-    int next = 0;
-    #pragma omp tile sizes(I_NELEMS, J_NELEMS)
-    for (i = 0; i < I_NTILES * I_NELEMS; ++i) {
-      for (j = 0; j < J_NTILES * J_NELEMS; ++j) {
-        int iTile = i / I_NELEMS;
-        int jTile = j / J_NELEMS;
-        int iElem = i % I_NELEMS;
-        int jElem = j % J_NELEMS;
-        order[iTile][jTile][iElem][jElem] = next++;
-      }
-    }
-  }
-  int expected = 0;
-  for (int iTile = 0; iTile < I_NTILES; ++iTile) {
-    for (int jTile = 0; jTile < J_NTILES; ++jTile) {
-      for (int iElem = 0; iElem < I_NELEMS; ++iElem) {
-        for (int jElem = 0; jElem < J_NELEMS; ++jElem) {
-          int actual = order[iTile][jTile][iElem][jElem];
-          if (expected != actual) {
-            printf("error: order[%d][%d][%d][%d] = %d, expected %d\n",
-                   iTile, jTile, iElem, jElem, actual, expected);
-            return 1;
-          }
-          ++expected;
-        }
-      }
-    }
-  }
-  // Tiling leaves the loop variables with their values from the final iteration
-  // rather than with the usual +1.
-  expected = I_NTILES * I_NELEMS - 1;
-  if (i != expected) {
-    printf("error: i = %d, expected %d\n", i, expected);
-    return 1;
-  }
-  expected = J_NTILES * J_NELEMS - 1;
-  if (j != expected) {
-    printf("error: j = %d, expected %d\n", j, expected);
-    return 1;
-  }
-  // CHECK: success
-  printf("success\n");
-  return 0;
-}

diff --git a/libomptarget/test/offloading/target_constexpr_mapping.cpp b/libomptarget/test/offloading/target_constexpr_mapping.cpp
deleted file mode 100644
index 14cf92a..0000000
--- a/libomptarget/test/offloading/target_constexpr_mapping.cpp
+++ /dev/null

@@ -1,34 +0,0 @@
-// RUN: %libomptarget-compileoptxx-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp declare target
-class A {
-public:
-  constexpr static double pi = 3.141592653589793116;
-  A() { ; }
-  ~A() { ; }
-};
-#pragma omp end declare target
-
-#pragma omp declare target
-constexpr static double anotherPi = 3.14;
-#pragma omp end declare target
-
-int main() {
-  double a[2];
-#pragma omp target map(tofrom : a[:2])
-  {
-    a[0] = A::pi;
-    a[1] = anotherPi;
-  }
-
-  // CHECK: pi = 3.141592653589793116
-  printf("pi = %.18f\n", a[0]);
-
-  // CHECK: anotherPi = 3.14
-  printf("anotherPi = %.2f\n", a[1]);
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/target_critical_region.cpp b/libomptarget/test/offloading/target_critical_region.cpp
deleted file mode 100644
index 495632b..0000000
--- a/libomptarget/test/offloading/target_critical_region.cpp
+++ /dev/null

@@ -1,41 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
-#include <omp.h>
-#include <stdio.h>
-
-#define N 1000000
-
-int A[N];
-int main() {
-  for (int i = 0; i < N; i++)
-    A[i] = 1;
-
-  int sum[1];
-  sum[0] = 0;
-
-#pragma omp target teams distribute parallel for num_teams(256)                \
-    schedule(static, 1) map(to                                                 \
-                            : A[:N]) map(tofrom                                \
-                                         : sum[:1])
-  {
-    for (int i = 0; i < N; i++) {
-#pragma omp critical
-      { sum[0] += A[i]; }
-    }
-  }
-
-  // CHECK: SUM = 1000000
-  printf("SUM = %d\n", sum[0]);
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/target_depend_nowait.cpp b/libomptarget/test/offloading/target_depend_nowait.cpp
deleted file mode 100644
index b0d5408..0000000
--- a/libomptarget/test/offloading/target_depend_nowait.cpp
+++ /dev/null

@@ -1,56 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-#define N 1024
-
-int A[N];
-int B[N];
-int C[N];
-int main() {
-  for (int i = 0; i < N; i++)
-    A[i] = B[i] = i;
-
-#pragma omp parallel num_threads(2)
-  {
-    if (omp_get_thread_num() == 1) {
-// map data A & B and move to
-#pragma omp target enter data map(to : A, B) depend(out : A[0]) nowait
-
-// no data move since already mapped
-#pragma omp target map(A, B) depend(out : A[0]) nowait
-      {
-        for (int i = 0; i < N; i++)
-          ++A[i];
-        for (int i = 0; i < N; i++)
-          ++B[i];
-      }
-
-// no data move since already mapped
-#pragma omp target teams num_teams(1) map(A, B) depend(out : A[0]) nowait
-      {
-        for (int i = 0; i < N; i++)
-          ++A[i];
-        for (int i = 0; i < N; i++)
-          ++B[i];
-      }
-
-// A updated via update
-#pragma omp target update from(A) depend(out : A[0]) nowait
-
-// B updated via exit, A just released
-#pragma omp target exit data map(release : A) map(from : B) depend(out : A[0]) \
-    nowait
-    } // if
-  }   // parallel
-
-  int Sum = 0;
-  for (int i = 0; i < N; i++)
-    Sum += A[i] + B[i];
-  // Sum is 2 * N * (2 + N - 1 + 2) / 2
-  // CHECK: Sum = 1051648.
-  printf("Sum = %d.\n", Sum);
-
-  return Sum != 2 * N * (2 + N - 1 + 2) / 2;
-}

diff --git a/libomptarget/test/offloading/target_map_for_member_data.cpp b/libomptarget/test/offloading/target_map_for_member_data.cpp
deleted file mode 100644
index 7fba71b..0000000
--- a/libomptarget/test/offloading/target_map_for_member_data.cpp
+++ /dev/null

@@ -1,25 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 | %fcheck-generic
-// clang-format on
-
-// REQUIRES: libomptarget-debug
-
-struct DataTy {
-  float a;
-  float b[3];
-};
-
-int main(int argc, char **argv) {
-  DataTy D;
-#pragma omp target map(D.a) map(D.b[ : 2])
-  {
-    D.a = 0;
-    D.b[0] = 1;
-  }
-  return 0;
-}
-// clang-format off
-// CHECK: omptarget --> Entry  0: Base=[[DAT_HST_PTR_BASE:0x.*]], Begin=[[DAT_HST_PTR_BASE]], Size=12
-// CHECK: omptarget --> Entry  1: Base=[[DAT_HST_PTR_BASE]], Begin=[[DAT_HST_PTR_BASE]], Size=4,
-// CHECK: omptarget --> Entry  2: Base=[[DAT_HST_PTR_BASE]], Begin=[[DATUM_HST_PTR_BASE:0x.*]], Size=8,
-// clang-format on

diff --git a/libomptarget/test/offloading/target_nowait_target.cpp b/libomptarget/test/offloading/target_nowait_target.cpp
deleted file mode 100644
index 7c20e76..0000000
--- a/libomptarget/test/offloading/target_nowait_target.cpp
+++ /dev/null

@@ -1,31 +0,0 @@
-// RUN: %libomptarget-compilexx-and-run-generic
-// RUN: %libomptarget-compileoptxx-and-run-generic
-
-#include <cassert>
-
-int main(int argc, char *argv[]) {
-  int data[1024];
-  int sum = 0;
-
-  for (int i = 0; i < 1024; ++i)
-    data[i] = i;
-
-#pragma omp target map(tofrom : sum) map(to : data) depend(inout : data[0])    \
-    nowait
-  {
-    for (int i = 0; i < 1024; ++i) {
-      sum += data[i];
-    }
-  }
-
-#pragma omp target map(tofrom : sum) map(to : data) depend(inout : data[0])
-  {
-    for (int i = 0; i < 1024; ++i) {
-      sum += data[i];
-    }
-  }
-
-  assert(sum == 1023 * 1024);
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/task_in_reduction_target.c b/libomptarget/test/offloading/task_in_reduction_target.c
deleted file mode 100644
index b546d73..0000000
--- a/libomptarget/test/offloading/task_in_reduction_target.c
+++ /dev/null

@@ -1,31 +0,0 @@
-// RUN: %libomptarget-compile-generic && \
-// RUN: %libomptarget-run-generic
-
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, char *argv[]) {
-
-  int num_devices = omp_get_num_devices();
-
-  // No target devices, just return
-  if (num_devices == 0) {
-    printf("PASS\n");
-    return 0;
-  }
-
-  double sum = 999;
-  double A = 311;
-
-#pragma omp taskgroup task_reduction(+ : sum)
-  {
-#pragma omp target map(to : A) in_reduction(+ : sum) device(0) nowait
-    { sum += A; }
-  }
-
-  printf("PASS\n");
-  return EXIT_SUCCESS;
-}
-
-// CHECK: PASS

diff --git a/libomptarget/test/offloading/taskloop_offload_nowait.cpp b/libomptarget/test/offloading/taskloop_offload_nowait.cpp
deleted file mode 100644
index 7069580..0000000
--- a/libomptarget/test/offloading/taskloop_offload_nowait.cpp
+++ /dev/null

@@ -1,43 +0,0 @@
-// RUN: %libomptarget-compilexx-and-run-generic
-
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-
-bool almost_equal(float x, float gold, float tol) {
-  if (std::signbit(x) != std::signbit(gold))
-    x = std::abs(gold) - std::abs(x);
-
-  return std::abs(gold) * (1 - tol) <= std::abs(x) &&
-         std::abs(x) <= std::abs(gold) * (1 + tol);
-}
-
-int main(int argc, char *argv[]) {
-  constexpr const int N0{2};
-  constexpr const int N1{182};
-  constexpr const float expected_value{N0 * N1};
-  float counter_N0{};
-
-#pragma omp target data map(tofrom : counter_N0)
-  {
-#pragma omp taskloop shared(counter_N0)
-    for (int i0 = 0; i0 < N0; i0++) {
-#pragma omp target teams distribute parallel for map(tofrom : counter_N0) nowait
-      for (int i1 = 0; i1 < N1; i1++) {
-#pragma omp atomic update
-        counter_N0 = counter_N0 + 1.;
-      }
-    }
-  }
-
-  if (!almost_equal(counter_N0, expected_value, 0.1)) {
-    std::cerr << "Expected: " << expected_value << " Got: " << counter_N0
-              << '\n';
-    return -1;
-  }
-
-  return 0;
-}

diff --git a/libomptarget/test/offloading/test_libc.cpp b/libomptarget/test/offloading/test_libc.cpp
deleted file mode 100644
index 859c2da..0000000
--- a/libomptarget/test/offloading/test_libc.cpp
+++ /dev/null

@@ -1,28 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-
-#include <algorithm>
-
-extern "C" int printf(const char *, ...);
-
-// std::equal is lowered to libc function memcmp.
-void test_memcpy() {
-  int r = 0;
-#pragma omp target map(from : r)
-  {
-    int x[2] = {0, 0};
-    int y[2] = {0, 0};
-    int z[2] = {0, 1};
-    bool eq1 = std::equal(x, x + 2, y);
-    bool eq2 = std::equal(x, x + 2, z);
-    r = eq1 && !eq2;
-  }
-  printf("memcmp: %s\n", r ? "PASS" : "FAIL");
-}
-
-int main(int argc, char *argv[]) {
-  test_memcpy();
-
-  return 0;
-}
-
-// CHECK: memcmp: PASS

diff --git a/libomptarget/test/offloading/thread_limit.c b/libomptarget/test/offloading/thread_limit.c
deleted file mode 100644
index a8cc51b..0000000
--- a/libomptarget/test/offloading/thread_limit.c
+++ /dev/null

@@ -1,31 +0,0 @@
-// clang-format off
-// RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=16 \
-// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
-
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-int main() {
-  int n = 1 << 20;
-  int th = 12;
-  int te = n / th;
-// DEFAULT: 12 (MaxFlatWorkGroupSize: 
-#pragma omp target
-#pragma omp teams loop num_teams(te), thread_limit(th)
-  for (int i = 0; i < n; i++) {
-  }
-
-// DEFAULT: 13 (MaxFlatWorkGroupSize: 
-  #pragma omp target
-  #pragma omp teams distribute parallel for simd num_teams(te), thread_limit(th+1) simdlen(64)
-  for(int i = 0; i < n; i++) {
-  }
-  return 0;
-}

diff --git a/libomptarget/test/offloading/thread_state_1.c b/libomptarget/test/offloading/thread_state_1.c
deleted file mode 100644
index 908b716..0000000
--- a/libomptarget/test/offloading/thread_state_1.c
+++ /dev/null

@@ -1,38 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  // TODO: Test all ICVs on all levels
-  int o_lvl = 111, i_lvl = 222, o_tid = 333, i_tid = 333, o_nt = 444,
-      i_nt = 555;
-#pragma omp target teams map(tofrom : o_lvl, i_lvl, o_tid, i_tid, o_nt, i_nt)  \
-    num_teams(2) thread_limit(64)
-  {
-    if (omp_get_team_num() == 0) {
-#pragma omp parallel num_threads(64)
-      if (omp_get_thread_num() == omp_get_num_threads() - 1) {
-        o_lvl = omp_get_level();
-        o_tid = omp_get_thread_num();
-        o_nt = omp_get_num_threads();
-#pragma omp parallel num_threads(64)
-        if (omp_get_thread_num() == omp_get_num_threads() - 1) {
-          i_lvl = omp_get_level();
-          i_tid = omp_get_thread_num();
-          i_nt = omp_get_num_threads();
-        }
-      }
-    }
-  }
-  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
-      i_tid == 0 && i_nt == 1) {
-    // CHECK: Success
-    printf("Success\n");
-    return 0;
-  }
-  printf("outer: lvl: %i, tid: %i, nt: %i\n", o_lvl, o_tid, o_nt);
-  printf("inner: lvl: %i, tid: %i, nt: %i\n", i_lvl, i_tid, i_nt);
-  return 1;
-}

diff --git a/libomptarget/test/offloading/thread_state_2.c b/libomptarget/test/offloading/thread_state_2.c
deleted file mode 100644
index 38bc86b..0000000
--- a/libomptarget/test/offloading/thread_state_2.c
+++ /dev/null

@@ -1,40 +0,0 @@
-// This fails when optimized for now.
-// RUN: %libomptarget-compile-run-and-check-generic
-// XUN: %libomptarget-compileopt-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  // TODO: Test all ICVs on all levels
-  int o_lvl = 111, i_lvl = 222, o_tid = 333, i_tid = 333, o_nt = 444,
-      i_nt = 555;
-#pragma omp target teams map(tofrom : o_lvl, i_lvl, o_tid, i_tid, o_nt, i_nt)  \
-    num_teams(2) thread_limit(64)
-  {
-    omp_set_max_active_levels(1);
-    if (omp_get_team_num() == 0) {
-#pragma omp parallel num_threads(64)
-      if (omp_get_thread_num() == omp_get_num_threads() - 1) {
-        o_lvl = omp_get_level();
-        o_tid = omp_get_thread_num();
-        o_nt = omp_get_num_threads();
-#pragma omp parallel num_threads(64)
-        if (omp_get_thread_num() == omp_get_num_threads() - 1) {
-          i_lvl = omp_get_level();
-          i_tid = omp_get_thread_num();
-          i_nt = omp_get_num_threads();
-        }
-      }
-    }
-  }
-  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
-      i_tid == 0 && i_nt == 1) {
-    // CHECK: Success
-    printf("Success\n");
-    return 0;
-  }
-  printf("outer: lvl: %i, tid: %i, nt: %i\n", o_lvl, o_tid, o_nt);
-  printf("inner: lvl: %i, tid: %i, nt: %i\n", i_lvl, i_tid, i_nt);
-  return 1;
-}

diff --git a/libomptarget/test/offloading/weak.c b/libomptarget/test/offloading/weak.c
deleted file mode 100644
index ca81db9..0000000
--- a/libomptarget/test/offloading/weak.c
+++ /dev/null

@@ -1,33 +0,0 @@
-// RUN: %libomptarget-compile-generic -DA -c -o %t-a.o
-// RUN: %libomptarget-compile-generic -DB -c -o %t-b.o
-// RUN: %libomptarget-compile-generic %t-a.o %t-b.o && \
-// RUN:   %libomptarget-run-generic | %fcheck-generic
-
-#if defined(A)
-__attribute__((weak)) int x = 999;
-#pragma omp declare target to(x)
-#elif defined(B)
-int x = 42;
-#pragma omp declare target to(x)
-__attribute__((weak)) int y = 42;
-#pragma omp declare target to(y)
-#else
-
-#include <stdio.h>
-
-extern int x;
-#pragma omp declare target to(x)
-extern int y;
-#pragma omp declare target to(y)
-
-int main() {
-  x = 0;
-
-#pragma omp target update from(x)
-#pragma omp target update from(y)
-
-  // CHECK: PASS
-  if (x == 42 && y == 42)
-    printf("PASS\n");
-}
-#endif

diff --git a/libomptarget/test/offloading/workshare_chunk.c b/libomptarget/test/offloading/workshare_chunk.c
deleted file mode 100644
index a8c60c0..0000000
--- a/libomptarget/test/offloading/workshare_chunk.c
+++ /dev/null

@@ -1,254 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-// clang-format off
-
-#include <omp.h>
-#include <stdio.h>
-
-#define N 100
-#define BLOCK_SHIFT 8
-
-void print(int *A, int size) {
-  for (int i = 0; i < size; ++i) {
-    printf("B%dT%d ", A[i] >> BLOCK_SHIFT, A[i] % (1 << BLOCK_SHIFT));
-  }
-  printf("\n");
-}
-
-int main() {
-  int A[N];
-
-#pragma omp target parallel for map(from:A) num_threads(10) schedule(static, 2)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target parallel for thread chunk size %d\n", 2);
-  print(A, N);
-
-#pragma omp target teams distribute map(from:A) num_teams(10) \
-        dist_schedule(static, 2)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute block chunk size %d\n", 2);
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) \
-        num_teams(10) dist_schedule(static, 2)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for block chunk size %d ", 2);
-  printf("thread chunk size default\n");
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) \
-        num_teams(10) dist_schedule(static, 2) schedule(static, 3)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for block chunk size %d ", 2);
-  printf("thread chunk size %d\n", 3);
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) \
-        num_teams(10) dist_schedule(static, 3) schedule(static, 2)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for block chunk size %d ", 3);
-  printf("thread chunk size %d\n", 2);
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) \
-        num_teams(10) dist_schedule(static, 5) schedule(static, 2)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for block chunk size %d ", 5);
-  printf("thread chunk size %d\n", 2);
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) num_teams(10) \
-        dist_schedule(static, 49) schedule(static, 2)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for block chunk size %d ", 49);
-  printf("thread chunk size %d\n", 2);
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) \
-        num_teams(10) num_threads(10) dist_schedule(static, 29)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for block chunk size %d ", 29);
-  printf("thread chunk size default\n");
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) \
-        num_teams(10) num_threads(10) dist_schedule(static, 101)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for block chunk size %d ", 101);
-  printf("thread chunk size default\n");
-  print(A, N);
-
-#pragma omp target teams distribute parallel for map(from:A) \
-        num_teams(9) num_threads(10) schedule(static, 101)
-  for (int i = 0; i < N; ++i) {
-     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
-  }
-  printf("omp target teams distribute parallel for default block chunk size ");
-  printf("thread chunk size %d\n", 101);
-  print(A, N);
-  return 0;
-}
-//CHECK:      omp target parallel for thread chunk size 2
-
-//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
-//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
-//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
-//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
-//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
-//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
-//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
-//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
-//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
-//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
-
-//CHECK:      omp target teams distribute block chunk size 2
-
-//CHECK-NEXT: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME: block chunk size 2 thread chunk size default
-
-//CHECK-NEXT: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
-//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
-//CHECK-SAME: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
-//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
-//CHECK-SAME: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
-//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME  block chunk size 2 thread chunk size 3
-
-//CHECK-NEXT: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME: block chunk size 3 thread chunk size 2
-
-//CHECK-NEXT: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
-//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
-//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
-//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
-//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
-//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
-//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
-//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
-//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
-//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
-//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
-//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
-//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1 B3T0
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME: block chunk size 5 thread chunk size 2
-
-//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B1T0 B1T0 B1T1 B1T1 B1T2
-//CHECK-SAME: B2T0 B2T0 B2T1 B2T1 B2T2 B3T0 B3T0 B3T1 B3T1 B3T2
-//CHECK-SAME: B4T0 B4T0 B4T1 B4T1 B4T2 B5T0 B5T0 B5T1 B5T1 B5T2
-//CHECK-SAME: B6T0 B6T0 B6T1 B6T1 B6T2 B7T0 B7T0 B7T1 B7T1 B7T2
-//CHECK-SAME: B8T0 B8T0 B8T1 B8T1 B8T2 B9T0 B9T0 B9T1 B9T1 B9T2
-//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B1T0 B1T0 B1T1 B1T1 B1T2
-//CHECK-SAME: B2T0 B2T0 B2T1 B2T1 B2T2 B3T0 B3T0 B3T1 B3T1 B3T2
-//CHECK-SAME: B4T0 B4T0 B4T1 B4T1 B4T2 B5T0 B5T0 B5T1 B5T1 B5T2
-//CHECK-SAME: B6T0 B6T0 B6T1 B6T1 B6T2 B7T0 B7T0 B7T1 B7T1 B7T2
-//CHECK-SAME: B8T0 B8T0 B8T1 B8T1 B8T2 B9T0 B9T0 B9T1 B9T1 B9T2
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME: block chunk size 49 thread chunk size 2
-
-//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 B0T5 B0T5
-//CHECK-SAME: B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 B0T10 B0T10 B0T11 B0T11
-//CHECK-SAME: B0T12 B0T12 B0T13 B0T13 B0T14 B0T14 B0T15 B0T15 B0T16 B0T16
-//CHECK-SAME: B0T17 B0T17 B0T18 B0T18 B0T19 B0T19 B0T20 B0T20 B0T21 B0T21
-//CHECK-SAME: B0T22 B0T22 B0T23 B0T23 B0T24
-//CHECK-SAME: B1T0 B1T0 B1T1 B1T1 B1T2 B1T2 B1T3 B1T3 B1T4 B1T4 B1T5 B1T5
-//CHECK-SAME: B1T6 B1T6 B1T7 B1T7 B1T8 B1T8 B1T9 B1T9 B1T10 B1T10 B1T11 B1T11
-//CHECK-SAME: B1T12 B1T12 B1T13 B1T13 B1T14 B1T14 B1T15 B1T15 B1T16 B1T16
-//CHECK-SAME: B1T17 B1T17 B1T18 B1T18 B1T19 B1T19 B1T20 B1T20 B1T21 B1T21
-//CHECK-SAME: B1T22 B1T22 B1T23 B1T23 B1T24
-//CHECK-SAME: B2T0 B2T0
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME: block chunk size 29 thread chunk size default
-
-//CHECK-NEXT: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8
-//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8 B1T9
-//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8 B1T9
-//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8
-//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8 B2T9
-//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8 B2T9
-//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8
-//CHECK-SAME: B3T0 B3T1 B3T2 B3T3 B3T4 B3T5 B3T6 B3T7 B3T8 B3T9
-//CHECK-SAME: B3T0 B3T1 B3T2
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME: block chunk size 101 thread chunk size default
-
-//CHECK-NEXT: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
-
-//CHECK:      omp target teams distribute parallel for
-//CHECK-SAME: default block chunk size thread chunk size 101
-
-//CHECK-NEXT: B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0
-//CHECK-SAME: B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0
-//CHECK-SAME: B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0
-//CHECK-SAME: B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0
-//CHECK-SAME: B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0
-//CHECK-SAME: B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0
-//CHECK-SAME: B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0
-//CHECK-SAME: B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0
-//CHECK-SAME: B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0
-//CHECK-SAME: B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0

diff --git a/libomptarget/test/offloading/wtime.c b/libomptarget/test/offloading/wtime.c
deleted file mode 100644
index 8a6e206..0000000
--- a/libomptarget/test/offloading/wtime.c
+++ /dev/null

@@ -1,27 +0,0 @@
-// RUN: %libomptarget-compileopt-and-run-generic
-
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define N (1024 * 1024 * 256)
-
-int main(int argc, char *argv[]) {
-  int *data = (int *)malloc(N * sizeof(int));
-  double duration = 0.0;
-
-#pragma omp target map(from : data[0 : N]) map(from : duration)
-  {
-    double start = omp_get_wtime();
-    for (int i = 0; i < N; ++i)
-      data[i] = i;
-    double end = omp_get_wtime();
-    duration = end - start;
-  }
-  assert(duration > 0.0);
-  free(data);
-  return 0;
-}

diff --git a/libomptarget/test/ompt/callbacks.h b/libomptarget/test/ompt/callbacks.h
deleted file mode 100644
index 95437d9..0000000
--- a/libomptarget/test/ompt/callbacks.h
+++ /dev/null

@@ -1,129 +0,0 @@
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-// Tool related code below
-#include <omp-tools.h>
-
-// For EMI callbacks
-ompt_id_t next_op_id = 0x8000000000000001;
-
-// OMPT callbacks
-
-// Synchronous callbacks
-static void on_ompt_callback_device_initialize(int device_num, const char *type,
-                                               ompt_device_t *device,
-                                               ompt_function_lookup_t lookup,
-                                               const char *documentation) {
-  printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
-         device_num, type, device, lookup, documentation);
-}
-
-static void on_ompt_callback_device_finalize(int device_num) {
-  printf("Callback Fini: device_num=%d\n", device_num);
-}
-
-static void on_ompt_callback_device_load(int device_num, const char *filename,
-                                         int64_t offset_in_file,
-                                         void *vma_in_file, size_t bytes,
-                                         void *host_addr, void *device_addr,
-                                         uint64_t module_id) {
-  printf("Callback Load: device_num:%d module_id:%lu filename:%s host_adddr:%p "
-         "device_addr:%p bytes:%lu\n",
-         device_num, module_id, filename, host_addr, device_addr, bytes);
-}
-
-static void on_ompt_callback_target_data_op(
-    ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
-    void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
-    size_t bytes, const void *codeptr_ra) {
-  assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  printf("  Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p "
-         "src_device_num=%d "
-         "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
-         dest_device_num, bytes, codeptr_ra);
-}
-
-static void on_ompt_callback_target(ompt_target_t kind,
-                                    ompt_scope_endpoint_t endpoint,
-                                    int device_num, ompt_data_t *task_data,
-                                    ompt_id_t target_id,
-                                    const void *codeptr_ra) {
-  assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d "
-         "code=%p\n",
-         target_id, kind, endpoint, device_num, codeptr_ra);
-}
-
-static void on_ompt_callback_target_submit(ompt_id_t target_id,
-                                           ompt_id_t host_op_id,
-                                           unsigned int requested_num_teams) {
-  printf("  Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
-         target_id, host_op_id, requested_num_teams);
-}
-
-static void on_ompt_callback_target_map(ompt_id_t target_id,
-                                        unsigned int nitems, void **host_addr,
-                                        void **device_addr, size_t *bytes,
-                                        unsigned int *mapping_flags,
-                                        const void *codeptr_ra) {
-  printf("Target map callback is unimplemented\n");
-  abort();
-}
-
-static void on_ompt_callback_target_data_op_emi(
-    ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
-    ompt_data_t *target_data, ompt_id_t *host_op_id,
-    ompt_target_data_op_t optype, void *src_addr, int src_device_num,
-    void *dest_addr, int dest_device_num, size_t bytes,
-    const void *codeptr_ra) {
-  assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  if (endpoint == ompt_scope_begin)
-    *host_op_id = next_op_id++;
-  // target_task_data may be null, avoid dereferencing it
-  uint64_t target_task_data_value =
-      (target_task_data) ? target_task_data->value : 0;
-  printf("  Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
-         "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
-         "src_device_num=%d "
-         "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         endpoint, optype, target_task_data, target_task_data_value,
-         target_data, target_data->value, host_op_id, *host_op_id, src_addr,
-         src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
-}
-
-static void on_ompt_callback_target_emi(ompt_target_t kind,
-                                        ompt_scope_endpoint_t endpoint,
-                                        int device_num, ompt_data_t *task_data,
-                                        ompt_data_t *target_task_data,
-                                        ompt_data_t *target_data,
-                                        const void *codeptr_ra) {
-  assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  if (endpoint == ompt_scope_begin)
-    target_data->value = next_op_id++;
-  printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
-         "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
-         kind, endpoint, device_num, task_data, task_data->value,
-         target_task_data, target_task_data->value, target_data,
-         target_data->value, codeptr_ra);
-}
-
-static void on_ompt_callback_target_submit_emi(
-    ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
-    ompt_id_t *host_op_id, unsigned int requested_num_teams) {
-  printf("  Callback Submit EMI: endpoint=%d  req_num_teams=%d target_data=%p "
-         "(0x%lx) host_op_id=%p (0x%lx)\n",
-         endpoint, requested_num_teams, target_data, target_data->value,
-         host_op_id, *host_op_id);
-}
-
-static void on_ompt_callback_target_map_emi(ompt_data_t *target_data,
-                                            unsigned int nitems,
-                                            void **host_addr,
-                                            void **device_addr, size_t *bytes,
-                                            unsigned int *mapping_flags,
-                                            const void *codeptr_ra) {
-  printf("Target map emi callback is unimplemented\n");
-  abort();
-}

diff --git a/libomptarget/test/ompt/register_both.h b/libomptarget/test/ompt/register_both.h
deleted file mode 100644
index afdf094..0000000
--- a/libomptarget/test/ompt/register_both.h
+++ /dev/null

@@ -1,49 +0,0 @@
-#include <omp-tools.h>
-
-// From openmp/runtime/test/ompt/callback.h
-#define register_ompt_callback_t(name, type)                                   \
-  do {                                                                         \
-    type f_##name = &on_##name;                                                \
-    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
-      printf("0: Could not register callback '" #name "'\n");                  \
-  } while (0)
-
-#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
-
-// OMPT entry point handles
-static ompt_set_callback_t ompt_set_callback = 0;
-
-// Init functions
-int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
-                    ompt_data_t *tool_data) {
-  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
-
-  if (!ompt_set_callback)
-    return 0; // failed
-
-  register_ompt_callback(ompt_callback_device_initialize);
-  register_ompt_callback(ompt_callback_device_finalize);
-  register_ompt_callback(ompt_callback_device_load);
-  register_ompt_callback(ompt_callback_target_data_op_emi);
-  register_ompt_callback(ompt_callback_target_data_op);
-  register_ompt_callback(ompt_callback_target);
-  register_ompt_callback(ompt_callback_target_emi);
-  register_ompt_callback(ompt_callback_target_submit);
-
-  return 1; // success
-}
-
-void ompt_finalize(ompt_data_t *tool_data) {}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
-                                          const char *runtime_version) {
-  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
-                                                            &ompt_finalize, 0};
-  return &ompt_start_tool_result;
-}
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/test/ompt/register_emi.h b/libomptarget/test/ompt/register_emi.h
deleted file mode 100644
index 2c70c16..0000000
--- a/libomptarget/test/ompt/register_emi.h
+++ /dev/null

@@ -1,47 +0,0 @@
-#include <omp-tools.h>
-
-// From openmp/runtime/test/ompt/callback.h
-#define register_ompt_callback_t(name, type)                                   \
-  do {                                                                         \
-    type f_##name = &on_##name;                                                \
-    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
-      printf("0: Could not register callback '" #name "'\n");                  \
-  } while (0)
-
-#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
-
-// OMPT entry point handles
-static ompt_set_callback_t ompt_set_callback = 0;
-
-// Init functions
-int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
-                    ompt_data_t *tool_data) {
-  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
-
-  if (!ompt_set_callback)
-    return 0; // failed
-
-  register_ompt_callback(ompt_callback_device_initialize);
-  register_ompt_callback(ompt_callback_device_finalize);
-  register_ompt_callback(ompt_callback_device_load);
-  register_ompt_callback(ompt_callback_target_data_op_emi);
-  register_ompt_callback(ompt_callback_target_emi);
-  register_ompt_callback(ompt_callback_target_submit_emi);
-
-  return 1; // success
-}
-
-void ompt_finalize(ompt_data_t *tool_data) {}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
-                                          const char *runtime_version) {
-  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
-                                                            &ompt_finalize, 0};
-  return &ompt_start_tool_result;
-}
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/test/ompt/register_emi_map.h b/libomptarget/test/ompt/register_emi_map.h
deleted file mode 100644
index ccf6591..0000000
--- a/libomptarget/test/ompt/register_emi_map.h
+++ /dev/null

@@ -1,48 +0,0 @@
-#include <omp-tools.h>
-
-// From openmp/runtime/test/ompt/callback.h
-#define register_ompt_callback_t(name, type)                                   \
-  do {                                                                         \
-    type f_##name = &on_##name;                                                \
-    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
-      printf("0: Could not register callback '" #name "'\n");                  \
-  } while (0)
-
-#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
-
-// OMPT entry point handles
-static ompt_set_callback_t ompt_set_callback = 0;
-
-// Init functions
-int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
-                    ompt_data_t *tool_data) {
-  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
-
-  if (!ompt_set_callback)
-    return 0; // failed
-
-  register_ompt_callback(ompt_callback_device_initialize);
-  register_ompt_callback(ompt_callback_device_finalize);
-  register_ompt_callback(ompt_callback_device_load);
-  register_ompt_callback(ompt_callback_target_data_op_emi);
-  register_ompt_callback(ompt_callback_target_emi);
-  register_ompt_callback(ompt_callback_target_submit_emi);
-  register_ompt_callback(ompt_callback_target_map_emi);
-
-  return 1; // success
-}
-
-void ompt_finalize(ompt_data_t *tool_data) {}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
-                                          const char *runtime_version) {
-  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
-                                                            &ompt_finalize, 0};
-  return &ompt_start_tool_result;
-}
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/test/ompt/register_no_device_init.h b/libomptarget/test/ompt/register_no_device_init.h
deleted file mode 100644
index 874e3d3..0000000
--- a/libomptarget/test/ompt/register_no_device_init.h
+++ /dev/null

@@ -1,47 +0,0 @@
-#include <omp-tools.h>
-
-// From openmp/runtime/test/ompt/callback.h
-#define register_ompt_callback_t(name, type)                                   \
-  do {                                                                         \
-    type f_##name = &on_##name;                                                \
-    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
-      printf("0: Could not register callback '" #name "'\n");                  \
-  } while (0)
-
-#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
-
-// OMPT entry point handles
-static ompt_set_callback_t ompt_set_callback = 0;
-
-// Init functions
-int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
-                    ompt_data_t *tool_data) {
-  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
-
-  if (!ompt_set_callback)
-    return 0; // failed
-
-  // If no device init callback is registered, the other callbacks won't be
-  // activated.
-  register_ompt_callback(ompt_callback_device_load);
-  register_ompt_callback(ompt_callback_target_data_op);
-  register_ompt_callback(ompt_callback_target);
-  register_ompt_callback(ompt_callback_target_submit);
-
-  return 1; // success
-}
-
-void ompt_finalize(ompt_data_t *tool_data) {}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
-                                          const char *runtime_version) {
-  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
-                                                            &ompt_finalize, 0};
-  return &ompt_start_tool_result;
-}
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/test/ompt/register_non_emi.h b/libomptarget/test/ompt/register_non_emi.h
deleted file mode 100644
index 66c8aaa..0000000
--- a/libomptarget/test/ompt/register_non_emi.h
+++ /dev/null

@@ -1,47 +0,0 @@
-#include <omp-tools.h>
-
-// From openmp/runtime/test/ompt/callback.h
-#define register_ompt_callback_t(name, type)                                   \
-  do {                                                                         \
-    type f_##name = &on_##name;                                                \
-    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
-      printf("0: Could not register callback '" #name "'\n");                  \
-  } while (0)
-
-#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
-
-// OMPT entry point handles
-static ompt_set_callback_t ompt_set_callback = 0;
-
-// Init functions
-int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
-                    ompt_data_t *tool_data) {
-  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
-
-  if (!ompt_set_callback)
-    return 0; // failed
-
-  register_ompt_callback(ompt_callback_device_initialize);
-  register_ompt_callback(ompt_callback_device_finalize);
-  register_ompt_callback(ompt_callback_device_load);
-  register_ompt_callback(ompt_callback_target_data_op);
-  register_ompt_callback(ompt_callback_target);
-  register_ompt_callback(ompt_callback_target_submit);
-
-  return 1; // success
-}
-
-void ompt_finalize(ompt_data_t *tool_data) {}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
-                                          const char *runtime_version) {
-  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
-                                                            &ompt_finalize, 0};
-  return &ompt_start_tool_result;
-}
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/test/ompt/register_non_emi_map.h b/libomptarget/test/ompt/register_non_emi_map.h
deleted file mode 100644
index b5c6634..0000000
--- a/libomptarget/test/ompt/register_non_emi_map.h
+++ /dev/null

@@ -1,48 +0,0 @@
-#include <omp-tools.h>
-
-// From openmp/runtime/test/ompt/callback.h
-#define register_ompt_callback_t(name, type)                                   \
-  do {                                                                         \
-    type f_##name = &on_##name;                                                \
-    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
-      printf("0: Could not register callback '" #name "'\n");                  \
-  } while (0)
-
-#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
-
-// OMPT entry point handles
-static ompt_set_callback_t ompt_set_callback = 0;
-
-// Init functions
-int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
-                    ompt_data_t *tool_data) {
-  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
-
-  if (!ompt_set_callback)
-    return 0; // failed
-
-  register_ompt_callback(ompt_callback_device_initialize);
-  register_ompt_callback(ompt_callback_device_finalize);
-  register_ompt_callback(ompt_callback_device_load);
-  register_ompt_callback(ompt_callback_target_data_op);
-  register_ompt_callback(ompt_callback_target);
-  register_ompt_callback(ompt_callback_target_submit);
-  register_ompt_callback(ompt_callback_target_map);
-
-  return 1; // success
-}
-
-void ompt_finalize(ompt_data_t *tool_data) {}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
-                                          const char *runtime_version) {
-  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
-                                                            &ompt_finalize, 0};
-  return &ompt_start_tool_result;
-}
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/test/ompt/register_wrong_return.h b/libomptarget/test/ompt/register_wrong_return.h
deleted file mode 100644
index 7de3fee..0000000
--- a/libomptarget/test/ompt/register_wrong_return.h
+++ /dev/null

@@ -1,47 +0,0 @@
-#include <omp-tools.h>
-
-// From openmp/runtime/test/ompt/callback.h
-#define register_ompt_callback_t(name, type)                                   \
-  do {                                                                         \
-    type f_##name = &on_##name;                                                \
-    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
-      printf("0: Could not register callback '" #name "'\n");                  \
-  } while (0)
-
-#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
-
-// OMPT entry point handles
-static ompt_set_callback_t ompt_set_callback = 0;
-
-// Init functions
-int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
-                    ompt_data_t *tool_data) {
-  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
-
-  if (!ompt_set_callback)
-    return 1; // failed but wrongly returning 1
-
-  register_ompt_callback(ompt_callback_device_initialize);
-  register_ompt_callback(ompt_callback_device_finalize);
-  register_ompt_callback(ompt_callback_device_load);
-  register_ompt_callback(ompt_callback_target_data_op);
-  register_ompt_callback(ompt_callback_target);
-  register_ompt_callback(ompt_callback_target_submit);
-
-  return 0; // success but should return 1 according to the spec
-}
-
-void ompt_finalize(ompt_data_t *tool_data) {}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
-                                          const char *runtime_version) {
-  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
-                                                            &ompt_finalize, 0};
-  return &ompt_start_tool_result;
-}
-#ifdef __cplusplus
-}
-#endif

diff --git a/libomptarget/test/ompt/target_memcpy.c b/libomptarget/test/ompt/target_memcpy.c
deleted file mode 100644
index 3224c5a..0000000
--- a/libomptarget/test/ompt/target_memcpy.c
+++ /dev/null

@@ -1,76 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Verify that for the target OpenMP APIs, the return address is non-null and
- * distinct.
- */
-
-#include <omp.h>
-#include <stdlib.h>
-
-#include "callbacks.h"
-#include "register_non_emi.h"
-
-int main() {
-  int dev = omp_get_default_device();
-  int host = omp_get_initial_device();
-
-  int host_var1 = 42;
-  int host_var2 = 0;
-  void *dev_ptr = NULL;
-
-  // Allocate space on the device
-  dev_ptr = omp_target_alloc(sizeof(int), dev);
-  if (dev_ptr == NULL)
-    abort();
-
-  // H2D transfer
-  if (omp_target_memcpy(dev_ptr, &host_var1, sizeof(int), 0, 0, dev, host))
-    abort();
-
-  // D2D transfer
-  if (omp_target_memcpy(dev_ptr, dev_ptr, sizeof(int), 0, 0, dev, dev))
-    abort();
-
-  // D2H transfer
-  if (omp_target_memcpy(&host_var2, dev_ptr, sizeof(int), 0, 0, host, dev))
-    abort();
-
-  // Free the device location
-  omp_target_free(dev_ptr, dev);
-
-  // Both host variables should have the same value.
-  return host_var1 != host_var2;
-}
-
-// clang-format off
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
-/// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE1:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-SAME: src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK-NOT: code=(nil)
-/// CHECK-NOT: code=[[CODE1]]
-/// CHECK: code=[[CODE2:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK-NOT: code=(nil)
-/// CHECK-NOT: code=[[CODE2]]
-/// CHECK: code=[[CODE3:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
-/// CHECK-NOT: code=(nil)
-/// CHECK-NOT: code=[[CODE3]]
-/// CHECK: code=[[CODE4:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: code=(nil)
-/// CHECK-NOT: code=[[CODE4]]

diff --git a/libomptarget/test/ompt/target_memcpy_emi.c b/libomptarget/test/ompt/target_memcpy_emi.c
deleted file mode 100644
index fd7da13..0000000
--- a/libomptarget/test/ompt/target_memcpy_emi.c
+++ /dev/null

@@ -1,87 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Verify all three data transfer directions: H2D, D2D and D2H
- */
-
-#include <omp.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "callbacks.h"
-#include "register_emi.h"
-
-int main(void) {
-  int NumDevices = omp_get_num_devices();
-  assert(NumDevices > 0 && "No device(s) present.");
-  int Device = omp_get_default_device();
-  int Host = omp_get_initial_device();
-  // Note: Zero value depicts an OFFLOAD_SUCCESS
-  int Status;
-
-  printf("Allocating Memory on Device\n");
-  int *DevPtr = (int *)omp_target_alloc(sizeof(int), Device);
-  assert(DevPtr && "Could not allocate memory on device.");
-  int *HstPtr = (int *)malloc(sizeof(int));
-  *HstPtr = 42;
-
-  printf("Testing: Host to Device\n");
-  Status = omp_target_memcpy(DevPtr, HstPtr, sizeof(int), 0, 0, Device, Host);
-  assert(Status == 0 && "H2D memory copy operation failed.\n");
-
-  printf("Testing: Device to Device\n");
-  Status = omp_target_memcpy(DevPtr, DevPtr, sizeof(int), 0, 0, Device, Device);
-  assert(Status == 0 && "D2D memory copy operation failed.\n");
-
-  printf("Testing: Device to Host\n");
-  Status = omp_target_memcpy(HstPtr, DevPtr, sizeof(int), 0, 0, Host, Device);
-  assert(Status == 0 && "D2H memory copy operation failed.\n");
-
-  printf("Checking Correctness\n");
-  assert(*HstPtr == 42);
-
-  printf("Freeing Memory on Device\n");
-  free(HstPtr);
-  omp_target_free(DevPtr, Device);
-
-  return 0;
-}
-
-// clang-format off
-
-/// CHECK: Callback Init:
-
-/// CHECK: Allocating Memory on Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
-/// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
-
-/// CHECK: Testing: Host to Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
-
-/// CHECK: Testing: Device to Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
-
-/// CHECK: Testing: Device to Host
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
-
-/// CHECK: Checking Correctness
-
-/// CHECK: Freeing Memory on Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 {{.+}} src_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 {{.+}} src_device_num=[[DEVICE]]
-
-/// CHECK: Callback Fini:
-
-// clang-format on

diff --git a/libomptarget/test/ompt/veccopy.c b/libomptarget/test/ompt/veccopy.c
deleted file mode 100644
index bc70f5a..0000000
--- a/libomptarget/test/ompt/veccopy.c
+++ /dev/null

@@ -1,106 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Example OpenMP program that registers non-EMI callbacks
- */
-
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_non_emi.h"
-
-int main() {
-  int N = 100000;
-
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-#pragma omp target teams distribute parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-  int rc = 0;
-  for (i = 0; i < N; i++)
-    if (a[i] != b[i]) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-// clang-format off
-/// CHECK: Callback Init:
-/// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 device_num=[[DEVICE_NUM:[0-9]+]]
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE1]]
-
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// device_num=[[DEVICE_NUM]]
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE2]]
-/// CHECK: Callback Fini:

diff --git a/libomptarget/test/ompt/veccopy_data.c b/libomptarget/test/ompt/veccopy_data.c
deleted file mode 100644
index 264b484..0000000
--- a/libomptarget/test/ompt/veccopy_data.c
+++ /dev/null

@@ -1,162 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Example OpenMP program that registers EMI callbacks.
- * Explicitly testing for an initialized device num and
- * #pragma omp target [data enter / data exit / update]
- * The latter with the addition of a nowait clause.
- */
-
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_emi.h"
-
-#define N 100000
-
-#pragma omp declare target
-int c[N];
-#pragma omp end declare target
-
-int main() {
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-  for (i = 0; i < N; i++)
-    c[i] = 0;
-
-#pragma omp target enter data map(to : a)
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-#pragma omp target exit data map(from : a)
-
-#pragma omp target parallel for map(alloc : c)
-  {
-    for (int j = 0; j < N; j++)
-      c[j] = 2 * j + 1;
-  }
-#pragma omp target update from(c) nowait
-#pragma omp barrier
-
-  int rc = 0;
-  for (i = 0; i < N; i++) {
-    if (a[i] != i) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-  }
-
-  for (i = 0; i < N; i++) {
-    if (c[i] != 2 * i + 1) {
-      rc++;
-      printf("Wrong value: c[%d]=%d\n", i, c[i]);
-    }
-  }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-/// CHECK-NOT: Callback Target EMI:
-/// CHECK-NOT: device_num=-1
-/// CHECK: Callback Init:
-/// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=2 endpoint=1
-/// CHECK-NOT: device_num=-1
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=2 endpoint=2
-/// CHECK-NOT: device_num=-1
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK-NOT: device_num=-1
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK-NOT: device_num=-1
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=3 endpoint=1
-/// CHECK-NOT: device_num=-1
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE3:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: code=[[CODE3]]
-/// CHECK: Callback Target EMI: kind=3 endpoint=2
-/// CHECK-NOT: device_num=-1
-/// CHECK: code=[[CODE3]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK-NOT: device_num=-1
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE4:.*]]
-/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK-NOT: device_num=-1
-/// CHECK: code=[[CODE4]]
-/// CHECK: Callback Target EMI: kind=4 endpoint=1
-/// CHECK-NOT: device_num=-1
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE5:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: code=[[CODE5]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: code=[[CODE5]]
-/// CHECK: Callback Target EMI: kind=4 endpoint=2
-/// CHECK-NOT: device_num=-1
-/// CHECK: code=[[CODE5]]
-/// CHECK: Callback Fini:

diff --git a/libomptarget/test/ompt/veccopy_disallow_both.c b/libomptarget/test/ompt/veccopy_disallow_both.c
deleted file mode 100644
index a011c21..0000000
--- a/libomptarget/test/ompt/veccopy_disallow_both.c
+++ /dev/null

@@ -1,106 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Example OpenMP program that shows that both EMI and non-EMI
- * callbacks cannot be registered for the same type. In the
- * current implementation, the EMI callback overrides the non-EMI
- * callback.
- */
-
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_both.h"
-
-int main() {
-  int N = 100000;
-
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-#pragma omp target teams distribute parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-  int rc = 0;
-  for (i = 0; i < N; i++)
-    if (a[i] != b[i]) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-/// CHECK: Callback Init:
-/// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Fini:

diff --git a/libomptarget/test/ompt/veccopy_emi.c b/libomptarget/test/ompt/veccopy_emi.c
deleted file mode 100644
index 8718a39..0000000
--- a/libomptarget/test/ompt/veccopy_emi.c
+++ /dev/null

@@ -1,145 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Example OpenMP program that registers EMI callbacks
- */
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_emi.h"
-
-int main() {
-  int N = 100000;
-
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-#pragma omp target teams distribute parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-  int rc = 0;
-  for (i = 0; i < N; i++)
-    if (a[i] != b[i]) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-/// CHECK: Callback Init:
-/// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: code=[[CODE1]]
-
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: code=[[CODE2]]
-/// CHECK: Callback Fini:

diff --git a/libomptarget/test/ompt/veccopy_emi_map.c b/libomptarget/test/ompt/veccopy_emi_map.c
deleted file mode 100644
index 2accba3..0000000
--- a/libomptarget/test/ompt/veccopy_emi_map.c
+++ /dev/null

@@ -1,107 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Example OpenMP program that shows that map-EMI callbacks are not supported.
- */
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_emi_map.h"
-
-int main() {
-  int N = 100000;
-
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-#pragma omp target teams distribute parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-  int rc = 0;
-  for (i = 0; i < N; i++)
-    if (a[i] != b[i]) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-/// CHECK: 0: Could not register callback 'ompt_callback_target_map_emi'
-/// CHECK: Callback Init:
-/// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
-/// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Fini:

diff --git a/libomptarget/test/ompt/veccopy_map.c b/libomptarget/test/ompt/veccopy_map.c
deleted file mode 100644
index 56a0dd4..0000000
--- a/libomptarget/test/ompt/veccopy_map.c
+++ /dev/null

@@ -1,86 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
-
-/*
- * Example OpenMP program that shows that map callbacks are not supported.
- */
-
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_non_emi_map.h"
-
-int main() {
-  int N = 100000;
-
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-#pragma omp target teams distribute parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-  int rc = 0;
-  for (i = 0; i < N; i++)
-    if (a[i] != b[i]) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-
-/// CHECK: 0: Could not register callback 'ompt_callback_target_map'
-/// CHECK: Callback Init:
-/// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
-
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
-/// CHECK: Callback Fini:

diff --git a/libomptarget/test/ompt/veccopy_no_device_init.c b/libomptarget/test/ompt/veccopy_no_device_init.c
deleted file mode 100644
index c1a0319..0000000
--- a/libomptarget/test/ompt/veccopy_no_device_init.c
+++ /dev/null

@@ -1,79 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-
-/*
- * Example OpenMP program that shows that if no device init callback
- * is registered, the other callbacks won't be activated.
- */
-
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_no_device_init.h"
-
-int main() {
-  int N = 100000;
-
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-#pragma omp target teams distribute parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-  int rc = 0;
-  for (i = 0; i < N; i++)
-    if (a[i] != b[i]) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-/// CHECK-NOT: Callback Init:
-/// CHECK-NOT: Callback Load:
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
-
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
-/// CHECK-NOT: Callback Fini:

diff --git a/libomptarget/test/ompt/veccopy_wrong_return.c b/libomptarget/test/ompt/veccopy_wrong_return.c
deleted file mode 100644
index 2d07b4e..0000000
--- a/libomptarget/test/ompt/veccopy_wrong_return.c
+++ /dev/null

@@ -1,79 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: ompt
-
-/*
- * Example OpenMP program that shows that if the initialize function
- * returns the wrong status code, the callbacks won't be activated.
- */
-
-#include <omp.h>
-#include <stdio.h>
-
-#include "callbacks.h"
-#include "register_wrong_return.h"
-
-int main() {
-  int N = 100000;
-
-  int a[N];
-  int b[N];
-
-  int i;
-
-  for (i = 0; i < N; i++)
-    a[i] = 0;
-
-  for (i = 0; i < N; i++)
-    b[i] = i;
-
-#pragma omp target parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-#pragma omp target teams distribute parallel for
-  {
-    for (int j = 0; j < N; j++)
-      a[j] = b[j];
-  }
-
-  int rc = 0;
-  for (i = 0; i < N; i++)
-    if (a[i] != b[i]) {
-      rc++;
-      printf("Wrong value: a[%d]=%d\n", i, a[i]);
-    }
-
-  if (!rc)
-    printf("Success\n");
-
-  return rc;
-}
-
-/// CHECK-NOT: Callback Init:
-/// CHECK-NOT: Callback Load:
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
-
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
-/// CHECK-NOT: Callback Fini

diff --git a/libomptarget/test/unified_shared_memory/api.c b/libomptarget/test/unified_shared_memory/api.c
deleted file mode 100644
index c7ab055..0000000
--- a/libomptarget/test/unified_shared_memory/api.c
+++ /dev/null

@@ -1,169 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: env HSA_XNACK=1 \
-// RUN: %libomptarget-run-generic | %fcheck-generic
-// XFAIL: nvptx64-nvidia-cuda
-// XFAIL: nvptx64-nvidia-cuda-LTO
-
-// REQUIRES: unified_shared_memory
-
-#include <omp.h>
-#include <stdio.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-void init(int A[], int B[], int C[]) {
-  for (int i = 0; i < N; ++i) {
-    A[i] = 0;
-    B[i] = 1;
-    C[i] = i;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  const int device = omp_get_default_device();
-
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
-  // CHECK: Initial device: [[INITIAL_DEVICE:[0-9]+]]
-  printf("Initial device: %d\n", omp_get_initial_device());
-  // CHECK: Num devices: [[INITIAL_DEVICE]]
-  printf("Num devices: %d\n", omp_get_num_devices());
-
-  //
-  // Target alloc & target memcpy
-  //
-  int A[N], B[N], C[N];
-
-  // Init
-  init(A, B, C);
-
-  int *pA, *pB, *pC;
-
-  // map ptrs
-  pA = &A[0];
-  pB = &B[0];
-  pC = &C[0];
-
-  int *d_A = (int *)omp_target_alloc(N * sizeof(int), device);
-  int *d_B = (int *)omp_target_alloc(N * sizeof(int), device);
-  int *d_C = (int *)omp_target_alloc(N * sizeof(int), device);
-
-  // CHECK: omp_target_alloc succeeded
-  printf("omp_target_alloc %s\n", d_A && d_B && d_C ? "succeeded" : "failed");
-
-  omp_target_memcpy(d_B, pB, N * sizeof(int), 0, 0, device,
-                    omp_get_initial_device());
-  omp_target_memcpy(d_C, pC, N * sizeof(int), 0, 0, device,
-                    omp_get_initial_device());
-
-#pragma omp target is_device_ptr(d_A, d_B, d_C) device(device)
-  {
-#pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < N; i++) {
-      d_A[i] = d_B[i] + d_C[i] + 1;
-    }
-  }
-
-  omp_target_memcpy(pA, d_A, N * sizeof(int), 0, 0, omp_get_initial_device(),
-                    device);
-
-  // CHECK: Test omp_target_memcpy: Succeeded
-  int fail = 0;
-  for (int i = 0; i < N; ++i) {
-    if (A[i] != i + 2)
-      fail++;
-  }
-  if (fail) {
-    printf("Test omp_target_memcpy: Failed\n");
-  } else {
-    printf("Test omp_target_memcpy: Succeeded\n");
-  }
-
-  //
-  // target_is_present and target_associate/disassociate_ptr
-  //
-  init(A, B, C);
-
-  // CHECK: B is not present, associating it...
-  // CHECK: omp_target_associate_ptr B succeeded
-  if (!omp_target_is_present(B, device)) {
-    printf("B is not present, associating it...\n");
-    int rc = omp_target_associate_ptr(B, d_B, N * sizeof(int), 0, device);
-    printf("omp_target_associate_ptr B %s\n", !rc ? "succeeded" : "failed");
-  }
-
-  // CHECK: C is not present, associating it...
-  // CHECK: omp_target_associate_ptr C succeeded
-  if (!omp_target_is_present(C, device)) {
-    printf("C is not present, associating it...\n");
-    int rc = omp_target_associate_ptr(C, d_C, N * sizeof(int), 0, device);
-    printf("omp_target_associate_ptr C %s\n", !rc ? "succeeded" : "failed");
-  }
-
-// CHECK: Inside target data: A is not present
-// CHECK: Inside target data: B is present
-// CHECK: Inside target data: C is present
-#pragma omp target data map(from : B, C) device(device)
-  {
-    printf("Inside target data: A is%s present\n",
-           omp_target_is_present(A, device) ? "" : " not");
-    printf("Inside target data: B is%s present\n",
-           omp_target_is_present(B, device) ? "" : " not");
-    printf("Inside target data: C is%s present\n",
-           omp_target_is_present(C, device) ? "" : " not");
-
-#pragma omp target map(from : A) device(device)
-    {
-#pragma omp parallel for schedule(static, 1)
-      for (int i = 0; i < N; i++)
-        A[i] = B[i] + C[i] + 1;
-    }
-  }
-
-  // CHECK: B is present, disassociating it...
-  // CHECK: omp_target_disassociate_ptr B succeeded
-  // CHECK: C is present, disassociating it...
-  // CHECK: omp_target_disassociate_ptr C succeeded
-  if (omp_target_is_present(B, device)) {
-    printf("B is present, disassociating it...\n");
-    int rc = omp_target_disassociate_ptr(B, device);
-    printf("omp_target_disassociate_ptr B %s\n", !rc ? "succeeded" : "failed");
-  }
-  if (omp_target_is_present(C, device)) {
-    printf("C is present, disassociating it...\n");
-    int rc = omp_target_disassociate_ptr(C, device);
-    printf("omp_target_disassociate_ptr C %s\n", !rc ? "succeeded" : "failed");
-  }
-
-  // CHECK: Test omp_target_associate_ptr: Succeeded
-  fail = 0;
-  for (int i = 0; i < N; ++i) {
-    if (A[i] != i + 2)
-      fail++;
-  }
-  if (fail) {
-    printf("Test omp_target_associate_ptr: Failed\n");
-  } else {
-    printf("Test omp_target_associate_ptr: Succeeded\n");
-  }
-
-  omp_target_free(d_A, device);
-  omp_target_free(d_B, device);
-  omp_target_free(d_C, device);
-
-  printf("Done!\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/unified_shared_memory/associate_ptr.c b/libomptarget/test/unified_shared_memory/associate_ptr.c
deleted file mode 100644
index 5f795dd..0000000
--- a/libomptarget/test/unified_shared_memory/associate_ptr.c
+++ /dev/null

@@ -1,33 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: unified_shared_memory
-// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
-
-#include <assert.h>
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp requires unified_shared_memory
-
-int main(int argc, char *argv[]) {
-  int dev = omp_get_default_device();
-  int x = 10;
-  int *x_dev = (int *)omp_target_alloc(sizeof x, dev);
-  assert(x_dev && "expected omp_target_alloc to succeed");
-  int rc = omp_target_associate_ptr(&x, x_dev, sizeof x, 0, dev);
-  assert(!rc && "expected omp_target_associate_ptr to succeed");
-
-// To determine whether x needs to be transfered, the runtime cannot simply
-// check whether unified shared memory is enabled and the 'close' modifier is
-// specified.  It must check whether x was previously placed in device memory
-// by, for example, omp_target_associate_ptr.
-#pragma omp target map(always, tofrom : x)
-  x += 1;
-
-  // CHECK: x=11
-  printf("x=%d\n", x);
-  // CHECK: present: 1
-  printf("present: %d\n", omp_target_is_present(&x, dev));
-
-  return 0;
-}

diff --git a/libomptarget/test/unified_shared_memory/close_enter_exit.c b/libomptarget/test/unified_shared_memory/close_enter_exit.c
deleted file mode 100644
index b116c65..0000000
--- a/libomptarget/test/unified_shared_memory/close_enter_exit.c
+++ /dev/null

@@ -1,111 +0,0 @@
-// RUN: %libomptarget-compile-generic
-// RUN: env HSA_XNACK=1 \
-// RUN: %libomptarget-run-generic | %fcheck-generic
-
-// REQUIRES: unified_shared_memory
-// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
-
-// Fails on nvptx with error: an illegal memory access was encountered
-// XFAIL: nvptx64-nvidia-cuda
-// XFAIL: nvptx64-nvidia-cuda-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc = 0, *device_alloc = 0;
-  int *a = (int *)malloc(N * sizeof(int));
-  int dev = omp_get_default_device();
-
-  // Init
-  for (int i = 0; i < N; ++i) {
-    a[i] = 10;
-  }
-  host_alloc = &a[0];
-
-  //
-  // map + target no close
-  //
-#pragma omp target data map(tofrom : a[ : N]) map(tofrom : device_alloc)
-  {
-#pragma omp target map(tofrom : device_alloc)
-    { device_alloc = &a[0]; }
-  }
-
-  // CHECK: a used from unified memory.
-  if (device_alloc == host_alloc)
-    printf("a used from unified memory.\n");
-
-  //
-  // map + target with close
-  //
-  device_alloc = 0;
-#pragma omp target data map(close, tofrom : a[ : N]) map(tofrom : device_alloc)
-  {
-#pragma omp target map(tofrom : device_alloc)
-    { device_alloc = &a[0]; }
-  }
-  // CHECK: a copied to device.
-  if (device_alloc != host_alloc)
-    printf("a copied to device.\n");
-
-  //
-  // map + use_device_ptr no close
-  //
-  device_alloc = 0;
-#pragma omp target data map(tofrom : a[ : N]) use_device_ptr(a)
-  { device_alloc = &a[0]; }
-
-  // CHECK: a used from unified memory with use_device_ptr.
-  if (device_alloc == host_alloc)
-    printf("a used from unified memory with use_device_ptr.\n");
-
-  //
-  // map + use_device_ptr close
-  //
-  device_alloc = 0;
-#pragma omp target data map(close, tofrom : a[ : N]) use_device_ptr(a)
-  { device_alloc = &a[0]; }
-
-  // CHECK: a used from device memory with use_device_ptr.
-  if (device_alloc != host_alloc)
-    printf("a used from device memory with use_device_ptr.\n");
-
-  //
-  // map enter/exit + close
-  //
-  device_alloc = 0;
-#pragma omp target enter data map(close, to : a[ : N])
-
-#pragma omp target map(from : device_alloc)
-  {
-    device_alloc = &a[0];
-    a[0] = 99;
-  }
-
-  // 'close' is missing, so the runtime must check whether s is actually in
-  // shared memory in order to determine whether to transfer data and delete the
-  // allocation.
-#pragma omp target exit data map(from : a[ : N])
-
-  // CHECK: a has been mapped to the device.
-  if (device_alloc != host_alloc)
-    printf("a has been mapped to the device.\n");
-
-  // CHECK: a[0]=99
-  // CHECK: a is present: 0
-  printf("a[0]=%d\n", a[0]);
-  printf("a is present: %d\n", omp_target_is_present(a, dev));
-
-  free(a);
-
-  // CHECK: Done!
-  printf("Done!\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/unified_shared_memory/close_manual.c b/libomptarget/test/unified_shared_memory/close_manual.c
deleted file mode 100644
index 9985e82..0000000
--- a/libomptarget/test/unified_shared_memory/close_manual.c
+++ /dev/null

@@ -1,85 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: unified_shared_memory
-
-#include <omp.h>
-#include <stdio.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
-extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
-                                    void **args_base, void **args,
-                                    int64_t *arg_sizes, int64_t *arg_types);
-
-extern void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
-                                  void **args_base, void **args,
-                                  int64_t *arg_sizes, int64_t *arg_types);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc = 0, *device_alloc = 0;
-  int *a = (int *)malloc(N * sizeof(int));
-
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
-  // Init
-  for (int i = 0; i < N; ++i) {
-    a[i] = 10;
-  }
-  host_alloc = &a[0];
-
-// Dummy target region that ensures the runtime library is loaded when
-// the target data begin/end functions are manually called below.
-#pragma omp target
-  {}
-
-  // Manual calls
-  int device_id = omp_get_default_device();
-  int arg_num = 1;
-  void **args_base = (void **)&a;
-  void **args = (void **)&a;
-  int64_t arg_sizes[arg_num];
-
-  arg_sizes[0] = sizeof(int) * N;
-
-  int64_t arg_types[arg_num];
-
-  // Ox400 enables the CLOSE map type in the runtime:
-  // OMP_TGT_MAPTYPE_CLOSE = 0x400
-  // OMP_TGT_MAPTYPE_TO    = 0x001
-  arg_types[0] = 0x400 | 0x001;
-
-  device_alloc = host_alloc;
-
-  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
-                          arg_types);
-
-#pragma omp target data use_device_ptr(a)
-  { device_alloc = a; }
-
-  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
-                        arg_types);
-
-  // CHECK: a was copied to the device
-  if (device_alloc != host_alloc)
-    printf("a was copied to the device\n");
-
-  free(a);
-
-  // CHECK: Done!
-  printf("Done!\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/unified_shared_memory/close_member.c b/libomptarget/test/unified_shared_memory/close_member.c
deleted file mode 100644
index e5a15e3..0000000
--- a/libomptarget/test/unified_shared_memory/close_member.c
+++ /dev/null

@@ -1,41 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: unified_shared_memory
-// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp requires unified_shared_memory
-
-struct S {
-  int x;
-  int y;
-};
-
-int main(int argc, char *argv[]) {
-  int dev = omp_get_default_device();
-  struct S s = {10, 20};
-
-#pragma omp target enter data map(close, to : s)
-#pragma omp target map(alloc : s)
-  {
-    s.x = 11;
-    s.y = 21;
-  }
-// To determine whether x needs to be transfered or deleted, the runtime
-// cannot simply check whether unified shared memory is enabled and the
-// 'close' modifier is specified.  It must check whether x was previously
-// placed in device memory by, for example, a 'close' modifier that isn't
-// specified here.  The following struct member case checks a special code
-// path in the runtime implementation where members are transferred before
-// deletion of the struct.
-#pragma omp target exit data map(from : s.x, s.y)
-
-  // CHECK: s.x=11, s.y=21
-  printf("s.x=%d, s.y=%d\n", s.x, s.y);
-  // CHECK: present: 0
-  printf("present: %d\n", omp_target_is_present(&s, dev));
-
-  return 0;
-}

diff --git a/libomptarget/test/unified_shared_memory/close_modifier.c b/libomptarget/test/unified_shared_memory/close_modifier.c
deleted file mode 100644
index 7ca815d..0000000
--- a/libomptarget/test/unified_shared_memory/close_modifier.c
+++ /dev/null

@@ -1,137 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: unified_shared_memory
-// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
-
-// amdgpu runtime crash
-// Fails on nvptx with error: an illegal memory access was encountered
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc, *device_alloc;
-  void *host_data, *device_data;
-  int *alloc = (int *)malloc(N * sizeof(int));
-  int data[N];
-
-  for (int i = 0; i < N; ++i) {
-    alloc[i] = 10;
-    data[i] = 1;
-  }
-
-  host_data = &data[0];
-  host_alloc = &alloc[0];
-
-//
-// Test that updates on the device are not visible to host
-// when only a TO mapping is used.
-//
-#pragma omp target map(tofrom : device_data, device_alloc)                     \
-    map(close, to : alloc[ : N], data[ : N])
-  {
-    device_data = &data[0];
-    device_alloc = &alloc[0];
-
-    for (int i = 0; i < N; i++) {
-      alloc[i] += 1;
-      data[i] += 1;
-    }
-  }
-
-  // CHECK: Address of alloc on device different from host address.
-  if (device_alloc != host_alloc)
-    printf("Address of alloc on device different from host address.\n");
-
-  // CHECK: Address of data on device different from host address.
-  if (device_data != host_data)
-    printf("Address of data on device different from host address.\n");
-
-  // On the host, check that the arrays have been updated.
-  // CHECK: Alloc host values not updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (alloc[i] != 10)
-      fails++;
-  }
-  printf("Alloc host values not updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  // CHECK: Data host values not updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (data[i] != 1)
-      fails++;
-  }
-  printf("Data host values not updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  //
-  // Test that updates on the device are visible on host
-  // when a from is used.
-  //
-
-  for (int i = 0; i < N; i++) {
-    alloc[i] += 1;
-    data[i] += 1;
-  }
-
-#pragma omp target map(close, tofrom : alloc[ : N], data[ : N])
-  {
-    // CHECK: Alloc device values are correct: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (alloc[i] != 11)
-        fails++;
-    }
-    printf("Alloc device values are correct: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-    // CHECK: Data device values are correct: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (data[i] != 2)
-        fails++;
-    }
-    printf("Data device values are correct: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-
-    // Update values on the device
-    for (int i = 0; i < N; i++) {
-      alloc[i] += 1;
-      data[i] += 1;
-    }
-  }
-
-  // CHECK: Alloc host values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (alloc[i] != 12)
-      fails++;
-  }
-  printf("Alloc host values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  // CHECK: Data host values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (data[i] != 3)
-      fails++;
-  }
-  printf("Data host values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  free(alloc);
-
-  // CHECK: Done!
-  printf("Done!\n");
-
-  return 0;
-}

diff --git a/libomptarget/test/unified_shared_memory/shared_update.c b/libomptarget/test/unified_shared_memory/shared_update.c
deleted file mode 100644
index 65db9e4..0000000
--- a/libomptarget/test/unified_shared_memory/shared_update.c
+++ /dev/null

@@ -1,119 +0,0 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-
-// REQUIRES: unified_shared_memory
-
-// amdgpu runtime crash
-// Fails on nvptx with error: an illegal memory access was encountered
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-
-#include <omp.h>
-#include <stdio.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc, *device_alloc;
-  void *host_data, *device_data;
-  int *alloc = (int *)malloc(N * sizeof(int));
-  int data[N];
-
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
-  for (int i = 0; i < N; ++i) {
-    alloc[i] = 10;
-    data[i] = 1;
-  }
-
-  host_data = &data[0];
-  host_alloc = &alloc[0];
-
-// implicit mapping of data
-#pragma omp target map(tofrom : device_data, device_alloc)
-  {
-    device_data = &data[0];
-    device_alloc = &alloc[0];
-
-    for (int i = 0; i < N; i++) {
-      alloc[i] += 1;
-      data[i] += 1;
-    }
-  }
-
-  // CHECK: Address of alloc on device matches host address.
-  if (device_alloc == host_alloc)
-    printf("Address of alloc on device matches host address.\n");
-
-  // CHECK: Address of data on device matches host address.
-  if (device_data == host_data)
-    printf("Address of data on device matches host address.\n");
-
-  // On the host, check that the arrays have been updated.
-  // CHECK: Alloc device values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (alloc[i] != 11)
-      fails++;
-  }
-  printf("Alloc device values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  // CHECK: Data device values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (data[i] != 2)
-      fails++;
-  }
-  printf("Data device values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  //
-  // Test that updates on the host snd on the device are both visible.
-  //
-
-  // Update on the host.
-  for (int i = 0; i < N; ++i) {
-    alloc[i] += 1;
-    data[i] += 1;
-  }
-
-#pragma omp target
-  {
-    // CHECK: Alloc host values updated: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (alloc[i] != 12)
-        fails++;
-    }
-    printf("Alloc host values updated: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-    // CHECK: Data host values updated: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (data[i] != 3)
-        fails++;
-    }
-    printf("Data host values updated: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-  }
-
-  free(alloc);
-
-  printf("Done!\n");
-
-  return 0;
-}

diff --git a/libomptarget/tools/CMakeLists.txt b/libomptarget/tools/CMakeLists.txt
deleted file mode 100644
index a850647..0000000
--- a/libomptarget/tools/CMakeLists.txt
+++ /dev/null

@@ -1,28 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Adding omptarget tools
-#
-##===----------------------------------------------------------------------===##
-
-set(OPENMP_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH
-    "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')")
-mark_as_advanced(OPENMP_TOOLS_INSTALL_DIR)
-
-# Move these macros to AddOpenMP if such a CMake module is ever created.
-
-macro(add_openmp_tool name)
-  llvm_add_tool(OPENMP ${ARGV})
-endmacro()
-
-macro(add_openmp_tool_symlink name)
-  llvm_add_tool_symlink(OPENMP ${ARGV})
-endmacro()
-
-add_subdirectory(deviceinfo)
-add_subdirectory(kernelreplay)

diff --git a/libomptarget/tools/deviceinfo/CMakeLists.txt b/libomptarget/tools/deviceinfo/CMakeLists.txt
deleted file mode 100644
index 62d36d7..0000000
--- a/libomptarget/tools/deviceinfo/CMakeLists.txt
+++ /dev/null

@@ -1,25 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build llvm-omp-device-info tool
-#
-##===----------------------------------------------------------------------===##
-
-libomptarget_say("Building the llvm-omp-device-info tool")
-
-add_openmp_tool(llvm-omp-device-info llvm-omp-device-info.cpp)
-
-llvm_update_compile_flags(llvm-omp-device-info)
-
-target_include_directories(llvm-omp-device-info PRIVATE
-  ${LIBOMPTARGET_INCLUDE_DIR}
-)
-target_link_libraries(llvm-omp-device-info PRIVATE
-  omp
-  omptarget
-)

diff --git a/libomptarget/tools/deviceinfo/llvm-omp-device-info.cpp b/libomptarget/tools/deviceinfo/llvm-omp-device-info.cpp
deleted file mode 100644
index 7dd22ae..0000000
--- a/libomptarget/tools/deviceinfo/llvm-omp-device-info.cpp
+++ /dev/null

@@ -1,31 +0,0 @@
-//===- llvm-omp-device-info.cpp - Obtain device info as seen from OpenMP --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a command line utility that, by using Libomptarget, and the device
-// plugins, list devices information as seen from the OpenMP Runtime.
-//
-//===----------------------------------------------------------------------===//
-
-#include "omptarget.h"
-#include <cstdio>
-
-int main(int argc, char **argv) {
-  __tgt_bin_desc EmptyDesc = {0, nullptr, nullptr, nullptr};
-  __tgt_register_lib(&EmptyDesc);
-  __tgt_init_all_rtls();
-
-  for (int Dev = 0; Dev < omp_get_num_devices(); Dev++) {
-    printf("Device (%d):\n", Dev);
-    if (!__tgt_print_device_info(Dev))
-      printf("    print_device_info not implemented\n");
-    printf("\n");
-  }
-
-  __tgt_unregister_lib(&EmptyDesc);
-  return 0;
-}

diff --git a/libomptarget/tools/kernelreplay/CMakeLists.txt b/libomptarget/tools/kernelreplay/CMakeLists.txt
deleted file mode 100644
index 6f3dc33..0000000
--- a/libomptarget/tools/kernelreplay/CMakeLists.txt
+++ /dev/null

@@ -1,26 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build llvm-omp-kernel-replay tool
-#
-##===----------------------------------------------------------------------===##
-
-libomptarget_say("Building the llvm-omp-kernel-replay tool")
-
-add_openmp_tool(llvm-omp-kernel-replay llvm-omp-kernel-replay.cpp)
-
-llvm_update_compile_flags(llvm-omp-kernel-replay)
-
-target_include_directories(llvm-omp-kernel-replay PRIVATE
-  ${LIBOMPTARGET_INCLUDE_DIR}
-)
-target_link_libraries(llvm-omp-kernel-replay PRIVATE
-  LLVMSupport
-  omp
-  omptarget
-)

diff --git a/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
deleted file mode 100644
index 761e04e..0000000
--- a/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ /dev/null

@@ -1,201 +0,0 @@
-//===- llvm-omp-kernel-replay.cpp - Replay OpenMP offload kernel ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a command line utility to replay the execution of recorded OpenMP
-// offload kernels.
-//
-//===----------------------------------------------------------------------===//
-
-#include "omptarget.h"
-
-#include "Shared/PluginAPI.h"
-
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/JSON.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <cstdint>
-#include <cstdlib>
-
-using namespace llvm;
-
-cl::OptionCategory ReplayOptions("llvm-omp-kernel-replay Options");
-
-// InputFilename - The filename to read the json description of the kernel.
-static cl::opt<std::string> InputFilename(cl::Positional,
-                                          cl::desc("<input kernel json file>"),
-                                          cl::Required);
-
-static cl::opt<bool> VerifyOpt(
-    "verify",
-    cl::desc(
-        "Verify device memory post execution against the original output."),
-    cl::init(false), cl::cat(ReplayOptions));
-
-static cl::opt<bool> SaveOutputOpt(
-    "save-output",
-    cl::desc("Save the device memory output of the replayed kernel execution."),
-    cl::init(false), cl::cat(ReplayOptions));
-
-static cl::opt<unsigned> NumTeamsOpt("num-teams",
-                                     cl::desc("Set the number of teams."),
-                                     cl::init(0), cl::cat(ReplayOptions));
-
-static cl::opt<unsigned> NumThreadsOpt("num-threads",
-                                       cl::desc("Set the number of threads."),
-                                       cl::init(0), cl::cat(ReplayOptions));
-
-static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
-                                    cl::init(-1), cl::cat(ReplayOptions));
-
-int main(int argc, char **argv) {
-  cl::HideUnrelatedOptions(ReplayOptions);
-  cl::ParseCommandLineOptions(argc, argv, "llvm-omp-kernel-replay\n");
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> KernelInfoMB =
-      MemoryBuffer::getFile(InputFilename, /*isText=*/true,
-                            /*RequiresNullTerminator=*/true);
-  if (!KernelInfoMB)
-    report_fatal_error("Error reading the kernel info json file");
-  Expected<json::Value> JsonKernelInfo =
-      json::parse(KernelInfoMB.get()->getBuffer());
-  if (auto Err = JsonKernelInfo.takeError())
-    report_fatal_error("Cannot parse the kernel info json file");
-
-  auto NumTeamsJson =
-      JsonKernelInfo->getAsObject()->getInteger("NumTeamsClause");
-  unsigned NumTeams = (NumTeamsOpt > 0 ? NumTeamsOpt : NumTeamsJson.value());
-  auto NumThreadsJson =
-      JsonKernelInfo->getAsObject()->getInteger("ThreadLimitClause");
-  unsigned NumThreads =
-      (NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value());
-  // TODO: Print a warning if number of teams/threads is explicitly set in the
-  // kernel info but overriden through command line options.
-  auto LoopTripCount =
-      JsonKernelInfo->getAsObject()->getInteger("LoopTripCount");
-  auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name");
-
-  SmallVector<void *> TgtArgs;
-  SmallVector<ptrdiff_t> TgtArgOffsets;
-  auto NumArgs = JsonKernelInfo->getAsObject()->getInteger("NumArgs");
-  auto *TgtArgsArray = JsonKernelInfo->getAsObject()->getArray("ArgPtrs");
-  for (auto It : *TgtArgsArray)
-    TgtArgs.push_back(reinterpret_cast<void *>(It.getAsInteger().value()));
-  auto *TgtArgOffsetsArray =
-      JsonKernelInfo->getAsObject()->getArray("ArgOffsets");
-  for (auto It : *TgtArgOffsetsArray)
-    TgtArgOffsets.push_back(static_cast<ptrdiff_t>(It.getAsInteger().value()));
-
-  void *BAllocStart = reinterpret_cast<void *>(
-      JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
-
-  __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
-  std::string KernelEntryName = KernelFunc.value().str();
-  KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
-  // Anything non-zero works to uniquely identify the kernel.
-  KernelEntry.addr = (void *)0x1;
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> ImageMB =
-      MemoryBuffer::getFile(KernelEntryName + ".image", /*isText=*/false,
-                            /*RequiresNullTerminator=*/false);
-  if (!ImageMB)
-    report_fatal_error("Error reading the kernel image.");
-
-  __tgt_device_image DeviceImage;
-  DeviceImage.ImageStart = const_cast<char *>(ImageMB.get()->getBufferStart());
-  DeviceImage.ImageEnd = const_cast<char *>(ImageMB.get()->getBufferEnd());
-  DeviceImage.EntriesBegin = &KernelEntry;
-  DeviceImage.EntriesEnd = &KernelEntry + 1;
-
-  __tgt_bin_desc Desc;
-  Desc.NumDeviceImages = 1;
-  Desc.HostEntriesBegin = &KernelEntry;
-  Desc.HostEntriesEnd = &KernelEntry + 1;
-  Desc.DeviceImages = &DeviceImage;
-
-  auto DeviceMemorySizeJson =
-      JsonKernelInfo->getAsObject()->getInteger("DeviceMemorySize");
-  // Set device memory size to the ceiling of GB granularity.
-  uint64_t DeviceMemorySize = std::ceil(DeviceMemorySizeJson.value());
-
-  auto DeviceIdJson = JsonKernelInfo->getAsObject()->getInteger("DeviceId");
-  // TODO: Print warning if the user overrides the device id in the json file.
-  int32_t DeviceId = (DeviceIdOpt > -1 ? DeviceIdOpt : DeviceIdJson.value());
-
-  // TODO: do we need requires?
-  //__tgt_register_requires(/*Flags=*/1);
-
-  __tgt_register_lib(&Desc);
-
-  uint64_t ReqPtrArgOffset = 0;
-  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
-                                        false, VerifyOpt, ReqPtrArgOffset);
-
-  if (Rc != OMP_TGT_SUCCESS) {
-    report_fatal_error("Cannot activate record replay\n");
-  }
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> DeviceMemoryMB =
-      MemoryBuffer::getFile(KernelEntryName + ".memory", /*isText=*/false,
-                            /*RequiresNullTerminator=*/false);
-
-  if (!DeviceMemoryMB)
-    report_fatal_error("Error reading the kernel input device memory.");
-
-  // On AMD for currently unknown reasons we cannot copy memory mapped data to
-  // device. This is a work-around.
-  uint8_t *recored_data = new uint8_t[DeviceMemoryMB.get()->getBufferSize()];
-  std::memcpy(recored_data,
-              const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
-              DeviceMemoryMB.get()->getBufferSize());
-
-  // If necessary, adjust pointer arguments.
-  if (ReqPtrArgOffset) {
-    for (auto *&Arg : TgtArgs) {
-      auto ArgInt = uintptr_t(Arg);
-      // Try to find pointer arguments.
-      if (ArgInt < uintptr_t(BAllocStart) ||
-          ArgInt >= uintptr_t(BAllocStart) + DeviceMemorySize)
-        continue;
-      Arg = reinterpret_cast<void *>(ArgInt - ReqPtrArgOffset);
-    }
-  }
-
-  __tgt_target_kernel_replay(
-      /*Loc=*/nullptr, DeviceId, KernelEntry.addr, (char *)recored_data,
-      DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),
-      TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads,
-      LoopTripCount.value());
-
-  if (VerifyOpt) {
-    ErrorOr<std::unique_ptr<MemoryBuffer>> OriginalOutputMB =
-        MemoryBuffer::getFile(KernelEntryName + ".original.output",
-                              /*isText=*/false,
-                              /*RequiresNullTerminator=*/false);
-    if (!OriginalOutputMB)
-      report_fatal_error("Error reading the kernel original output file, make "
-                         "sure LIBOMPTARGET_SAVE_OUTPUT is set when recording");
-    ErrorOr<std::unique_ptr<MemoryBuffer>> ReplayOutputMB =
-        MemoryBuffer::getFile(KernelEntryName + ".replay.output",
-                              /*isText=*/false,
-                              /*RequiresNullTerminator=*/false);
-    if (!ReplayOutputMB)
-      report_fatal_error("Error reading the kernel replay output file");
-
-    StringRef OriginalOutput = OriginalOutputMB.get()->getBuffer();
-    StringRef ReplayOutput = ReplayOutputMB.get()->getBuffer();
-    if (OriginalOutput == ReplayOutput)
-      outs() << "[llvm-omp-kernel-replay] Replay device memory verified!\n";
-    else
-      outs() << "[llvm-omp-kernel-replay] Replay device memory failed to "
-                "verify!\n";
-  }
-
-  delete[] recored_data;
-
-  return 0;
-}

diff --git a/libomptarget/unittests/CMakeLists.txt b/libomptarget/unittests/CMakeLists.txt
deleted file mode 100644
index 73c87b7..0000000
--- a/libomptarget/unittests/CMakeLists.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-add_custom_target(LibomptUnitTests)
-set_target_properties(LibomptUnitTests PROPERTIES FOLDER "Tests/UnitTests")
-
-function(add_libompt_unittest test_dirname)
-  add_unittest(LibomptUnitTests ${test_dirname} ${ARGN})
-endfunction()
-
-add_subdirectory(Plugins)

diff --git a/libomptarget/unittests/Plugins/CMakeLists.txt b/libomptarget/unittests/Plugins/CMakeLists.txt
deleted file mode 100644
index 67d4d9b..0000000
--- a/libomptarget/unittests/Plugins/CMakeLists.txt
+++ /dev/null

@@ -1,11 +0,0 @@
-set(PLUGINS_TEST_COMMON omptarget)
-set(PLUGINS_TEST_SOURCES NextgenPluginsTest.cpp)
-set(PLUGINS_TEST_INCLUDE ${LIBOMPTARGET_INCLUDE_DIR})
-
-foreach(PLUGIN IN LISTS LIBOMPTARGET_TESTED_PLUGINS)
-  libomptarget_say("Building plugin unit tests for ${PLUGIN}")
-  add_libompt_unittest("${PLUGIN}.unittests" ${PLUGINS_TEST_SOURCES})
-  add_dependencies("${PLUGIN}.unittests" ${PLUGINS_TEST_COMMON} ${PLUGIN})
-  target_link_libraries("${PLUGIN}.unittests" PRIVATE ${PLUGINS_TEST_COMMON} ${PLUGIN})
-  target_include_directories("${PLUGIN}.unittests" PRIVATE ${PLUGINS_TEST_INCLUDE})
-endforeach()

diff --git a/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp b/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp
deleted file mode 100644
index 635bd16..0000000
--- a/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp
+++ /dev/null

@@ -1,168 +0,0 @@
-//===------- unittests/Plugins/NextgenPluginsTest.cpp - Plugin tests ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Shared/PluginAPI.h"
-#include "omptarget.h"
-#include "gtest/gtest.h"
-
-#include <unordered_set>
-
-const int DEVICE_ID = 0;
-std::unordered_set<int> setup_map;
-
-int init_test_device(int ID) {
-  if (setup_map.find(ID) != setup_map.end()) {
-    return OFFLOAD_SUCCESS;
-  }
-  if (__tgt_rtl_init_plugin() == OFFLOAD_FAIL ||
-      __tgt_rtl_init_device(ID) == OFFLOAD_FAIL) {
-    return OFFLOAD_FAIL;
-  }
-  setup_map.insert(ID);
-  return OFFLOAD_SUCCESS;
-}
-
-// Test plugin initialization
-TEST(NextgenPluginsTest, PluginInit) {
-  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
-}
-
-// Test GPU allocation and R/W
-TEST(NextgenPluginsTest, PluginAlloc) {
-  int32_t test_value = 23;
-  int32_t host_value = -1;
-  int64_t var_size = sizeof(int32_t);
-
-  // Init plugin and device
-  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
-
-  // Allocate memory
-  void *device_ptr =
-      __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr, TARGET_ALLOC_DEFAULT);
-
-  // Check that the result is not null
-  EXPECT_NE(device_ptr, nullptr);
-
-  // Submit data to device
-  EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_submit(DEVICE_ID, device_ptr,
-                                                   &test_value, var_size));
-
-  // Read data from device
-  EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_retrieve(DEVICE_ID, &host_value,
-                                                     device_ptr, var_size));
-
-  // Compare values
-  EXPECT_EQ(host_value, test_value);
-
-  // Cleanup data
-  EXPECT_EQ(OFFLOAD_SUCCESS,
-            __tgt_rtl_data_delete(DEVICE_ID, device_ptr, TARGET_ALLOC_DEFAULT));
-}
-
-// Test async GPU allocation and R/W
-TEST(NextgenPluginsTest, PluginAsyncAlloc) {
-  int32_t test_value = 47;
-  int32_t host_value = -1;
-  int64_t var_size = sizeof(int32_t);
-  __tgt_async_info *info;
-
-  // Init plugin and device
-  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
-
-  // Check if device supports async
-  // Platforms like x86_64 don't support it
-  if (__tgt_rtl_init_async_info(DEVICE_ID, &info) == OFFLOAD_SUCCESS) {
-    // Allocate memory
-    void *device_ptr = __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr,
-                                            TARGET_ALLOC_DEFAULT);
-
-    // Check that the result is not null
-    EXPECT_NE(device_ptr, nullptr);
-
-    // Submit data to device asynchronously
-    EXPECT_EQ(OFFLOAD_SUCCESS,
-              __tgt_rtl_data_submit_async(DEVICE_ID, device_ptr, &test_value,
-                                          var_size, info));
-
-    // Wait for async request to process
-    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_synchronize(DEVICE_ID, info));
-
-    // Read data from device
-    EXPECT_EQ(OFFLOAD_SUCCESS,
-              __tgt_rtl_data_retrieve_async(DEVICE_ID, &host_value, device_ptr,
-                                            var_size, info));
-
-    // Wait for async request to process
-    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_synchronize(DEVICE_ID, info));
-
-    // Compare values
-    EXPECT_EQ(host_value, test_value);
-
-    // Cleanup data
-    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_ID, device_ptr,
-                                                     TARGET_ALLOC_DEFAULT));
-  }
-}
-
-// Test GPU data exchange
-TEST(NextgenPluginsTest, PluginDataSwap) {
-  int32_t test_value = 23;
-  int32_t host_value = -1;
-  int64_t var_size = sizeof(int32_t);
-
-  // Look for compatible device
-  int DEVICE_TWO = -1;
-  for (int i = 1; i < __tgt_rtl_number_of_devices(); i++) {
-    if (__tgt_rtl_is_data_exchangable(DEVICE_ID, i)) {
-      DEVICE_TWO = i;
-      break;
-    }
-  }
-
-  // Only run test if we have multiple GPUs to test
-  // GPUs must be compatible for test to work
-  if (DEVICE_TWO >= 1) {
-    // Init both GPUs
-    EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
-    EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_TWO));
-
-    // Allocate memory on both GPUs
-    // DEVICE_ID will be the source
-    // DEVICE_TWO will be the destination
-    void *source_ptr = __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr,
-                                            TARGET_ALLOC_DEFAULT);
-    void *dest_ptr = __tgt_rtl_data_alloc(DEVICE_TWO, var_size, nullptr,
-                                          TARGET_ALLOC_DEFAULT);
-
-    // Check for success in allocation
-    EXPECT_NE(source_ptr, nullptr);
-    EXPECT_NE(dest_ptr, nullptr);
-
-    // Write data to source
-    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_submit(DEVICE_ID, source_ptr,
-                                                     &test_value, var_size));
-
-    // Transfer data between devices
-    EXPECT_EQ(OFFLOAD_SUCCESS,
-              __tgt_rtl_data_exchange(DEVICE_ID, source_ptr, DEVICE_TWO,
-                                      dest_ptr, var_size));
-
-    // Read from destination device (DEVICE_TWO) memory
-    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_retrieve(DEVICE_TWO, &host_value,
-                                                       dest_ptr, var_size));
-
-    // Ensure match
-    EXPECT_EQ(host_value, test_value);
-
-    // Cleanup
-    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_ID, source_ptr,
-                                                     TARGET_ALLOC_DEFAULT));
-    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_TWO, dest_ptr,
-                                                     TARGET_ALLOC_DEFAULT));
-  }
-}

diff --git a/libomptarget/utils/generate_microtask_cases.py b/libomptarget/utils/generate_microtask_cases.py
deleted file mode 100755
index 1376c3e..0000000
--- a/libomptarget/utils/generate_microtask_cases.py
+++ /dev/null

@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--max_args",
-        type=int,
-        help="Max number of arguments to generate case statements for",
-        required=True,
-    )
-    parser.add_argument("--output", help="Output header file to include", required=True)
-    args = parser.parse_args()
-
-    output = ""
-    for i in range(args.max_args + 1):
-        output += "case %d:\n" % (i)
-        output += "((void (*)(kmp_int32 *, kmp_int32 *\n"
-        for j in range(i):
-            output += ", void *"
-            if (j + 1) % 4 == 0:
-                output += "\n"
-        output += "))fn)(&global_tid, &bound_tid\n"
-        for j in range(i):
-            output += ", args[%d]" % (j)
-            if (j + 1) % 4 == 0:
-                output += "\n"
-        output += ");\n"
-        output += "break;\n"
-
-    with open(args.output, "w") as f:
-        print(output, file=f)
-
-
-if __name__ == "__main__":
-    main()

diff --git a/runtime/src/CMakeLists.txt b/runtime/src/CMakeLists.txt
index 701c351..a2468d0 100644
--- a/runtime/src/CMakeLists.txt
+++ b/runtime/src/CMakeLists.txt

@@ -229,6 +229,7 @@
 else()
   set(LIBOMP_LIBRARY_DIR ${LIBOMP_LIBRARY_DIR} PARENT_SCOPE)
 endif()
+set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
 
 # Add symbolic links to libomp
@@ -241,7 +242,12 @@
     WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
   )
 endif()
-set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
+
+# Definitions for testing, for reuse when testing libomptarget-nvptx.
+set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${LIBOMP_INCLUDE_DIR}" CACHE STRING
+  "Path to folder containing omp.h")
+set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${LIBOMP_LIBRARY_DIR}" CACHE STRING
+  "Path to folder containing libomp.so, and libLLVMSupport.so with profiling enabled")
 
 # Create *.inc before compiling any sources
 # objects depend on : .inc files
commit	f99c47ef437ce6919c614d51a876c850997ba2a1	[log] [tgz]
author	Johannes Doerfert <johannes@jdoerfert.de>	Mon Apr 22 09:51:33 2024 -0700
committer	Copybara-Service <copybara-worker@google.com>	Mon Apr 22 09:55:47 2024 -0700
tree	e2ba8c2db9e0f937fe06bbfb087449f45c6bea19
parent	c2462eaedc665b5fea5d6ef3576dc5f29f456fce [diff]