diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index c1bc29faaf45d..541373133a909 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -1,85 +1,85 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build offloading library and related plugins.
-#
-##===----------------------------------------------------------------------===##
-
-if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
-  message(FATAL_ERROR "Direct configuration not supported, please use parent directory!")
-endif()
-
-# Add cmake directory to search for custom cmake functions.
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH})
-
-if(OPENMP_STANDALONE_BUILD)
-  # Build all libraries into a common place so that tests can find them.
-  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
-
-# Message utilities.
-include(LibomptargetUtils)
-
-# Get dependencies for the different components of the project.
-include(LibomptargetGetDependencies)
-
-# This is a list of all the targets that are supported/tested right now.
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
-
-# Once the plugins for the different targets are validated, they will be added to
-# the list of supported targets in the current system.
-set (LIBOMPTARGET_SYSTEM_TARGETS "")
-set (LIBOMPTARGET_TESTED_PLUGINS "")
-
-# Check whether using debug mode. In debug mode, allow dumping progress
-# messages at runtime by default. Otherwise, it can be enabled
-# independently using the LIBOMPTARGET_ENABLE_DEBUG option.
-string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE)
-if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
-  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" ON)
-else()
-  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" OFF)
-endif()
-if(LIBOMPTARGET_ENABLE_DEBUG)
-  add_definitions(-DOMPTARGET_DEBUG)
-endif()
-
-include_directories(include)
-
-# Build target agnostic offloading library.
-add_subdirectory(src)
-
-# Retrieve the path to the resulting library so that it can be used for 
-# testing.
-get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY)
-if(NOT LIBOMPTARGET_LIBRARY_DIR)
-  set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-endif()
-
-# Definitions for testing, for reuse when testing libomptarget-nvptx.
-if(OPENMP_STANDALONE_BUILD)
-  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING
-    "Path to folder containing omp.h")
-  set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING
-    "Path to folder containing libomp.so")
-else()
-  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src")
-endif()
-
-
-# Build offloading plugins and device RTLs if they are available.
-add_subdirectory(plugins)
-add_subdirectory(deviceRTLs)
-
-# Add tests.
-add_subdirectory(test)
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build offloading library and related plugins.
+#
+##===----------------------------------------------------------------------===##
+
+if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+  message(FATAL_ERROR "Direct configuration not supported, please use parent directory!")
+endif()
+
+# Add cmake directory to search for custom cmake functions.
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH})
+
+if(OPENMP_STANDALONE_BUILD)
+  # Build all libraries into a common place so that tests can find them.
+  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# Message utilities.
+include(LibomptargetUtils)
+
+# Get dependencies for the different components of the project.
+include(LibomptargetGetDependencies)
+
+# This is a list of all the targets that are supported/tested right now.
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
+
+# Once the plugins for the different targets are validated, they will be added to
+# the list of supported targets in the current system.
+set (LIBOMPTARGET_SYSTEM_TARGETS "")
+set (LIBOMPTARGET_TESTED_PLUGINS "")
+
+# Check whether using debug mode. In debug mode, allow dumping progress
+# messages at runtime by default. Otherwise, it can be enabled
+# independently using the LIBOMPTARGET_ENABLE_DEBUG option.
+string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE)
+if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" ON)
+else()
+  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" OFF)
+endif()
+if(LIBOMPTARGET_ENABLE_DEBUG)
+  add_definitions(-DOMPTARGET_DEBUG)
+endif()
+
+include_directories(include)
+
+# Build target agnostic offloading library.
+add_subdirectory(src)
+
+# Retrieve the path to the resulting library so that it can be used for 
+# testing.
+get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY)
+if(NOT LIBOMPTARGET_LIBRARY_DIR)
+  set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# Definitions for testing, for reuse when testing libomptarget-nvptx.
+if(OPENMP_STANDALONE_BUILD)
+  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING
+    "Path to folder containing omp.h")
+  set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING
+    "Path to folder containing libomp.so")
+else()
+  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src")
+endif()
+
+
+# Build offloading plugins and device RTLs if they are available.
+add_subdirectory(plugins)
+add_subdirectory(deviceRTLs)
+
+# Add tests.
+add_subdirectory(test)
diff --git a/openmp/libomptarget/README.txt b/openmp/libomptarget/README.txt
index 8c0a83729fdbe..18c7a0e970965 100644
--- a/openmp/libomptarget/README.txt
+++ b/openmp/libomptarget/README.txt
@@ -1,73 +1,73 @@
-
-    README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
-    ======================================================================
-
-How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
-========================================================================
-In-tree build:
-
-$ cd where-you-want-to-live
-Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects
-$ cd where-you-want-to-build
-$ mkdir build && cd build
-$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
-$ make omptarget
-
-Out-of-tree build:
-
-$ cd where-you-want-to-live
-Check out openmp (libomptarget lives under ./libomptarget)
-$ cd where-you-want-to-live/openmp/libomptarget
-$ mkdir build && cd build
-$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
-$ make
-
-For details about building, please look at README.rst in the parent directory.
-
-Architectures Supported
-=======================
-The current library has been only tested in Linux operating system and the
-following host architectures:
-* Intel(R) 64 architecture
-* IBM(R) Power architecture (big endian)
-* IBM(R) Power architecture (little endian)
-* ARM(R) AArch64 architecture (little endian)
-
-The currently supported offloading device architectures are:
-* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes)
-* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes)
-* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
-* ARM(R) AArch64 architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
-* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures
-
-Supported RTL Build Configurations
-==================================
-Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8
-
-              ---------------------------
-              |   gcc      |   clang    |
---------------|------------|------------|
-| Linux* OS   |  Yes(1)    |  Yes(2)    |
------------------------------------------
-
-(1) gcc version 4.8.2 or later is supported.
-(2) clang version 3.7 or later is supported.
-
-
-Front-end Compilers that work with this RTL
-===========================================
-
-The following compilers are known to do compatible code generation for
-this RTL:
-  - clang (from https://github.com/clang-ykt )
-  - clang (development branch at http://clang.llvm.org - several features still
-    under development)
-
------------------------------------------------------------------------
-
-Notices
-=======
-This library and related compiler support is still under development, so the
-employed interface is likely to change in the future.
-
-*Other names and brands may be claimed as the property of others.
+
+    README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+    ======================================================================
+
+How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+========================================================================
+In-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omptarget
+
+Out-of-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget)
+$ cd where-you-want-to-live/openmp/libomptarget
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+
+For details about building, please look at README.rst in the parent directory.
+
+Architectures Supported
+=======================
+The current library has been only tested in Linux operating system and the
+following host architectures:
+* Intel(R) 64 architecture
+* IBM(R) Power architecture (big endian)
+* IBM(R) Power architecture (little endian)
+* ARM(R) AArch64 architecture (little endian)
+
+The currently supported offloading device architectures are:
+* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
+* ARM(R) AArch64 architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
+* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures
+
+Supported RTL Build Configurations
+==================================
+Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8
+
+              ---------------------------
+              |   gcc      |   clang    |
+--------------|------------|------------|
+| Linux* OS   |  Yes(1)    |  Yes(2)    |
+-----------------------------------------
+
+(1) gcc version 4.8.2 or later is supported.
+(2) clang version 3.7 or later is supported.
+
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL:
+  - clang (from https://github.com/clang-ykt )
+  - clang (development branch at http://clang.llvm.org - several features still
+    under development)
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+This library and related compiler support is still under development, so the
+employed interface is likely to change in the future.
+
+*Other names and brands may be claimed as the property of others.
diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
index dbf8c381de139..ae05405e9f2fd 100644
--- a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -1,192 +1,192 @@
-#
-#//===----------------------------------------------------------------------===//
-#//
-#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-#// See https://llvm.org/LICENSE.txt for license information.
-#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#//
-#//===----------------------------------------------------------------------===//
-#
-
-# Try to detect in the system several dependencies required by the different
-# components of libomptarget. These are the dependencies we have:
-#
-# libelf : required by some targets to handle the ELF files at runtime.
-# libffi : required to launch target kernels given function and argument 
-#          pointers.
-# CUDA : required to control offloading to NVIDIA GPUs.
-
-include (FindPackageHandleStandardArgs)
-
-################################################################################
-# Looking for libelf...
-################################################################################
-
-find_path (
-  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR
-  NAMES
-    libelf.h
-  PATHS
-    /usr/include
-    /usr/local/include
-    /opt/local/include
-    /sw/include
-    ENV CPATH
-  PATH_SUFFIXES
-    libelf)
-
-find_library (
-  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
-  NAMES
-    elf
-  PATHS
-    /usr/lib
-    /usr/local/lib
-    /opt/local/lib
-    /sw/lib
-    ENV LIBRARY_PATH
-    ENV LD_LIBRARY_PATH)
-    
-set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
-find_package_handle_standard_args(
-  LIBOMPTARGET_DEP_LIBELF 
-  DEFAULT_MSG
-  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
-  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS)
-
-mark_as_advanced(
-  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS 
-  LIBOMPTARGET_DEP_LIBELF_LIBRARIES)
-  
-################################################################################
-# Looking for libffi...
-################################################################################
-find_package(PkgConfig)
-
-pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi)
-
-find_path (
-  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR
-  NAMES
-    ffi.h
-  HINTS
-    ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR}
-    ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS}
-  PATHS
-    /usr/include
-    /usr/local/include
-    /opt/local/include
-    /sw/include
-    ENV CPATH)
-
-# Don't bother look for the library if the header files were not found.
-if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR)
-  find_library (
-      LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
-    NAMES
-      ffi
-    HINTS
-      ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR}
-      ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS}
-    PATHS
-      /usr/lib
-      /usr/local/lib
-      /opt/local/lib
-      /sw/lib
-      ENV LIBRARY_PATH
-      ENV LD_LIBRARY_PATH)
-endif()
-
-set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
-find_package_handle_standard_args(
-  LIBOMPTARGET_DEP_LIBFFI 
-  DEFAULT_MSG
-  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
-  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS)
-
-mark_as_advanced(
-  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS 
-  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES)
-  
-################################################################################
-# Looking for CUDA...
-################################################################################
-if (CUDA_TOOLKIT_ROOT_DIR)
-  set(LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET TRUE)
-endif()
-find_package(CUDA QUIET)
-
-set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND})
-set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
-
-mark_as_advanced(
-  LIBOMPTARGET_DEP_CUDA_FOUND 
-  LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS)
-
-################################################################################
-# Looking for CUDA Driver API... (needed for CUDA plugin)
-################################################################################
-
-find_library (
-    LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
-  NAMES
-    cuda
-  PATHS
-    /lib64)
-
-# There is a libcuda.so in lib64/stubs that can be used for linking.
-if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND)
-  # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this
-  # case CUDA_LIBRARIES contains additional linker arguments which breaks
-  # get_filename_component below. Fortunately, since that change the module
-  # exports CUDA_cudart_static_LIBRARY which points to a single file in the
-  # right directory.
-  set(cuda_library ${CUDA_LIBRARIES})
-  if (DEFINED CUDA_cudart_static_LIBRARY)
-    set(cuda_library ${CUDA_cudart_static_LIBRARY})
-  endif()
-  get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY)
-  find_library (
-      LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
-    NAMES
-      cuda
-    HINTS
-      "${CUDA_LIBDIR}/stubs")
-endif()
-
-find_package_handle_standard_args(
-  LIBOMPTARGET_DEP_CUDA_DRIVER
-  DEFAULT_MSG
-  LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
-
-mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
-
-################################################################################
-# Looking for CUDA libdevice subdirectory
-#
-# Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work
-# out of the box. More info on http://bugs.debian.org/882505
-################################################################################
-
-set(LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR nvvm/libdevice)
-
-# Don't alter CUDA_TOOLKIT_ROOT_DIR if the user specified it, if a value was
-# already cached for it, or if it already has libdevice.  Otherwise, on
-# Debian/Ubuntu, look where the nvidia-cuda-toolkit package normally installs
-# libdevice.
-if (NOT LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET AND
-    NOT EXISTS
-      "${CUDA_TOOLKIT_ROOT_DIR}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}")
-  find_program(LSB_RELEASE lsb_release)
-  if (LSB_RELEASE)
-    execute_process(COMMAND ${LSB_RELEASE} -is
-      OUTPUT_VARIABLE LSB_RELEASE_ID
-      OUTPUT_STRIP_TRAILING_WHITESPACE)
-    set(candidate_dir /usr/lib/cuda)
-    if ((LSB_RELEASE_ID STREQUAL "Debian" OR LSB_RELEASE_ID STREQUAL "Ubuntu")
-        AND EXISTS "${candidate_dir}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}")
-      set(CUDA_TOOLKIT_ROOT_DIR "${candidate_dir}" CACHE PATH
-          "Toolkit location." FORCE)
-    endif()
-  endif()
-endif()
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Try to detect in the system several dependencies required by the different
+# components of libomptarget. These are the dependencies we have:
+#
+# libelf : required by some targets to handle the ELF files at runtime.
+# libffi : required to launch target kernels given function and argument 
+#          pointers.
+# CUDA : required to control offloading to NVIDIA GPUs.
+
+include (FindPackageHandleStandardArgs)
+
+################################################################################
+# Looking for libelf...
+################################################################################
+
+find_path (
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR
+  NAMES
+    libelf.h
+  PATHS
+    /usr/include
+    /usr/local/include
+    /opt/local/include
+    /sw/include
+    ENV CPATH
+  PATH_SUFFIXES
+    libelf)
+
+find_library (
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+  NAMES
+    elf
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /opt/local/lib
+    /sw/lib
+    ENV LIBRARY_PATH
+    ENV LD_LIBRARY_PATH)
+    
+set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_LIBELF 
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS)
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES)
+  
+################################################################################
+# Looking for libffi...
+################################################################################
+find_package(PkgConfig)
+
+pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi)
+
+find_path (
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR
+  NAMES
+    ffi.h
+  HINTS
+    ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR}
+    ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS}
+  PATHS
+    /usr/include
+    /usr/local/include
+    /opt/local/include
+    /sw/include
+    ENV CPATH)
+
+# Don't bother look for the library if the header files were not found.
+if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR)
+  find_library (
+      LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+    NAMES
+      ffi
+    HINTS
+      ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR}
+      ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS}
+    PATHS
+      /usr/lib
+      /usr/local/lib
+      /opt/local/lib
+      /sw/lib
+      ENV LIBRARY_PATH
+      ENV LD_LIBRARY_PATH)
+endif()
+
+set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_LIBFFI 
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS)
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES)
+  
+################################################################################
+# Looking for CUDA...
+################################################################################
+if (CUDA_TOOLKIT_ROOT_DIR)
+  set(LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET TRUE)
+endif()
+find_package(CUDA QUIET)
+
+set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND})
+set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_CUDA_FOUND 
+  LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS)
+
+################################################################################
+# Looking for CUDA Driver API... (needed for CUDA plugin)
+################################################################################
+
+find_library (
+    LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
+  NAMES
+    cuda
+  PATHS
+    /lib64)
+
+# There is a libcuda.so in lib64/stubs that can be used for linking.
+if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND)
+  # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this
+  # case CUDA_LIBRARIES contains additional linker arguments which breaks
+  # get_filename_component below. Fortunately, since that change the module
+  # exports CUDA_cudart_static_LIBRARY which points to a single file in the
+  # right directory.
+  set(cuda_library ${CUDA_LIBRARIES})
+  if (DEFINED CUDA_cudart_static_LIBRARY)
+    set(cuda_library ${CUDA_cudart_static_LIBRARY})
+  endif()
+  get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY)
+  find_library (
+      LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
+    NAMES
+      cuda
+    HINTS
+      "${CUDA_LIBDIR}/stubs")
+endif()
+
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_CUDA_DRIVER
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
+
+mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
+
+################################################################################
+# Looking for CUDA libdevice subdirectory
+#
+# Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work
+# out of the box. More info on http://bugs.debian.org/882505
+################################################################################
+
+set(LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR nvvm/libdevice)
+
+# Don't alter CUDA_TOOLKIT_ROOT_DIR if the user specified it, if a value was
+# already cached for it, or if it already has libdevice.  Otherwise, on
+# Debian/Ubuntu, look where the nvidia-cuda-toolkit package normally installs
+# libdevice.
+if (NOT LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET AND
+    NOT EXISTS
+      "${CUDA_TOOLKIT_ROOT_DIR}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}")
+  find_program(LSB_RELEASE lsb_release)
+  if (LSB_RELEASE)
+    execute_process(COMMAND ${LSB_RELEASE} -is
+      OUTPUT_VARIABLE LSB_RELEASE_ID
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    set(candidate_dir /usr/lib/cuda)
+    if ((LSB_RELEASE_ID STREQUAL "Debian" OR LSB_RELEASE_ID STREQUAL "Ubuntu")
+        AND EXISTS "${candidate_dir}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}")
+      set(CUDA_TOOLKIT_ROOT_DIR "${candidate_dir}" CACHE PATH
+          "Toolkit location." FORCE)
+    endif()
+  endif()
+endif()
diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
index 6ec0cc2b61bc0..6128618bf487d 100644
--- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -1,111 +1,111 @@
-#
-#//===----------------------------------------------------------------------===//
-#//
-#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-#// See https://llvm.org/LICENSE.txt for license information.
-#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#//
-#//===----------------------------------------------------------------------===//
-#
-
-# We use the compiler and linker provided by the user, attempt to use the one
-# used to build libomptarget or just fail.
-set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)
-
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
-  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
-elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
-  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
-else()
-  return()
-endif()
-
-# Get compiler directory to try to locate a suitable linker.
-get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
-set(llvm_link "${compiler_dir}/llvm-link")
-
-if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
-  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
-elseif (EXISTS "${llvm_link}")
-  # Use llvm-link from the compiler directory.
-  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
-else()
-  return()
-endif()
-
-function(try_compile_bitcode output source)
-  set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
-  file(WRITE ${srcfile} "${source}\n")
-  set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)
-
-  # The remaining arguments are the flags to be tested.
-  # FIXME: Don't hardcode GPU version. This is currently required because
-  #        Clang refuses to compile its default of sm_20 with CUDA 9.
-  execute_process(
-    COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
-      --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
-    RESULT_VARIABLE result
-    OUTPUT_QUIET ERROR_QUIET)
-  if (result EQUAL 0)
-    set(${output} TRUE PARENT_SCOPE)
-  else()
-    set(${output} FALSE PARENT_SCOPE)
-  endif()
-endfunction()
-
-# Save for which compiler we are going to do the following checks so that we
-# can discard cached values if the user specifies a different value.
-set(discard_cached FALSE)
-if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
-    NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
-  set(discard_cached TRUE)
-endif()
-set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)
-
-function(check_bitcode_compilation output source)
-  if (${discard_cached} OR NOT DEFINED ${output})
-    message(STATUS "Performing Test ${output}")
-    # Forward additional arguments which contain the flags.
-    try_compile_bitcode(result "${source}" ${ARGN})
-    set(${output} ${result} CACHE INTERNAL "" FORCE)
-    if(${result})
-      message(STATUS "Performing Test ${output} - Success")
-    else()
-      message(STATUS "Performing Test ${output} - Failed")
-    endif()
-  endif()
-endfunction()
-
-# These flags are required to emit LLVM Bitcode. We check them together because
-# if any of them are not supported, there is no point in finding out which are.
-set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
-set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
-check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
-
-# It makes no sense to continue given that the compiler doesn't support
-# emitting basic LLVM Bitcode
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
-  return()
-endif()
-
-set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})
-
-# Declaring external shared device variables might need an additional flag
-# since Clang 7.0 and was entirely unsupported since version 4.0.
-set(extern_device_shared_src "extern __device__ __shared__ int test;")
-
-check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
-  set(compiler_flag_fcuda_rdc -fcuda-rdc)
-  set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
-  check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
-
-  if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
-    return()
-  endif()
-
-  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
-endif()
-
-# We can compile LLVM Bitcode from CUDA source code!
-set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# We use the compiler and linker provided by the user, attempt to use the one
+# used to build libomptarget or just fail.
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
+else()
+  return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS "${llvm_link}")
+  # Use llvm-link from the compiler directory.
+  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
+else()
+  return()
+endif()
+
+function(try_compile_bitcode output source)
+  set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
+  file(WRITE ${srcfile} "${source}\n")
+  set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)
+
+  # The remaining arguments are the flags to be tested.
+  # FIXME: Don't hardcode GPU version. This is currently required because
+  #        Clang refuses to compile its default of sm_20 with CUDA 9.
+  execute_process(
+    COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
+      --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
+    RESULT_VARIABLE result
+    OUTPUT_QUIET ERROR_QUIET)
+  if (result EQUAL 0)
+    set(${output} TRUE PARENT_SCOPE)
+  else()
+    set(${output} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Save for which compiler we are going to do the following checks so that we
+# can discard cached values if the user specifies a different value.
+set(discard_cached FALSE)
+if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
+    NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
+  set(discard_cached TRUE)
+endif()
+set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)
+
+function(check_bitcode_compilation output source)
+  if (${discard_cached} OR NOT DEFINED ${output})
+    message(STATUS "Performing Test ${output}")
+    # Forward additional arguments which contain the flags.
+    try_compile_bitcode(result "${source}" ${ARGN})
+    set(${output} ${result} CACHE INTERNAL "" FORCE)
+    if(${result})
+      message(STATUS "Performing Test ${output} - Success")
+    else()
+      message(STATUS "Performing Test ${output} - Failed")
+    endif()
+  endif()
+endfunction()
+
+# These flags are required to emit LLVM Bitcode. We check them together because
+# if any of them are not supported, there is no point in finding out which are.
+set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
+
+# It makes no sense to continue given that the compiler doesn't support
+# emitting basic LLVM Bitcode
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
+  return()
+endif()
+
+set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})
+
+# Declaring external shared device variables might need an additional flag
+# since Clang 7.0 and was entirely unsupported since version 4.0.
+set(extern_device_shared_src "extern __device__ __shared__ int test;")
+
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
+  set(compiler_flag_fcuda_rdc -fcuda-rdc)
+  set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
+  check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
+
+  if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
+    return()
+  endif()
+
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
+endif()
+
+# We can compile LLVM Bitcode from CUDA source code!
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)
diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake
index 7339cc0b56edd..1f686067a838c 100644
--- a/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake
@@ -1,27 +1,27 @@
-#
-#//===----------------------------------------------------------------------===//
-#//
-#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-#// See https://llvm.org/LICENSE.txt for license information.
-#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#//
-#//===----------------------------------------------------------------------===//
-#
-
-# void libomptarget_say(string message_to_user);
-# - prints out message_to_user
-macro(libomptarget_say message_to_user)
-  message(STATUS "LIBOMPTARGET: ${message_to_user}")
-endmacro()
-
-# void libomptarget_warning_say(string message_to_user);
-# - prints out message_to_user with a warning
-macro(libomptarget_warning_say message_to_user)
-  message(WARNING "LIBOMPTARGET: ${message_to_user}")
-endmacro()
-
-# void libomptarget_error_say(string message_to_user);
-# - prints out message_to_user with an error and exits cmake
-macro(libomptarget_error_say message_to_user)
-  message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}")
-endmacro()
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# void libomptarget_say(string message_to_user);
+# - prints out message_to_user
+macro(libomptarget_say message_to_user)
+  message(STATUS "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_warning_say(string message_to_user);
+# - prints out message_to_user with a warning
+macro(libomptarget_warning_say message_to_user)
+  message(WARNING "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_error_say(string message_to_user);
+# - prints out message_to_user with an error and exits cmake
+macro(libomptarget_error_say message_to_user)
+  message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}")
+endmacro()
diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt
index 3df94eac0727e..9761b8b262ee9 100644
--- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/CMakeLists.txt
@@ -1,13 +1,13 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ##===----------------------------------------------------------------------===##
-#
-# Build a device RTL for each available machine.
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(nvptx)
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ##===----------------------------------------------------------------------===##
+#
+# Build a device RTL for each available machine.
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(nvptx)
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
index 1a24bfd6f8876..47d8380ac1157 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -1,153 +1,153 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build the AMDGCN Device RTL if the ROCM tools are available
-#
-##===----------------------------------------------------------------------===##
-
-find_package(LLVM QUIET CONFIG
-  PATHS
-  $ENV{AOMP}
-  $ENV{HOME}/rocm/aomp
-  /opt/rocm/aomp
-  /usr/lib/rocm/aomp
-  ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR}
-  ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR}
-  ${CMAKE_CXX_COMPILER_DIR}
-  NO_DEFAULT_PATH)
-
-if (LLVM_DIR)
-  libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake")
-else()
-  libomptarget_say("Not building AMDGCN device RTL: AOMP not found")
-  return()
-endif()
-
-set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX})
-
-if (AOMP_INSTALL_PREFIX)
-  set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin)
-else()
-  set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin)
-endif()
-
-libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}")
-
-project(omptarget-amdgcn)
-
-add_custom_target(omptarget-amdgcn ALL)
-
-#optimization level
-set(optimization_level 2)
-
-# Activate RTL message dumps if requested by the user.
-if(LIBOMPTARGET_NVPTX_DEBUG)
-  set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1)
-endif()
-
-get_filename_component(devicertl_base_directory
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  DIRECTORY)
-
-set(cuda_sources
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
-  ${devicertl_base_directory}/common/src/cancel.cu
-  ${devicertl_base_directory}/common/src/critical.cu
-  ${devicertl_base_directory}/common/src/data_sharing.cu
-  ${devicertl_base_directory}/common/src/libcall.cu
-  ${devicertl_base_directory}/common/src/loop.cu
-  ${devicertl_base_directory}/common/src/omp_data.cu
-  ${devicertl_base_directory}/common/src/omptarget.cu
-  ${devicertl_base_directory}/common/src/parallel.cu
-  ${devicertl_base_directory}/common/src/reduction.cu
-  ${devicertl_base_directory}/common/src/support.cu
-  ${devicertl_base_directory}/common/src/sync.cu
-  ${devicertl_base_directory}/common/src/task.cu)
-
-set(h_files
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
-  ${devicertl_base_directory}/common/debug.h
-  ${devicertl_base_directory}/common/device_environment.h
-  ${devicertl_base_directory}/common/omptarget.h
-  ${devicertl_base_directory}/common/omptargeti.h
-  ${devicertl_base_directory}/common/state-queue.h
-  ${devicertl_base_directory}/common/target_atomic.h
-  ${devicertl_base_directory}/common/state-queuei.h
-  ${devicertl_base_directory}/common/support.h)
-
-# for both in-tree and out-of-tree build
-if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
-  set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-else()
-  set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
-endif()
-
-# create libraries
-set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)
-if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
-  set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
-endif()
-
-macro(add_cuda_bc_library)
-  set(cu_cmd ${AOMP_BINDIR}/clang++
-    -std=c++14
-    -fcuda-rdc
-    -fvisibility=default
-    --cuda-device-only
-    -Wno-unused-value
-    -x hip
-    -O${optimization_level}
-    --cuda-gpu-arch=${mcpu}
-    ${CUDA_DEBUG}
-    -I${CMAKE_CURRENT_SOURCE_DIR}/src
-    -I${devicertl_base_directory})
-
-  set(bc1_files)
-
-  foreach(file ${ARGN})
-    get_filename_component(fname ${file} NAME_WE)
-    set(bc1_filename ${fname}.${mcpu}.bc)
-
-    add_custom_command(
-      OUTPUT ${bc1_filename}
-      COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
-      DEPENDS ${file} ${h_files})
-
-    list(APPEND bc1_files ${bc1_filename})
-  endforeach()
-
-  add_custom_command(
-    OUTPUT linkout.cuda.${mcpu}.bc
-    COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc
-    DEPENDS ${bc1_files})
-
-  list(APPEND bc_files linkout.cuda.${mcpu}.bc)
-endmacro()
-
-set(libname "omptarget-amdgcn")
-
-foreach(mcpu ${mcpus})
-  set(bc_files)
-  add_cuda_bc_library(${cuda_sources})
-
-  set(bc_libname lib${libname}-${mcpu}.bc)
-  add_custom_command(
-    OUTPUT ${bc_libname}
-    COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname}
-    DEPENDS ${bc_files})
-
-  add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
-
-  install(FILES ${OUTPUTDIR}/${bc_libname}
-     DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice"
-  )
-endforeach()
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the AMDGCN Device RTL if the ROCM tools are available
+#
+##===----------------------------------------------------------------------===##
+
+find_package(LLVM QUIET CONFIG
+  PATHS
+  $ENV{AOMP}
+  $ENV{HOME}/rocm/aomp
+  /opt/rocm/aomp
+  /usr/lib/rocm/aomp
+  ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR}
+  ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR}
+  ${CMAKE_CXX_COMPILER_DIR}
+  NO_DEFAULT_PATH)
+
+if (LLVM_DIR)
+  libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake")
+else()
+  libomptarget_say("Not building AMDGCN device RTL: AOMP not found")
+  return()
+endif()
+
+set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX})
+
+if (AOMP_INSTALL_PREFIX)
+  set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin)
+else()
+  set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin)
+endif()
+
+libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}")
+
+project(omptarget-amdgcn)
+
+add_custom_target(omptarget-amdgcn ALL)
+
+#optimization level
+set(optimization_level 2)
+
+# Activate RTL message dumps if requested by the user.
+if(LIBOMPTARGET_NVPTX_DEBUG)
+  set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1)
+endif()
+
+get_filename_component(devicertl_base_directory
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  DIRECTORY)
+
+set(cuda_sources
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
+  ${devicertl_base_directory}/common/src/cancel.cu
+  ${devicertl_base_directory}/common/src/critical.cu
+  ${devicertl_base_directory}/common/src/data_sharing.cu
+  ${devicertl_base_directory}/common/src/libcall.cu
+  ${devicertl_base_directory}/common/src/loop.cu
+  ${devicertl_base_directory}/common/src/omp_data.cu
+  ${devicertl_base_directory}/common/src/omptarget.cu
+  ${devicertl_base_directory}/common/src/parallel.cu
+  ${devicertl_base_directory}/common/src/reduction.cu
+  ${devicertl_base_directory}/common/src/support.cu
+  ${devicertl_base_directory}/common/src/sync.cu
+  ${devicertl_base_directory}/common/src/task.cu)
+
+set(h_files
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
+  ${devicertl_base_directory}/common/debug.h
+  ${devicertl_base_directory}/common/device_environment.h
+  ${devicertl_base_directory}/common/omptarget.h
+  ${devicertl_base_directory}/common/omptargeti.h
+  ${devicertl_base_directory}/common/state-queue.h
+  ${devicertl_base_directory}/common/target_atomic.h
+  ${devicertl_base_directory}/common/state-queuei.h
+  ${devicertl_base_directory}/common/support.h)
+
+# for both in-tree and out-of-tree build
+if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
+  set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+else()
+  set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
+endif()
+
+# create libraries
+set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)
+if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
+  set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
+endif()
+
+macro(add_cuda_bc_library)
+  set(cu_cmd ${AOMP_BINDIR}/clang++
+    -std=c++14
+    -fcuda-rdc
+    -fvisibility=default
+    --cuda-device-only
+    -Wno-unused-value
+    -x hip
+    -O${optimization_level}
+    --cuda-gpu-arch=${mcpu}
+    ${CUDA_DEBUG}
+    -I${CMAKE_CURRENT_SOURCE_DIR}/src
+    -I${devicertl_base_directory})
+
+  set(bc1_files)
+
+  foreach(file ${ARGN})
+    get_filename_component(fname ${file} NAME_WE)
+    set(bc1_filename ${fname}.${mcpu}.bc)
+
+    add_custom_command(
+      OUTPUT ${bc1_filename}
+      COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
+      DEPENDS ${file} ${h_files})
+
+    list(APPEND bc1_files ${bc1_filename})
+  endforeach()
+
+  add_custom_command(
+    OUTPUT linkout.cuda.${mcpu}.bc
+    COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc
+    DEPENDS ${bc1_files})
+
+  list(APPEND bc_files linkout.cuda.${mcpu}.bc)
+endmacro()
+
+set(libname "omptarget-amdgcn")
+
+foreach(mcpu ${mcpus})
+  set(bc_files)
+  add_cuda_bc_library(${cuda_sources})
+
+  set(bc_libname lib${libname}-${mcpu}.bc)
+  add_custom_command(
+    OUTPUT ${bc_libname}
+    COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname}
+    DEPENDS ${bc_files})
+
+  add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
+
+  install(FILES ${OUTPUTDIR}/${bc_libname}
+     DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice"
+  )
+endforeach()
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
index f7c75c09362a2..e1042e0367217 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
@@ -1,18 +1,18 @@
-//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _AMDGCN_INTERFACE_H_
-#define _AMDGCN_INTERFACE_H_
-
-#include <stdint.h>
-
-#define EXTERN extern "C" __attribute__((device))
-typedef uint64_t __kmpc_impl_lanemask_t;
-typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
-
-#endif
+//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDGCN_INTERFACE_H_
+#define _AMDGCN_INTERFACE_H_
+
+#include <stdint.h>
+
+#define EXTERN extern "C" __attribute__((device))
+typedef uint64_t __kmpc_impl_lanemask_t;
+typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
index 4163a14f50bf1..c64200d4289fa 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
@@ -1,28 +1,28 @@
-//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
-// cannot be implemented - if one thread gets the lock, it can't continue on to
-// the next instruction in order to do anything as the other threads are waiting
-// to take the lock.
-// These functions will be implemented to provide the documented semantics for
-// a SIMD => wavefront mapping once that is implemented.
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/debug.h"
-
-static DEVICE void warn() {
-  PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
-}
-
-DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
-DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
-DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
-DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
-DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); }
+//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
+// cannot be implemented - if one thread gets the lock, it can't continue on to
+// the next instruction in order to do anything as the other threads are waiting
+// to take the lock.
+// These functions will be implemented to provide the documented semantics for
+// a SIMD => wavefront mapping once that is implemented.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/debug.h"
+
+static DEVICE void warn() {
+  PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
+}
+
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); }
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
index 74d0d167137fb..87f02d51cfca7 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
@@ -1,61 +1,61 @@
-//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "target_impl.h"
-
-// Partially derived fom hcc_detail/device_functions.h
-
-// HW_ID Register bit structure
-// WAVE_ID     3:0     Wave buffer slot number. 0-9.
-// SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
-// PIPE_ID     7:6     Pipeline from which the wave was dispatched.
-// CU_ID       11:8    Compute Unit the wave is assigned to.
-// SH_ID       12      Shader Array (within an SE) the wave is assigned to.
-// SE_ID       14:13   Shader Engine the wave is assigned to.
-// TG_ID       19:16   Thread-group ID
-// VM_ID       23:20   Virtual Memory ID
-// QUEUE_ID    26:24   Queue from which this wave was dispatched.
-// STATE_ID    29:27   State ID (graphics only, not compute).
-// ME_ID       31:30   Micro-engine ID.
-
-enum {
-  HW_ID = 4, // specify that the hardware register to read is HW_ID
-
-  HW_ID_CU_ID_SIZE = 4,   // size of CU_ID field in bits
-  HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
-
-  HW_ID_SE_ID_SIZE = 2,    // sizeof SE_ID field in bits
-  HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
-};
-
-// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
-// immediate and returns a 32 bit value.
-// The encoding of the immediate parameter is:
-// ID           5:0     Which register to read from
-// OFFSET       10:6    Range: 0..31
-// WIDTH        15:11   Range: 1..32
-
-// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
-// where hwreg forms a 16 bit immediate encoded by the assembler thus:
-// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
-//   return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
-// }
-#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
-
-// Note: The results can be changed by a context switch
-// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
-// bound on how many compute units are available. Some values in this
-// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
-
-DEVICE uint32_t __kmpc_impl_smid() {
-  uint32_t cu_id = __builtin_amdgcn_s_getreg(
-      ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
-  uint32_t se_id = __builtin_amdgcn_s_getreg(
-      ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
-  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
-}
+//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "target_impl.h"
+
+// Partially derived fom hcc_detail/device_functions.h
+
+// HW_ID Register bit structure
+// WAVE_ID     3:0     Wave buffer slot number. 0-9.
+// SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
+// PIPE_ID     7:6     Pipeline from which the wave was dispatched.
+// CU_ID       11:8    Compute Unit the wave is assigned to.
+// SH_ID       12      Shader Array (within an SE) the wave is assigned to.
+// SE_ID       14:13   Shader Engine the wave is assigned to.
+// TG_ID       19:16   Thread-group ID
+// VM_ID       23:20   Virtual Memory ID
+// QUEUE_ID    26:24   Queue from which this wave was dispatched.
+// STATE_ID    29:27   State ID (graphics only, not compute).
+// ME_ID       31:30   Micro-engine ID.
+
+enum {
+  HW_ID = 4, // specify that the hardware register to read is HW_ID
+
+  HW_ID_CU_ID_SIZE = 4,   // size of CU_ID field in bits
+  HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
+
+  HW_ID_SE_ID_SIZE = 2,    // sizeof SE_ID field in bits
+  HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
+};
+
+// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
+// immediate and returns a 32 bit value.
+// The encoding of the immediate parameter is:
+// ID           5:0     Which register to read from
+// OFFSET       10:6    Range: 0..31
+// WIDTH        15:11   Range: 1..32
+
+// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
+// where hwreg forms a 16 bit immediate encoded by the assembler thus:
+// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
+//   return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
+// }
+#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
+
+// Note: The results can be changed by a context switch
+// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
+// bound on how many compute units are available. Some values in this
+// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
+
+DEVICE uint32_t __kmpc_impl_smid() {
+  uint32_t cu_id = __builtin_amdgcn_s_getreg(
+      ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
+  uint32_t se_id = __builtin_amdgcn_s_getreg(
+      ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
+  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+}
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
index df102c765925c..312003d902d0d 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h
@@ -1,42 +1,42 @@
-//===---- hip_atomics.h - Declarations of hip atomic functions ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_AMDGCN_HIP_ATOMICS_H
-#define OMPTARGET_AMDGCN_HIP_ATOMICS_H
-
-#include "target_impl.h"
-
-// inc requires an amdgcn specific intrinsic which is not yet available
-DEVICE unsigned atomicInc(unsigned *address);
-DEVICE unsigned atomicInc(unsigned *address, unsigned max);
-DEVICE int atomicInc(int *address);
-
-namespace {
-
-template <typename T> DEVICE T atomicAdd(T *address, T val) {
-  return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
-}
-
-template <typename T> DEVICE T atomicMax(T *address, T val) {
-  return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
-}
-
-template <typename T> DEVICE T atomicExch(T *address, T val) {
-  T r;
-  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
-  return r;
-}
-
-template <typename T> DEVICE T atomicCAS(T *address, T compare, T val) {
-  (void)__atomic_compare_exchange(address, &compare, &val, false,
-                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-  return compare;
-}
-
-} // namespace
-#endif
+//===---- hip_atomics.h - Declarations of hip atomic functions ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_AMDGCN_HIP_ATOMICS_H
+#define OMPTARGET_AMDGCN_HIP_ATOMICS_H
+
+#include "target_impl.h"
+
+// inc requires an amdgcn specific intrinsic which is not yet available
+DEVICE unsigned atomicInc(unsigned *address);
+DEVICE unsigned atomicInc(unsigned *address, unsigned max);
+DEVICE int atomicInc(int *address);
+
+namespace {
+
+template <typename T> DEVICE T atomicAdd(T *address, T val) {
+  return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
+}
+
+template <typename T> DEVICE T atomicMax(T *address, T val) {
+  return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
+}
+
+template <typename T> DEVICE T atomicExch(T *address, T val) {
+  T r;
+  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
+  return r;
+}
+
+template <typename T> DEVICE T atomicCAS(T *address, T compare, T val) {
+  (void)__atomic_compare_exchange(address, &compare, &val, false,
+                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return compare;
+}
+
+} // namespace
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 04755a6a3e73f..94a12a248a2fc 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -1,155 +1,155 @@
-//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations and definitions of target specific functions and constants
-//
-//===----------------------------------------------------------------------===//
-#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
-#define OMPTARGET_AMDGCN_TARGET_IMPL_H
-
-#ifndef __AMDGCN__
-#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
-#endif
-
-#include "amdgcn_interface.h"
-
-#include <assert.h>
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#define DEVICE __attribute__((device))
-#define INLINE inline DEVICE
-#define NOINLINE __attribute__((noinline)) DEVICE
-#define SHARED __attribute__((shared))
-#define ALIGN(N) __attribute__((aligned(N)))
-
-#include "hip_atomics.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 64
-
-// The named barrier for active parallel threads of a team in an L1 parallel
-// region to synchronize with each other.
-#define L1_BARRIER (1)
-
-// Maximum number of preallocated arguments to an outlined parallel/simd
-// function. Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#define OMP_STATE_COUNT 32
-#define MAX_SM 64
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
-  // The maximum number of workers in a kernel.
-  DS_Max_Worker_Threads = 960,
-  // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = 256,
-  // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
-  // The maximum number of warps in use
-  DS_Max_Warp_Number = 16,
-};
-
-INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
-  lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
-  hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
-}
-
-INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
-  return (((uint64_t)hi) << 32) | (uint64_t)lo;
-}
-
-static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
-    UINT64_C(0xffffffffffffffff);
-
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
-
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
-
-DEVICE uint32_t __kmpc_impl_smid();
-
-DEVICE double __kmpc_impl_get_wtick();
-
-DEVICE double __kmpc_impl_get_wtime();
-
-INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
-
-INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
-
-template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
-  return x < y ? x : y;
-}
-
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
-
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
-                                     int32_t SrcLane);
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
-                                          uint32_t Delta, int32_t Width);
-
-INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
-
-INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
-  // AMDGCN doesn't need to sync threads in a warp
-}
-
-INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
-  // we have protected the master warp from releasing from its barrier
-  // due to a full workgroup barrier in the middle of a work function.
-  // So it is ok to issue a full workgroup barrier here.
-  __builtin_amdgcn_s_barrier();
-}
-
-DEVICE void __kmpc_impl_threadfence(void);
-DEVICE void __kmpc_impl_threadfence_block(void);
-DEVICE void __kmpc_impl_threadfence_system(void);
-
-// Calls to the AMDGCN layer (assuming 1D layout)
-INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
-INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
-DEVICE int GetNumberOfBlocksInKernel();
-DEVICE int GetNumberOfThreadsInBlock();
-DEVICE unsigned GetWarpId();
-DEVICE unsigned GetLaneId();
-
-DEVICE bool __kmpc_impl_is_first_active_thread();
-
-// Locks
-DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
-DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
-DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
-DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
-DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
-
-// Memory
-DEVICE void *__kmpc_impl_malloc(size_t x);
-DEVICE void __kmpc_impl_free(void *x);
-
-// DEVICE versions of part of libc
-INLINE void __assert_fail(const char *, const char *, unsigned int,
-                          const char *) {
-  __builtin_trap();
-}
-EXTERN int printf(const char *, ...);
-
-#endif
+//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations and definitions of target specific functions and constants
+//
+//===----------------------------------------------------------------------===//
+#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
+#define OMPTARGET_AMDGCN_TARGET_IMPL_H
+
+#ifndef __AMDGCN__
+#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
+#endif
+
+#include "amdgcn_interface.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#define DEVICE __attribute__((device))
+#define INLINE inline DEVICE
+#define NOINLINE __attribute__((noinline)) DEVICE
+#define SHARED __attribute__((shared))
+#define ALIGN(N) __attribute__((aligned(N)))
+
+#include "hip_atomics.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 64
+
+// The named barrier for active parallel threads of a team in an L1 parallel
+// region to synchronize with each other.
+#define L1_BARRIER (1)
+
+// Maximum number of preallocated arguments to an outlined parallel/simd
+// function. Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#define OMP_STATE_COUNT 32
+#define MAX_SM 64
+
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 960,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 16,
+};
+
+INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+  lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
+  hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+}
+
+INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  return (((uint64_t)hi) << 32) | (uint64_t)lo;
+}
+
+static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
+    UINT64_C(0xffffffffffffffff);
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
+
+DEVICE uint32_t __kmpc_impl_smid();
+
+DEVICE double __kmpc_impl_get_wtick();
+
+DEVICE double __kmpc_impl_get_wtime();
+
+INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
+
+INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
+
+template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
+  return x < y ? x : y;
+}
+
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                     int32_t SrcLane);
+
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                          uint32_t Delta, int32_t Width);
+
+INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
+
+INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
+  // AMDGCN doesn't need to sync threads in a warp
+}
+
+INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
+  // we have protected the master warp from releasing from its barrier
+  // due to a full workgroup barrier in the middle of a work function.
+  // So it is ok to issue a full workgroup barrier here.
+  __builtin_amdgcn_s_barrier();
+}
+
+DEVICE void __kmpc_impl_threadfence(void);
+DEVICE void __kmpc_impl_threadfence_block(void);
+DEVICE void __kmpc_impl_threadfence_system(void);
+
+// Calls to the AMDGCN layer (assuming 1D layout)
+INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
+DEVICE int GetNumberOfBlocksInKernel();
+DEVICE int GetNumberOfThreadsInBlock();
+DEVICE unsigned GetWarpId();
+DEVICE unsigned GetLaneId();
+
+DEVICE bool __kmpc_impl_is_first_active_thread();
+
+// Locks
+DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
+DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
+DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
+
+// Memory
+DEVICE void *__kmpc_impl_malloc(size_t x);
+DEVICE void __kmpc_impl_free(void *x);
+
+// DEVICE versions of part of libc
+INLINE void __assert_fail(const char *, const char *, unsigned int,
+                          const char *) {
+  __builtin_trap();
+}
+EXTERN int printf(const char *, ...);
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 9807483d4c420..aca9daad12143 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -1,72 +1,72 @@
-//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-
-#include "target_impl.h"
-
-// Implementations initially derived from hcc
-
-// Initialized with a 64-bit mask with bits set in positions less than the
-// thread's lane number in the warp
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
-  uint32_t lane = GetLaneId();
-  int64_t ballot = __kmpc_impl_activemask();
-  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
-  return mask & ballot;
-}
-
-// Initialized with a 64-bit mask with bits set in positions greater than the
-// thread's lane number in the warp
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
-  uint32_t lane = GetLaneId();
-  if (lane == (WARPSIZE - 1))
-    return 0;
-  uint64_t ballot = __kmpc_impl_activemask();
-  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
-  return mask & ballot;
-}
-
-DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); }
-
-EXTERN uint64_t __clock64();
-DEVICE double __kmpc_impl_get_wtime() {
-  return ((double)1.0 / 745000000.0) * __clock64();
-}
-
-// Warp vote function
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-  return __builtin_amdgcn_read_exec();
-}
-
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                     int32_t srcLane) {
-  int width = WARPSIZE;
-  int self = GetLaneId();
-  int index = srcLane + (self & ~(width - 1));
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                          uint32_t laneDelta, int32_t width) {
-  int self = GetLaneId();
-  int index = self + laneDelta;
-  index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
-EXTERN uint64_t __ockl_get_local_size(uint32_t);
-EXTERN uint64_t __ockl_get_num_groups(uint32_t);
-DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }
-DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); }
-DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
-DEVICE unsigned GetLaneId() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
+//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of target specific functions
+//
+//===----------------------------------------------------------------------===//
+
+#include "target_impl.h"
+
+// Implementations initially derived from hcc
+
+// Initialized with a 64-bit mask with bits set in positions less than the
+// thread's lane number in the warp
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  uint32_t lane = GetLaneId();
+  int64_t ballot = __kmpc_impl_activemask();
+  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
+  return mask & ballot;
+}
+
+// Initialized with a 64-bit mask with bits set in positions greater than the
+// thread's lane number in the warp
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  uint32_t lane = GetLaneId();
+  if (lane == (WARPSIZE - 1))
+    return 0;
+  uint64_t ballot = __kmpc_impl_activemask();
+  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
+  return mask & ballot;
+}
+
+DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); }
+
+EXTERN uint64_t __clock64();
+DEVICE double __kmpc_impl_get_wtime() {
+  return ((double)1.0 / 745000000.0) * __clock64();
+}
+
+// Warp vote function
+DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
+                                     int32_t srcLane) {
+  int width = WARPSIZE;
+  int self = GetLaneId();
+  int index = srcLane + (self & ~(width - 1));
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+
+DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
+                                          uint32_t laneDelta, int32_t width) {
+  int self = GetLaneId();
+  int index = self + laneDelta;
+  index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, var);
+}
+
+EXTERN uint64_t __ockl_get_local_size(uint32_t);
+EXTERN uint64_t __ockl_get_num_groups(uint32_t);
+DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }
+DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); }
+DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+DEVICE unsigned GetLaneId() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
index 6539b7ad70cf6..b19f1bf3563d0 100644
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@@ -1,287 +1,287 @@
-//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains debug macros to be used in the application.
-//
-//   Usage guide
-//
-//   PRINT0(flag, str)        : if debug flag is on, print (no arguments)
-//   PRINT(flag, str, args)   : if debug flag is on, print (arguments)
-//   DON(flag)                : return true if debug flag is on
-//
-//   ASSERT(flag, cond, str, args): if test flag is on, test the condition
-//                                  if the condition is false, print str+args
-//          and assert.
-//          CAUTION: cond may be evaluate twice
-//   AON(flag)                     : return true if test flag is on
-//
-//   WARNING(flag, str, args)      : if warning flag is on, print the warning
-//   WON(flag)                     : return true if warning flag is on
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_NVPTX_DEBUG_H_
-#define _OMPTARGET_NVPTX_DEBUG_H_
-
-#include "common/device_environment.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of debugging
-////////////////////////////////////////////////////////////////////////////////
-
-#define LD_SET_NONE 0ULL /* none */
-#define LD_SET_ALL -1ULL /* all */
-
-// pos 1
-#define LD_SET_LOOP 0x1ULL  /* basic loop */
-#define LD_SET_LOOPD 0x2ULL /* basic loop */
-#define LD_SET_PAR 0x4ULL   /* basic parallel */
-#define LD_SET_PARD 0x8ULL  /* basic parallel */
-
-// pos 2
-#define LD_SET_SYNC 0x10ULL  /* sync info */
-#define LD_SET_SYNCD 0x20ULL /* sync info */
-#define LD_SET_WAIT 0x40ULL  /* state when waiting */
-#define LD_SET_TASK 0x80ULL  /* print task info (high level) */
-
-// pos 3
-#define LD_SET_IO 0x100ULL     /* big region io (excl atomic) */
-#define LD_SET_IOD 0x200ULL    /* big region io (excl atomic) */
-#define LD_SET_ENV 0x400ULL    /* env info */
-#define LD_SET_CANCEL 0x800ULL /* print cancel info */
-
-// pos 4
-#define LD_SET_MEM 0x1000ULL /* malloc / free */
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags to print selected output.
-
-// these are some examples of possible definitions that can be used for
-// debugging.
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
-// on cuda buffer
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
-
-#ifndef OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
-#elif OMPTARGET_NVPTX_DEBUG
-#warning debug is used, not good for measurements
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of asserts
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// available flags
-
-#define LT_SET_NONE 0x0 /* unsafe */
-#define LT_SET_SAFETY                                                          \
-  0x1 /* check malloc type of stuff, input at creation, cheap */
-#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
-#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags
-
-#ifndef OMPTARGET_NVPTX_TEST
-#if OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
-#else
-#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
-#endif
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of warnings
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// available flags
-
-#define LW_SET_ALL -1
-#define LW_SET_NONE 0x0
-#define LW_SET_ENV 0x1
-#define LW_SET_INPUT 0x2
-#define LW_SET_FUSSY 0x4
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags
-
-#if OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
-#else
-#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// implementation for debug
-////////////////////////////////////////////////////////////////////////////////
-
-#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
-#include "common/support.h"
-
-template <typename... Arguments>
-NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-         (int)GetWarpId(), (int)GetLaneId(), parameters...);
-}
-
-#endif
-#if OMPTARGET_NVPTX_TEST
-
-template <typename... Arguments>
-NOINLINE static void check(bool cond, const char *fmt,
-                           Arguments... parameters) {
-  if (!cond)
-    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-           (int)GetWarpId(), (int)GetLaneId(), parameters...);
-  assert(cond);
-}
-
-NOINLINE static void check(bool cond) { assert(cond); }
-#endif
-
-// set flags that are tested (inclusion properties)
-
-#define LD_ALL (LD_SET_ALL)
-
-#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
-#define LD_LOOPD (LD_SET_LOOPD)
-#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
-#define LD_PARD (LD_SET_PARD)
-
-// pos 2
-#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
-#define LD_SYNCD (LD_SET_SYNCD)
-#define LD_WAIT (LD_SET_WAIT)
-#define LD_TASK (LD_SET_TASK)
-
-// pos 3
-#define LD_IO (LD_SET_IO | LD_SET_IOD)
-#define LD_IOD (LD_SET_IOD)
-#define LD_ENV (LD_SET_ENV)
-#define LD_CANCEL (LD_SET_CANCEL)
-
-// pos 3
-#define LD_MEM (LD_SET_MEM)
-
-// implement
-#if OMPTARGET_NVPTX_DEBUG
-
-#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
-
-#define PRINT0(_flag, _str)                                                    \
-  {                                                                            \
-    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
-      log("<b %2d, t %4d, w %2d, l %2d>: " _str);                              \
-    }                                                                          \
-  }
-
-#define PRINT(_flag, _str, _args...)                                           \
-  {                                                                            \
-    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
-      log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args);                       \
-    }                                                                          \
-  }
-#else
-
-#define DON(_flag) (0)
-#define PRINT0(flag, str)
-#define PRINT(flag, str, _args...)
-
-#endif
-
-// for printing without worrying about precision, pointers...
-#define P64(_x) ((unsigned long long)(_x))
-
-////////////////////////////////////////////////////////////////////////////////
-// early defs for test
-////////////////////////////////////////////////////////////////////////////////
-
-#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
-#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
-#define LT_FUSSY (LT_SET_FUSSY)
-
-#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
-
-#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
-#define ASSERT0(_flag, _cond, _str)                                            \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check(_cond);                                                            \
-    }                                                                          \
-  }
-#define ASSERT(_flag, _cond, _str, _args...)                                   \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check(_cond);                                                            \
-    }                                                                          \
-  }
-
-#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
-
-#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
-#define ASSERT0(_flag, _cond, _str)                                            \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n");       \
-    }                                                                          \
-  }
-#define ASSERT(_flag, _cond, _str, _args...)                                   \
-  {                                                                            \
-    if (TON(_flag)) {                                                          \
-      check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n",        \
-            _args);                                                            \
-    }                                                                          \
-  }
-
-#else
-
-#define TON(_flag) (0)
-#define ASSERT0(_flag, _cond, _str)
-#define ASSERT(_flag, _cond, _str, _args...)
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// early defs for warning
-
-#define LW_ALL (LW_SET_ALL)
-#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
-#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
-#define LW_FUSSY (LW_SET_FUSSY)
-
-#if OMPTARGET_NVPTX_WARNING
-
-#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
-#define WARNING0(_flag, _str)                                                  \
-  {                                                                            \
-    if (WON(_flag)) {                                                          \
-      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str);                      \
-    }                                                                          \
-  }
-#define WARNING(_flag, _str, _args...)                                         \
-  {                                                                            \
-    if (WON(_flag)) {                                                          \
-      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args);               \
-    }                                                                          \
-  }
-
-#else
-
-#define WON(_flag) (0)
-#define WARNING0(_flag, _str)
-#define WARNING(_flag, _str, _args...)
-
-#endif
-
-#endif
+//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+//   Usage guide
+//
+//   PRINT0(flag, str)        : if debug flag is on, print (no arguments)
+//   PRINT(flag, str, args)   : if debug flag is on, print (arguments)
+//   DON(flag)                : return true if debug flag is on
+//
+//   ASSERT(flag, cond, str, args): if test flag is on, test the condition
+//                                  if the condition is false, print str+args
+//          and assert.
+//          CAUTION: cond may be evaluate twice
+//   AON(flag)                     : return true if test flag is on
+//
+//   WARNING(flag, str, args)      : if warning flag is on, print the warning
+//   WON(flag)                     : return true if warning flag is on
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_NVPTX_DEBUG_H_
+#define _OMPTARGET_NVPTX_DEBUG_H_
+
+#include "common/device_environment.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of debugging
+////////////////////////////////////////////////////////////////////////////////
+
+#define LD_SET_NONE 0ULL /* none */
+#define LD_SET_ALL -1ULL /* all */
+
+// pos 1
+#define LD_SET_LOOP 0x1ULL  /* basic loop */
+#define LD_SET_LOOPD 0x2ULL /* basic loop */
+#define LD_SET_PAR 0x4ULL   /* basic parallel */
+#define LD_SET_PARD 0x8ULL  /* basic parallel */
+
+// pos 2
+#define LD_SET_SYNC 0x10ULL  /* sync info */
+#define LD_SET_SYNCD 0x20ULL /* sync info */
+#define LD_SET_WAIT 0x40ULL  /* state when waiting */
+#define LD_SET_TASK 0x80ULL  /* print task info (high level) */
+
+// pos 3
+#define LD_SET_IO 0x100ULL     /* big region io (excl atomic) */
+#define LD_SET_IOD 0x200ULL    /* big region io (excl atomic) */
+#define LD_SET_ENV 0x400ULL    /* env info */
+#define LD_SET_CANCEL 0x800ULL /* print cancel info */
+
+// pos 4
+#define LD_SET_MEM 0x1000ULL /* malloc / free */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags to print selected output.
+
+// these are some examples of possible definitions that can be used for
+// debugging.
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
+// on cuda buffer
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
+
+#ifndef OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
+#elif OMPTARGET_NVPTX_DEBUG
+#warning debug is used, not good for measurements
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of asserts
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LT_SET_NONE 0x0 /* unsafe */
+#define LT_SET_SAFETY                                                          \
+  0x1 /* check malloc type of stuff, input at creation, cheap */
+#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
+#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#ifndef OMPTARGET_NVPTX_TEST
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
+#else
+#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of warnings
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LW_SET_ALL -1
+#define LW_SET_NONE 0x0
+#define LW_SET_ENV 0x1
+#define LW_SET_INPUT 0x2
+#define LW_SET_FUSSY 0x4
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
+#else
+#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// implementation for debug
+////////////////////////////////////////////////////////////////////////////////
+
+#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
+#include "common/support.h"
+
+template <typename... Arguments>
+NOINLINE static void log(const char *fmt, Arguments... parameters) {
+  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
+         (int)GetWarpId(), (int)GetLaneId(), parameters...);
+}
+
+#endif
+#if OMPTARGET_NVPTX_TEST
+
+template <typename... Arguments>
+NOINLINE static void check(bool cond, const char *fmt,
+                           Arguments... parameters) {
+  if (!cond)
+    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
+           (int)GetWarpId(), (int)GetLaneId(), parameters...);
+  assert(cond);
+}
+
+NOINLINE static void check(bool cond) { assert(cond); }
+#endif
+
+// set flags that are tested (inclusion properties)
+
+#define LD_ALL (LD_SET_ALL)
+
+#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
+#define LD_LOOPD (LD_SET_LOOPD)
+#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
+#define LD_PARD (LD_SET_PARD)
+
+// pos 2
+#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
+#define LD_SYNCD (LD_SET_SYNCD)
+#define LD_WAIT (LD_SET_WAIT)
+#define LD_TASK (LD_SET_TASK)
+
+// pos 3
+#define LD_IO (LD_SET_IO | LD_SET_IOD)
+#define LD_IOD (LD_SET_IOD)
+#define LD_ENV (LD_SET_ENV)
+#define LD_CANCEL (LD_SET_CANCEL)
+
+// pos 3
+#define LD_MEM (LD_SET_MEM)
+
+// implement
+#if OMPTARGET_NVPTX_DEBUG
+
+#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
+
+#define PRINT0(_flag, _str)                                                    \
+  {                                                                            \
+    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
+      log("<b %2d, t %4d, w %2d, l %2d>: " _str);                              \
+    }                                                                          \
+  }
+
+#define PRINT(_flag, _str, _args...)                                           \
+  {                                                                            \
+    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
+      log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args);                       \
+    }                                                                          \
+  }
+#else
+
+#define DON(_flag) (0)
+#define PRINT0(flag, str)
+#define PRINT(flag, str, _args...)
+
+#endif
+
+// for printing without worrying about precision, pointers...
+#define P64(_x) ((unsigned long long)(_x))
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for test
+////////////////////////////////////////////////////////////////////////////////
+
+#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_FUSSY (LT_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str)                                            \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check(_cond);                                                            \
+    }                                                                          \
+  }
+#define ASSERT(_flag, _cond, _str, _args...)                                   \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check(_cond);                                                            \
+    }                                                                          \
+  }
+
+#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str)                                            \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n");       \
+    }                                                                          \
+  }
+#define ASSERT(_flag, _cond, _str, _args...)                                   \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n",        \
+            _args);                                                            \
+    }                                                                          \
+  }
+
+#else
+
+#define TON(_flag) (0)
+#define ASSERT0(_flag, _cond, _str)
+#define ASSERT(_flag, _cond, _str, _args...)
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for warning
+
+#define LW_ALL (LW_SET_ALL)
+#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
+#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
+#define LW_FUSSY (LW_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_WARNING
+
+#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
+#define WARNING0(_flag, _str)                                                  \
+  {                                                                            \
+    if (WON(_flag)) {                                                          \
+      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str);                      \
+    }                                                                          \
+  }
+#define WARNING(_flag, _str, _args...)                                         \
+  {                                                                            \
+    if (WON(_flag)) {                                                          \
+      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args);               \
+    }                                                                          \
+  }
+
+#else
+
+#define WON(_flag) (0)
+#define WARNING0(_flag, _str)
+#define WARNING(_flag, _str, _args...)
+
+#endif
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h
index 68a7757d20472..a13454514055d 100644
--- a/openmp/libomptarget/deviceRTLs/common/device_environment.h
+++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h
@@ -1,24 +1,24 @@
-//===---- device_environment.h - OpenMP GPU device environment --- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Global device environment
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_
-#define _OMPTARGET_DEVICE_ENVIRONMENT_H_
-
-#include "target_impl.h"
-
-struct omptarget_device_environmentTy {
-  int32_t debug_level;
-};
-
-extern DEVICE omptarget_device_environmentTy omptarget_device_environment;
-
-#endif
+//===---- device_environment.h - OpenMP GPU device environment --- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Global device environment
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_
+#define _OMPTARGET_DEVICE_ENVIRONMENT_H_
+
+#include "target_impl.h"
+
+struct omptarget_device_environmentTy {
+  int32_t debug_level;
+};
+
+extern DEVICE omptarget_device_environmentTy omptarget_device_environment;
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index 986eb3677dcf4..d8e610d34cd0d 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -1,382 +1,382 @@
-//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of all library macros, types,
-// and functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_H
-#define OMPTARGET_H
-
-#include "target_impl.h"
-#include "common/debug.h"     // debug
-#include "interface.h" // interfaces with omp, compiler, and user
-#include "common/state-queue.h"
-#include "common/support.h"
-
-#define OMPTARGET_NVPTX_VERSION 1.1
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-#define BARRIER_COUNTER 0
-#define ORDERED_COUNTER 1
-
-// arguments needed for L0 parallelism only.
-class omptarget_nvptx_SharedArgs {
-public:
-  // All these methods must be called by the master thread only.
-  INLINE void Init() {
-    args  = buffer;
-    nArgs = MAX_SHARED_ARGS;
-  }
-  INLINE void DeInit() {
-    // Free any memory allocated for outlined parallel function with a large
-    // number of arguments.
-    if (nArgs > MAX_SHARED_ARGS) {
-      SafeFree(args, "new extended args");
-      Init();
-    }
-  }
-  INLINE void EnsureSize(size_t size) {
-    if (size > nArgs) {
-      if (nArgs > MAX_SHARED_ARGS) {
-        SafeFree(args, "new extended args");
-      }
-      args = (void **)SafeMalloc(size * sizeof(void *), "new extended args");
-      nArgs = size;
-    }
-  }
-  // Called by all threads.
-  INLINE void **GetArgs() const { return args; };
-private:
-  // buffer of pre-allocated arguments.
-  void *buffer[MAX_SHARED_ARGS];
-  // pointer to arguments buffer.
-  // starts off as a pointer to 'buffer' but can be dynamically allocated.
-  void **args;
-  // starts off as MAX_SHARED_ARGS but can increase in size.
-  uint32_t nArgs;
-};
-
-extern DEVICE SHARED omptarget_nvptx_SharedArgs
-    omptarget_nvptx_globalArgs;
-
-// Data structure to keep in shared memory that traces the current slot, stack,
-// and frame pointer as well as the active threads that didn't exit the current
-// environment.
-struct DataSharingStateTy {
-  __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
-  void *StackPtr[DS_Max_Warp_Number];
-  void * volatile FramePtr[DS_Max_Warp_Number];
-  __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
-};
-// Additional worker slot type which is initialized with the default worker slot
-// size of 4*32 bytes.
-struct __kmpc_data_sharing_worker_slot_static {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[DS_Worker_Warp_Slot_Size];
-};
-// Additional master slot type which is initialized with the default master slot
-// size of 4 bytes.
-struct __kmpc_data_sharing_master_slot_static {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[DS_Slot_Size];
-};
-extern DEVICE SHARED DataSharingStateTy DataSharingState;
-
-////////////////////////////////////////////////////////////////////////////////
-// task ICV and (implicit & explicit) task state
-
-class omptarget_nvptx_TaskDescr {
-public:
-  // methods for flags
-  INLINE omp_sched_t GetRuntimeSched() const;
-  INLINE void SetRuntimeSched(omp_sched_t sched);
-  INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
-  INLINE int InL2OrHigherParallelRegion() const {
-    return items.flags & TaskDescr_InParL2P;
-  }
-  INLINE int IsParallelConstruct() const {
-    return items.flags & TaskDescr_IsParConstr;
-  }
-  INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
-  // methods for other fields
-  INLINE uint16_t &ThreadId() { return items.threadId; }
-  INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
-  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
-  INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
-    prev = taskDescr;
-  }
-  // init & copy
-  INLINE void InitLevelZeroTaskDescr();
-  INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
-  INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
-  INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
-  INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
-  INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
-  INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
-  INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
-  INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
-                                   uint16_t tid, uint16_t tnum);
-  INLINE void SaveLoopData();
-  INLINE void RestoreLoopData() const;
-
-private:
-  // bits for flags: (6 used, 2 free)
-  //   3 bits (SchedMask) for runtime schedule
-  //   1 bit (InPar) if this thread has encountered one or more parallel region
-  //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
-  //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
-  //   region
-  static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
-  static const uint8_t TaskDescr_InPar = 0x10;
-  static const uint8_t TaskDescr_IsParConstr = 0x20;
-  static const uint8_t TaskDescr_InParL2P = 0x40;
-
-  struct SavedLoopDescr_items {
-    int64_t loopUpperBound;
-    int64_t nextLowerBound;
-    int64_t chunk;
-    int64_t stride;
-    kmp_sched_t schedule;
-  } loopData;
-
-  struct TaskDescr_items {
-    uint8_t flags; // 6 bit used (see flag above)
-    uint8_t unused;
-    uint16_t threadId;         // thread id
-    uint64_t runtimeChunkSize; // runtime chunk size
-  } items;
-  omptarget_nvptx_TaskDescr *prev;
-};
-
-// build on kmp
-typedef struct omptarget_nvptx_ExplicitTaskDescr {
-  omptarget_nvptx_TaskDescr
-      taskDescr; // omptarget_nvptx task description (must be first)
-  kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
-} omptarget_nvptx_ExplicitTaskDescr;
-
-////////////////////////////////////////////////////////////////////////////////
-// Descriptor of a parallel region (worksharing in general)
-
-class omptarget_nvptx_WorkDescr {
-
-public:
-  // access to data
-  INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
-
-private:
-  omptarget_nvptx_TaskDescr masterTaskICV;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-class omptarget_nvptx_TeamDescr {
-public:
-  // access to data
-  INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
-    return &levelZeroTaskDescr;
-  }
-  INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
-    return workDescrForActiveParallel;
-  }
-  INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
-
-  // init
-  INLINE void InitTeamDescr();
-
-  INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
-    // If this is invoked by the master thread of the master warp then
-    // initialize it with a smaller slot.
-    if (IsMasterThread) {
-      // Do not initialize this slot again if it has already been initalized.
-      if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
-        return 0;
-      // Initialize the pointer to the end of the slot given the size of the
-      // data section. DataEnd is non-inclusive.
-      master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
-      // We currently do not have a next slot.
-      master_rootS[0].Next = 0;
-      master_rootS[0].Prev = 0;
-      master_rootS[0].PrevSlotStackPtr = 0;
-      return (__kmpc_data_sharing_slot *)&master_rootS[0];
-    }
-    // Do not initialize this slot again if it has already been initalized.
-    if (worker_rootS[wid].DataEnd ==
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
-      return 0;
-    // Initialize the pointer to the end of the slot given the size of the data
-    // section. DataEnd is non-inclusive.
-    worker_rootS[wid].DataEnd =
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
-    // We currently do not have a next slot.
-    worker_rootS[wid].Next = 0;
-    worker_rootS[wid].Prev = 0;
-    worker_rootS[wid].PrevSlotStackPtr = 0;
-    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
-  }
-
-  INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
-    worker_rootS[wid].DataEnd =
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
-    // We currently do not have a next slot.
-    worker_rootS[wid].Next = 0;
-    worker_rootS[wid].Prev = 0;
-    worker_rootS[wid].PrevSlotStackPtr = 0;
-    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
-  }
-
-private:
-  omptarget_nvptx_TaskDescr
-      levelZeroTaskDescr; // icv for team master initial thread
-  omptarget_nvptx_WorkDescr
-      workDescrForActiveParallel; // one, ONLY for the active par
-  uint64_t lastprivateIterBuffer;
-
-  ALIGN(16)
-  __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
-  ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// thread private data (struct of arrays for better coalescing)
-// tid refers here to the global thread id
-// do not support multiple concurrent kernel a this time
-class omptarget_nvptx_ThreadPrivateContext {
-public:
-  // task
-  INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
-    return &levelOneTaskDescr[tid];
-  }
-  INLINE void SetTopLevelTaskDescr(int tid,
-                                   omptarget_nvptx_TaskDescr *taskICV) {
-    topTaskDescr[tid] = taskICV;
-  }
-  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
-  // parallel
-  INLINE uint16_t &NumThreadsForNextParallel(int tid) {
-    return nextRegion.tnum[tid];
-  }
-  // simd
-  INLINE uint16_t &SimdLimitForNextSimd(int tid) {
-    return nextRegion.slim[tid];
-  }
-  // schedule (for dispatch)
-  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
-  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
-  INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
-  INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
-  INLINE int64_t &Stride(int tid) { return stride[tid]; }
-
-  INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
-
-  INLINE void InitThreadPrivateContext(int tid);
-  INLINE uint64_t &Cnt() { return cnt; }
-
-private:
-  // team context for this team
-  omptarget_nvptx_TeamDescr teamContext;
-  // task ICV for implicit threads in the only parallel region
-  omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
-  // pointer where to find the current task ICV (top of the stack)
-  omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
-  union {
-    // Only one of the two is live at the same time.
-    // parallel
-    uint16_t tnum[MAX_THREADS_PER_TEAM];
-    // simd limit
-    uint16_t slim[MAX_THREADS_PER_TEAM];
-  } nextRegion;
-  // schedule (for dispatch)
-  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
-  int64_t chunk[MAX_THREADS_PER_TEAM];
-  int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
-  // state for dispatch with dyn/guided OR static (never use both at a time)
-  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
-  int64_t stride[MAX_THREADS_PER_TEAM];
-  uint64_t cnt;
-};
-
-/// Memory manager for statically allocated memory.
-class omptarget_nvptx_SimpleMemoryManager {
-private:
-  ALIGN(128) struct MemDataTy {
-    volatile unsigned keys[OMP_STATE_COUNT];
-  } MemData[MAX_SM];
-
-  INLINE static uint32_t hash(unsigned key) {
-    return key & (OMP_STATE_COUNT - 1);
-  }
-
-public:
-  INLINE void Release();
-  INLINE const void *Acquire(const void *buf, size_t size);
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// global data tables
-////////////////////////////////////////////////////////////////////////////////
-
-extern DEVICE omptarget_nvptx_SimpleMemoryManager
-    omptarget_nvptx_simpleMemoryManager;
-extern DEVICE SHARED uint32_t usedMemIdx;
-extern DEVICE SHARED uint32_t usedSlotIdx;
-extern DEVICE SHARED uint8_t
-    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-extern DEVICE SHARED uint16_t threadLimit;
-extern DEVICE SHARED uint16_t threadsInTeam;
-extern DEVICE SHARED uint16_t nThreads;
-extern DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-
-extern DEVICE SHARED uint32_t execution_param;
-extern DEVICE SHARED void *ReductionScratchpadPtr;
-
-////////////////////////////////////////////////////////////////////////////////
-// work function (outlined parallel/simd functions) and arguments.
-// needed for L1 parallelism only.
-////////////////////////////////////////////////////////////////////////////////
-
-typedef void *omptarget_nvptx_WorkFn;
-extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
-    omptarget_nvptx_workFn;
-
-////////////////////////////////////////////////////////////////////////////////
-// get private data structures
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
-INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
-INLINE omptarget_nvptx_TaskDescr *
-getMyTopTaskDescriptor(bool isSPMDExecutionMode);
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
-
-////////////////////////////////////////////////////////////////////////////////
-// inlined implementation
-////////////////////////////////////////////////////////////////////////////////
-
-#include "common/omptargeti.h"
-
-#endif
+//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_H
+#define OMPTARGET_H
+
+#include "target_impl.h"
+#include "common/debug.h"     // debug
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "common/state-queue.h"
+#include "common/support.h"
+
+#define OMPTARGET_NVPTX_VERSION 1.1
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#define BARRIER_COUNTER 0
+#define ORDERED_COUNTER 1
+
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+  // All these methods must be called by the master thread only.
+  INLINE void Init() {
+    args  = buffer;
+    nArgs = MAX_SHARED_ARGS;
+  }
+  INLINE void DeInit() {
+    // Free any memory allocated for outlined parallel function with a large
+    // number of arguments.
+    if (nArgs > MAX_SHARED_ARGS) {
+      SafeFree(args, "new extended args");
+      Init();
+    }
+  }
+  INLINE void EnsureSize(size_t size) {
+    if (size > nArgs) {
+      if (nArgs > MAX_SHARED_ARGS) {
+        SafeFree(args, "new extended args");
+      }
+      args = (void **)SafeMalloc(size * sizeof(void *), "new extended args");
+      nArgs = size;
+    }
+  }
+  // Called by all threads.
+  INLINE void **GetArgs() const { return args; };
+private:
+  // buffer of pre-allocated arguments.
+  void *buffer[MAX_SHARED_ARGS];
+  // pointer to arguments buffer.
+  // starts off as a pointer to 'buffer' but can be dynamically allocated.
+  void **args;
+  // starts off as MAX_SHARED_ARGS but can increase in size.
+  uint32_t nArgs;
+};
+
+extern DEVICE SHARED omptarget_nvptx_SharedArgs
+    omptarget_nvptx_globalArgs;
+
+// Data structure to keep in shared memory that traces the current slot, stack,
+// and frame pointer as well as the active threads that didn't exit the current
+// environment.
+struct DataSharingStateTy {
+  __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
+  void *StackPtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
+  __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
+};
+// Additional worker slot type which is initialized with the default worker slot
+// size of 4*32 bytes.
+struct __kmpc_data_sharing_worker_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Worker_Warp_Slot_Size];
+};
+// Additional master slot type which is initialized with the default master slot
+// size of 4 bytes.
+struct __kmpc_data_sharing_master_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Slot_Size];
+};
+extern DEVICE SHARED DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// task ICV and (implicit & explicit) task state
+
+class omptarget_nvptx_TaskDescr {
+public:
+  // methods for flags
+  INLINE omp_sched_t GetRuntimeSched() const;
+  INLINE void SetRuntimeSched(omp_sched_t sched);
+  INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
+  INLINE int InL2OrHigherParallelRegion() const {
+    return items.flags & TaskDescr_InParL2P;
+  }
+  INLINE int IsParallelConstruct() const {
+    return items.flags & TaskDescr_IsParConstr;
+  }
+  INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
+  // methods for other fields
+  INLINE uint16_t &ThreadId() { return items.threadId; }
+  INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
+  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
+  INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
+    prev = taskDescr;
+  }
+  // init & copy
+  INLINE void InitLevelZeroTaskDescr();
+  INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
+  INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
+  INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
+                                   uint16_t tid, uint16_t tnum);
+  INLINE void SaveLoopData();
+  INLINE void RestoreLoopData() const;
+
+private:
+  // bits for flags: (6 used, 2 free)
+  //   3 bits (SchedMask) for runtime schedule
+  //   1 bit (InPar) if this thread has encountered one or more parallel region
+  //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
+  //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
+  //   region
+  static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
+  static const uint8_t TaskDescr_InPar = 0x10;
+  static const uint8_t TaskDescr_IsParConstr = 0x20;
+  static const uint8_t TaskDescr_InParL2P = 0x40;
+
+  struct SavedLoopDescr_items {
+    int64_t loopUpperBound;
+    int64_t nextLowerBound;
+    int64_t chunk;
+    int64_t stride;
+    kmp_sched_t schedule;
+  } loopData;
+
+  struct TaskDescr_items {
+    uint8_t flags; // 6 bit used (see flag above)
+    uint8_t unused;
+    uint16_t threadId;         // thread id
+    uint64_t runtimeChunkSize; // runtime chunk size
+  } items;
+  omptarget_nvptx_TaskDescr *prev;
+};
+
+// build on kmp
+typedef struct omptarget_nvptx_ExplicitTaskDescr {
+  omptarget_nvptx_TaskDescr
+      taskDescr; // omptarget_nvptx task description (must be first)
+  kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
+} omptarget_nvptx_ExplicitTaskDescr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Descriptor of a parallel region (worksharing in general)
+
+class omptarget_nvptx_WorkDescr {
+
+public:
+  // access to data
+  INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
+
+private:
+  omptarget_nvptx_TaskDescr masterTaskICV;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class omptarget_nvptx_TeamDescr {
+public:
+  // access to data
+  INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
+    return &levelZeroTaskDescr;
+  }
+  INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
+    return workDescrForActiveParallel;
+  }
+  INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
+
+  // init
+  INLINE void InitTeamDescr();
+
+  INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
+    // If this is invoked by the master thread of the master warp then
+    // initialize it with a smaller slot.
+    if (IsMasterThread) {
+      // Do not initialize this slot again if it has already been initalized.
+      if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
+        return 0;
+      // Initialize the pointer to the end of the slot given the size of the
+      // data section. DataEnd is non-inclusive.
+      master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
+      // We currently do not have a next slot.
+      master_rootS[0].Next = 0;
+      master_rootS[0].Prev = 0;
+      master_rootS[0].PrevSlotStackPtr = 0;
+      return (__kmpc_data_sharing_slot *)&master_rootS[0];
+    }
+    // Do not initialize this slot again if it has already been initalized.
+    if (worker_rootS[wid].DataEnd ==
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
+      return 0;
+    // Initialize the pointer to the end of the slot given the size of the data
+    // section. DataEnd is non-inclusive.
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
+  INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
+private:
+  omptarget_nvptx_TaskDescr
+      levelZeroTaskDescr; // icv for team master initial thread
+  omptarget_nvptx_WorkDescr
+      workDescrForActiveParallel; // one, ONLY for the active par
+  uint64_t lastprivateIterBuffer;
+
+  ALIGN(16)
+  __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
+  ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// thread private data (struct of arrays for better coalescing)
+// tid refers here to the global thread id
+// do not support multiple concurrent kernel a this time
+class omptarget_nvptx_ThreadPrivateContext {
+public:
+  // task
+  INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
+    return &levelOneTaskDescr[tid];
+  }
+  INLINE void SetTopLevelTaskDescr(int tid,
+                                   omptarget_nvptx_TaskDescr *taskICV) {
+    topTaskDescr[tid] = taskICV;
+  }
+  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
+  // parallel
+  INLINE uint16_t &NumThreadsForNextParallel(int tid) {
+    return nextRegion.tnum[tid];
+  }
+  // simd
+  INLINE uint16_t &SimdLimitForNextSimd(int tid) {
+    return nextRegion.slim[tid];
+  }
+  // schedule (for dispatch)
+  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
+  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
+  INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
+  INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
+  INLINE int64_t &Stride(int tid) { return stride[tid]; }
+
+  INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
+
+  INLINE void InitThreadPrivateContext(int tid);
+  INLINE uint64_t &Cnt() { return cnt; }
+
+private:
+  // team context for this team
+  omptarget_nvptx_TeamDescr teamContext;
+  // task ICV for implicit threads in the only parallel region
+  omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
+  // pointer where to find the current task ICV (top of the stack)
+  omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
+  union {
+    // Only one of the two is live at the same time.
+    // parallel
+    uint16_t tnum[MAX_THREADS_PER_TEAM];
+    // simd limit
+    uint16_t slim[MAX_THREADS_PER_TEAM];
+  } nextRegion;
+  // schedule (for dispatch)
+  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
+  int64_t chunk[MAX_THREADS_PER_TEAM];
+  int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
+  // state for dispatch with dyn/guided OR static (never use both at a time)
+  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
+  int64_t stride[MAX_THREADS_PER_TEAM];
+  uint64_t cnt;
+};
+
+/// Memory manager for statically allocated memory.
+class omptarget_nvptx_SimpleMemoryManager {
+private:
+  ALIGN(128) struct MemDataTy {
+    volatile unsigned keys[OMP_STATE_COUNT];
+  } MemData[MAX_SM];
+
+  INLINE static uint32_t hash(unsigned key) {
+    return key & (OMP_STATE_COUNT - 1);
+  }
+
+public:
+  INLINE void Release();
+  INLINE const void *Acquire(const void *buf, size_t size);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern DEVICE omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+extern DEVICE SHARED uint32_t usedMemIdx;
+extern DEVICE SHARED uint32_t usedSlotIdx;
+extern DEVICE SHARED uint8_t
+    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+extern DEVICE SHARED uint16_t threadLimit;
+extern DEVICE SHARED uint16_t threadsInTeam;
+extern DEVICE SHARED uint16_t nThreads;
+extern DEVICE SHARED
+    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+extern DEVICE SHARED uint32_t execution_param;
+extern DEVICE SHARED void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// work function (outlined parallel/simd functions) and arguments.
+// needed for L1 parallelism only.
+////////////////////////////////////////////////////////////////////////////////
+
+typedef void *omptarget_nvptx_WorkFn;
+extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
+    omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// get private data structures
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode);
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
+
+////////////////////////////////////////////////////////////////////////////////
+// inlined implementation
+////////////////////////////////////////////////////////////////////////////////
+
+#include "common/omptargeti.h"
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
index 14faa59062aee..e20016eeaa0da 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -1,228 +1,228 @@
-//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of all library macros, types,
-// and functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/target_atomic.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Task Descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
-  // sched starts from 1..4; encode it as 0..3; so add 1 here
-  uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
-  return (omp_sched_t)rc;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
-  // sched starts from 1..4; encode it as 0..3; so sub 1 here
-  uint8_t val = ((uint8_t)sched) - 1;
-  // clear current sched
-  items.flags &= ~TaskDescr_SchedMask;
-  // set new sched
-  items.flags |= val;
-}
-
-INLINE void
-omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
-  // slow method
-  // flag:
-  //   default sched is static,
-  //   dyn is off (unused now anyway, but may need to sample from host ?)
-  //   not in parallel
-
-  items.flags = 0;
-  items.threadId = 0;         // is master
-  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
-}
-
-// This is called when all threads are started together in SPMD mode.
-// OMP directives include target parallel, target distribute parallel for, etc.
-INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
-    omptarget_nvptx_TaskDescr *parentTaskDescr) {
-  // slow method
-  // flag:
-  //   default sched is static,
-  //   dyn is off (unused now anyway, but may need to sample from host ?)
-  //   in L1 parallel
-
-  items.flags =
-      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-  items.threadId =
-      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
-  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
-  prev = parentTaskDescr;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyData(
-    omptarget_nvptx_TaskDescr *sourceTaskDescr) {
-  items = sourceTaskDescr->items;
-}
-
-INLINE void
-omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
-  CopyData(sourceTaskDescr);
-  prev = sourceTaskDescr->prev;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyParent(
-    omptarget_nvptx_TaskDescr *parentTaskDescr) {
-  CopyData(parentTaskDescr);
-  prev = parentTaskDescr;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
-    omptarget_nvptx_TaskDescr *parentTaskDescr) {
-  CopyParent(parentTaskDescr);
-  items.flags = items.flags & ~TaskDescr_IsParConstr;
-  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
-    omptarget_nvptx_TaskDescr *masterTaskDescr) {
-  CopyParent(masterTaskDescr);
-  // overwrite specific items;
-  items.flags |=
-      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
-    omptarget_nvptx_TaskDescr *workTaskDescr) {
-  Copy(workTaskDescr);
-  //
-  // overwrite specific items;
-  //
-  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
-  // This is so that the serial master (first lane in the master warp)
-  // gets a threadId of 0.
-  // However, we know that this function is always called in a parallel
-  // region where only workers are active.  The serial master thread
-  // never enters this region.  When a parallel region is executed serially,
-  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
-  // are called, which never activate this region.
-  items.threadId =
-      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
-    omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
-  CopyParent(parentTaskDescr);
-  items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
-  items.threadId = tid;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
-  loopData.loopUpperBound =
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
-  loopData.nextLowerBound =
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
-  loopData.schedule =
-      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
-  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
-  loopData.stride =
-      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
-}
-
-INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
-  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
-  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
-      loopData.loopUpperBound;
-  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
-      loopData.nextLowerBound;
-  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
-      loopData.stride;
-  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
-      loopData.schedule;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Thread Private Context
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omptarget_nvptx_TaskDescr *
-omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
-  ASSERT0(
-      LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
-      "Getting top level, tid is larger than allocated data structure size");
-  return topTaskDescr[tid];
-}
-
-INLINE void
-omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
-  // levelOneTaskDescr is init when starting the parallel region
-  // top task descr is NULL (team master version will be fixed separately)
-  topTaskDescr[tid] = NULL;
-  // no num threads value has been pushed
-  nextRegion.tnum[tid] = 0;
-  // the following don't need to be init here; they are init when using dyn
-  // sched
-  // current_Event, events_Number, chunk, num_Iterations, schedule
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Team Descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
-  levelZeroTaskDescr.InitLevelZeroTaskDescr();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Get private data structure for thread
-////////////////////////////////////////////////////////////////////////////////
-
-// Utility routines for CUDA threads
-INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
-  return omptarget_nvptx_threadPrivateContext->TeamContext();
-}
-
-INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
-  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  return currTeamDescr.WorkDescr();
-}
-
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
-  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-}
-
-INLINE omptarget_nvptx_TaskDescr *
-getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
-  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory management runtime functions.
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
-  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
-          "SlotIdx is too big or uninitialized.");
-  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
-          "MemIdx is too big or uninitialized.");
-  MemDataTy &MD = MemData[usedSlotIdx];
-  __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
-}
-
-INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
-                                                                size_t size) {
-  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
-          "SlotIdx is too big or uninitialized.");
-  const unsigned sm = usedSlotIdx;
-  MemDataTy &MD = MemData[sm];
-  unsigned i = hash(GetBlockIdInKernel());
-  while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
-    i = hash(i + 1);
-  }
-  usedSlotIdx = sm;
-  usedMemIdx = i;
-  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
-}
+//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/target_atomic.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Task Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
+  // sched starts from 1..4; encode it as 0..3; so add 1 here
+  uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
+  return (omp_sched_t)rc;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
+  // sched starts from 1..4; encode it as 0..3; so sub 1 here
+  uint8_t val = ((uint8_t)sched) - 1;
+  // clear current sched
+  items.flags &= ~TaskDescr_SchedMask;
+  // set new sched
+  items.flags |= val;
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   not in parallel
+
+  items.flags = 0;
+  items.threadId = 0;         // is master
+  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
+}
+
+// This is called when all threads are started together in SPMD mode.
+// OMP directives include target parallel, target distribute parallel for, etc.
+INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   in L1 parallel
+
+  items.flags =
+      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+  items.threadId =
+      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+  items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyData(
+    omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  items = sourceTaskDescr->items;
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  CopyData(sourceTaskDescr);
+  prev = sourceTaskDescr->prev;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyData(parentTaskDescr);
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyParent(parentTaskDescr);
+  items.flags = items.flags & ~TaskDescr_IsParConstr;
+  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
+    omptarget_nvptx_TaskDescr *masterTaskDescr) {
+  CopyParent(masterTaskDescr);
+  // overwrite specific items;
+  items.flags |=
+      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
+    omptarget_nvptx_TaskDescr *workTaskDescr) {
+  Copy(workTaskDescr);
+  //
+  // overwrite specific items;
+  //
+  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
+  // This is so that the serial master (first lane in the master warp)
+  // gets a threadId of 0.
+  // However, we know that this function is always called in a parallel
+  // region where only workers are active.  The serial master thread
+  // never enters this region.  When a parallel region is executed serially,
+  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
+  // are called, which never activate this region.
+  items.threadId =
+      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
+  CopyParent(parentTaskDescr);
+  items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
+  items.threadId = tid;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
+  loopData.loopUpperBound =
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
+  loopData.nextLowerBound =
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
+  loopData.schedule =
+      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
+  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
+  loopData.stride =
+      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
+}
+
+INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
+  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
+  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
+      loopData.loopUpperBound;
+  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
+      loopData.nextLowerBound;
+  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
+      loopData.stride;
+  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
+      loopData.schedule;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread Private Context
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TaskDescr *
+omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
+  ASSERT0(
+      LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
+      "Getting top level, tid is larger than allocated data structure size");
+  return topTaskDescr[tid];
+}
+
+INLINE void
+omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
+  // levelOneTaskDescr is init when starting the parallel region
+  // top task descr is NULL (team master version will be fixed separately)
+  topTaskDescr[tid] = NULL;
+  // no num threads value has been pushed
+  nextRegion.tnum[tid] = 0;
+  // the following don't need to be init here; they are init when using dyn
+  // sched
+  // current_Event, events_Number, chunk, num_Iterations, schedule
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+  levelZeroTaskDescr.InitLevelZeroTaskDescr();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Get private data structure for thread
+////////////////////////////////////////////////////////////////////////////////
+
+// Utility routines for CUDA threads
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
+  return omptarget_nvptx_threadPrivateContext->TeamContext();
+}
+
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  return currTeamDescr.WorkDescr();
+}
+
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
+  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+}
+
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
+  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory management runtime functions.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
+          "MemIdx is too big or uninitialized.");
+  MemDataTy &MD = MemData[usedSlotIdx];
+  __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
+}
+
+INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
+                                                                size_t size) {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  const unsigned sm = usedSlotIdx;
+  MemDataTy &MD = MemData[sm];
+  unsigned i = hash(GetBlockIdInKernel());
+  while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
+    i = hash(i + 1);
+  }
+  usedSlotIdx = sm;
+  usedMemIdx = i;
+  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
index 9540f5647699b..4a1a13cce2c28 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
@@ -1,28 +1,28 @@
-//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Interface to be used in the implementation of OpenMP cancel.
-//
-//===----------------------------------------------------------------------===//
-
-#include "interface.h"
-#include "common/debug.h"
-
-EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
-                                        int32_t cancelVal) {
-  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
-  // disabled
-  return 0;
-}
-
-EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
-                             int32_t cancelVal) {
-  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
-  // disabled
-  return 0;
-}
+//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used in the implementation of OpenMP cancel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "interface.h"
+#include "common/debug.h"
+
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
+                                        int32_t cancelVal) {
+  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
+  // disabled
+  return 0;
+}
+
+EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
+                             int32_t cancelVal) {
+  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
+  // disabled
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
index ee4b056ddad92..08fc053c33ce0 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
@@ -1,28 +1,28 @@
-//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of critical with KMPC interface
-//
-//===----------------------------------------------------------------------===//
-
-#include "interface.h"
-#include "common/debug.h"
-
-EXTERN
-void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
-                     kmp_CriticalName *lck) {
-  PRINT0(LD_IO, "call to kmpc_critical()\n");
-  omp_set_lock((omp_lock_t *)lck);
-}
-
-EXTERN
-void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
-                         kmp_CriticalName *lck) {
-  PRINT0(LD_IO, "call to kmpc_end_critical()\n");
-  omp_unset_lock((omp_lock_t *)lck);
-}
+//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of critical with KMPC interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "interface.h"
+#include "common/debug.h"
+
+EXTERN
+void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
+                     kmp_CriticalName *lck) {
+  PRINT0(LD_IO, "call to kmpc_critical()\n");
+  omp_set_lock((omp_lock_t *)lck);
+}
+
+EXTERN
+void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
+                         kmp_CriticalName *lck) {
+  PRINT0(LD_IO, "call to kmpc_end_critical()\n");
+  omp_unset_lock((omp_lock_t *)lck);
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
index f6523c8ce8aa2..0e10a6a2364d0 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -1,568 +1,568 @@
-//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of data sharing environments
-//
-//===----------------------------------------------------------------------===//
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-// Return true if this is the master thread.
-INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
-  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
-}
-
-/// Return the provided size aligned to the size of a pointer.
-INLINE static size_t AlignVal(size_t Val) {
-  const size_t Align = (size_t)sizeof(void *);
-  if (Val & (Align - 1)) {
-    Val += Align;
-    Val &= ~(Align - 1);
-  }
-  return Val;
-}
-
-#define DSFLAG 0
-#define DSFLAG_INIT 0
-#define DSPRINT(_flag, _str, _args...)                                         \
-  {                                                                            \
-    if (_flag) {                                                               \
-      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/          \
-    }                                                                          \
-  }
-#define DSPRINT0(_flag, _str)                                                  \
-  {                                                                            \
-    if (_flag) {                                                               \
-      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/                 \
-    }                                                                          \
-  }
-
-// Initialize the shared data structures. This is expected to be called for the
-// master thread and warp masters. \param RootS: A pointer to the root of the
-// data sharing stack. \param InitialDataSize: The initial size of the data in
-// the slot.
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
-                                           size_t InitialDataSize) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  DSPRINT0(DSFLAG_INIT,
-           "Entering __kmpc_initialize_data_sharing_environment\n");
-
-  unsigned WID = GetWarpId();
-  DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
-
-  omptarget_nvptx_TeamDescr *teamDescr =
-      &omptarget_nvptx_threadPrivateContext->TeamContext();
-  __kmpc_data_sharing_slot *RootS =
-      teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
-
-  DataSharingState.SlotPtr[WID] = RootS;
-  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-
-  // We don't need to initialize the frame and active threads.
-
-  DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
-  DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
-  DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
-          (unsigned long long)RootS->DataEnd);
-  DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
-          (unsigned long long)RootS->Next);
-  DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
-          (unsigned long long)DataSharingState.SlotPtr[WID]);
-  DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
-          (unsigned long long)DataSharingState.StackPtr[WID]);
-
-  DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
-}
-
-EXTERN void *__kmpc_data_sharing_environment_begin(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    size_t SharingDataSize, size_t SharingDefaultDataSize,
-    int16_t IsOMPRuntimeInitialized) {
-
-  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
-
-  // If the runtime has been elided, used shared memory for master-worker
-  // data sharing.
-  if (!IsOMPRuntimeInitialized)
-    return (void *)&DataSharingState;
-
-  DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
-  DSPRINT(DSFLAG, "Default Data Size %016llx\n",
-          (unsigned long long)SharingDefaultDataSize);
-
-  unsigned WID = GetWarpId();
-  __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
-
-  __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-  void *&StackP = DataSharingState.StackPtr[WID];
-  void * volatile &FrameP = DataSharingState.FramePtr[WID];
-  __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
-  DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
-  // Save the current values.
-  *SavedSharedSlot = SlotP;
-  *SavedSharedStack = StackP;
-  *SavedSharedFrame = FrameP;
-  *SavedActiveThreads = ActiveT;
-
-  DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
-  DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
-  DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
-  DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
-  DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
-
-  // Only the warp active master needs to grow the stack.
-  if (__kmpc_impl_is_first_active_thread()) {
-    // Save the current active threads.
-    ActiveT = CurActiveThreads;
-
-    // Make sure we use aligned sizes to avoid rematerialization of data.
-    SharingDataSize = AlignVal(SharingDataSize);
-    // FIXME: The default data size can be assumed to be aligned?
-    SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
-
-    // Check if we have room for the data in the current slot.
-    const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
-    const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
-    const uintptr_t RequiredEndAddress =
-        CurrentStartAddress + (uintptr_t)SharingDataSize;
-
-    DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
-    DSPRINT(DSFLAG, "Default Data Size %016llx\n",
-            (unsigned long long)SharingDefaultDataSize);
-    DSPRINT(DSFLAG, "Current Start Address %016llx\n",
-            (unsigned long long)CurrentStartAddress);
-    DSPRINT(DSFLAG, "Current End Address %016llx\n",
-            (unsigned long long)CurrentEndAddress);
-    DSPRINT(DSFLAG, "Required End Address %016llx\n",
-            (unsigned long long)RequiredEndAddress);
-    DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
-
-    // If we require a new slot, allocate it and initialize it (or attempt to
-    // reuse one). Also, set the shared stack and slot pointers to the new
-    // place. If we do not need to grow the stack, just adapt the stack and
-    // frame pointers.
-    if (CurrentEndAddress < RequiredEndAddress) {
-      size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
-                           ? SharingDataSize
-                           : SharingDefaultDataSize;
-      __kmpc_data_sharing_slot *NewSlot = 0;
-
-      // Attempt to reuse an existing slot.
-      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
-        uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                                     (uintptr_t)(&ExistingSlot->Data[0]);
-        if (ExistingSlotSize >= NewSize) {
-          DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
-                  (unsigned long long)ExistingSlot);
-          NewSlot = ExistingSlot;
-        } else {
-          DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
-                  (unsigned long long)SlotP->Next);
-          SafeFree(ExistingSlot, "Failed reuse");
-        }
-      }
-
-      if (!NewSlot) {
-        NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
-            sizeof(__kmpc_data_sharing_slot) + NewSize,
-            "Warp master slot allocation");
-        DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
-                (unsigned long long)NewSlot, NewSize);
-      }
-
-      NewSlot->Next = 0;
-      NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
-      SlotP->Next = NewSlot;
-      SlotP = NewSlot;
-      StackP = &NewSlot->Data[SharingDataSize];
-      FrameP = &NewSlot->Data[0];
-    } else {
-
-      // Clean up any old slot that we may still have. The slot producers, do
-      // not eliminate them because that may be used to return data.
-      if (SlotP->Next) {
-        DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
-                (unsigned long long)SlotP->Next);
-        SafeFree(SlotP->Next, "Old slot not required");
-        SlotP->Next = 0;
-      }
-
-      FrameP = StackP;
-      StackP = (void *)RequiredEndAddress;
-    }
-  }
-
-  // FIXME: Need to see the impact of doing it here.
-  __kmpc_impl_threadfence_block();
-
-  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
-
-  // All the threads in this warp get the frame they should work with.
-  return FrameP;
-}
-
-EXTERN void __kmpc_data_sharing_environment_end(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    int32_t IsEntryPoint) {
-
-  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
-
-  unsigned WID = GetWarpId();
-
-  if (IsEntryPoint) {
-    if (__kmpc_impl_is_first_active_thread()) {
-      DSPRINT0(DSFLAG, "Doing clean up\n");
-
-      // The master thread cleans the saved slot, because this is an environment
-      // only for the master.
-      __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
-                                        ? *SavedSharedSlot
-                                        : DataSharingState.SlotPtr[WID];
-
-      if (S->Next) {
-        SafeFree(S->Next, "Sharing environment end");
-        S->Next = 0;
-      }
-    }
-
-    DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
-    return;
-  }
-
-  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
-
-  // Only the warp master can restore the stack and frame information, and only
-  // if there are no other threads left behind in this environment (i.e. the
-  // warp diverged and returns in different places). This only works if we
-  // assume that threads will converge right after the call site that started
-  // the environment.
-  if (__kmpc_impl_is_first_active_thread()) {
-    __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
-    DSPRINT0(DSFLAG, "Before restoring the stack\n");
-    // Zero the bits in the mask. If it is still different from zero, then we
-    // have other threads that will return after the current ones.
-    ActiveT &= ~CurActive;
-
-    DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
-            (unsigned)CurActive, (unsigned)ActiveT);
-
-    if (!ActiveT) {
-      // No other active threads? Great, lets restore the stack.
-
-      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-      void *&StackP = DataSharingState.StackPtr[WID];
-      void * volatile &FrameP = DataSharingState.FramePtr[WID];
-
-      SlotP = *SavedSharedSlot;
-      StackP = *SavedSharedStack;
-      FrameP = *SavedSharedFrame;
-      ActiveT = *SavedActiveThreads;
-
-      DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
-              (unsigned long long)SlotP);
-      DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
-              (unsigned long long)StackP);
-      DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
-              (unsigned long long)FrameP);
-      DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
-    }
-  }
-
-  // FIXME: Need to see the impact of doing it here.
-  __kmpc_impl_threadfence_block();
-
-  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
-  return;
-}
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
-                                          int16_t IsOMPRuntimeInitialized) {
-  DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
-
-  // If the runtime has been elided, use shared memory for master-worker
-  // data sharing.  We're reusing the statically allocated data structure
-  // that is used for standard data sharing.
-  if (!IsOMPRuntimeInitialized)
-    return (void *)&DataSharingState;
-
-  // Get the frame used by the requested thread.
-
-  unsigned SourceWID = SourceThreadID / WARPSIZE;
-
-  DSPRINT(DSFLAG, "Source  warp: %u\n", SourceWID);
-
-  void * volatile P = DataSharingState.FramePtr[SourceWID];
-  DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
-  return P;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Runtime functions for trunk data sharing scheme.
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE static void data_sharing_init_stack_common() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  omptarget_nvptx_TeamDescr *teamDescr =
-      &omptarget_nvptx_threadPrivateContext->TeamContext();
-
-  for (int WID = 0; WID < WARPSIZE; WID++) {
-    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
-    DataSharingState.SlotPtr[WID] = RootS;
-    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-  }
-}
-
-// Initialize data sharing data structure. This function needs to be called
-// once at the beginning of a data sharing context (coincides with the kernel
-// initialization). This function is called only by the MASTER thread of each
-// team in non-SPMD mode.
-EXTERN void __kmpc_data_sharing_init_stack() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  // This function initializes the stack pointer with the pointer to the
-  // statically allocated shared memory slots. The size of a shared memory
-  // slot is pre-determined to be 256 bytes.
-  data_sharing_init_stack_common();
-  omptarget_nvptx_globalArgs.Init();
-}
-
-// Initialize data sharing data structure. This function needs to be called
-// once at the beginning of a data sharing context (coincides with the kernel
-// initialization). This function is called in SPMD mode only.
-EXTERN void __kmpc_data_sharing_init_stack_spmd() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  // This function initializes the stack pointer with the pointer to the
-  // statically allocated shared memory slots. The size of a shared memory
-  // slot is pre-determined to be 256 bytes.
-  if (GetThreadIdInBlock() == 0)
-    data_sharing_init_stack_common();
-
-  __kmpc_impl_threadfence_block();
-}
-
-INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  // Only warp active master threads manage the stack.
-  bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
-
-  // Add worst-case padding to DataSize so that future stack allocations are
-  // correctly aligned.
-  const size_t Alignment = 8;
-  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
-
-  // Frame pointer must be visible to all workers in the same warp.
-  const unsigned WID = GetWarpId();
-  void *FrameP = 0;
-  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
-
-  if (IsWarpMaster) {
-    // SlotP will point to either the shared memory slot or an existing
-    // global memory slot.
-    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-    void *&StackP = DataSharingState.StackPtr[WID];
-
-    // Check if we have room for the data in the current slot.
-    const uintptr_t StartAddress = (uintptr_t)StackP;
-    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
-    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
-
-    // If we requested more data than there is room for in the rest
-    // of the slot then we need to either re-use the next slot, if one exists,
-    // or create a new slot.
-    if (EndAddress < RequestedEndAddress) {
-      __kmpc_data_sharing_slot *NewSlot = 0;
-      size_t NewSize = PushSize;
-
-      // Allocate at least the default size for each type of slot.
-      // Master is a special case and even though there is only one thread,
-      // it can share more things with the workers. For uniformity, it uses
-      // the full size of a worker warp slot.
-      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
-      if (DefaultSlotSize > NewSize)
-        NewSize = DefaultSlotSize;
-      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
-          sizeof(__kmpc_data_sharing_slot) + NewSize,
-          "Global memory slot allocation.");
-
-      NewSlot->Next = 0;
-      NewSlot->Prev = SlotP;
-      NewSlot->PrevSlotStackPtr = StackP;
-      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
-
-      // Make previous slot point to the newly allocated slot.
-      SlotP->Next = NewSlot;
-      // The current slot becomes the new slot.
-      SlotP = NewSlot;
-      // The stack pointer always points to the next free stack frame.
-      StackP = &NewSlot->Data[0] + PushSize;
-      // The frame pointer always points to the beginning of the frame.
-      FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
-    } else {
-      // Add the data chunk to the current slot. The frame pointer is set to
-      // point to the start of the new frame held in StackP.
-      FrameP = DataSharingState.FramePtr[WID] = StackP;
-      // Reset stack pointer to the requested address.
-      StackP = (void *)RequestedEndAddress;
-    }
-  }
-  // Get address from lane 0.
-  int *FP = (int *)&FrameP;
-  FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
-  if (sizeof(FrameP) == 8)
-    FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
-
-  return FrameP;
-}
-
-EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
-                                                      int16_t UseSharedMemory) {
-  return data_sharing_push_stack_common(DataSize);
-}
-
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
-                                            int16_t UseSharedMemory) {
-  // Compute the total memory footprint of the requested data.
-  // The master thread requires a stack only for itself. A worker
-  // thread (which at this point is a warp master) will require
-  // space for the variables of each thread in the warp,
-  // i.e. one DataSize chunk per warp lane.
-  // TODO: change WARPSIZE to the number of active threads in the warp.
-  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
-                        ? DataSize
-                        : WARPSIZE * DataSize;
-
-  // Compute the start address of the frame of each thread in the warp.
-  uintptr_t FrameStartAddress =
-      (uintptr_t) data_sharing_push_stack_common(PushSize);
-  FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
-  return (void *)FrameStartAddress;
-}
-
-// Pop the stack and free any memory which can be reclaimed.
-//
-// When the pop operation removes the last global memory slot,
-// reclaim all outstanding global memory slots since it is
-// likely we have reached the end of the kernel.
-EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  __kmpc_impl_threadfence_block();
-
-  if (GetThreadIdInBlock() % WARPSIZE == 0) {
-    unsigned WID = GetWarpId();
-
-    // Current slot
-    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-
-    // Pointer to next available stack.
-    void *&StackP = DataSharingState.StackPtr[WID];
-
-    // Pop the frame.
-    StackP = FrameStart;
-
-    // If the current slot is empty, we need to free the slot after the
-    // pop.
-    bool SlotEmpty = (StackP == &SlotP->Data[0]);
-
-    if (SlotEmpty && SlotP->Prev) {
-      // Before removing the slot we need to reset StackP.
-      StackP = SlotP->PrevSlotStackPtr;
-
-      // Remove the slot.
-      SlotP = SlotP->Prev;
-      SafeFree(SlotP->Next, "Free slot.");
-      SlotP->Next = 0;
-    }
-  }
-}
-
-// Begin a data sharing context. Maintain a list of references to shared
-// variables. This list of references to shared variables will be passed
-// to one or more threads.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
-  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
-}
-
-// End a data sharing context. There is no need to have a list of refs
-// to shared variables because the context in which those variables were
-// shared has now ended. This should clean-up the list of references only
-// without affecting the actual global storage of the variables.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_end_sharing_variables() {
-  omptarget_nvptx_globalArgs.DeInit();
-}
-
-// This function will return a list of references to global variables. This
-// is how the workers will get a reference to the globalized variable. The
-// members of this list will be passed to the outlined parallel function
-// preserving the order.
-// Called by all workers.
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
-}
-
-// This function is used to init static memory manager. This manager is used to
-// manage statically allocated global memory. This memory is allocated by the
-// compiler and used to correctly implement globalization of the variables in
-// target, teams and distribute regions.
-EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
-                                          const void *buf, size_t size,
-                                          int16_t is_shared,
-                                          const void **frame) {
-  if (is_shared) {
-    *frame = buf;
-    return;
-  }
-  if (isSPMDExecutionMode) {
-    if (GetThreadIdInBlock() == 0) {
-      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
-    }
-    __kmpc_impl_syncthreads();
-    return;
-  }
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
-          "Must be called only in the target master thread.");
-  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
-  __kmpc_impl_threadfence();
-}
-
-EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
-                                              int16_t is_shared) {
-  if (is_shared)
-    return;
-  if (isSPMDExecutionMode) {
-    __kmpc_impl_syncthreads();
-    if (GetThreadIdInBlock() == 0) {
-      omptarget_nvptx_simpleMemoryManager.Release();
-    }
-    return;
-  }
-  __kmpc_impl_threadfence();
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
-          "Must be called only in the target master thread.");
-  omptarget_nvptx_simpleMemoryManager.Release();
-}
-
+//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of data sharing environments
+//
+//===----------------------------------------------------------------------===//
+#include "common/omptarget.h"
+#include "target_impl.h"
+
+// Return true if this is the master thread.
+INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
+  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
+}
+
+/// Return the provided size aligned to the size of a pointer.
+INLINE static size_t AlignVal(size_t Val) {
+  const size_t Align = (size_t)sizeof(void *);
+  if (Val & (Align - 1)) {
+    Val += Align;
+    Val &= ~(Align - 1);
+  }
+  return Val;
+}
+
+#define DSFLAG 0
+#define DSFLAG_INIT 0
+#define DSPRINT(_flag, _str, _args...)                                         \
+  {                                                                            \
+    if (_flag) {                                                               \
+      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/          \
+    }                                                                          \
+  }
+#define DSPRINT0(_flag, _str)                                                  \
+  {                                                                            \
+    if (_flag) {                                                               \
+      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/                 \
+    }                                                                          \
+  }
+
+// Initialize the shared data structures. This is expected to be called for the
+// master thread and warp masters. \param RootS: A pointer to the root of the
+// data sharing stack. \param InitialDataSize: The initial size of the data in
+// the slot.
+EXTERN void
+__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
+                                           size_t InitialDataSize) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  DSPRINT0(DSFLAG_INIT,
+           "Entering __kmpc_initialize_data_sharing_environment\n");
+
+  unsigned WID = GetWarpId();
+  DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
+
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+  __kmpc_data_sharing_slot *RootS =
+      teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
+
+  DataSharingState.SlotPtr[WID] = RootS;
+  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+
+  // We don't need to initialize the frame and active threads.
+
+  DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
+  DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
+  DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
+          (unsigned long long)RootS->DataEnd);
+  DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
+          (unsigned long long)RootS->Next);
+  DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
+          (unsigned long long)DataSharingState.SlotPtr[WID]);
+  DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
+          (unsigned long long)DataSharingState.StackPtr[WID]);
+
+  DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
+}
+
+EXTERN void *__kmpc_data_sharing_environment_begin(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
+    size_t SharingDataSize, size_t SharingDefaultDataSize,
+    int16_t IsOMPRuntimeInitialized) {
+
+  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
+
+  // If the runtime has been elided, used shared memory for master-worker
+  // data sharing.
+  if (!IsOMPRuntimeInitialized)
+    return (void *)&DataSharingState;
+
+  DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+  DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+          (unsigned long long)SharingDefaultDataSize);
+
+  unsigned WID = GetWarpId();
+  __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
+
+  __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+  void *&StackP = DataSharingState.StackPtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
+  __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+  DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
+  // Save the current values.
+  *SavedSharedSlot = SlotP;
+  *SavedSharedStack = StackP;
+  *SavedSharedFrame = FrameP;
+  *SavedActiveThreads = ActiveT;
+
+  DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
+  DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
+  DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
+  DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
+  DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
+
+  // Only the warp active master needs to grow the stack.
+  if (__kmpc_impl_is_first_active_thread()) {
+    // Save the current active threads.
+    ActiveT = CurActiveThreads;
+
+    // Make sure we use aligned sizes to avoid rematerialization of data.
+    SharingDataSize = AlignVal(SharingDataSize);
+    // FIXME: The default data size can be assumed to be aligned?
+    SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
+
+    // Check if we have room for the data in the current slot.
+    const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
+    const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
+    const uintptr_t RequiredEndAddress =
+        CurrentStartAddress + (uintptr_t)SharingDataSize;
+
+    DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+    DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+            (unsigned long long)SharingDefaultDataSize);
+    DSPRINT(DSFLAG, "Current Start Address %016llx\n",
+            (unsigned long long)CurrentStartAddress);
+    DSPRINT(DSFLAG, "Current End Address %016llx\n",
+            (unsigned long long)CurrentEndAddress);
+    DSPRINT(DSFLAG, "Required End Address %016llx\n",
+            (unsigned long long)RequiredEndAddress);
+    DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
+
+    // If we require a new slot, allocate it and initialize it (or attempt to
+    // reuse one). Also, set the shared stack and slot pointers to the new
+    // place. If we do not need to grow the stack, just adapt the stack and
+    // frame pointers.
+    if (CurrentEndAddress < RequiredEndAddress) {
+      size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
+                           ? SharingDataSize
+                           : SharingDefaultDataSize;
+      __kmpc_data_sharing_slot *NewSlot = 0;
+
+      // Attempt to reuse an existing slot.
+      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+        uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+                                     (uintptr_t)(&ExistingSlot->Data[0]);
+        if (ExistingSlotSize >= NewSize) {
+          DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
+                  (unsigned long long)ExistingSlot);
+          NewSlot = ExistingSlot;
+        } else {
+          DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
+                  (unsigned long long)SlotP->Next);
+          SafeFree(ExistingSlot, "Failed reuse");
+        }
+      }
+
+      if (!NewSlot) {
+        NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
+            sizeof(__kmpc_data_sharing_slot) + NewSize,
+            "Warp master slot allocation");
+        DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
+                (unsigned long long)NewSlot, NewSize);
+      }
+
+      NewSlot->Next = 0;
+      NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+      SlotP->Next = NewSlot;
+      SlotP = NewSlot;
+      StackP = &NewSlot->Data[SharingDataSize];
+      FrameP = &NewSlot->Data[0];
+    } else {
+
+      // Clean up any old slot that we may still have. The slot producers, do
+      // not eliminate them because that may be used to return data.
+      if (SlotP->Next) {
+        DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
+                (unsigned long long)SlotP->Next);
+        SafeFree(SlotP->Next, "Old slot not required");
+        SlotP->Next = 0;
+      }
+
+      FrameP = StackP;
+      StackP = (void *)RequiredEndAddress;
+    }
+  }
+
+  // FIXME: Need to see the impact of doing it here.
+  __kmpc_impl_threadfence_block();
+
+  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
+
+  // All the threads in this warp get the frame they should work with.
+  return FrameP;
+}
+
+EXTERN void __kmpc_data_sharing_environment_end(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
+    int32_t IsEntryPoint) {
+
+  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
+
+  unsigned WID = GetWarpId();
+
+  if (IsEntryPoint) {
+    if (__kmpc_impl_is_first_active_thread()) {
+      DSPRINT0(DSFLAG, "Doing clean up\n");
+
+      // The master thread cleans the saved slot, because this is an environment
+      // only for the master.
+      __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
+                                        ? *SavedSharedSlot
+                                        : DataSharingState.SlotPtr[WID];
+
+      if (S->Next) {
+        SafeFree(S->Next, "Sharing environment end");
+        S->Next = 0;
+      }
+    }
+
+    DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
+    return;
+  }
+
+  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
+
+  // Only the warp master can restore the stack and frame information, and only
+  // if there are no other threads left behind in this environment (i.e. the
+  // warp diverged and returns in different places). This only works if we
+  // assume that threads will converge right after the call site that started
+  // the environment.
+  if (__kmpc_impl_is_first_active_thread()) {
+    __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+    DSPRINT0(DSFLAG, "Before restoring the stack\n");
+    // Zero the bits in the mask. If it is still different from zero, then we
+    // have other threads that will return after the current ones.
+    ActiveT &= ~CurActive;
+
+    DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
+            (unsigned)CurActive, (unsigned)ActiveT);
+
+    if (!ActiveT) {
+      // No other active threads? Great, lets restore the stack.
+
+      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+      void *&StackP = DataSharingState.StackPtr[WID];
+      void * volatile &FrameP = DataSharingState.FramePtr[WID];
+
+      SlotP = *SavedSharedSlot;
+      StackP = *SavedSharedStack;
+      FrameP = *SavedSharedFrame;
+      ActiveT = *SavedActiveThreads;
+
+      DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
+              (unsigned long long)SlotP);
+      DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
+              (unsigned long long)StackP);
+      DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
+              (unsigned long long)FrameP);
+      DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
+    }
+  }
+
+  // FIXME: Need to see the impact of doing it here.
+  __kmpc_impl_threadfence_block();
+
+  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
+  return;
+}
+
+EXTERN void *
+__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+                                          int16_t IsOMPRuntimeInitialized) {
+  DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
+
+  // If the runtime has been elided, use shared memory for master-worker
+  // data sharing.  We're reusing the statically allocated data structure
+  // that is used for standard data sharing.
+  if (!IsOMPRuntimeInitialized)
+    return (void *)&DataSharingState;
+
+  // Get the frame used by the requested thread.
+
+  unsigned SourceWID = SourceThreadID / WARPSIZE;
+
+  DSPRINT(DSFLAG, "Source  warp: %u\n", SourceWID);
+
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
+  DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
+  return P;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE static void data_sharing_init_stack_common() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+  for (int WID = 0; WID < WARPSIZE; WID++) {
+    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  }
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
+EXTERN void __kmpc_data_sharing_init_stack() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  data_sharing_init_stack_common();
+  omptarget_nvptx_globalArgs.Init();
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  if (GetThreadIdInBlock() == 0)
+    data_sharing_init_stack_common();
+
+  __kmpc_impl_threadfence_block();
+}
+
+INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
+
+  // Add worst-case padding to DataSize so that future stack allocations are
+  // correctly aligned.
+  const size_t Alignment = 8;
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
+
+  // Frame pointer must be visible to all workers in the same warp.
+  const unsigned WID = GetWarpId();
+  void *FrameP = 0;
+  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
+
+  if (IsWarpMaster) {
+    // SlotP will point to either the shared memory slot or an existing
+    // global memory slot.
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+    void *&StackP = DataSharingState.StackPtr[WID];
+
+    // Check if we have room for the data in the current slot.
+    const uintptr_t StartAddress = (uintptr_t)StackP;
+    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
+    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
+
+    // If we requested more data than there is room for in the rest
+    // of the slot then we need to either re-use the next slot, if one exists,
+    // or create a new slot.
+    if (EndAddress < RequestedEndAddress) {
+      __kmpc_data_sharing_slot *NewSlot = 0;
+      size_t NewSize = PushSize;
+
+      // Allocate at least the default size for each type of slot.
+      // Master is a special case and even though there is only one thread,
+      // it can share more things with the workers. For uniformity, it uses
+      // the full size of a worker warp slot.
+      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+      if (DefaultSlotSize > NewSize)
+        NewSize = DefaultSlotSize;
+      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+          sizeof(__kmpc_data_sharing_slot) + NewSize,
+          "Global memory slot allocation.");
+
+      NewSlot->Next = 0;
+      NewSlot->Prev = SlotP;
+      NewSlot->PrevSlotStackPtr = StackP;
+      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+
+      // Make previous slot point to the newly allocated slot.
+      SlotP->Next = NewSlot;
+      // The current slot becomes the new slot.
+      SlotP = NewSlot;
+      // The stack pointer always points to the next free stack frame.
+      StackP = &NewSlot->Data[0] + PushSize;
+      // The frame pointer always points to the beginning of the frame.
+      FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
+    } else {
+      // Add the data chunk to the current slot. The frame pointer is set to
+      // point to the start of the new frame held in StackP.
+      FrameP = DataSharingState.FramePtr[WID] = StackP;
+      // Reset stack pointer to the requested address.
+      StackP = (void *)RequestedEndAddress;
+    }
+  }
+  // Get address from lane 0.
+  int *FP = (int *)&FrameP;
+  FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
+  if (sizeof(FrameP) == 8)
+    FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
+
+  return FrameP;
+}
+
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+                                                      int16_t UseSharedMemory) {
+  return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
+                                            int16_t UseSharedMemory) {
+  // Compute the total memory footprint of the requested data.
+  // The master thread requires a stack only for itself. A worker
+  // thread (which at this point is a warp master) will require
+  // space for the variables of each thread in the warp,
+  // i.e. one DataSize chunk per warp lane.
+  // TODO: change WARPSIZE to the number of active threads in the warp.
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
+                        ? DataSize
+                        : WARPSIZE * DataSize;
+
+  // Compute the start address of the frame of each thread in the warp.
+  uintptr_t FrameStartAddress =
+      (uintptr_t) data_sharing_push_stack_common(PushSize);
+  FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
+  return (void *)FrameStartAddress;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  __kmpc_impl_threadfence_block();
+
+  if (GetThreadIdInBlock() % WARPSIZE == 0) {
+    unsigned WID = GetWarpId();
+
+    // Current slot
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+    // Pointer to next available stack.
+    void *&StackP = DataSharingState.StackPtr[WID];
+
+    // Pop the frame.
+    StackP = FrameStart;
+
+    // If the current slot is empty, we need to free the slot after the
+    // pop.
+    bool SlotEmpty = (StackP == &SlotP->Data[0]);
+
+    if (SlotEmpty && SlotP->Prev) {
+      // Before removing the slot we need to reset StackP.
+      StackP = SlotP->PrevSlotStackPtr;
+
+      // Remove the slot.
+      SlotP = SlotP->Prev;
+      SafeFree(SlotP->Next, "Free slot.");
+      SlotP->Next = 0;
+    }
+  }
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+  omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// This function is used to init static memory manager. This manager is used to
+// manage statically allocated global memory. This memory is allocated by the
+// compiler and used to correctly implement globalization of the variables in
+// target, teams and distribute regions.
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+                                          const void *buf, size_t size,
+                                          int16_t is_shared,
+                                          const void **frame) {
+  if (is_shared) {
+    *frame = buf;
+    return;
+  }
+  if (isSPMDExecutionMode) {
+    if (GetThreadIdInBlock() == 0) {
+      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+    }
+    __kmpc_impl_syncthreads();
+    return;
+  }
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "Must be called only in the target master thread.");
+  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+  __kmpc_impl_threadfence();
+}
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+                                              int16_t is_shared) {
+  if (is_shared)
+    return;
+  if (isSPMDExecutionMode) {
+    __kmpc_impl_syncthreads();
+    if (GetThreadIdInBlock() == 0) {
+      omptarget_nvptx_simpleMemoryManager.Release();
+    }
+    return;
+  }
+  __kmpc_impl_threadfence();
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "Must be called only in the target master thread.");
+  omptarget_nvptx_simpleMemoryManager.Release();
+}
+
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
index 89c481bcf8da3..c3cc51c7c3625 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -1,414 +1,414 @@
-//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the OpenMP runtime functions that can be
-// invoked by the user in an OpenMP region
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-#include "common/target_atomic.h"
-#include "target_impl.h"
-
-EXTERN double omp_get_wtick(void) {
-  double rc = __kmpc_impl_get_wtick();
-  PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
-  return rc;
-}
-
-EXTERN double omp_get_wtime(void) {
-  double rc = __kmpc_impl_get_wtime();
-  PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_num_threads(int num) {
-  // Ignore it for SPMD mode.
-  if (isSPMDMode())
-    return;
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-  PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
-  if (num <= 0) {
-    WARNING0(LW_INPUT, "expected positive num; ignore\n");
-  } else if (parallelLevel[GetWarpId()] == 0) {
-    nThreads = num;
-  }
-}
-
-EXTERN int omp_get_num_threads(void) {
-  int rc = GetNumberOfOmpThreads(isSPMDMode());
-  PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_max_threads(void) {
-  if (parallelLevel[GetWarpId()] > 0)
-    // We're already in parallel region.
-    return 1; // default is 1 thread avail
-  // Not currently in a parallel region, return what was set.
-  int rc = 1;
-  if (parallelLevel[GetWarpId()] == 0)
-    rc = nThreads;
-  ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
-  PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_thread_limit(void) {
-  if (isSPMDMode())
-    return GetNumberOfThreadsInBlock();
-  int rc = threadLimit;
-  PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_thread_num() {
-  bool isSPMDExecutionMode = isSPMDMode();
-  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
-  PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_num_procs(void) {
-  int rc = GetNumberOfProcsInDevice(isSPMDMode());
-  PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_in_parallel(void) {
-  int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
-  PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_in_final(void) {
-  // treat all tasks as final... Specs may expect runtime to keep
-  // track more precisely if a task was actively set by users... This
-  // is not explicitly specified; will treat as if runtime can
-  // actively decide to put a non-final task into a final one.
-  int rc = 1;
-  PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_dynamic(int flag) {
-  PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
-}
-
-EXTERN int omp_get_dynamic(void) {
-  int rc = 0;
-  PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_nested(int flag) {
-  PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
-        flag);
-}
-
-EXTERN int omp_get_nested(void) {
-  int rc = 0;
-  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_max_active_levels(int level) {
-  PRINT(LD_IO,
-        "call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
-        level);
-}
-
-EXTERN int omp_get_max_active_levels(void) {
-  int rc = 1;
-  PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_level(void) {
-  int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
-  PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
-  return level;
-}
-
-EXTERN int omp_get_active_level(void) {
-  int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
-  PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
-  return level;
-}
-
-EXTERN int omp_get_ancestor_thread_num(int level) {
-  if (isSPMDMode())
-    return level == 1 ? GetThreadIdInBlock() : 0;
-  int rc = -1;
-  // If level is 0 or all parallel regions are not active - return 0.
-  unsigned parLevel = parallelLevel[GetWarpId()];
-  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
-    int totLevel = omp_get_level();
-    if (level <= totLevel) {
-      omptarget_nvptx_TaskDescr *currTaskDescr =
-          getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
-      int steps = totLevel - level;
-      PRINT(LD_IO, "backtrack %d steps\n", steps);
-      ASSERT0(LT_FUSSY, currTaskDescr,
-              "do not expect fct to be called in a non-active thread");
-      do {
-        if (DON(LD_IOD)) {
-          // print current state
-          omp_sched_t sched = currTaskDescr->GetRuntimeSched();
-          PRINT(LD_ALL,
-                "task descr %s %d: %s, in par %d, rt sched %d,"
-                " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
-                "ancestor", steps,
-                (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
-                (int)currTaskDescr->InParallelRegion(), (int)sched,
-                currTaskDescr->RuntimeChunkSize(),
-                (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
-                (int)nThreads);
-        }
-
-        if (currTaskDescr->IsParallelConstruct()) {
-          // found the level
-          if (!steps) {
-            rc = currTaskDescr->ThreadId();
-            break;
-          }
-          steps--;
-        }
-        currTaskDescr = currTaskDescr->GetPrevTaskDescr();
-      } while (currTaskDescr);
-      ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
-    }
-  } else if (level == 0 ||
-             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= parLevel) ||
-             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
-    rc = 0;
-  }
-  PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
-        rc)
-  return rc;
-}
-
-EXTERN int omp_get_team_size(int level) {
-  if (isSPMDMode())
-    return level == 1 ? GetNumberOfThreadsInBlock() : 1;
-  int rc = -1;
-  unsigned parLevel = parallelLevel[GetWarpId()];
-  // If level is 0 or all parallel regions are not active - return 1.
-  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
-    rc = threadsInTeam;
-  } else if (level == 0 ||
-             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= parLevel) ||
-             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
-              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
-    rc = 1;
-  }
-  PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
-  return rc;
-}
-
-EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode only with uninitialized runtime.");
-    *kind = omp_sched_static;
-    *modifier = 1;
-  } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        getMyTopTaskDescriptor(isSPMDMode());
-    *kind = currTaskDescr->GetRuntimeSched();
-    *modifier = currTaskDescr->RuntimeChunkSize();
-  }
-  PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
-        (int)*kind, *modifier);
-}
-
-EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
-  PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
-        modifier);
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode only with uninitialized runtime.");
-    return;
-  }
-  if (kind >= omp_sched_static && kind < omp_sched_auto) {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        getMyTopTaskDescriptor(isSPMDMode());
-    currTaskDescr->SetRuntimeSched(kind);
-    currTaskDescr->RuntimeChunkSize() = modifier;
-    PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
-          (int)currTaskDescr->GetRuntimeSched(),
-          currTaskDescr->RuntimeChunkSize());
-  }
-}
-
-EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
-  PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
-  return omp_proc_bind_true;
-}
-
-EXTERN int omp_get_num_places(void) {
-  PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_place_num_procs(int place_num) {
-  PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
-  return 0;
-}
-
-EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
-  PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
-}
-
-EXTERN int omp_get_place_num(void) {
-  PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_partition_num_places(void) {
-  PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
-  return 0;
-}
-
-EXTERN void omp_get_partition_place_nums(int *place_nums) {
-  PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
-}
-
-EXTERN int omp_get_cancellation(void) {
-  int rc = 0;
-  PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN void omp_set_default_device(int deviceId) {
-  PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
-}
-
-EXTERN int omp_get_default_device(void) {
-  PRINT0(LD_IO,
-         "call omp_get_default_device() is undef on device, returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_num_devices(void) {
-  PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
-  return 0;
-}
-
-EXTERN int omp_get_num_teams(void) {
-  int rc = GetNumberOfOmpTeams();
-  PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_get_team_num() {
-  int rc = GetOmpTeamId();
-  PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_is_initial_device(void) {
-  PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
-  return 0; // 0 by def on device
-}
-
-// Unspecified on the device.
-EXTERN int omp_get_initial_device(void) {
-  PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
-  return 0;
-}
-
-// Unused for now.
-EXTERN int omp_get_max_task_priority(void) {
-  PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
-  return 0;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// locks
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void omp_init_lock(omp_lock_t *lock) {
-  __kmpc_impl_init_lock(lock);
-  PRINT0(LD_IO, "call omp_init_lock()\n");
-}
-
-EXTERN void omp_destroy_lock(omp_lock_t *lock) {
-  __kmpc_impl_destroy_lock(lock);
-  PRINT0(LD_IO, "call omp_destroy_lock()\n");
-}
-
-EXTERN void omp_set_lock(omp_lock_t *lock) {
-  __kmpc_impl_set_lock(lock);
-  PRINT0(LD_IO, "call omp_set_lock()\n");
-}
-
-EXTERN void omp_unset_lock(omp_lock_t *lock) {
-  __kmpc_impl_unset_lock(lock);
-  PRINT0(LD_IO, "call omp_unset_lock()\n");
-}
-
-EXTERN int omp_test_lock(omp_lock_t *lock) {
-  int rc = __kmpc_impl_test_lock(lock);
-  PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
-  return rc;
-}
-
-// for xlf Fortran
-// Fortran, the return is LOGICAL type
-
-#define FLOGICAL long
-EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
-  int ret = omp_is_initial_device();
-  if (ret == 0)
-    return (FLOGICAL)0;
-  else
-    return (FLOGICAL)1;
-}
-
-EXTERN int __xlf_omp_is_initial_device_i4() {
-  int ret = omp_is_initial_device();
-  if (ret == 0)
-    return 0;
-  else
-    return 1;
-}
-
-EXTERN long __xlf_omp_get_team_num_i4() {
-  int ret = omp_get_team_num();
-  return (long)ret;
-}
-
-EXTERN long __xlf_omp_get_num_teams_i4() {
-  int ret = omp_get_num_teams();
-  return (long)ret;
-}
-
-EXTERN void xlf_debug_print_int(int *p) {
-  printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_long(long *p) {
-  printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_float(float *p) {
-  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_double(double *p) {
-  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_addr(void *p) {
-  printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
-}
+//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP runtime functions that can be
+// invoked by the user in an OpenMP region
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+#include "common/target_atomic.h"
+#include "target_impl.h"
+
+EXTERN double omp_get_wtick(void) {
+  double rc = __kmpc_impl_get_wtick();
+  PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
+  return rc;
+}
+
+EXTERN double omp_get_wtime(void) {
+  double rc = __kmpc_impl_get_wtime();
+  PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_num_threads(int num) {
+  // Ignore it for SPMD mode.
+  if (isSPMDMode())
+    return;
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+  PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
+  if (num <= 0) {
+    WARNING0(LW_INPUT, "expected positive num; ignore\n");
+  } else if (parallelLevel[GetWarpId()] == 0) {
+    nThreads = num;
+  }
+}
+
+EXTERN int omp_get_num_threads(void) {
+  int rc = GetNumberOfOmpThreads(isSPMDMode());
+  PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_max_threads(void) {
+  if (parallelLevel[GetWarpId()] > 0)
+    // We're already in parallel region.
+    return 1; // default is 1 thread avail
+  // Not currently in a parallel region, return what was set.
+  int rc = 1;
+  if (parallelLevel[GetWarpId()] == 0)
+    rc = nThreads;
+  ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
+  PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_limit(void) {
+  if (isSPMDMode())
+    return GetNumberOfThreadsInBlock();
+  int rc = threadLimit;
+  PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_num() {
+  bool isSPMDExecutionMode = isSPMDMode();
+  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
+  PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_num_procs(void) {
+  int rc = GetNumberOfProcsInDevice(isSPMDMode());
+  PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_parallel(void) {
+  int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
+  PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_final(void) {
+  // treat all tasks as final... Specs may expect runtime to keep
+  // track more precisely if a task was actively set by users... This
+  // is not explicitly specified; will treat as if runtime can
+  // actively decide to put a non-final task into a final one.
+  int rc = 1;
+  PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_dynamic(int flag) {
+  PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
+}
+
+EXTERN int omp_get_dynamic(void) {
+  int rc = 0;
+  PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_nested(int flag) {
+  PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
+        flag);
+}
+
+EXTERN int omp_get_nested(void) {
+  int rc = 0;
+  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_max_active_levels(int level) {
+  PRINT(LD_IO,
+        "call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
+        level);
+}
+
+EXTERN int omp_get_max_active_levels(void) {
+  int rc = 1;
+  PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_level(void) {
+  int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
+  PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
+  return level;
+}
+
+EXTERN int omp_get_active_level(void) {
+  int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
+  PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
+  return level;
+}
+
+EXTERN int omp_get_ancestor_thread_num(int level) {
+  if (isSPMDMode())
+    return level == 1 ? GetThreadIdInBlock() : 0;
+  int rc = -1;
+  // If level is 0 or all parallel regions are not active - return 0.
+  unsigned parLevel = parallelLevel[GetWarpId()];
+  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
+    int totLevel = omp_get_level();
+    if (level <= totLevel) {
+      omptarget_nvptx_TaskDescr *currTaskDescr =
+          getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
+      int steps = totLevel - level;
+      PRINT(LD_IO, "backtrack %d steps\n", steps);
+      ASSERT0(LT_FUSSY, currTaskDescr,
+              "do not expect fct to be called in a non-active thread");
+      do {
+        if (DON(LD_IOD)) {
+          // print current state
+          omp_sched_t sched = currTaskDescr->GetRuntimeSched();
+          PRINT(LD_ALL,
+                "task descr %s %d: %s, in par %d, rt sched %d,"
+                " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
+                "ancestor", steps,
+                (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
+                (int)currTaskDescr->InParallelRegion(), (int)sched,
+                currTaskDescr->RuntimeChunkSize(),
+                (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
+                (int)nThreads);
+        }
+
+        if (currTaskDescr->IsParallelConstruct()) {
+          // found the level
+          if (!steps) {
+            rc = currTaskDescr->ThreadId();
+            break;
+          }
+          steps--;
+        }
+        currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+      } while (currTaskDescr);
+      ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
+    }
+  } else if (level == 0 ||
+             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= parLevel) ||
+             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
+    rc = 0;
+  }
+  PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
+        rc)
+  return rc;
+}
+
+EXTERN int omp_get_team_size(int level) {
+  if (isSPMDMode())
+    return level == 1 ? GetNumberOfThreadsInBlock() : 1;
+  int rc = -1;
+  unsigned parLevel = parallelLevel[GetWarpId()];
+  // If level is 0 or all parallel regions are not active - return 1.
+  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
+    rc = threadsInTeam;
+  } else if (level == 0 ||
+             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= parLevel) ||
+             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
+    rc = 1;
+  }
+  PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
+  return rc;
+}
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
+  if (isRuntimeUninitialized()) {
+    ASSERT0(LT_FUSSY, isSPMDMode(),
+            "Expected SPMD mode only with uninitialized runtime.");
+    *kind = omp_sched_static;
+    *modifier = 1;
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(isSPMDMode());
+    *kind = currTaskDescr->GetRuntimeSched();
+    *modifier = currTaskDescr->RuntimeChunkSize();
+  }
+  PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
+        (int)*kind, *modifier);
+}
+
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
+  PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
+        modifier);
+  if (isRuntimeUninitialized()) {
+    ASSERT0(LT_FUSSY, isSPMDMode(),
+            "Expected SPMD mode only with uninitialized runtime.");
+    return;
+  }
+  if (kind >= omp_sched_static && kind < omp_sched_auto) {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(isSPMDMode());
+    currTaskDescr->SetRuntimeSched(kind);
+    currTaskDescr->RuntimeChunkSize() = modifier;
+    PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
+          (int)currTaskDescr->GetRuntimeSched(),
+          currTaskDescr->RuntimeChunkSize());
+  }
+}
+
+EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
+  PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
+  return omp_proc_bind_true;
+}
+
+EXTERN int omp_get_num_places(void) {
+  PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_place_num_procs(int place_num) {
+  PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
+  return 0;
+}
+
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
+  PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
+}
+
+EXTERN int omp_get_place_num(void) {
+  PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_partition_num_places(void) {
+  PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
+  return 0;
+}
+
+EXTERN void omp_get_partition_place_nums(int *place_nums) {
+  PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
+}
+
+EXTERN int omp_get_cancellation(void) {
+  int rc = 0;
+  PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_default_device(int deviceId) {
+  PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
+}
+
+EXTERN int omp_get_default_device(void) {
+  PRINT0(LD_IO,
+         "call omp_get_default_device() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_num_devices(void) {
+  PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_num_teams(void) {
+  int rc = GetNumberOfOmpTeams();
+  PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_team_num() {
+  int rc = GetOmpTeamId();
+  PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_is_initial_device(void) {
+  PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
+  return 0; // 0 by def on device
+}
+
+// Unspecified on the device.
+EXTERN int omp_get_initial_device(void) {
+  PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
+  return 0;
+}
+
+// Unused for now.
+EXTERN int omp_get_max_task_priority(void) {
+  PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
+  return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// locks
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void omp_init_lock(omp_lock_t *lock) {
+  __kmpc_impl_init_lock(lock);
+  PRINT0(LD_IO, "call omp_init_lock()\n");
+}
+
+EXTERN void omp_destroy_lock(omp_lock_t *lock) {
+  __kmpc_impl_destroy_lock(lock);
+  PRINT0(LD_IO, "call omp_destroy_lock()\n");
+}
+
+EXTERN void omp_set_lock(omp_lock_t *lock) {
+  __kmpc_impl_set_lock(lock);
+  PRINT0(LD_IO, "call omp_set_lock()\n");
+}
+
+EXTERN void omp_unset_lock(omp_lock_t *lock) {
+  __kmpc_impl_unset_lock(lock);
+  PRINT0(LD_IO, "call omp_unset_lock()\n");
+}
+
+EXTERN int omp_test_lock(omp_lock_t *lock) {
+  int rc = __kmpc_impl_test_lock(lock);
+  PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
+  return rc;
+}
+
+// for xlf Fortran
+// Fortran, the return is LOGICAL type
+
+#define FLOGICAL long
+EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
+  int ret = omp_is_initial_device();
+  if (ret == 0)
+    return (FLOGICAL)0;
+  else
+    return (FLOGICAL)1;
+}
+
+EXTERN int __xlf_omp_is_initial_device_i4() {
+  int ret = omp_is_initial_device();
+  if (ret == 0)
+    return 0;
+  else
+    return 1;
+}
+
+EXTERN long __xlf_omp_get_team_num_i4() {
+  int ret = omp_get_team_num();
+  return (long)ret;
+}
+
+EXTERN long __xlf_omp_get_num_teams_i4() {
+  int ret = omp_get_num_teams();
+  return (long)ret;
+}
+
+EXTERN void xlf_debug_print_int(int *p) {
+  printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_long(long *p) {
+  printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_float(float *p) {
+  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_double(double *p) {
+  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_addr(void *p) {
+  printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
index 417460db138a1..f8acadc8a0dbe 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -1,808 +1,808 @@
-//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-#include "common/target_atomic.h"
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-// template class that encapsulate all the helper functions
-//
-// T is loop iteration type (32 | 64)  (unsigned | signed)
-// ST is the signed version of T
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
-public:
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling with chunk
-
-  // Generic implementation of OMP loop scheduling with static policy
-  /*! \brief Calculate initial bounds for static loop and stride
-   *  @param[in] loc location in code of the call (not used here)
-   *  @param[in] global_tid global thread id
-   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
-   *  @param[in] plastiter pointer to last iteration
-   *  @param[in,out] pointer to loop lower bound. it will contain value of
-   *  lower bound of first chunk
-   *  @param[in,out] pointer to loop upper bound. It will contain value of
-   *  upper bound of first chunk
-   *  @param[in,out] pointer to loop stride. It will contain value of stride
-   *  between two successive chunks executed by the same thread
-   *  @param[in] loop increment bump
-   *  @param[in] chunk size
-   */
-
-  // helper function for static chunk
-  INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
-                                    ST chunk, T entityId, T numberOfEntities) {
-    // each thread executes multiple chunks all of the same size, except
-    // the last one
-
-    // distance between two successive chunks
-    stride = numberOfEntities * chunk;
-    lb = lb + entityId * chunk;
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    // Say ub' is the begining of the last chunk. Then who ever has a
-    // lower bound plus a multiple of the increment equal to ub' is
-    // the last one.
-    T beginingLastChunk = inputUb - (inputUb % chunk);
-    last = ((beginingLastChunk - lb) % stride) == 0;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling without chunk
-
-  // helper function for static no chunk
-  INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
-                                      ST &chunk, T entityId,
-                                      T numberOfEntities) {
-    // No chunk size specified.  Each thread or warp gets at most one
-    // chunk; chunks are all almost of equal size
-    T loopSize = ub - lb + 1;
-
-    chunk = loopSize / numberOfEntities;
-    T leftOver = loopSize - chunk * numberOfEntities;
-
-    if (entityId < leftOver) {
-      chunk++;
-      lb = lb + entityId * chunk;
-    } else {
-      lb = lb + entityId * chunk + leftOver;
-    }
-
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    last = lb <= inputUb && inputUb <= ub;
-    stride = loopSize; // make sure we only do 1 chunk per warp
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for Static Init
-
-  INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
-                                     int32_t *plastiter, T *plower, T *pupper,
-                                     ST *pstride, ST chunk,
-                                     bool IsSPMDExecutionMode) {
-    // When IsRuntimeUninitialized is true, we assume that the caller is
-    // in an L0 parallel region and that all worker threads participate.
-
-    // Assume we are in teams region or that we use a single block
-    // per target region
-    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
-
-    // All warps that are in excess of the maximum requested, do
-    // not execute the loop
-    PRINT(LD_LOOP,
-          "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
-          "%d, num tids %d\n",
-          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
-          (int)numberOfActiveOMPThreads);
-    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
-            "current thread is not needed here; error");
-
-    // copy
-    int lastiter = 0;
-    T lb = *plower;
-    T ub = *pupper;
-    ST stride = *pstride;
-    // init
-    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
-    case kmp_sched_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        break;
-      }
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_balanced_chunk: {
-      if (chunk > 0) {
-        // round up to make sure the chunk is enough to cover all iterations
-        T tripCount = ub - lb + 1; // +1 because ub is inclusive
-        T span = (tripCount + numberOfActiveOMPThreads - 1) /
-                 numberOfActiveOMPThreads;
-        // perform chunk adjustment
-        chunk = (span + chunk - 1) & ~(chunk - 1);
-
-        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-        T oldUb = ub;
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        if (ub > oldUb)
-          ub = oldUb;
-        break;
-      }
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-      break;
-    }
-    case kmp_sched_distr_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
-                       GetNumberOfOmpTeams());
-        break;
-      } // note: if chunk <=0, use nochunk
-    }
-    case kmp_sched_distr_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
-                       GetNumberOfOmpTeams());
-      break;
-    }
-    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
-      ForStaticChunk(lastiter, lb, ub, stride, chunk,
-                     numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
-                     GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
-      break;
-    }
-    default: {
-      ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
-      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
-            (int)schedtype);
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                     numberOfActiveOMPThreads);
-      break;
-    }
-    }
-    // copy back
-    *plastiter = lastiter;
-    *plower = lb;
-    *pupper = ub;
-    *pstride = stride;
-    PRINT(LD_LOOP,
-          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
-          "%d\n",
-          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
-          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
-          (int)lastiter);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch Init
-
-  INLINE static int OrderedSchedule(kmp_sched_t schedule) {
-    return schedule >= kmp_sched_ordered_first &&
-           schedule <= kmp_sched_ordered_last;
-  }
-
-  INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
-                                   kmp_sched_t schedule, T lb, T ub, ST st,
-                                   ST chunk) {
-    if (checkRuntimeUninitialized(loc)) {
-      // In SPMD mode no need to check parallelism level - dynamic scheduling
-      // may appear only in L2 parallel regions with lightweight runtime.
-      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
-      return;
-    }
-    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
-    T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
-    T tripCount = ub - lb + 1; // +1 because ub is inclusive
-    ASSERT0(LT_FUSSY, threadId < tnum,
-            "current thread is not needed here; error");
-
-    /* Currently just ignore the monotonic and non-monotonic modifiers
-     * (the compiler isn't producing them * yet anyway).
-     * When it is we'll want to look at them somewhere here and use that
-     * information to add to our schedule choice. We shouldn't need to pass
-     * them on, they merely affect which schedule we can legally choose for
-     * various dynamic cases. (In particular, whether or not a stealing scheme
-     * is legal).
-     */
-    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
-    // Process schedule.
-    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule))
-        __kmpc_barrier(loc, threadId);
-      PRINT(LD_LOOP,
-            "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
-            (long)tnum, (long long)tripCount, (int)schedule);
-      schedule = kmp_sched_static_chunk;
-      chunk = tripCount; // one thread gets the whole loop
-    } else if (schedule == kmp_sched_runtime) {
-      // process runtime
-      omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
-      chunk = currTaskDescr->RuntimeChunkSize();
-      switch (rtSched) {
-      case omp_sched_static: {
-        if (chunk > 0)
-          schedule = kmp_sched_static_chunk;
-        else
-          schedule = kmp_sched_static_nochunk;
-        break;
-      }
-      case omp_sched_auto: {
-        schedule = kmp_sched_static_chunk;
-        chunk = 1;
-        break;
-      }
-      case omp_sched_dynamic:
-      case omp_sched_guided: {
-        schedule = kmp_sched_dynamic;
-        break;
-      }
-      }
-      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-    } else if (schedule == kmp_sched_auto) {
-      schedule = kmp_sched_static_chunk;
-      chunk = 1;
-      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-    } else {
-      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-      ASSERT(LT_FUSSY,
-             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-             "unknown schedule %d & chunk %lld\n", (int)schedule,
-             (long long)chunk);
-    }
-
-    // init schedules
-    if (schedule == kmp_sched_static_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_static_balanced_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      // round up to make sure the chunk is enough to cover all iterations
-      T span = (tripCount + tnum - 1) / tnum;
-      // perform chunk adjustment
-      chunk = (span + chunk - 1) & ~(chunk - 1);
-
-      T oldUb = ub;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-      if (ub > oldUb)
-        ub = oldUb;
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_static_nochunk) {
-      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      // save data
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      if (chunk < 1)
-        chunk = 1;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      __kmpc_barrier(loc, threadId);
-      if (tid == 0) {
-        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
-        __kmpc_impl_threadfence_block();
-      }
-      __kmpc_barrier(loc, threadId);
-      PRINT(LD_LOOP,
-            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
-            ", chunk %" PRIu64 "\n",
-            (int)tnum,
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid));
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch next
-
-  INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
-                                 int leader) {
-    uint32_t lo, hi;
-    __kmpc_impl_unpack(val, lo, hi);
-    hi = __kmpc_impl_shfl_sync(active, hi, leader);
-    lo = __kmpc_impl_shfl_sync(active, lo, leader);
-    return __kmpc_impl_pack(lo, hi);
-  }
-
-  INLINE static uint64_t NextIter() {
-    __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
-    uint32_t leader = __kmpc_impl_ffs(active) - 1;
-    uint32_t change = __kmpc_impl_popc(active);
-    __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
-    unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
-    uint64_t warp_res;
-    if (rank == 0) {
-      warp_res = __kmpc_atomic_add(
-          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
-          (unsigned long long)change);
-    }
-    warp_res = Shuffle(active, warp_res, leader);
-    return warp_res + rank;
-  }
-
-  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
-                                     T loopLowerBound, T loopUpperBound) {
-    T N = NextIter();
-    lb = loopLowerBound + N * chunkSize;
-    ub = lb + chunkSize - 1;  // Clang uses i <= ub
-
-    // 3 result cases:
-    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
-    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
-    //  NOT_FINISHED
-    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    // a.
-    if (lb <= loopUpperBound && ub < loopUpperBound) {
-      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
-            (long long)lb, (long long)ub, (long long)loopUpperBound);
-      return NOT_FINISHED;
-    }
-    // b.
-    if (lb <= loopUpperBound) {
-      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
-            (long long)lb, (long long)ub, (long long)loopUpperBound);
-      ub = loopUpperBound;
-      return LAST_CHUNK;
-    }
-    // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 2;
-    ub = loopUpperBound + 1;
-    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
-          (long long)ub, (long long)loopUpperBound);
-    return FINISHED;
-  }
-
-  INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
-                                  T *plower, T *pupper, ST *pstride) {
-    if (checkRuntimeUninitialized(loc)) {
-      // In SPMD mode no need to check parallelism level - dynamic scheduling
-      // may appear only in L2 parallel regions with lightweight runtime.
-      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
-      if (*plast)
-        return DISPATCH_FINISHED;
-      *plast = 1;
-      return DISPATCH_NOTFINISHED;
-    }
-    // ID of a thread in its own warp
-
-    // automatically selects thread or warp ID based on selected implementation
-    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-    ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
-            "current thread is not needed here; error");
-    // retrieve schedule
-    kmp_sched_t schedule =
-        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
-
-    // xxx reduce to one
-    if (schedule == kmp_sched_static_chunk ||
-        schedule == kmp_sched_static_nochunk) {
-      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
-      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
-      // finished?
-      if (myLb > ub) {
-        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
-              (long long)myLb, (long long)ub);
-        return DISPATCH_FINISHED;
-      }
-      // not finished, save current bounds
-      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
-      *plower = myLb;
-      T myUb = myLb + chunk - 1; // Clang uses i <= ub
-      if (myUb > ub)
-        myUb = ub;
-      *pupper = myUb;
-      *plast = (int32_t)(myUb == ub);
-
-      // increment next lower bound by the stride
-      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
-      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
-            (long long)*plower, (long long)*pupper);
-      return DISPATCH_NOTFINISHED;
-    }
-    ASSERT0(LT_FUSSY,
-            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-            "bad sched");
-    T myLb, myUb;
-    int finished = DynamicNextChunk(
-        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
-    if (finished == FINISHED)
-      return DISPATCH_FINISHED;
-
-    // not finished (either not finished or last chunk)
-    *plast = (int32_t)(finished == LAST_CHUNK);
-    *plower = myLb;
-    *pupper = myUb;
-    *pstride = 1;
-
-    PRINT(LD_LOOP,
-          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
-          "last %d\n",
-          (int)GetNumberOfOmpThreads(isSPMDMode()),
-          (int)GetNumberOfWorkersInTeam(), (long long)*plower,
-          (long long)*pupper, (long long)*pstride, (int)*plast);
-    return DISPATCH_NOTFINISHED;
-  }
-
-  INLINE static void dispatch_fini() {
-    // nothing
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // end of template class that encapsulate all the helper functions
-  ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// init
-EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
-                                   int32_t schedule, int32_t lb, int32_t ub,
-                                   int32_t st, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
-                                    int32_t schedule, uint32_t lb, uint32_t ub,
-                                    int32_t st, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
-                                   int32_t schedule, int64_t lb, int64_t ub,
-                                   int64_t st, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
-                                    int32_t schedule, uint64_t lb, uint64_t ub,
-                                    int64_t st, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-// next
-EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                  int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
-  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
-                                   int32_t *p_last, uint32_t *p_lb,
-                                   uint32_t *p_ub, int32_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
-  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                  int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
-  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
-                                   int32_t *p_last, uint64_t *p_lb,
-                                   uint64_t *p_ub, int64_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
-  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-// fini
-EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t schedtype, int32_t *plastiter,
-                                          int32_t *plower, int32_t *pupper,
-                                          int32_t *pstride, int32_t incr,
-                                          int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t schedtype,
-                                           int32_t *plastiter, uint32_t *plower,
-                                           uint32_t *pupper, int32_t *pstride,
-                                           int32_t incr, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t schedtype, int32_t *plastiter,
-                                          int64_t *plower, int64_t *pupper,
-                                          int64_t *pstride, int64_t incr,
-                                          int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t schedtype,
-                                           int32_t *plastiter, uint64_t *plower,
-                                           uint64_t *pupper, int64_t *pstride,
-                                           int64_t incr, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
-    int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
-    int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
-    int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
-    int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
-}
-
-namespace {
-INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
-  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  // On Volta and newer architectures we require that all lanes in
-  // a warp (at least, all present for the kernel launch) participate in the
-  // barrier.  This is enforced when launching the parallel region.  An
-  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
-  // is started, so we don't need a barrier.
-  if (NumThreads > 1) {
-#endif
-    __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  }
-#endif
-}
-}; // namespace
-
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
-                                                  int32_t varNum, void *array) {
-  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Expected non-SPMD mode + initialized runtime.");
-
-  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
-  uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
-  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
-  for (unsigned i = 0; i < varNum; i++) {
-    // Reset buffer.
-    if (gtid == 0)
-      *Buffer = 0; // Reset to minimum loop iteration value.
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Atomic max of iterations.
-    uint64_t *varArray = (uint64_t *)array;
-    uint64_t elem = varArray[i];
-    (void)__kmpc_atomic_max((unsigned long long int *)Buffer,
-                            (unsigned long long int)elem);
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Read max value and update thread private array.
-    varArray[i] = *Buffer;
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-  }
-}
+//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+#include "target_impl.h"
+#include "common/target_atomic.h"
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// template class that encapsulate all the helper functions
+//
+// T is loop iteration type (32 | 64)  (unsigned | signed)
+// ST is the signed version of T
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
+public:
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling with chunk
+
+  // Generic implementation of OMP loop scheduling with static policy
+  /*! \brief Calculate initial bounds for static loop and stride
+   *  @param[in] loc location in code of the call (not used here)
+   *  @param[in] global_tid global thread id
+   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+   *  @param[in] plastiter pointer to last iteration
+   *  @param[in,out] pointer to loop lower bound. it will contain value of
+   *  lower bound of first chunk
+   *  @param[in,out] pointer to loop upper bound. It will contain value of
+   *  upper bound of first chunk
+   *  @param[in,out] pointer to loop stride. It will contain value of stride
+   *  between two successive chunks executed by the same thread
+   *  @param[in] loop increment bump
+   *  @param[in] chunk size
+   */
+
+  // helper function for static chunk
+  INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
+                                    ST chunk, T entityId, T numberOfEntities) {
+    // each thread executes multiple chunks all of the same size, except
+    // the last one
+
+    // distance between two successive chunks
+    stride = numberOfEntities * chunk;
+    lb = lb + entityId * chunk;
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    // Say ub' is the begining of the last chunk. Then who ever has a
+    // lower bound plus a multiple of the increment equal to ub' is
+    // the last one.
+    T beginingLastChunk = inputUb - (inputUb % chunk);
+    last = ((beginingLastChunk - lb) % stride) == 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling without chunk
+
+  // helper function for static no chunk
+  INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
+                                      ST &chunk, T entityId,
+                                      T numberOfEntities) {
+    // No chunk size specified.  Each thread or warp gets at most one
+    // chunk; chunks are all almost of equal size
+    T loopSize = ub - lb + 1;
+
+    chunk = loopSize / numberOfEntities;
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for Static Init
+
+  INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
+                                     int32_t *plastiter, T *plower, T *pupper,
+                                     ST *pstride, ST chunk,
+                                     bool IsSPMDExecutionMode) {
+    // When IsRuntimeUninitialized is true, we assume that the caller is
+    // in an L0 parallel region and that all worker threads participate.
+
+    // Assume we are in teams region or that we use a single block
+    // per target region
+    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
+
+    // All warps that are in excess of the maximum requested, do
+    // not execute the loop
+    PRINT(LD_LOOP,
+          "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+          "%d, num tids %d\n",
+          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
+          (int)numberOfActiveOMPThreads);
+    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
+            "current thread is not needed here; error");
+
+    // copy
+    int lastiter = 0;
+    T lb = *plower;
+    T ub = *pupper;
+    ST stride = *pstride;
+    // init
+    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
+    case kmp_sched_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_balanced_chunk: {
+      if (chunk > 0) {
+        // round up to make sure the chunk is enough to cover all iterations
+        T tripCount = ub - lb + 1; // +1 because ub is inclusive
+        T span = (tripCount + numberOfActiveOMPThreads - 1) /
+                 numberOfActiveOMPThreads;
+        // perform chunk adjustment
+        chunk = (span + chunk - 1) & ~(chunk - 1);
+
+        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+        T oldUb = ub;
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        if (ub > oldUb)
+          ub = oldUb;
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+                       GetNumberOfOmpTeams());
+        break;
+      } // note: if chunk <=0, use nochunk
+    }
+    case kmp_sched_distr_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+                       GetNumberOfOmpTeams());
+      break;
+    }
+    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
+                     GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
+      break;
+    }
+    default: {
+      ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
+      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
+            (int)schedtype);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                     numberOfActiveOMPThreads);
+      break;
+    }
+    }
+    // copy back
+    *plastiter = lastiter;
+    *plower = lb;
+    *pupper = ub;
+    *pstride = stride;
+    PRINT(LD_LOOP,
+          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+          "%d\n",
+          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
+          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
+          (int)lastiter);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch Init
+
+  INLINE static int OrderedSchedule(kmp_sched_t schedule) {
+    return schedule >= kmp_sched_ordered_first &&
+           schedule <= kmp_sched_ordered_last;
+  }
+
+  INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
+                                   kmp_sched_t schedule, T lb, T ub, ST st,
+                                   ST chunk) {
+    if (checkRuntimeUninitialized(loc)) {
+      // In SPMD mode no need to check parallelism level - dynamic scheduling
+      // may appear only in L2 parallel regions with lightweight runtime.
+      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
+      return;
+    }
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
+    T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
+    T tripCount = ub - lb + 1; // +1 because ub is inclusive
+    ASSERT0(LT_FUSSY, threadId < tnum,
+            "current thread is not needed here; error");
+
+    /* Currently just ignore the monotonic and non-monotonic modifiers
+     * (the compiler isn't producing them * yet anyway).
+     * When it is we'll want to look at them somewhere here and use that
+     * information to add to our schedule choice. We shouldn't need to pass
+     * them on, they merely affect which schedule we can legally choose for
+     * various dynamic cases. (In particular, whether or not a stealing scheme
+     * is legal).
+     */
+    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+    // Process schedule.
+    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
+            (long)tnum, (long long)tripCount, (int)schedule);
+      schedule = kmp_sched_static_chunk;
+      chunk = tripCount; // one thread gets the whole loop
+    } else if (schedule == kmp_sched_runtime) {
+      // process runtime
+      omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+      chunk = currTaskDescr->RuntimeChunkSize();
+      switch (rtSched) {
+      case omp_sched_static: {
+        if (chunk > 0)
+          schedule = kmp_sched_static_chunk;
+        else
+          schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto: {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic:
+      case omp_sched_guided: {
+        schedule = kmp_sched_dynamic;
+        break;
+      }
+      }
+      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else if (schedule == kmp_sched_auto) {
+      schedule = kmp_sched_static_chunk;
+      chunk = 1;
+      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else {
+      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+      ASSERT(LT_FUSSY,
+             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+             "unknown schedule %d & chunk %lld\n", (int)schedule,
+             (long long)chunk);
+    }
+
+    // init schedules
+    if (schedule == kmp_sched_static_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_static_balanced_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      // round up to make sure the chunk is enough to cover all iterations
+      T span = (tripCount + tnum - 1) / tnum;
+      // perform chunk adjustment
+      chunk = (span + chunk - 1) & ~(chunk - 1);
+
+      T oldUb = ub;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+      if (ub > oldUb)
+        ub = oldUb;
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_static_nochunk) {
+      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+      // save data
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      if (chunk < 1)
+        chunk = 1;
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+        __kmpc_impl_threadfence_block();
+      }
+      __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
+            (int)tnum,
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->Chunk(tid));
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch next
+
+  INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
+                                 int leader) {
+    uint32_t lo, hi;
+    __kmpc_impl_unpack(val, lo, hi);
+    hi = __kmpc_impl_shfl_sync(active, hi, leader);
+    lo = __kmpc_impl_shfl_sync(active, lo, leader);
+    return __kmpc_impl_pack(lo, hi);
+  }
+
+  INLINE static uint64_t NextIter() {
+    __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
+    uint32_t leader = __kmpc_impl_ffs(active) - 1;
+    uint32_t change = __kmpc_impl_popc(active);
+    __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
+    unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = __kmpc_atomic_add(
+          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+          (unsigned long long)change);
+    }
+    warp_res = Shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
+  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+                                     T loopLowerBound, T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
+    ub = lb + chunkSize - 1;  // Clang uses i <= ub
+
+    // 3 result cases:
+    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
+    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+    //  NOT_FINISHED
+    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+    // a.
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      return NOT_FINISHED;
+    }
+    // b.
+    if (lb <= loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      ub = loopUpperBound;
+      return LAST_CHUNK;
+    }
+    // c. if we are here, we are in case 'c'
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
+          (long long)ub, (long long)loopUpperBound);
+    return FINISHED;
+  }
+
+  INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
+                                  T *plower, T *pupper, ST *pstride) {
+    if (checkRuntimeUninitialized(loc)) {
+      // In SPMD mode no need to check parallelism level - dynamic scheduling
+      // may appear only in L2 parallel regions with lightweight runtime.
+      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
+      if (*plast)
+        return DISPATCH_FINISHED;
+      *plast = 1;
+      return DISPATCH_NOTFINISHED;
+    }
+    // ID of a thread in its own warp
+
+    // automatically selects thread or warp ID based on selected implementation
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+    ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
+            "current thread is not needed here; error");
+    // retrieve schedule
+    kmp_sched_t schedule =
+        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+
+    // xxx reduce to one
+    if (schedule == kmp_sched_static_chunk ||
+        schedule == kmp_sched_static_nochunk) {
+      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
+      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+      // finished?
+      if (myLb > ub) {
+        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+              (long long)myLb, (long long)ub);
+        return DISPATCH_FINISHED;
+      }
+      // not finished, save current bounds
+      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+      *plower = myLb;
+      T myUb = myLb + chunk - 1; // Clang uses i <= ub
+      if (myUb > ub)
+        myUb = ub;
+      *pupper = myUb;
+      *plast = (int32_t)(myUb == ub);
+
+      // increment next lower bound by the stride
+      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
+      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+            (long long)*plower, (long long)*pupper);
+      return DISPATCH_NOTFINISHED;
+    }
+    ASSERT0(LT_FUSSY,
+            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+            "bad sched");
+    T myLb, myUb;
+    int finished = DynamicNextChunk(
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+
+    if (finished == FINISHED)
+      return DISPATCH_FINISHED;
+
+    // not finished (either not finished or last chunk)
+    *plast = (int32_t)(finished == LAST_CHUNK);
+    *plower = myLb;
+    *pupper = myUb;
+    *pstride = 1;
+
+    PRINT(LD_LOOP,
+          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+          "last %d\n",
+          (int)GetNumberOfOmpThreads(isSPMDMode()),
+          (int)GetNumberOfWorkersInTeam(), (long long)*plower,
+          (long long)*pupper, (long long)*pstride, (int)*plast);
+    return DISPATCH_NOTFINISHED;
+  }
+
+  INLINE static void dispatch_fini() {
+    // nothing
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // end of template class that encapsulate all the helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// init
+EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
+                                   int32_t schedule, int32_t lb, int32_t ub,
+                                   int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
+                                    int32_t schedule, uint32_t lb, uint32_t ub,
+                                    int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
+                                   int32_t schedule, int64_t lb, int64_t ub,
+                                   int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
+                                    int32_t schedule, uint64_t lb, uint64_t ub,
+                                    int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+// next
+EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
+                                  int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
+                                   int32_t *p_last, uint32_t *p_lb,
+                                   uint32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
+                                  int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
+                                   int32_t *p_last, uint64_t *p_lb,
+                                   uint64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+// fini
+EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int32_t *plower, int32_t *pupper,
+                                     int32_t *pstride, int32_t incr,
+                                     int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint32_t *plower, uint32_t *pupper,
+                                      int32_t *pstride, int32_t incr,
+                                      int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int64_t *plower, int64_t *pupper,
+                                     int64_t *pstride, int64_t incr,
+                                     int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint64_t *plower, uint64_t *pupper,
+                                      int64_t *pstride, int64_t incr,
+                                      int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint32_t *plower,
+                                           uint32_t *pupper, int32_t *pstride,
+                                           int32_t incr, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint64_t *plower,
+                                           uint64_t *pupper, int64_t *pstride,
+                                           int64_t incr, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+
+namespace {
+INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
+  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp (at least, all present for the kernel launch) participate in the
+  // barrier.  This is enforced when launching the parallel region.  An
+  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
+  // is started, so we don't need a barrier.
+  if (NumThreads > 1) {
+#endif
+    __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  }
+#endif
+}
+}; // namespace
+
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
+                                                  int32_t varNum, void *array) {
+  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Expected non-SPMD mode + initialized runtime.");
+
+  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+  uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
+  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
+  for (unsigned i = 0; i < varNum; i++) {
+    // Reset buffer.
+    if (gtid == 0)
+      *Buffer = 0; // Reset to minimum loop iteration value.
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Atomic max of iterations.
+    uint64_t *varArray = (uint64_t *)array;
+    uint64_t elem = varArray[i];
+    (void)__kmpc_atomic_max((unsigned long long int *)Buffer,
+                            (unsigned long long int)elem);
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Read max value and update thread private array.
+    varArray[i] = *Buffer;
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+  }
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
index 5bef3b89a1721..f335dac5484a0 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -1,68 +1,68 @@
-//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the data objects used on the GPU device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-#include "common/device_environment.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// global device environment
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE omptarget_device_environmentTy omptarget_device_environment;
-
-////////////////////////////////////////////////////////////////////////////////
-// global data holding OpenMP state information
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE
-    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
-        omptarget_nvptx_device_State[MAX_SM];
-
-DEVICE omptarget_nvptx_SimpleMemoryManager
-    omptarget_nvptx_simpleMemoryManager;
-DEVICE SHARED uint32_t usedMemIdx;
-DEVICE SHARED uint32_t usedSlotIdx;
-
-DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-DEVICE SHARED uint16_t threadLimit;
-DEVICE SHARED uint16_t threadsInTeam;
-DEVICE SHARED uint16_t nThreads;
-// Pointer to this team's OpenMP state object
-DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-
-////////////////////////////////////////////////////////////////////////////////
-// The team master sets the outlined parallel function in this variable to
-// communicate with the workers.  Since it is in shared memory, there is one
-// copy of these variables for each kernel, instance, and team.
-////////////////////////////////////////////////////////////////////////////////
-volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenMP kernel execution parameters
-////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED uint32_t execution_param;
-
-////////////////////////////////////////////////////////////////////////////////
-// Data sharing state
-////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED DataSharingStateTy DataSharingState;
-
-////////////////////////////////////////////////////////////////////////////////
-// Scratchpad for teams reduction.
-////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED void *ReductionScratchpadPtr;
-
-////////////////////////////////////////////////////////////////////////////////
-// Data sharing related variables.
-////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the data objects used on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+#include "common/device_environment.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global device environment
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE omptarget_device_environmentTy omptarget_device_environment;
+
+////////////////////////////////////////////////////////////////////////////////
+// global data holding OpenMP state information
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE
+    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+        omptarget_nvptx_device_State[MAX_SM];
+
+DEVICE omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+DEVICE SHARED uint32_t usedMemIdx;
+DEVICE SHARED uint32_t usedSlotIdx;
+
+DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+DEVICE SHARED uint16_t threadLimit;
+DEVICE SHARED uint16_t threadsInTeam;
+DEVICE SHARED uint16_t nThreads;
+// Pointer to this team's OpenMP state object
+DEVICE SHARED
+    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+////////////////////////////////////////////////////////////////////////////////
+// The team master sets the outlined parallel function in this variable to
+// communicate with the workers.  Since it is in shared memory, there is one
+// copy of these variables for each kernel, instance, and team.
+////////////////////////////////////////////////////////////////////////////////
+volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP kernel execution parameters
+////////////////////////////////////////////////////////////////////////////////
+DEVICE SHARED uint32_t execution_param;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing state
+////////////////////////////////////////////////////////////////////////////////
+DEVICE SHARED DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// Scratchpad for teams reduction.
+////////////////////////////////////////////////////////////////////////////////
+DEVICE SHARED void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing related variables.
+////////////////////////////////////////////////////////////////////////////////
+DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 23fbd00cacaf9..305ff626699a1 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -1,179 +1,179 @@
-//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the initialization code for the GPU
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// global data tables
-////////////////////////////////////////////////////////////////////////////////
-
-extern DEVICE
-    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
-        omptarget_nvptx_device_State[MAX_SM];
-
-////////////////////////////////////////////////////////////////////////////////
-// init entry points
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_kernel_init_params(void *Ptr) {
-  PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
-        OMPTARGET_NVPTX_VERSION);
-
-  SetTeamsReductionScratchpadPtr(Ptr);
-}
-
-EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
-  PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
-        OMPTARGET_NVPTX_VERSION);
-  ASSERT0(LT_FUSSY, RequiresOMPRuntime,
-          "Generic always requires initialized runtime.");
-  setExecutionParameters(Generic, RuntimeInitialized);
-  for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
-    parallelLevel[I] = 0;
-
-  int threadIdInBlock = GetThreadIdInBlock();
-  ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
-          "__kmpc_kernel_init() must be called by team master warp only!");
-  PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
-
-  // Get a state object from the queue.
-  int slot = __kmpc_impl_smid() % MAX_SM;
-  usedSlotIdx = slot;
-  omptarget_nvptx_threadPrivateContext =
-      omptarget_nvptx_device_State[slot].Dequeue();
-
-  // init thread private
-  int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false);
-  omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
-
-  // init team context
-  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  currTeamDescr.InitTeamDescr();
-  // this thread will start execution... has to update its task ICV
-  // to point to the level zero task ICV. That ICV was init in
-  // InitTeamDescr()
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, currTeamDescr.LevelZeroTaskDescr());
-
-  // set number of threads and thread limit in team to started value
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  nThreads = GetNumberOfThreadsInBlock();
-  threadLimit = ThreadLimit;
-}
-
-EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
-  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
-          "Generic always requires initialized runtime.");
-  // Enqueue omp state object for use by another team.
-  int slot = usedSlotIdx;
-  omptarget_nvptx_device_State[slot].Enqueue(
-      omptarget_nvptx_threadPrivateContext);
-  // Done with work.  Kill the workers.
-  omptarget_nvptx_workFn = 0;
-}
-
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
-                                    int16_t RequiresDataSharing) {
-  PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
-
-  setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
-                                                  : RuntimeUninitialized);
-  int threadId = GetThreadIdInBlock();
-  if (threadId == 0) {
-    usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
-    parallelLevel[0] =
-        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
-  } else if (GetLaneId() == 0) {
-    parallelLevel[GetWarpId()] =
-        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
-  }
-  if (!RequiresOMPRuntime) {
-    // Runtime is not required - exit.
-    __kmpc_impl_syncthreads();
-    return;
-  }
-
-  //
-  // Team Context Initialization.
-  //
-  // In SPMD mode there is no master thread so use any cuda thread for team
-  // context initialization.
-  if (threadId == 0) {
-    // Get a state object from the queue.
-    omptarget_nvptx_threadPrivateContext =
-        omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
-
-    omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-    omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-    // init team context
-    currTeamDescr.InitTeamDescr();
-  }
-  __kmpc_impl_syncthreads();
-
-  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-
-  //
-  // Initialize task descr for each thread.
-  //
-  omptarget_nvptx_TaskDescr *newTaskDescr =
-      omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
-  newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
-  // install new top descriptor
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                             newTaskDescr);
-
-  // init thread private from init value
-  PRINT(LD_PAR,
-        "thread will execute parallel region with id %d in a team of "
-        "%d threads\n",
-        (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
-
-  if (RequiresDataSharing && GetLaneId() == 0) {
-    // Warp master initializes data sharing environment.
-    unsigned WID = threadId / WARPSIZE;
-    __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
-        WID, WID == WARPSIZE - 1);
-    DataSharingState.SlotPtr[WID] = RootS;
-    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-  }
-}
-
-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
-  __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
-}
-
-EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
-  // We're not going to pop the task descr stack of each thread since
-  // there are no more parallel regions in SPMD mode.
-  if (!RequiresOMPRuntime)
-    return;
-
-  __kmpc_impl_syncthreads();
-  int threadId = GetThreadIdInBlock();
-  if (threadId == 0) {
-    // Enqueue omp state object for use by another team.
-    int slot = usedSlotIdx;
-    omptarget_nvptx_device_State[slot].Enqueue(
-        omptarget_nvptx_threadPrivateContext);
-  }
-}
-
-// Return true if the current target region is executed in SPMD mode.
-EXTERN int8_t __kmpc_is_spmd_exec_mode() {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");
-  return isSPMDMode();
-}
+//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the initialization code for the GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+#include "target_impl.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern DEVICE
+    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+        omptarget_nvptx_device_State[MAX_SM];
+
+////////////////////////////////////////////////////////////////////////////////
+// init entry points
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_kernel_init_params(void *Ptr) {
+  PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
+        OMPTARGET_NVPTX_VERSION);
+
+  SetTeamsReductionScratchpadPtr(Ptr);
+}
+
+EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
+  PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
+        OMPTARGET_NVPTX_VERSION);
+  ASSERT0(LT_FUSSY, RequiresOMPRuntime,
+          "Generic always requires initialized runtime.");
+  setExecutionParameters(Generic, RuntimeInitialized);
+  for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
+    parallelLevel[I] = 0;
+
+  int threadIdInBlock = GetThreadIdInBlock();
+  ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
+          "__kmpc_kernel_init() must be called by team master warp only!");
+  PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
+
+  // Get a state object from the queue.
+  int slot = __kmpc_impl_smid() % MAX_SM;
+  usedSlotIdx = slot;
+  omptarget_nvptx_threadPrivateContext =
+      omptarget_nvptx_device_State[slot].Dequeue();
+
+  // init thread private
+  int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false);
+  omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
+
+  // init team context
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  currTeamDescr.InitTeamDescr();
+  // this thread will start execution... has to update its task ICV
+  // to point to the level zero task ICV. That ICV was init in
+  // InitTeamDescr()
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTeamDescr.LevelZeroTaskDescr());
+
+  // set number of threads and thread limit in team to started value
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+  nThreads = GetNumberOfThreadsInBlock();
+  threadLimit = ThreadLimit;
+}
+
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
+  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
+          "Generic always requires initialized runtime.");
+  // Enqueue omp state object for use by another team.
+  int slot = usedSlotIdx;
+  omptarget_nvptx_device_State[slot].Enqueue(
+      omptarget_nvptx_threadPrivateContext);
+  // Done with work.  Kill the workers.
+  omptarget_nvptx_workFn = 0;
+}
+
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
+                                    int16_t RequiresDataSharing) {
+  PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
+
+  setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
+                                                  : RuntimeUninitialized);
+  int threadId = GetThreadIdInBlock();
+  if (threadId == 0) {
+    usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
+    parallelLevel[0] =
+        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+  } else if (GetLaneId() == 0) {
+    parallelLevel[GetWarpId()] =
+        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+  }
+  if (!RequiresOMPRuntime) {
+    // Runtime is not required - exit.
+    __kmpc_impl_syncthreads();
+    return;
+  }
+
+  //
+  // Team Context Initialization.
+  //
+  // In SPMD mode there is no master thread so use any cuda thread for team
+  // context initialization.
+  if (threadId == 0) {
+    // Get a state object from the queue.
+    omptarget_nvptx_threadPrivateContext =
+        omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
+
+    omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+    omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+    // init team context
+    currTeamDescr.InitTeamDescr();
+  }
+  __kmpc_impl_syncthreads();
+
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+
+  //
+  // Initialize task descr for each thread.
+  //
+  omptarget_nvptx_TaskDescr *newTaskDescr =
+      omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+  newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
+  // install new top descriptor
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                             newTaskDescr);
+
+  // init thread private from init value
+  PRINT(LD_PAR,
+        "thread will execute parallel region with id %d in a team of "
+        "%d threads\n",
+        (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
+
+  if (RequiresDataSharing && GetLaneId() == 0) {
+    // Warp master initializes data sharing environment.
+    unsigned WID = threadId / WARPSIZE;
+    __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
+        WID, WID == WARPSIZE - 1);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  }
+}
+
+EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
+  __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
+}
+
+EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
+  // We're not going to pop the task descr stack of each thread since
+  // there are no more parallel regions in SPMD mode.
+  if (!RequiresOMPRuntime)
+    return;
+
+  __kmpc_impl_syncthreads();
+  int threadId = GetThreadIdInBlock();
+  if (threadId == 0) {
+    // Enqueue omp state object for use by another team.
+    int slot = usedSlotIdx;
+    omptarget_nvptx_device_State[slot].Enqueue(
+        omptarget_nvptx_threadPrivateContext);
+  }
+}
+
+// Return true if the current target region is executed in SPMD mode.
+EXTERN int8_t __kmpc_is_spmd_exec_mode() {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");
+  return isSPMDMode();
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index ab031e99e51f9..c7c41021d4bbc 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -1,470 +1,470 @@
-//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-//    while (not finished) {
-//
-//    if (master) {
-//      sequential code, decide which par loop to do, or if finished
-//     __kmpc_kernel_prepare_parallel() // exec by master only
-//    }
-//    syncthreads // A
-//    __kmpc_kernel_parallel() // exec by all
-//    if (this thread is included in the parallel) {
-//      switch () for all parallel loops
-//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
-//    }
-//
-//
-//    The reason we don't exec end_parallel for the threads not included
-//    in the parallel loop is that for each barrier in the parallel
-//    region, these non-included threads will cycle through the
-//    syncthread A. Thus they must preserve their current threadId that
-//    is larger than thread in team.
-//
-//    To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-typedef struct ConvergentSimdJob {
-  omptarget_nvptx_TaskDescr taskDescr;
-  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
-  uint16_t slimForNextSimd;
-} ConvergentSimdJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent simd (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
-                                          __kmpc_impl_lanemask_t Mask,
-                                          bool *IsFinal, int32_t *LaneSource,
-                                          int32_t *LaneId, int32_t *NumLanes) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
-  __kmpc_impl_lanemask_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
-  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
-  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
-  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
-  int32_t SimdLimit =
-      omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
-  job->slimForNextSimd = SimdLimit;
-
-  int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
-  // reset simdlimit to avoid propagating to successive #simd
-  if (SimdLimitSource > 0 && threadId == sourceThreadId)
-    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
-
-  // We cannot have more than the # of convergent threads.
-  if (SimdLimitSource > 0)
-    *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
-  else
-    *NumLanes = ConvergentSize;
-  ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
-         (int)*NumLanes);
-
-  // Set to true for lanes participating in the simd region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (*LaneId < *NumLanes) {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    omptarget_nvptx_TaskDescr *sourceTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
-            sourceThreadId);
-    job->convHeadTaskDescr = currTaskDescr;
-    // install top descriptor from the thread for which the lanes are working.
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               sourceTaskDescr);
-    isActive = true;
-  }
-
-  // requires a memory fence between threads of a warp
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
-  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
-      job->slimForNextSimd;
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, job->convHeadTaskDescr);
-}
-
-typedef struct ConvergentParallelJob {
-  omptarget_nvptx_TaskDescr taskDescr;
-  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
-  uint16_t tnumForNextPar;
-} ConvergentParallelJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent parallelism (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
-                                              __kmpc_impl_lanemask_t Mask,
-                                              bool *IsFinal,
-                                              int32_t *LaneSource) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
-  __kmpc_impl_lanemask_t ConvergentMask = Mask;
-  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
-  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
-  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
-  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
-  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
-  int32_t NumThreadsClause =
-      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
-  job->tnumForNextPar = NumThreadsClause;
-
-  int32_t NumThreadsSource =
-      __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
-  // reset numthreads to avoid propagating to successive #parallel
-  if (NumThreadsSource > 0 && threadId == sourceThreadId)
-    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
-        0;
-
-  // We cannot have more than the # of convergent threads.
-  uint16_t NumThreads;
-  if (NumThreadsSource > 0)
-    NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
-  else
-    NumThreads = ConvergentSize;
-  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         (int)NumThreads);
-
-  // Set to true for workers participating in the parallel region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (OmpId < NumThreads) {
-    // init L2 task descriptor and storage for the L1 parallel task descriptor.
-    omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
-    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    omptarget_nvptx_TaskDescr *sourceTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
-            sourceThreadId);
-    job->convHeadTaskDescr = currTaskDescr;
-    newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
-    // install new top descriptor
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               newTaskDescr);
-    isActive = true;
-  }
-
-  // requires a memory fence between threads of a warp
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
-  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, job->convHeadTaskDescr);
-  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
-      job->tnumForNextPar;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// support for parallel that goes parallel (1 static level only)
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
-                                                uint16_t NThreadsICV,
-                                                uint16_t ThreadLimit) {
-  uint16_t ThreadsRequested = NThreadsICV;
-  if (NumThreadsClause != 0) {
-    ThreadsRequested = NumThreadsClause;
-  }
-
-  uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
-  if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
-    ThreadsAvailable = ThreadLimit;
-  }
-
-  uint16_t NumThreads = ThreadsAvailable;
-  if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
-    NumThreads = ThreadsRequested;
-  }
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  // On Volta and newer architectures we require that all lanes in
-  // a warp participate in the parallel region.  Round down to a
-  // multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (NumThreads < WARPSIZE) {
-    NumThreads = 1;
-  } else {
-    NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
-  }
-#endif
-
-  return NumThreads;
-}
-
-// This routine is always called by the team master..
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           int16_t IsOMPRuntimeInitialized) {
-  PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
-  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
-
-  omptarget_nvptx_workFn = WorkFn;
-
-  // This routine is only called by the team master.  The team master is
-  // the first thread of the last warp.  It always has the logical thread
-  // id of 0 (since it is a shadow for the first worker thread).
-  const int threadId = 0;
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
-  ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
-          "cannot be called in a parallel region.");
-  if (currTaskDescr->InParallelRegion()) {
-    PRINT0(LD_PAR, "already in parallel: go seq\n");
-    return;
-  }
-
-  uint16_t &NumThreadsClause =
-      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
-
-  uint16_t NumThreads =
-      determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
-
-  if (NumThreadsClause != 0) {
-    // Reset request to avoid propagating to successive #parallel
-    NumThreadsClause = 0;
-  }
-
-  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         (int)NumThreads);
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
-          "only team master can create parallel");
-
-  // Set number of threads on work descriptor.
-  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-  workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
-  threadsInTeam = NumThreads;
-}
-
-// All workers call this function.  Deactivate those not needed.
-// Fn - the outlined work function to execute.
-// returns True if this thread is active, else False.
-//
-// Only the worker threads call this routine.
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
-                                   int16_t IsOMPRuntimeInitialized) {
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
-
-  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
-
-  // Work function and arguments for L1 parallel region.
-  *WorkFn = omptarget_nvptx_workFn;
-
-  // If this is the termination signal from the master, quit early.
-  if (!*WorkFn) {
-    PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
-    return false;
-  }
-
-  // Only the worker threads call this routine and the master warp
-  // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = GetThreadIdInBlock();
-  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-  // Set to true for workers participating in the parallel region.
-  bool isActive = false;
-  // Initialize state for active threads.
-  if (threadId < threadsInTeam) {
-    // init work descriptor from workdesccr
-    omptarget_nvptx_TaskDescr *newTaskDescr =
-        omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
-    newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
-    // install new top descriptor
-    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                               newTaskDescr);
-    // init private from int value
-    PRINT(LD_PAR,
-          "thread will execute parallel region with id %d in a team of "
-          "%d threads\n",
-          (int)newTaskDescr->ThreadId(), (int)nThreads);
-
-    isActive = true;
-    // Reconverge the threads at the end of the parallel region to correctly
-    // handle parallel levels.
-    // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
-    // warp. If only 1 thread is active, not need to reconverge the threads.
-    // If we have the whole warp, reconverge all the threads in the warp before
-    // actually trying to change the parallel level. Otherwise, parallel level
-    // can be changed incorrectly because of threads divergence.
-    bool IsActiveParallelRegion = threadsInTeam != 1;
-    IncParallelLevel(IsActiveParallelRegion,
-                     IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
-  }
-
-  return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_parallel() {
-  // pop stack
-  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  // Only the worker threads call this routine and the master warp
-  // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = GetThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, currTaskDescr->GetPrevTaskDescr());
-
-  // Reconverge the threads at the end of the parallel region to correctly
-  // handle parallel levels.
-  // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
-  // warp. If only 1 thread is active, not need to reconverge the threads.
-  // If we have the whole warp, reconverge all the threads in the warp before
-  // actually trying to change the parallel level. Otherwise, parallel level can
-  // be changed incorrectly because of threads divergence.
-    bool IsActiveParallelRegion = threadsInTeam != 1;
-    DecParallelLevel(IsActiveParallelRegion,
-                     IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// support for parallel that goes sequential
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
-
-  IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
-
-  if (checkRuntimeUninitialized(loc)) {
-    ASSERT0(LT_FUSSY, checkSPMDMode(loc),
-            "Expected SPMD mode with uninitialized runtime.");
-    return;
-  }
-
-  // assume this is only called for nested parallel
-  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-
-  // unlike actual parallel, threads in the same team do not share
-  // the workTaskDescr in this case and num threads is fixed to 1
-
-  // get current task
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
-  currTaskDescr->SaveLoopData();
-
-  // allocate new task descriptor and copy value from current one, set prev to
-  // it
-  omptarget_nvptx_TaskDescr *newTaskDescr =
-      (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
-                                              "new seq parallel task");
-  newTaskDescr->CopyParent(currTaskDescr);
-
-  // tweak values for serialized parallel case:
-  // - each thread becomes ID 0 in its serialized parallel, and
-  // - there is only one thread per team
-  newTaskDescr->ThreadId() = 0;
-
-  // set new task descriptor as top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
-                                                             newTaskDescr);
-}
-
-EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
-                                           uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
-
-  DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
-
-  if (checkRuntimeUninitialized(loc)) {
-    ASSERT0(LT_FUSSY, checkSPMDMode(loc),
-            "Expected SPMD mode with uninitialized runtime.");
-    return;
-  }
-
-  // pop stack
-  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
-  // set new top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
-      threadId, currTaskDescr->GetPrevTaskDescr());
-  // free
-  SafeFree(currTaskDescr, "new seq parallel task");
-  currTaskDescr = getMyTopTaskDescriptor(threadId);
-  currTaskDescr->RestoreLoopData();
-}
-
-EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
-
-  return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
-}
-
-// This kmpc call returns the thread id across all teams. It's value is
-// cached by the compiler and used when calling the runtime. On nvptx
-// it's cheap to recalculate this value so we never use the result
-// of this call.
-EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
-  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  return GetOmpThreadId(tid, checkSPMDMode(loc));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// push params
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
-                                    int32_t num_threads) {
-  PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
-      num_threads;
-}
-
-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
-                                   int32_t simd_limit) {
-  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
-}
-
-// Do nothing. The host guarantees we started the requested number of
-// teams and we only need inspection of gridDim.
-
-EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
-                                  int32_t num_teams, int32_t thread_limit) {
-  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
-  ASSERT0(LT_FUSSY, 0,
-          "should never have anything with new teams on device");
-}
-
-EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
-                                  int proc_bind) {
-  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
-}
+//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implementation in the GPU. Here is the pattern:
+//
+//    while (not finished) {
+//
+//    if (master) {
+//      sequential code, decide which par loop to do, or if finished
+//     __kmpc_kernel_prepare_parallel() // exec by master only
+//    }
+//    syncthreads // A
+//    __kmpc_kernel_parallel() // exec by all
+//    if (this thread is included in the parallel) {
+//      switch () for all parallel loops
+//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
+//    }
+//
+//
+//    The reason we don't exec end_parallel for the threads not included
+//    in the parallel loop is that for each barrier in the parallel
+//    region, these non-included threads will cycle through the
+//    syncthread A. Thus they must preserve their current threadId that
+//    is larger than thread in team.
+//
+//    To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+#include "target_impl.h"
+
+typedef struct ConvergentSimdJob {
+  omptarget_nvptx_TaskDescr taskDescr;
+  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+  uint16_t slimForNextSimd;
+} ConvergentSimdJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent simd (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
+                                          __kmpc_impl_lanemask_t Mask,
+                                          bool *IsFinal, int32_t *LaneSource,
+                                          int32_t *LaneId, int32_t *NumLanes) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
+  __kmpc_impl_lanemask_t ConvergentMask = Mask;
+  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
+  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
+  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
+  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
+  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
+  *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
+
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
+
+  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
+  int32_t SimdLimit =
+      omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
+  job->slimForNextSimd = SimdLimit;
+
+  int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
+  // reset simdlimit to avoid propagating to successive #simd
+  if (SimdLimitSource > 0 && threadId == sourceThreadId)
+    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
+
+  // We cannot have more than the # of convergent threads.
+  if (SimdLimitSource > 0)
+    *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
+  else
+    *NumLanes = ConvergentSize;
+  ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
+         (int)*NumLanes);
+
+  // Set to true for lanes participating in the simd region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (*LaneId < *NumLanes) {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    omptarget_nvptx_TaskDescr *sourceTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+            sourceThreadId);
+    job->convHeadTaskDescr = currTaskDescr;
+    // install top descriptor from the thread for which the lanes are working.
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               sourceTaskDescr);
+    isActive = true;
+  }
+
+  // requires a memory fence between threads of a warp
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
+  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
+      job->slimForNextSimd;
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, job->convHeadTaskDescr);
+}
+
+typedef struct ConvergentParallelJob {
+  omptarget_nvptx_TaskDescr taskDescr;
+  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+  uint16_t tnumForNextPar;
+} ConvergentParallelJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent parallelism (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
+                                              __kmpc_impl_lanemask_t Mask,
+                                              bool *IsFinal,
+                                              int32_t *LaneSource) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
+  __kmpc_impl_lanemask_t ConvergentMask = Mask;
+  int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
+  __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
+  *LaneSource += __kmpc_impl_ffs(WorkRemaining);
+  *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
+  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
+  uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
+
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
+
+  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
+  int32_t NumThreadsClause =
+      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
+  job->tnumForNextPar = NumThreadsClause;
+
+  int32_t NumThreadsSource =
+      __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
+  // reset numthreads to avoid propagating to successive #parallel
+  if (NumThreadsSource > 0 && threadId == sourceThreadId)
+    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+        0;
+
+  // We cannot have more than the # of convergent threads.
+  uint16_t NumThreads;
+  if (NumThreadsSource > 0)
+    NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
+  else
+    NumThreads = ConvergentSize;
+  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
+         (int)NumThreads);
+
+  // Set to true for workers participating in the parallel region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (OmpId < NumThreads) {
+    // init L2 task descriptor and storage for the L1 parallel task descriptor.
+    omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
+    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    omptarget_nvptx_TaskDescr *sourceTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+            sourceThreadId);
+    job->convHeadTaskDescr = currTaskDescr;
+    newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
+    // install new top descriptor
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               newTaskDescr);
+    isActive = true;
+  }
+
+  // requires a memory fence between threads of a warp
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, job->convHeadTaskDescr);
+  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+      job->tnumForNextPar;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes parallel (1 static level only)
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
+                                                uint16_t NThreadsICV,
+                                                uint16_t ThreadLimit) {
+  uint16_t ThreadsRequested = NThreadsICV;
+  if (NumThreadsClause != 0) {
+    ThreadsRequested = NumThreadsClause;
+  }
+
+  uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
+  if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
+    ThreadsAvailable = ThreadLimit;
+  }
+
+  uint16_t NumThreads = ThreadsAvailable;
+  if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
+    NumThreads = ThreadsRequested;
+  }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp participate in the parallel region.  Round down to a
+  // multiple of WARPSIZE since it is legal to do so in OpenMP.
+  if (NumThreads < WARPSIZE) {
+    NumThreads = 1;
+  } else {
+    NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
+  }
+#endif
+
+  return NumThreads;
+}
+
+// This routine is always called by the team master..
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+                                           int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
+  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
+
+  omptarget_nvptx_workFn = WorkFn;
+
+  // This routine is only called by the team master.  The team master is
+  // the first thread of the last warp.  It always has the logical thread
+  // id of 0 (since it is a shadow for the first worker thread).
+  const int threadId = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+  ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
+          "cannot be called in a parallel region.");
+  if (currTaskDescr->InParallelRegion()) {
+    PRINT0(LD_PAR, "already in parallel: go seq\n");
+    return;
+  }
+
+  uint16_t &NumThreadsClause =
+      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
+
+  uint16_t NumThreads =
+      determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
+
+  if (NumThreadsClause != 0) {
+    // Reset request to avoid propagating to successive #parallel
+    NumThreadsClause = 0;
+  }
+
+  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
+         (int)NumThreads);
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "only team master can create parallel");
+
+  // Set number of threads on work descriptor.
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+  workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
+  threadsInTeam = NumThreads;
+}
+
+// All workers call this function.  Deactivate those not needed.
+// Fn - the outlined work function to execute.
+// returns True if this thread is active, else False.
+//
+// Only the worker threads call this routine.
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+                                   int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
+
+  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
+
+  // Work function and arguments for L1 parallel region.
+  *WorkFn = omptarget_nvptx_workFn;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn) {
+    PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
+    return false;
+  }
+
+  // Only the worker threads call this routine and the master warp
+  // never arrives here.  Therefore, use the nvptx thread id.
+  int threadId = GetThreadIdInBlock();
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+  // Set to true for workers participating in the parallel region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (threadId < threadsInTeam) {
+    // init work descriptor from workdesccr
+    omptarget_nvptx_TaskDescr *newTaskDescr =
+        omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+    newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
+    // install new top descriptor
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               newTaskDescr);
+    // init private from int value
+    PRINT(LD_PAR,
+          "thread will execute parallel region with id %d in a team of "
+          "%d threads\n",
+          (int)newTaskDescr->ThreadId(), (int)nThreads);
+
+    isActive = true;
+    // Reconverge the threads at the end of the parallel region to correctly
+    // handle parallel levels.
+    // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
+    // warp. If only 1 thread is active, not need to reconverge the threads.
+    // If we have the whole warp, reconverge all the threads in the warp before
+    // actually trying to change the parallel level. Otherwise, parallel level
+    // can be changed incorrectly because of threads divergence.
+    bool IsActiveParallelRegion = threadsInTeam != 1;
+    IncParallelLevel(IsActiveParallelRegion,
+                     IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
+  }
+
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_parallel() {
+  // pop stack
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  // Only the worker threads call this routine and the master warp
+  // never arrives here.  Therefore, use the nvptx thread id.
+  int threadId = GetThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTaskDescr->GetPrevTaskDescr());
+
+  // Reconverge the threads at the end of the parallel region to correctly
+  // handle parallel levels.
+  // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
+  // warp. If only 1 thread is active, not need to reconverge the threads.
+  // If we have the whole warp, reconverge all the threads in the warp before
+  // actually trying to change the parallel level. Otherwise, parallel level can
+  // be changed incorrectly because of threads divergence.
+    bool IsActiveParallelRegion = threadsInTeam != 1;
+    DecParallelLevel(IsActiveParallelRegion,
+                     IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes sequential
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
+
+  IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
+
+  if (checkRuntimeUninitialized(loc)) {
+    ASSERT0(LT_FUSSY, checkSPMDMode(loc),
+            "Expected SPMD mode with uninitialized runtime.");
+    return;
+  }
+
+  // assume this is only called for nested parallel
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+
+  // unlike actual parallel, threads in the same team do not share
+  // the workTaskDescr in this case and num threads is fixed to 1
+
+  // get current task
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+  currTaskDescr->SaveLoopData();
+
+  // allocate new task descriptor and copy value from current one, set prev to
+  // it
+  omptarget_nvptx_TaskDescr *newTaskDescr =
+      (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
+                                              "new seq parallel task");
+  newTaskDescr->CopyParent(currTaskDescr);
+
+  // tweak values for serialized parallel case:
+  // - each thread becomes ID 0 in its serialized parallel, and
+  // - there is only one thread per team
+  newTaskDescr->ThreadId() = 0;
+
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                             newTaskDescr);
+}
+
+EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
+                                           uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
+
+  DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
+
+  if (checkRuntimeUninitialized(loc)) {
+    ASSERT0(LT_FUSSY, checkSPMDMode(loc),
+            "Expected SPMD mode with uninitialized runtime.");
+    return;
+  }
+
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+  // set new top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTaskDescr->GetPrevTaskDescr());
+  // free
+  SafeFree(currTaskDescr, "new seq parallel task");
+  currTaskDescr = getMyTopTaskDescriptor(threadId);
+  currTaskDescr->RestoreLoopData();
+}
+
+EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
+
+  return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
+}
+
+// This kmpc call returns the thread id across all teams. It's value is
+// cached by the compiler and used when calling the runtime. On nvptx
+// it's cheap to recalculate this value so we never use the result
+// of this call.
+EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  return GetOmpThreadId(tid, checkSPMDMode(loc));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// push params
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
+                                    int32_t num_threads) {
+  PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
+  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
+      num_threads;
+}
+
+EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
+                                   int32_t simd_limit) {
+  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
+  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
+}
+
+// Do nothing. The host guarantees we started the requested number of
+// teams and we only need inspection of gridDim.
+
+EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
+                                  int32_t num_teams, int32_t thread_limit) {
+  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
+  ASSERT0(LT_FUSSY, 0,
+          "should never have anything with new teams on device");
+}
+
+EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
+                                  int proc_bind) {
+  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index 427c90a7e0913..04ec735674603 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -1,531 +1,531 @@
-//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of reduction with KMPC interface.
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-#include "common/target_atomic.h"
-#include "target_impl.h"
-
-EXTERN
-void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
-
-EXTERN
-void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
-
-EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
-  return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
-}
-
-EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
-   uint32_t lo, hi;
-   __kmpc_impl_unpack(val, lo, hi);
-   hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
-   lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
-   return __kmpc_impl_pack(lo, hi);
-}
-
-INLINE static void gpu_regular_warp_reduce(void *reduce_data,
-                                           kmp_ShuffleReductFctPtr shflFct) {
-  for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
-    shflFct(reduce_data, /*LaneId - not used= */ 0,
-            /*Offset = */ mask, /*AlgoVersion=*/0);
-  }
-}
-
-INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
-                                             kmp_ShuffleReductFctPtr shflFct,
-                                             uint32_t size, uint32_t tid) {
-  uint32_t curr_size;
-  uint32_t mask;
-  curr_size = size;
-  mask = curr_size / 2;
-  while (mask > 0) {
-    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
-    curr_size = (curr_size + 1) / 2;
-    mask = curr_size / 2;
-  }
-}
-
-INLINE static uint32_t
-gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
-  uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
-  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
-  __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
-  do {
-    Liveness = __kmpc_impl_activemask();
-    remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
-    size = __kmpc_impl_popc(Liveness);
-    logical_lane_id /= 2;
-    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-  return (logical_lane_id == 0);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                        size_t reduce_size, void *reduce_data,
-                                        kmp_ShuffleReductFctPtr shflFct,
-                                        kmp_InterWarpCopyFctPtr cpyFct) {
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) {
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-    return GetThreadIdInBlock() % WARPSIZE ==
-           0; // Result on lane 0 of the simd warp.
-  } else {
-    return gpu_irregular_simd_reduce(
-        reduce_data, shflFct); // Result on the first active lane.
-  }
-}
-
-INLINE
-static int32_t nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
-  if (NumThreads == 1)
-    return 1;
-  /*
-   * This reduce function handles reduction within a team. It handles
-   * parallel regions in both L1 and L2 parallelism levels. It also
-   * supports Generic, SPMD, and NoOMP modes.
-   *
-   * 1. Reduce within a warp.
-   * 2. Warp master copies value to warp 0 via shared memory.
-   * 3. Warp 0 reduces to a single value.
-   * 4. The reduced value is available in the thread that returns 1.
-   */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
-  uint32_t WarpId = BlockThreadId / WARPSIZE;
-
-  // Volta execution model:
-  // For the Generic execution mode a parallel region either has 1 thread and
-  // beyond that, always a multiple of 32. For the SPMD execution mode we may
-  // have any number of threads.
-  if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % WARPSIZE,
-                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > WARPSIZE) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-  }
-  return BlockThreadId == 0;
-#else
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
-                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
-  else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
-                                    // parallel region may enter here; return
-                                    // early.
-    return gpu_irregular_simd_reduce(reduce_data, shflFct);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > WARPSIZE) {
-    uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = BlockThreadId / WARPSIZE;
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-
-    return BlockThreadId == 0;
-  } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
-    return BlockThreadId == 0;
-  }
-
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
-  return global_tid == 0;
-#endif // __CUDA_ARCH__ >= 700
-}
-
-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
-                                      reduce_data, shflFct, cpyFct,
-                                      isSPMDMode(), isRuntimeUninitialized());
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
-    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
-}
-
-INLINE
-static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                         size_t reduce_size, void *reduce_data,
-                                         kmp_ShuffleReductFctPtr shflFct,
-                                         kmp_InterWarpCopyFctPtr cpyFct,
-                                         kmp_CopyToScratchpadFctPtr scratchFct,
-                                         kmp_LoadReduceFctPtr ldFct,
-                                         bool isSPMDExecutionMode) {
-  uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads =
-      isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
-                          : /*Master thread only*/ 1;
-  uint32_t TeamId = GetBlockIdInKernel();
-  uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  static SHARED volatile bool IsLastTeam;
-
-  // Team masters of all teams write to the scratchpad.
-  if (ThreadId == 0) {
-    unsigned int *timestamp = GetTeamsReductionTimestamp();
-    char *scratchpad = GetTeamsReductionScratchpad();
-
-    scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
-    __kmpc_impl_threadfence();
-
-    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
-    // It resets 'timestamp' back to 0 once the last team increments
-    // this counter.
-    unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
-    IsLastTeam = val == NumTeams - 1;
-  }
-
-  // We have to wait on L1 barrier because in GENERIC mode the workers
-  // are waiting on barrier 0 for work.
-  //
-  // If we guard this barrier as follows it leads to deadlock, probably
-  // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
-  uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
-  __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
-
-  // If this team is not the last, quit.
-  if (/* Volatile read by all threads */ !IsLastTeam)
-    return 0;
-
-    //
-    // Last team processing.
-    //
-
-    // Threads in excess of #teams do not participate in reduction of the
-    // scratchpad values.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t ActiveThreads = NumThreads;
-  if (NumTeams < NumThreads) {
-    ActiveThreads =
-        (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
-  }
-  if (ThreadId >= ActiveThreads)
-    return 0;
-
-  // Load from scratchpad and reduce.
-  char *scratchpad = GetTeamsReductionScratchpad();
-  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
-  for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
-    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
-  uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-  uint32_t WarpId = ThreadId / WARPSIZE;
-
-  // Reduce across warps to the warp master.
-  if ((ActiveThreads % WARPSIZE == 0) ||
-      (WarpId < WarpsNeeded - 1)) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (ActiveThreads > 1) // Partial warp but contiguous lanes
-    // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/ActiveThreads % WARPSIZE,
-                              /*LaneId=*/ThreadId % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  if (ActiveThreads > WARPSIZE) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-  }
-#else
-  if (ThreadId >= NumTeams)
-    return 0;
-
-  // Load from scratchpad and reduce.
-  char *scratchpad = GetTeamsReductionScratchpad();
-  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
-  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
-    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
-  // Reduce across warps to the warp master.
-  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
-  if (Liveness == __kmpc_impl_all_lanes) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
-                              /*LaneId=*/ThreadId % WARPSIZE);
-
-  // When we have more than [warpsize] number of threads
-  // a block reduction is performed here.
-  uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
-  if (ActiveThreads > WARPSIZE) {
-    uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = ThreadId / WARPSIZE;
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-  }
-#endif // __CUDA_ARCH__ >= 700
-
-  return ThreadId == 0;
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                         size_t reduce_size, void *reduce_data,
-                                         kmp_ShuffleReductFctPtr shflFct,
-                                         kmp_InterWarpCopyFctPtr cpyFct,
-                                         kmp_CopyToScratchpadFctPtr scratchFct,
-                                         kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, isSPMDMode());
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, /*isSPMDExecutionMode=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
-                                   reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct, /*isSPMDExecutionMode=*/false);
-}
-
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
-                                                       int32_t global_tid,
-                                                       kmp_CriticalName *crit) {
-  if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
-    return 0;
-  // The master thread of the team actually does the reduction.
-  while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
-    ;
-  return 1;
-}
-
-EXTERN void
-__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
-                                            kmp_CriticalName *crit) {
-  __kmpc_impl_threadfence_system();
-  (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
-}
-
-INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
-  return checkGenericMode(loc) || IsTeamMaster(ThreadId);
-}
-
-INLINE static uint32_t roundToWarpsize(uint32_t s) {
-  if (s < WARPSIZE)
-    return 1;
-  return (s & ~(unsigned)(WARPSIZE - 1));
-}
-
-DEVICE static volatile uint32_t IterCnt = 0;
-DEVICE static volatile uint32_t Cnt = 0;
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
-    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
-    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
-    kmp_ListGlobalFctPtr glredFct) {
-
-  // Terminate all threads in non-SPMD mode except for the master thread.
-  if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
-    return 0;
-
-  uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team master participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads =
-      checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
-                         : /*Master thread only*/ 1;
-  uint32_t TeamId = GetBlockIdInKernel();
-  uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  static SHARED unsigned Bound;
-  static SHARED unsigned ChunkTeamCount;
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMaster = isMaster(loc, ThreadId);
-  while (IsMaster) {
-    // Atomic read
-    Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
-    if (TeamId < Bound + num_of_records)
-      break;
-  }
-
-  if (IsMaster) {
-    int ModBockId = TeamId % num_of_records;
-    if (TeamId < num_of_records)
-      lgcpyFct(global_buffer, ModBockId, reduce_data);
-    else
-      lgredFct(global_buffer, ModBockId, reduce_data);
-    __kmpc_impl_threadfence_system();
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // BUFFER_SIZE chunk.
-    ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
-  }
-  // Synchronize
-  if (checkSPMDMode(loc))
-    __kmpc_barrier(loc, global_tid);
-
-  // reduce_data is global or shared so before being reduced within the
-  // warp we need to bring it in local memory:
-  // local_reduce_data = reduce_data[i]
-  //
-  // Example for 3 reduction variables a, b, c (of potentially different
-  // types):
-  //
-  // buffer layout (struct of arrays):
-  // a, a, ..., a, b, b, ... b, c, c, ... c
-  // |__________|
-  //     num_of_records
-  //
-  // local_data_reduce layout (struct):
-  // a, b, c
-  //
-  // Each thread will have a local struct containing the values to be
-  // reduced:
-  //      1. do reduction within each warp.
-  //      2. do reduction across warps.
-  //      3. write the final result to the main reduction variable
-  //         by returning 1 in the thread holding the reduction result.
-
-  // Check if this is the very last team.
-  unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
-  if (ChunkTeamCount == NumTeams - Bound - 1) {
-    //
-    // Last team processing.
-    //
-    if (ThreadId >= NumRecs)
-      return 0;
-    NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
-    if (ThreadId >= NumThreads)
-      return 0;
-
-    // Load from buffer and reduce.
-    glcpyFct(global_buffer, ThreadId, reduce_data);
-    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
-      glredFct(global_buffer, i, reduce_data);
-
-    // Reduce across warps to the warp master.
-    if (NumThreads > 1) {
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-
-      // When we have more than [warpsize] number of threads
-      // a block reduction is performed here.
-      uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
-      if (ActiveThreads > WARPSIZE) {
-        uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
-        // Gather all the reduced values from each warp
-        // to the first warp.
-        cpyFct(reduce_data, WarpsNeeded);
-
-        uint32_t WarpId = ThreadId / WARPSIZE;
-        if (WarpId == 0)
-          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                    ThreadId);
-      }
-    }
-
-    if (IsMaster) {
-      Cnt = 0;
-      IterCnt = 0;
-      return 1;
-    }
-    return 0;
-  }
-  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
-    // Allow SIZE number of teams to proceed writing their
-    // intermediate results to the global buffer.
-    __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
-  }
-
-  return 0;
-}
-
+//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+#include "common/target_atomic.h"
+#include "target_impl.h"
+
+EXTERN
+void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
+
+EXTERN
+void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
+
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
+  return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
+}
+
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
+   uint32_t lo, hi;
+   __kmpc_impl_unpack(val, lo, hi);
+   hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
+   lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
+   return __kmpc_impl_pack(lo, hi);
+}
+
+INLINE static void gpu_regular_warp_reduce(void *reduce_data,
+                                           kmp_ShuffleReductFctPtr shflFct) {
+  for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
+    shflFct(reduce_data, /*LaneId - not used= */ 0,
+            /*Offset = */ mask, /*AlgoVersion=*/0);
+  }
+}
+
+INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
+                                             kmp_ShuffleReductFctPtr shflFct,
+                                             uint32_t size, uint32_t tid) {
+  uint32_t curr_size;
+  uint32_t mask;
+  curr_size = size;
+  mask = curr_size / 2;
+  while (mask > 0) {
+    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
+    curr_size = (curr_size + 1) / 2;
+    mask = curr_size / 2;
+  }
+}
+
+INLINE static uint32_t
+gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
+  uint32_t size, remote_id, physical_lane_id;
+  physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
+  __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
+  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
+  uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
+  __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
+  do {
+    Liveness = __kmpc_impl_activemask();
+    remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
+    size = __kmpc_impl_popc(Liveness);
+    logical_lane_id /= 2;
+    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+  return (logical_lane_id == 0);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                        size_t reduce_size, void *reduce_data,
+                                        kmp_ShuffleReductFctPtr shflFct,
+                                        kmp_InterWarpCopyFctPtr cpyFct) {
+  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
+  if (Liveness == __kmpc_impl_all_lanes) {
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+    return GetThreadIdInBlock() % WARPSIZE ==
+           0; // Result on lane 0 of the simd warp.
+  } else {
+    return gpu_irregular_simd_reduce(
+        reduce_data, shflFct); // Result on the first active lane.
+  }
+}
+
+INLINE
+static int32_t nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
+  if (NumThreads == 1)
+    return 1;
+  /*
+   * This reduce function handles reduction within a team. It handles
+   * parallel regions in both L1 and L2 parallelism levels. It also
+   * supports Generic, SPMD, and NoOMP modes.
+   *
+   * 1. Reduce within a warp.
+   * 2. Warp master copies value to warp 0 via shared memory.
+   * 3. Warp 0 reduces to a single value.
+   * 4. The reduced value is available in the thread that returns 1.
+   */
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+  uint32_t WarpId = BlockThreadId / WARPSIZE;
+
+  // Volta execution model:
+  // For the Generic execution mode a parallel region either has 1 thread and
+  // beyond that, always a multiple of 32. For the SPMD execution mode we may
+  // have any number of threads.
+  if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/NumThreads % WARPSIZE,
+                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > WARPSIZE) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+  }
+  return BlockThreadId == 0;
+#else
+  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
+  if (Liveness == __kmpc_impl_all_lanes) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
+                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+  else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
+                                    // parallel region may enter here; return
+                                    // early.
+    return gpu_irregular_simd_reduce(reduce_data, shflFct);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > WARPSIZE) {
+    uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = BlockThreadId / WARPSIZE;
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+
+    return BlockThreadId == 0;
+  } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
+    return BlockThreadId == 0;
+  }
+
+  // Get the OMP thread Id. This is different from BlockThreadId in the case of
+  // an L2 parallel region.
+  return global_tid == 0;
+#endif // __CUDA_ARCH__ >= 700
+}
+
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+                                      reduce_data, shflFct, cpyFct,
+                                      isSPMDMode(), isRuntimeUninitialized());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
+}
+
+INLINE
+static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                         size_t reduce_size, void *reduce_data,
+                                         kmp_ShuffleReductFctPtr shflFct,
+                                         kmp_InterWarpCopyFctPtr cpyFct,
+                                         kmp_CopyToScratchpadFctPtr scratchFct,
+                                         kmp_LoadReduceFctPtr ldFct,
+                                         bool isSPMDExecutionMode) {
+  uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads =
+      isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
+                          : /*Master thread only*/ 1;
+  uint32_t TeamId = GetBlockIdInKernel();
+  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  static SHARED volatile bool IsLastTeam;
+
+  // Team masters of all teams write to the scratchpad.
+  if (ThreadId == 0) {
+    unsigned int *timestamp = GetTeamsReductionTimestamp();
+    char *scratchpad = GetTeamsReductionScratchpad();
+
+    scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
+    __kmpc_impl_threadfence();
+
+    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
+    // It resets 'timestamp' back to 0 once the last team increments
+    // this counter.
+    unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
+    IsLastTeam = val == NumTeams - 1;
+  }
+
+  // We have to wait on L1 barrier because in GENERIC mode the workers
+  // are waiting on barrier 0 for work.
+  //
+  // If we guard this barrier as follows it leads to deadlock, probably
+  // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
+  uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+  __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
+
+  // If this team is not the last, quit.
+  if (/* Volatile read by all threads */ !IsLastTeam)
+    return 0;
+
+    //
+    // Last team processing.
+    //
+
+    // Threads in excess of #teams do not participate in reduction of the
+    // scratchpad values.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  uint32_t ActiveThreads = NumThreads;
+  if (NumTeams < NumThreads) {
+    ActiveThreads =
+        (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
+  }
+  if (ThreadId >= ActiveThreads)
+    return 0;
+
+  // Load from scratchpad and reduce.
+  char *scratchpad = GetTeamsReductionScratchpad();
+  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
+  for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
+    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
+
+  uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+  uint32_t WarpId = ThreadId / WARPSIZE;
+
+  // Reduce across warps to the warp master.
+  if ((ActiveThreads % WARPSIZE == 0) ||
+      (WarpId < WarpsNeeded - 1)) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (ActiveThreads > 1) // Partial warp but contiguous lanes
+    // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/ActiveThreads % WARPSIZE,
+                              /*LaneId=*/ThreadId % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  if (ActiveThreads > WARPSIZE) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+  }
+#else
+  if (ThreadId >= NumTeams)
+    return 0;
+
+  // Load from scratchpad and reduce.
+  char *scratchpad = GetTeamsReductionScratchpad();
+  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
+  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
+    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
+
+  // Reduce across warps to the warp master.
+  __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
+  if (Liveness == __kmpc_impl_all_lanes) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
+                              /*LaneId=*/ThreadId % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
+  if (ActiveThreads > WARPSIZE) {
+    uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = ThreadId / WARPSIZE;
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+  }
+#endif // __CUDA_ARCH__ >= 700
+
+  return ThreadId == 0;
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                         size_t reduce_size, void *reduce_data,
+                                         kmp_ShuffleReductFctPtr shflFct,
+                                         kmp_InterWarpCopyFctPtr cpyFct,
+                                         kmp_CopyToScratchpadFctPtr scratchFct,
+                                         kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, isSPMDMode());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, /*isSPMDExecutionMode=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, /*isSPMDExecutionMode=*/false);
+}
+
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
+                                                       int32_t global_tid,
+                                                       kmp_CriticalName *crit) {
+  if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
+    return 0;
+  // The master thread of the team actually does the reduction.
+  while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
+    ;
+  return 1;
+}
+
+EXTERN void
+__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
+                                            kmp_CriticalName *crit) {
+  __kmpc_impl_threadfence_system();
+  (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
+}
+
+INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
+  return checkGenericMode(loc) || IsTeamMaster(ThreadId);
+}
+
+INLINE static uint32_t roundToWarpsize(uint32_t s) {
+  if (s < WARPSIZE)
+    return 1;
+  return (s & ~(unsigned)(WARPSIZE - 1));
+}
+
+DEVICE static volatile uint32_t IterCnt = 0;
+DEVICE static volatile uint32_t Cnt = 0;
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
+    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
+    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
+    kmp_ListGlobalFctPtr glredFct) {
+
+  // Terminate all threads in non-SPMD mode except for the master thread.
+  if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
+    return 0;
+
+  uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads =
+      checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
+                         : /*Master thread only*/ 1;
+  uint32_t TeamId = GetBlockIdInKernel();
+  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  static SHARED unsigned Bound;
+  static SHARED unsigned ChunkTeamCount;
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  bool IsMaster = isMaster(loc, ThreadId);
+  while (IsMaster) {
+    // Atomic read
+    Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
+    if (TeamId < Bound + num_of_records)
+      break;
+  }
+
+  if (IsMaster) {
+    int ModBockId = TeamId % num_of_records;
+    if (TeamId < num_of_records)
+      lgcpyFct(global_buffer, ModBockId, reduce_data);
+    else
+      lgredFct(global_buffer, ModBockId, reduce_data);
+    __kmpc_impl_threadfence_system();
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
+  }
+  // Synchronize
+  if (checkSPMDMode(loc))
+    __kmpc_barrier(loc, global_tid);
+
+  // reduce_data is global or shared so before being reduced within the
+  // warp we need to bring it in local memory:
+  // local_reduce_data = reduce_data[i]
+  //
+  // Example for 3 reduction variables a, b, c (of potentially different
+  // types):
+  //
+  // buffer layout (struct of arrays):
+  // a, a, ..., a, b, b, ... b, c, c, ... c
+  // |__________|
+  //     num_of_records
+  //
+  // local_data_reduce layout (struct):
+  // a, b, c
+  //
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
+  if (ChunkTeamCount == NumTeams - Bound - 1) {
+    //
+    // Last team processing.
+    //
+    if (ThreadId >= NumRecs)
+      return 0;
+    NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
+    if (ThreadId >= NumThreads)
+      return 0;
+
+    // Load from buffer and reduce.
+    glcpyFct(global_buffer, ThreadId, reduce_data);
+    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
+      glredFct(global_buffer, i, reduce_data);
+
+    // Reduce across warps to the warp master.
+    if (NumThreads > 1) {
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+
+      // When we have more than [warpsize] number of threads
+      // a block reduction is performed here.
+      uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
+      if (ActiveThreads > WARPSIZE) {
+        uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+        // Gather all the reduced values from each warp
+        // to the first warp.
+        cpyFct(reduce_data, WarpsNeeded);
+
+        uint32_t WarpId = ThreadId / WARPSIZE;
+        if (WarpId == 0)
+          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                    ThreadId);
+      }
+    }
+
+    if (IsMaster) {
+      Cnt = 0;
+      IterCnt = 0;
+      return 1;
+    }
+    return 0;
+  }
+  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
+    // Allow SIZE number of teams to proceed writing their
+    // intermediate results to the global buffer.
+    __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
+  }
+
+  return 0;
+}
+
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
index 85747511d46c1..e7dfa83bc056d 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -1,269 +1,269 @@
-//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Wrapper implementation to some functions natively supported by the GPU.
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/support.h"
-#include "common/debug.h"
-#include "common/omptarget.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Parameters
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
-  execution_param = EMode;
-  execution_param |= RMode;
-}
-
-DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
-
-DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
-
-DEVICE bool isRuntimeUninitialized() {
-  return (execution_param & RuntimeMask) == RuntimeUninitialized;
-}
-
-DEVICE bool isRuntimeInitialized() {
-  return (execution_param & RuntimeMask) == RuntimeInitialized;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Modes based on location parameter fields
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE bool checkSPMDMode(kmp_Ident *loc) {
-  if (!loc)
-    return isSPMDMode();
-
-  // If SPMD is true then we are not in the UNDEFINED state so
-  // we can return immediately.
-  if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
-    return true;
-
-  // If not in SPMD mode and runtime required is a valid
-  // combination of flags so we can return immediately.
-  if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
-    return false;
-
-  // We are in underfined state.
-  return isSPMDMode();
-}
-
-DEVICE bool checkGenericMode(kmp_Ident *loc) {
-  return !checkSPMDMode(loc);
-}
-
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) {
-  if (!loc)
-    return isRuntimeUninitialized();
-
-  // If runtime is required then we know we can't be
-  // in the undefined mode. We can return immediately.
-  if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
-    return false;
-
-  // If runtime is required then we need to check is in
-  // SPMD mode or not. If not in SPMD mode then we end
-  // up in the UNDEFINED state that marks the orphaned
-  // functions.
-  if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
-    return true;
-
-  // Check if we are in an UNDEFINED state. Undefined is denoted by
-  // non-SPMD + noRuntimeRequired which is a combination that
-  // cannot actually happen. Undefined states is used to mark orphaned
-  // functions.
-  return isRuntimeUninitialized();
-}
-
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) {
-  return !checkRuntimeUninitialized(loc);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// support: get info from machine
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
-//
-////////////////////////////////////////////////////////////////////////////////
-
-// The master thread id is the first thread (lane) of the last warp.
-// Thread id is 0 indexed.
-// E.g: If NumThreads is 33, master id is 32.
-//      If NumThreads is 64, master id is 32.
-//      If NumThreads is 97, master id is 96.
-//      If NumThreads is 1024, master id is 992.
-//
-// Called in Generic Execution Mode only.
-DEVICE int GetMasterThreadID() { return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); }
-
-// The last warp is reserved for the master; other warps are workers.
-// Called in Generic Execution Mode only.
-DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
-
-////////////////////////////////////////////////////////////////////////////////
-// get thread id in team
-
-// This function may be called in a parallel region by the workers
-// or a serial region by the master.  If the master (whose CUDA thread
-// id is GetMasterThreadID()) calls this routine, we return 0 because
-// it is a shadow for the first worker.
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
-  // Implemented using control flow (predication) instead of with a modulo
-  // operation.
-  int tid = GetThreadIdInBlock();
-  if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
-    return 0;
-  else
-    return tid;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// OpenMP Thread Support Layer
-//
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
-  // omp_thread_num
-  int rc;
-  if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
-    rc = 0;
-  } else if (isSPMDExecutionMode) {
-    rc = GetThreadIdInBlock();
-  } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-    ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
-    rc = currTaskDescr->ThreadId();
-  }
-  return rc;
-}
-
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
-  // omp_num_threads
-  int rc;
-  int Level = parallelLevel[GetWarpId()];
-  if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
-    rc = 1;
-  } else if (isSPMDExecutionMode) {
-    rc = GetNumberOfThreadsInBlock();
-  } else {
-    rc = threadsInTeam;
-  }
-
-  return rc;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Team id linked to OpenMP
-
-DEVICE int GetOmpTeamId() {
-  // omp_team_num
-  return GetBlockIdInKernel(); // assume 1 block per team
-}
-
-DEVICE int GetNumberOfOmpTeams() {
-  // omp_num_teams
-  return GetNumberOfBlocksInKernel(); // assume 1 block per team
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Masters
-
-DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
-
-////////////////////////////////////////////////////////////////////////////////
-// Parallel level
-
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
-  __kmpc_impl_syncwarp(Mask);
-  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
-  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
-  if (Rank == 0) {
-    parallelLevel[GetWarpId()] +=
-        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __kmpc_impl_threadfence();
-  }
-  __kmpc_impl_syncwarp(Mask);
-}
-
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
-  __kmpc_impl_syncwarp(Mask);
-  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
-  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
-  if (Rank == 0) {
-    parallelLevel[GetWarpId()] -=
-        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-    __kmpc_impl_threadfence();
-  }
-  __kmpc_impl_syncwarp(Mask);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// get OpenMP number of procs
-
-// Get the number of processors in the device.
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
-  if (!isSPMDExecutionMode)
-    return GetNumberOfWorkersInTeam();
-  return GetNumberOfThreadsInBlock();
-}
-
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
-  return GetNumberOfProcsInDevice(isSPMDExecutionMode);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE unsigned long PadBytes(unsigned long size,
-                              unsigned long alignment) // must be a power of 2
-{
-  // compute the necessary padding to satisfy alignment constraint
-  ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
-         "alignment %lu is not a power of 2\n", alignment);
-  return (~(unsigned long)size + 1) & (alignment - 1);
-}
-
-DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success
-{
-  void *ptr = __kmpc_impl_malloc(size);
-  PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
-        (unsigned long long)size, msg, (unsigned long long)ptr);
-  return ptr;
-}
-
-DEVICE void *SafeFree(void *ptr, const char *msg) {
-  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
-  __kmpc_impl_free(ptr);
-  return NULL;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Teams Reduction Scratchpad Helpers
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE unsigned int *GetTeamsReductionTimestamp() {
-  return static_cast<unsigned int *>(ReductionScratchpadPtr);
-}
-
-DEVICE char *GetTeamsReductionScratchpad() {
-  return static_cast<char *>(ReductionScratchpadPtr) + 256;
-}
-
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
-  ReductionScratchpadPtr = ScratchpadPtr;
-}
+//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper implementation to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/support.h"
+#include "common/debug.h"
+#include "common/omptarget.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
+  execution_param = EMode;
+  execution_param |= RMode;
+}
+
+DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
+
+DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
+
+DEVICE bool isRuntimeUninitialized() {
+  return (execution_param & RuntimeMask) == RuntimeUninitialized;
+}
+
+DEVICE bool isRuntimeInitialized() {
+  return (execution_param & RuntimeMask) == RuntimeInitialized;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Modes based on location parameter fields
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE bool checkSPMDMode(kmp_Ident *loc) {
+  if (!loc)
+    return isSPMDMode();
+
+  // If SPMD is true then we are not in the UNDEFINED state so
+  // we can return immediately.
+  if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
+    return true;
+
+  // If not in SPMD mode and runtime required is a valid
+  // combination of flags so we can return immediately.
+  if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
+    return false;
+
+  // We are in underfined state.
+  return isSPMDMode();
+}
+
+DEVICE bool checkGenericMode(kmp_Ident *loc) {
+  return !checkSPMDMode(loc);
+}
+
+DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) {
+  if (!loc)
+    return isRuntimeUninitialized();
+
+  // If runtime is required then we know we can't be
+  // in the undefined mode. We can return immediately.
+  if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
+    return false;
+
+  // If runtime is required then we need to check is in
+  // SPMD mode or not. If not in SPMD mode then we end
+  // up in the UNDEFINED state that marks the orphaned
+  // functions.
+  if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
+    return true;
+
+  // Check if we are in an UNDEFINED state. Undefined is denoted by
+  // non-SPMD + noRuntimeRequired which is a combination that
+  // cannot actually happen. Undefined states is used to mark orphaned
+  // functions.
+  return isRuntimeUninitialized();
+}
+
+DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) {
+  return !checkRuntimeUninitialized(loc);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support: get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// The master thread id is the first thread (lane) of the last warp.
+// Thread id is 0 indexed.
+// E.g: If NumThreads is 33, master id is 32.
+//      If NumThreads is 64, master id is 32.
+//      If NumThreads is 97, master id is 96.
+//      If NumThreads is 1024, master id is 992.
+//
+// Called in Generic Execution Mode only.
+DEVICE int GetMasterThreadID() { return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); }
+
+// The last warp is reserved for the master; other warps are workers.
+// Called in Generic Execution Mode only.
+DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
+
+////////////////////////////////////////////////////////////////////////////////
+// get thread id in team
+
+// This function may be called in a parallel region by the workers
+// or a serial region by the master.  If the master (whose CUDA thread
+// id is GetMasterThreadID()) calls this routine, we return 0 because
+// it is a shadow for the first worker.
+DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
+  // Implemented using control flow (predication) instead of with a modulo
+  // operation.
+  int tid = GetThreadIdInBlock();
+  if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
+    return 0;
+  else
+    return tid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP Thread Support Layer
+//
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
+  // omp_thread_num
+  int rc;
+  if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
+    rc = 0;
+  } else if (isSPMDExecutionMode) {
+    rc = GetThreadIdInBlock();
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+    rc = currTaskDescr->ThreadId();
+  }
+  return rc;
+}
+
+DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
+  // omp_num_threads
+  int rc;
+  int Level = parallelLevel[GetWarpId()];
+  if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
+    rc = 1;
+  } else if (isSPMDExecutionMode) {
+    rc = GetNumberOfThreadsInBlock();
+  } else {
+    rc = threadsInTeam;
+  }
+
+  return rc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team id linked to OpenMP
+
+DEVICE int GetOmpTeamId() {
+  // omp_team_num
+  return GetBlockIdInKernel(); // assume 1 block per team
+}
+
+DEVICE int GetNumberOfOmpTeams() {
+  // omp_num_teams
+  return GetNumberOfBlocksInKernel(); // assume 1 block per team
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Masters
+
+DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
+
+////////////////////////////////////////////////////////////////////////////////
+// Parallel level
+
+DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+  __kmpc_impl_syncwarp(Mask);
+  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
+  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
+  if (Rank == 0) {
+    parallelLevel[GetWarpId()] +=
+        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __kmpc_impl_threadfence();
+  }
+  __kmpc_impl_syncwarp(Mask);
+}
+
+DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+  __kmpc_impl_syncwarp(Mask);
+  __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
+  unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
+  if (Rank == 0) {
+    parallelLevel[GetWarpId()] -=
+        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __kmpc_impl_threadfence();
+  }
+  __kmpc_impl_syncwarp(Mask);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get OpenMP number of procs
+
+// Get the number of processors in the device.
+DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
+  if (!isSPMDExecutionMode)
+    return GetNumberOfWorkersInTeam();
+  return GetNumberOfThreadsInBlock();
+}
+
+DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
+  return GetNumberOfProcsInDevice(isSPMDExecutionMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE unsigned long PadBytes(unsigned long size,
+                              unsigned long alignment) // must be a power of 2
+{
+  // compute the necessary padding to satisfy alignment constraint
+  ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
+         "alignment %lu is not a power of 2\n", alignment);
+  return (~(unsigned long)size + 1) & (alignment - 1);
+}
+
+DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success
+{
+  void *ptr = __kmpc_impl_malloc(size);
+  PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
+        (unsigned long long)size, msg, (unsigned long long)ptr);
+  return ptr;
+}
+
+DEVICE void *SafeFree(void *ptr, const char *msg) {
+  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
+  __kmpc_impl_free(ptr);
+  return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE unsigned int *GetTeamsReductionTimestamp() {
+  return static_cast<unsigned int *>(ReductionScratchpadPtr);
+}
+
+DEVICE char *GetTeamsReductionScratchpad() {
+  return static_cast<char *>(ReductionScratchpadPtr) + 256;
+}
+
+DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
+  ReductionScratchpadPtr = ScratchpadPtr;
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
index 2ac3e3f9c7c0a..ba6c66340a764 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -1,155 +1,155 @@
-//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Include all synchronization.
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP Ordered calls
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_ordered\n");
-}
-
-EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_end_ordered\n");
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP Barriers
-////////////////////////////////////////////////////////////////////////////////
-
-// a team is a block: we can use CUDA native synchronization mechanism
-// FIXME: what if not all threads (warps) participate to the barrier?
-// We may need to implement it differently
-
-EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
-  __kmpc_barrier(loc_ref, tid);
-  PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
-  return 0;
-}
-
-EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
-  if (checkRuntimeUninitialized(loc_ref)) {
-    ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref),
-            "Expected SPMD mode with uninitialized runtime.");
-    __kmpc_barrier_simple_spmd(loc_ref, tid);
-  } else {
-    tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
-    if (numberOfActiveOMPThreads > 1) {
-      if (checkSPMDMode(loc_ref)) {
-        __kmpc_barrier_simple_spmd(loc_ref, tid);
-      } else {
-        // The #threads parameter must be rounded up to the WARPSIZE.
-        int threads =
-            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-        PRINT(LD_SYNC,
-              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-              (int)numberOfActiveOMPThreads, (int)threads);
-        // Barrier #1 is for synchronization among active threads.
-        __kmpc_impl_named_sync(L1_BARRIER, threads);
-      }
-    } else {
-      // Still need to flush the memory per the standard.
-      __kmpc_flush(loc_ref);
-    } // numberOfActiveOMPThreads > 1
-    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
-  }
-}
-
-// Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0
-// parallel region and that all worker threads participate.
-EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
-  PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
-  __kmpc_impl_syncthreads();
-  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
-}
-
-// Emit a simple barrier call in Generic mode.  Assumes the caller is in an L0
-// parallel region and that all worker threads participate.
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
-  int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
-  // The #threads parameter must be rounded up to the WARPSIZE.
-  int threads =
-      WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-  PRINT(LD_SYNC,
-        "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
-        "%d\n",
-        (int)numberOfActiveOMPThreads, (int)threads);
-  // Barrier #1 is for synchronization among active threads.
-  __kmpc_impl_named_sync(L1_BARRIER, threads);
-  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP MASTER
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_master\n");
-  return IsTeamMaster(global_tid);
-}
-
-EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_end_master\n");
-  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP SINGLE
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_single\n");
-  // decide to implement single with master; master get the single
-  return IsTeamMaster(global_tid);
-}
-
-EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_end_single\n");
-  // decide to implement single with master: master get the single
-  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
-  // sync barrier is explicitly called... so that is not a problem
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Flush
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_flush(kmp_Ident *loc) {
-  PRINT0(LD_IO, "call kmpc_flush\n");
-  __kmpc_impl_threadfence();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Vote
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
-  PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
-  return __kmpc_impl_activemask();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Syncwarp
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
-  PRINT0(LD_IO, "call __kmpc_syncwarp\n");
-  __kmpc_impl_syncwarp(Mask);
-}
+//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+#include "target_impl.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Ordered calls
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_ordered\n");
+}
+
+EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_end_ordered\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Barriers
+////////////////////////////////////////////////////////////////////////////////
+
+// a team is a block: we can use CUDA native synchronization mechanism
+// FIXME: what if not all threads (warps) participate to the barrier?
+// We may need to implement it differently
+
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
+  __kmpc_barrier(loc_ref, tid);
+  PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
+  return 0;
+}
+
+EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
+  if (checkRuntimeUninitialized(loc_ref)) {
+    ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref),
+            "Expected SPMD mode with uninitialized runtime.");
+    __kmpc_barrier_simple_spmd(loc_ref, tid);
+  } else {
+    tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
+    int numberOfActiveOMPThreads =
+        GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
+    if (numberOfActiveOMPThreads > 1) {
+      if (checkSPMDMode(loc_ref)) {
+        __kmpc_barrier_simple_spmd(loc_ref, tid);
+      } else {
+        // The #threads parameter must be rounded up to the WARPSIZE.
+        int threads =
+            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+        PRINT(LD_SYNC,
+              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+              (int)numberOfActiveOMPThreads, (int)threads);
+        // Barrier #1 is for synchronization among active threads.
+        __kmpc_impl_named_sync(L1_BARRIER, threads);
+      }
+    } else {
+      // Still need to flush the memory per the standard.
+      __kmpc_flush(loc_ref);
+    } // numberOfActiveOMPThreads > 1
+    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
+  }
+}
+
+// Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0
+// parallel region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
+  PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
+  __kmpc_impl_syncthreads();
+  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
+}
+
+// Emit a simple barrier call in Generic mode.  Assumes the caller is in an L0
+// parallel region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
+  int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
+  // The #threads parameter must be rounded up to the WARPSIZE.
+  int threads =
+      WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+  PRINT(LD_SYNC,
+        "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
+        "%d\n",
+        (int)numberOfActiveOMPThreads, (int)threads);
+  // Barrier #1 is for synchronization among active threads.
+  __kmpc_impl_named_sync(L1_BARRIER, threads);
+  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP MASTER
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_master\n");
+  return IsTeamMaster(global_tid);
+}
+
+EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_end_master\n");
+  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP SINGLE
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_single\n");
+  // decide to implement single with master; master get the single
+  return IsTeamMaster(global_tid);
+}
+
+EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_end_single\n");
+  // decide to implement single with master: master get the single
+  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
+  // sync barrier is explicitly called... so that is not a problem
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Flush
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_flush(kmp_Ident *loc) {
+  PRINT0(LD_IO, "call kmpc_flush\n");
+  __kmpc_impl_threadfence();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Vote
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
+  PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
+  return __kmpc_impl_activemask();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Syncwarp
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
+  PRINT0(LD_IO, "call __kmpc_syncwarp\n");
+  __kmpc_impl_syncwarp(Mask);
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu
index 5e5bc350d2775..0c11d3b4f9dbc 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/task.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu
@@ -1,216 +1,216 @@
-//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Task implementation support.
-//
-//  explicit task structure uses
-//  omptarget_nvptx task
-//  kmp_task
-//
-//  where kmp_task is
-//    - klegacy_TaskDescr    <- task pointer
-//        shared -> X
-//        routine
-//        part_id
-//        descr
-//    -  private (of size given by task_alloc call). Accessed by
-//       task+sizeof(klegacy_TaskDescr)
-//        * private data *
-//    - shared: X. Accessed by shared ptr in klegacy_TaskDescr
-//        * pointer table to shared variables *
-//    - end
-//
-//===----------------------------------------------------------------------===//
-
-#include "common/omptarget.h"
-
-EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
-    kmp_Ident *loc,     // unused
-    uint32_t global_tid, // unused
-    int32_t flag, // unused (because in our impl, all are immediately exec
-    size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
-    kmp_TaskFctPtr taskSub) {
-  PRINT(LD_IO,
-        "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
-        "fct 0x%llx)\n",
-        (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
-        (unsigned long long)taskSub);
-  // want task+priv to be a multiple of 8 bytes
-  size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
-  sizeOfTaskInclPrivate += padForTaskInclPriv;
-  size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
-  ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
-         "need task descr of size %d to be a multiple of %d\n",
-         (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
-  size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
-          totSize, "explicit task descriptor");
-  kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
-  ASSERT0(LT_FUSSY,
-          (uint64_t)newKmpTaskDescr ==
-              (uint64_t)ADD_BYTES(newExplicitTaskDescr,
-                                  sizeof(omptarget_nvptx_TaskDescr)),
-          "bad size assumptions");
-  // init kmp_TaskDescr
-  newKmpTaskDescr->sharedPointerTable =
-      (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
-  newKmpTaskDescr->sub = taskSub;
-  newKmpTaskDescr->destructors = NULL;
-  PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
-        (unsigned long long)newKmpTaskDescr,
-        (unsigned long long)newExplicitTaskDescr);
-
-  return newKmpTaskDescr;
-}
-
-EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
-                               kmp_TaskDescr *newKmpTaskDescr) {
-  return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
-                                   0);
-}
-
-EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newKmpTaskDescr,
-                                         int32_t depNum, void *depList,
-                                         int32_t noAliasDepNum,
-                                         void *noAliasDepList) {
-  PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
-        P64(newKmpTaskDescr));
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Runtime must be initialized.");
-  // 1. get explicit task descr from kmp task descr
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
-          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
-  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
-          "bad assumptions");
-  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
-  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
-          "bad assumptions");
-
-  // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
-  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
-  // set new task descriptor as top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
-
-  // 3. call sub
-  PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
-        (unsigned long long)newKmpTaskDescr->sub,
-        (unsigned long long)newKmpTaskDescr);
-  newKmpTaskDescr->sub(0, newKmpTaskDescr);
-  PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
-        (unsigned long long)newKmpTaskDescr->sub);
-
-  // 4. pop context
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
-                                                             parentTaskDescr);
-  // 5. free
-  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
-  return 0;
-}
-
-EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
-                                      kmp_TaskDescr *newKmpTaskDescr) {
-  PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
-        (unsigned long long)newKmpTaskDescr);
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Runtime must be initialized.");
-  // 1. get explicit task descr from kmp task descr
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
-          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
-  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
-          "bad assumptions");
-  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
-  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
-          "bad assumptions");
-
-  // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
-  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
-  // set new task descriptor as top
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
-  // 3... noting to call... is inline
-  // 4 & 5 ... done in complete
-}
-
-EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newKmpTaskDescr) {
-  PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
-        (unsigned long long)newKmpTaskDescr);
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Runtime must be initialized.");
-  // 1. get explicit task descr from kmp task descr
-  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
-      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
-          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
-  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
-          "bad assumptions");
-  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
-  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
-          "bad assumptions");
-  // 2. get parent
-  omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
-  // 3... noting to call... is inline
-  // 4. pop context
-  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
-                                                             parentTaskDescr);
-  // 5. free
-  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
-}
-
-EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
-                                 int32_t depNum, void *depList,
-                                 int32_t noAliasDepNum, void *noAliasDepList) {
-  PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
-  // nothing to do as all our tasks are executed as final
-}
-
-EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
-  // nothing to do as all our tasks are executed as final
-}
-
-EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
-  // nothing to do as all our tasks are executed as final
-}
-
-EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
-                                    int end_part) {
-  PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
-  // do nothing: tasks are executed immediately, no yielding allowed
-  return 0;
-}
-
-EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) {
-  PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
-  // nothing to do as all our tasks are executed as final
-  return 0;
-}
-
-EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
-                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
-                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
-                            int32_t sched, uint64_t grainsize, void *task_dup) {
-
-  // skip task entirely if empty iteration space
-  if (*lb > *ub)
-    return;
-
-  // the compiler has already stored lb and ub in the kmp_TaskDescr structure
-  // as we are using a single task to execute the entire loop, we can leave
-  // the initial task_t untouched
-
-  __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
-}
+//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+//  explicit task structure uses
+//  omptarget_nvptx task
+//  kmp_task
+//
+//  where kmp_task is
+//    - klegacy_TaskDescr    <- task pointer
+//        shared -> X
+//        routine
+//        part_id
+//        descr
+//    -  private (of size given by task_alloc call). Accessed by
+//       task+sizeof(klegacy_TaskDescr)
+//        * private data *
+//    - shared: X. Accessed by shared ptr in klegacy_TaskDescr
+//        * pointer table to shared variables *
+//    - end
+//
+//===----------------------------------------------------------------------===//
+
+#include "common/omptarget.h"
+
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
+    kmp_Ident *loc,     // unused
+    uint32_t global_tid, // unused
+    int32_t flag, // unused (because in our impl, all are immediately exec
+    size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
+    kmp_TaskFctPtr taskSub) {
+  PRINT(LD_IO,
+        "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
+        "fct 0x%llx)\n",
+        (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
+        (unsigned long long)taskSub);
+  // want task+priv to be a multiple of 8 bytes
+  size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
+  sizeOfTaskInclPrivate += padForTaskInclPriv;
+  size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
+  ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
+         "need task descr of size %d to be a multiple of %d\n",
+         (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
+  size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
+          totSize, "explicit task descriptor");
+  kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
+  ASSERT0(LT_FUSSY,
+          (uint64_t)newKmpTaskDescr ==
+              (uint64_t)ADD_BYTES(newExplicitTaskDescr,
+                                  sizeof(omptarget_nvptx_TaskDescr)),
+          "bad size assumptions");
+  // init kmp_TaskDescr
+  newKmpTaskDescr->sharedPointerTable =
+      (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
+  newKmpTaskDescr->sub = taskSub;
+  newKmpTaskDescr->destructors = NULL;
+  PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
+        (unsigned long long)newKmpTaskDescr,
+        (unsigned long long)newExplicitTaskDescr);
+
+  return newKmpTaskDescr;
+}
+
+EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
+                               kmp_TaskDescr *newKmpTaskDescr) {
+  return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
+                                   0);
+}
+
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newKmpTaskDescr,
+                                         int32_t depNum, void *depList,
+                                         int32_t noAliasDepNum,
+                                         void *noAliasDepList) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
+        P64(newKmpTaskDescr));
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Runtime must be initialized.");
+  // 1. get explicit task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
+
+  // 3. call sub
+  PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
+        (unsigned long long)newKmpTaskDescr->sub,
+        (unsigned long long)newKmpTaskDescr);
+  newKmpTaskDescr->sub(0, newKmpTaskDescr);
+  PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
+        (unsigned long long)newKmpTaskDescr->sub);
+
+  // 4. pop context
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+  return 0;
+}
+
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
+                                      kmp_TaskDescr *newKmpTaskDescr) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
+        (unsigned long long)newKmpTaskDescr);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Runtime must be initialized.");
+  // 1. get explicit task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
+  // 3... noting to call... is inline
+  // 4 & 5 ... done in complete
+}
+
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newKmpTaskDescr) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
+        (unsigned long long)newKmpTaskDescr);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Runtime must be initialized.");
+  // 1. get explicit task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+  // 2. get parent
+  omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
+  // 3... noting to call... is inline
+  // 4. pop context
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+}
+
+EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
+                                 int32_t depNum, void *depList,
+                                 int32_t noAliasDepNum, void *noAliasDepList) {
+  PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
+                                    int end_part) {
+  PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
+  // do nothing: tasks are executed immediately, no yielding allowed
+  return 0;
+}
+
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
+  // nothing to do as all our tasks are executed as final
+  return 0;
+}
+
+EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
+                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
+                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
+                            int32_t sched, uint64_t grainsize, void *task_dup) {
+
+  // skip task entirely if empty iteration space
+  if (*lb > *ub)
+    return;
+
+  // the compiler has already stored lb and ub in the kmp_TaskDescr structure
+  // as we are using a single task to execute the entire loop, we can leave
+  // the initial task_t untouched
+
+  __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h
index 8320929cfaf3a..7884d7cbd0df6 100644
--- a/openmp/libomptarget/deviceRTLs/common/state-queue.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h
@@ -1,51 +1,51 @@
-//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a queue to hand out OpenMP state objects to teams of
-// one or more kernels.
-//
-// Reference:
-// Thomas R.W. Scogland and Wu-chun Feng. 2015.
-// Design and Evaluation of Scalable Concurrent Queues for Many-Core
-// Architectures. International Conference on Performance Engineering.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __STATE_QUEUE_H
-#define __STATE_QUEUE_H
-
-#include <stdint.h>
-
-#include "target_impl.h"
-
-template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
-private:
-  ElementType elements[SIZE];
-  volatile ElementType *elementQueue[SIZE];
-  volatile uint32_t head;
-  volatile uint32_t ids[SIZE];
-  volatile uint32_t tail;
-
-  static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
-  INLINE uint32_t ENQUEUE_TICKET();
-  INLINE uint32_t DEQUEUE_TICKET();
-  INLINE static uint32_t ID(uint32_t ticket);
-  INLINE bool IsServing(uint32_t slot, uint32_t id);
-  INLINE void PushElement(uint32_t slot, ElementType *element);
-  INLINE ElementType *PopElement(uint32_t slot);
-  INLINE void DoneServing(uint32_t slot, uint32_t id);
-
-public:
-  INLINE omptarget_nvptx_Queue() {}
-  INLINE void Enqueue(ElementType *element);
-  INLINE ElementType *Dequeue();
-};
-
-#include "state-queuei.h"
-
-#endif
+//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a queue to hand out OpenMP state objects to teams of
+// one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __STATE_QUEUE_H
+#define __STATE_QUEUE_H
+
+#include <stdint.h>
+
+#include "target_impl.h"
+
+template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
+private:
+  ElementType elements[SIZE];
+  volatile ElementType *elementQueue[SIZE];
+  volatile uint32_t head;
+  volatile uint32_t ids[SIZE];
+  volatile uint32_t tail;
+
+  static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
+  INLINE uint32_t ENQUEUE_TICKET();
+  INLINE uint32_t DEQUEUE_TICKET();
+  INLINE static uint32_t ID(uint32_t ticket);
+  INLINE bool IsServing(uint32_t slot, uint32_t id);
+  INLINE void PushElement(uint32_t slot, ElementType *element);
+  INLINE ElementType *PopElement(uint32_t slot);
+  INLINE void DoneServing(uint32_t slot, uint32_t id);
+
+public:
+  INLINE omptarget_nvptx_Queue() {}
+  INLINE void Enqueue(ElementType *element);
+  INLINE ElementType *Dequeue();
+};
+
+#include "state-queuei.h"
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
index 1bd261f2826ac..5c14f9aad2939 100644
--- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
@@ -1,90 +1,90 @@
-//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of a queue to hand out OpenMP state
-// objects to teams of one or more kernels.
-//
-// Reference:
-// Thomas R.W. Scogland and Wu-chun Feng. 2015.
-// Design and Evaluation of Scalable Concurrent Queues for Many-Core
-// Architectures. International Conference on Performance Engineering.
-//
-//===----------------------------------------------------------------------===//
-
-#include "state-queue.h"
-#include "common/target_atomic.h"
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
-  return __kmpc_atomic_add((unsigned int *)&tail, 1u);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
-  return __kmpc_atomic_add((unsigned int *)&head, 1u);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t
-omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
-  return (ticket / SIZE) * 2;
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
-                                                                uint32_t id) {
-  return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void
-omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
-                                                      ElementType *element) {
-  __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
-                         (unsigned long long)element);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE ElementType *
-omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
-  return (ElementType *)__kmpc_atomic_add(
-      (unsigned long long *)&elementQueue[slot], (unsigned long long)0);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
-                                                                  uint32_t id) {
-  __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void
-omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
-  uint32_t ticket = ENQUEUE_TICKET();
-  uint32_t slot = ticket % SIZE;
-  uint32_t id = ID(ticket) + 1;
-  while (!IsServing(slot, id))
-    ;
-  PushElement(slot, element);
-  DoneServing(slot, id);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
-  uint32_t ticket = DEQUEUE_TICKET();
-  uint32_t slot = ticket % SIZE;
-  uint32_t id = ID(ticket);
-  while (!IsServing(slot, id))
-    ;
-  ElementType *element = PopElement(slot);
-  // This is to populate the queue because of the lack of GPU constructors.
-  if (element == 0)
-    element = &elements[slot];
-  DoneServing(slot, id);
-  return element;
-}
+//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of a queue to hand out OpenMP state
+// objects to teams of one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "state-queue.h"
+#include "common/target_atomic.h"
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
+  return __kmpc_atomic_add((unsigned int *)&tail, 1u);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
+  return __kmpc_atomic_add((unsigned int *)&head, 1u);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t
+omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
+  return (ticket / SIZE) * 2;
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
+                                                                uint32_t id) {
+  return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void
+omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
+                                                      ElementType *element) {
+  __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
+                         (unsigned long long)element);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE ElementType *
+omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
+  return (ElementType *)__kmpc_atomic_add(
+      (unsigned long long *)&elementQueue[slot], (unsigned long long)0);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
+                                                                  uint32_t id) {
+  __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void
+omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
+  uint32_t ticket = ENQUEUE_TICKET();
+  uint32_t slot = ticket % SIZE;
+  uint32_t id = ID(ticket) + 1;
+  while (!IsServing(slot, id))
+    ;
+  PushElement(slot, element);
+  DoneServing(slot, id);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
+  uint32_t ticket = DEQUEUE_TICKET();
+  uint32_t slot = ticket % SIZE;
+  uint32_t id = ID(ticket);
+  while (!IsServing(slot, id))
+    ;
+  ElementType *element = PopElement(slot);
+  // This is to populate the queue because of the lack of GPU constructors.
+  if (element == 0)
+    element = &elements[slot];
+  DoneServing(slot, id);
+  return element;
+}
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
index 913c4c3c323fc..6dfb8e44c24ea 100644
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -1,99 +1,99 @@
-//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Wrapper to some functions natively supported by the GPU.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SUPPORT_H
-#define OMPTARGET_SUPPORT_H
-
-#include "interface.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Parameters
-////////////////////////////////////////////////////////////////////////////////
-enum ExecutionMode {
-  Spmd = 0x00u,
-  Generic = 0x01u,
-  ModeMask = 0x01u,
-};
-
-enum RuntimeMode {
-  RuntimeInitialized = 0x00u,
-  RuntimeUninitialized = 0x02u,
-  RuntimeMask = 0x02u,
-};
-
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-DEVICE bool isGenericMode();
-DEVICE bool isSPMDMode();
-DEVICE bool isRuntimeUninitialized();
-DEVICE bool isRuntimeInitialized();
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Modes based on location parameter fields
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE bool checkSPMDMode(kmp_Ident *loc);
-DEVICE bool checkGenericMode(kmp_Ident *loc);
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
-
-////////////////////////////////////////////////////////////////////////////////
-// get info from machine
-////////////////////////////////////////////////////////////////////////////////
-
-// get global ids to locate tread/team info (constant regardless of OMP)
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-DEVICE int GetMasterThreadID();
-DEVICE int GetNumberOfWorkersInTeam();
-
-// get OpenMP thread and team ids
-DEVICE int GetOmpThreadId(int threadId,
-                          bool isSPMDExecutionMode);    // omp_thread_num
-DEVICE int GetOmpTeamId();                              // omp_team_num
-
-// get OpenMP number of threads and team
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
-
-// get OpenMP number of procs
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
-
-// masters
-DEVICE int IsTeamMaster(int ompThreadId);
-
-// Parallel level
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory
-////////////////////////////////////////////////////////////////////////////////
-
-// safe alloc and free
-DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
-DEVICE void *SafeFree(void *ptr, const char *msg);
-// pad to a alignment (power of 2 only)
-DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
-#define ADD_BYTES(_addr, _bytes)                                               \
-  ((void *)((char *)((void *)(_addr)) + (_bytes)))
-#define SUB_BYTES(_addr, _bytes)                                               \
-  ((void *)((char *)((void *)(_addr)) - (_bytes)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Teams Reduction Scratchpad Helpers
-////////////////////////////////////////////////////////////////////////////////
-DEVICE unsigned int *GetTeamsReductionTimestamp();
-DEVICE char *GetTeamsReductionScratchpad();
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
-
-#endif
+//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_SUPPORT_H
+#define OMPTARGET_SUPPORT_H
+
+#include "interface.h"
+#include "target_impl.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+enum ExecutionMode {
+  Spmd = 0x00u,
+  Generic = 0x01u,
+  ModeMask = 0x01u,
+};
+
+enum RuntimeMode {
+  RuntimeInitialized = 0x00u,
+  RuntimeUninitialized = 0x02u,
+  RuntimeMask = 0x02u,
+};
+
+DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+DEVICE bool isGenericMode();
+DEVICE bool isSPMDMode();
+DEVICE bool isRuntimeUninitialized();
+DEVICE bool isRuntimeInitialized();
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Modes based on location parameter fields
+////////////////////////////////////////////////////////////////////////////////
+
+DEVICE bool checkSPMDMode(kmp_Ident *loc);
+DEVICE bool checkGenericMode(kmp_Ident *loc);
+DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
+DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
+
+////////////////////////////////////////////////////////////////////////////////
+// get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+// get global ids to locate tread/team info (constant regardless of OMP)
+DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+DEVICE int GetMasterThreadID();
+DEVICE int GetNumberOfWorkersInTeam();
+
+// get OpenMP thread and team ids
+DEVICE int GetOmpThreadId(int threadId,
+                          bool isSPMDExecutionMode);    // omp_thread_num
+DEVICE int GetOmpTeamId();                              // omp_team_num
+
+// get OpenMP number of threads and team
+DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
+
+// get OpenMP number of procs
+DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+
+// masters
+DEVICE int IsTeamMaster(int ompThreadId);
+
+// Parallel level
+DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+// safe alloc and free
+DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
+DEVICE void *SafeFree(void *ptr, const char *msg);
+// pad to a alignment (power of 2 only)
+DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+#define ADD_BYTES(_addr, _bytes)                                               \
+  ((void *)((char *)((void *)(_addr)) + (_bytes)))
+#define SUB_BYTES(_addr, _bytes)                                               \
+  ((void *)((char *)((void *)(_addr)) - (_bytes)))
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+DEVICE unsigned int *GetTeamsReductionTimestamp();
+DEVICE char *GetTeamsReductionScratchpad();
+DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/target_atomic.h b/openmp/libomptarget/deviceRTLs/common/target_atomic.h
index 3c905d3cbbf2d..8fd96451790b6 100644
--- a/openmp/libomptarget/deviceRTLs/common/target_atomic.h
+++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h
@@ -1,38 +1,38 @@
-//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations of atomic functions provided by each target
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_TARGET_ATOMIC_H
-#define OMPTARGET_TARGET_ATOMIC_H
-
-#include "target_impl.h"
-
-template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
-  return atomicAdd(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
-  return atomicInc(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
-  return atomicMax(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
-  return atomicExch(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
-  return atomicCAS(address, compare, val);
-}
-
-#endif
+//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations of atomic functions provided by each target
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_TARGET_ATOMIC_H
+#define OMPTARGET_TARGET_ATOMIC_H
+
+#include "target_impl.h"
+
+template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
+  return atomicAdd(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
+  return atomicInc(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
+  return atomicMax(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
+  return atomicExch(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
+  return atomicCAS(address, compare, val);
+}
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 3c216a5e61c5e..c6d6b55f17d98 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -1,542 +1,542 @@
-//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file contains all the definitions that are relevant to
-//  the interface. The first section contains the interface as
-//  declared by OpenMP.  The second section includes the compiler
-//  specific interfaces.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _INTERFACES_H_
-#define _INTERFACES_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __AMDGCN__
-#include "amdgcn/src/amdgcn_interface.h"
-#endif
-#ifdef __CUDACC__
-#include "nvptx/src/nvptx_interface.h"
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenMP interface
-////////////////////////////////////////////////////////////////////////////////
-
-typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
-
-typedef enum omp_sched_t {
-  omp_sched_static = 1,  /* chunkSize >0 */
-  omp_sched_dynamic = 2, /* chunkSize >0 */
-  omp_sched_guided = 3,  /* chunkSize >0 */
-  omp_sched_auto = 4,    /* no chunkSize */
-} omp_sched_t;
-
-typedef enum omp_proc_bind_t {
-  omp_proc_bind_false = 0,
-  omp_proc_bind_true = 1,
-  omp_proc_bind_master = 2,
-  omp_proc_bind_close = 3,
-  omp_proc_bind_spread = 4
-} omp_proc_bind_t;
-
-EXTERN double omp_get_wtick(void);
-EXTERN double omp_get_wtime(void);
-
-EXTERN void omp_set_num_threads(int num);
-EXTERN int omp_get_num_threads(void);
-EXTERN int omp_get_max_threads(void);
-EXTERN int omp_get_thread_limit(void);
-EXTERN int omp_get_thread_num(void);
-EXTERN int omp_get_num_procs(void);
-EXTERN int omp_in_parallel(void);
-EXTERN int omp_in_final(void);
-EXTERN void omp_set_dynamic(int flag);
-EXTERN int omp_get_dynamic(void);
-EXTERN void omp_set_nested(int flag);
-EXTERN int omp_get_nested(void);
-EXTERN void omp_set_max_active_levels(int level);
-EXTERN int omp_get_max_active_levels(void);
-EXTERN int omp_get_level(void);
-EXTERN int omp_get_active_level(void);
-EXTERN int omp_get_ancestor_thread_num(int level);
-EXTERN int omp_get_team_size(int level);
-
-EXTERN void omp_init_lock(omp_lock_t *lock);
-EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_destroy_lock(omp_lock_t *lock);
-EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_set_lock(omp_lock_t *lock);
-EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_unset_lock(omp_lock_t *lock);
-EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
-EXTERN int omp_test_lock(omp_lock_t *lock);
-EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
-
-EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
-EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
-EXTERN omp_proc_bind_t omp_get_proc_bind(void);
-EXTERN int omp_get_cancellation(void);
-EXTERN void omp_set_default_device(int deviceId);
-EXTERN int omp_get_default_device(void);
-EXTERN int omp_get_num_devices(void);
-EXTERN int omp_get_num_teams(void);
-EXTERN int omp_get_team_num(void);
-EXTERN int omp_is_initial_device(void);
-EXTERN int omp_get_initial_device(void);
-EXTERN int omp_get_max_task_priority(void);
-
-////////////////////////////////////////////////////////////////////////////////
-// file below is swiped from kmpc host interface
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// kmp specific types
-////////////////////////////////////////////////////////////////////////////////
-
-typedef enum kmp_sched_t {
-  kmp_sched_static_chunk = 33,
-  kmp_sched_static_nochunk = 34,
-  kmp_sched_dynamic = 35,
-  kmp_sched_guided = 36,
-  kmp_sched_runtime = 37,
-  kmp_sched_auto = 38,
-
-  kmp_sched_static_balanced_chunk = 45,
-
-  kmp_sched_static_ordered = 65,
-  kmp_sched_static_nochunk_ordered = 66,
-  kmp_sched_dynamic_ordered = 67,
-  kmp_sched_guided_ordered = 68,
-  kmp_sched_runtime_ordered = 69,
-  kmp_sched_auto_ordered = 70,
-
-  kmp_sched_distr_static_chunk = 91,
-  kmp_sched_distr_static_nochunk = 92,
-  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
-
-  kmp_sched_default = kmp_sched_static_nochunk,
-  kmp_sched_unordered_first = kmp_sched_static_chunk,
-  kmp_sched_unordered_last = kmp_sched_auto,
-  kmp_sched_ordered_first = kmp_sched_static_ordered,
-  kmp_sched_ordered_last = kmp_sched_auto_ordered,
-  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
-  kmp_sched_distribute_last =
-      kmp_sched_distr_static_chunk_sched_static_chunkone,
-
-  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
-   * Since we need to distinguish the three possible cases (no modifier,
-   * monotonic modifier, nonmonotonic modifier), we need separate bits for
-   * each modifier. The absence of monotonic does not imply nonmonotonic,
-   * especially since 4.5 says that the behaviour of the "no modifier" case
-   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
-   *
-   * Since we're passing a full 32 bit value, we can use a couple of high
-   * bits for these flags; out of paranoia we avoid the sign bit.
-   *
-   * These modifiers can be or-ed into non-static schedules by the compiler
-   * to pass the additional information. They will be stripped early in the
-   * processing in __kmp_dispatch_init when setting up schedules, so
-   * most of the code won't ever see schedules with these bits set.
-   */
-  kmp_sched_modifier_monotonic = (1 << 29),
-  /**< Set if the monotonic schedule modifier was present */
-  kmp_sched_modifier_nonmonotonic = (1 << 30),
-/**< Set if the nonmonotonic schedule modifier was present */
-
-#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
-  (enum kmp_sched_t)(                                                          \
-      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
-#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
-  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
-#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
-  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
-   0)
-
-} kmp_sched_t;
-
-/*!
- * Enum for accesseing the reserved_2 field of the ident_t struct below.
- */
-enum {
-  /*! Bit set to 1 when in SPMD mode. */
-  KMP_IDENT_SPMD_MODE = 0x01,
-  /*! Bit set to 1 when a simplified runtime is used. */
-  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
-};
-
-/*!
- * The ident structure that describes a source location.
- * The struct is identical to the one in the kmp.h file.
- * We maintain the same data structure for compatibility.
- */
-typedef int kmp_int32;
-typedef struct ident {
-  kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
-  kmp_int32 flags; /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
-                      identifies this union member  */
-  kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
-  kmp_int32 reserved_3; /**<  source[4] in Fortran, do not use for C++  */
-  char const *psource; /**<  String describing the source location.
-                       The string is composed of semi-colon separated fields
-                       which describe the source file, the function and a pair
-                       of line numbers that delimit the construct. */
-} ident_t;
-
-// parallel defs
-typedef ident_t kmp_Ident;
-typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
-typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
-typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
-typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
-                                        int16_t lane_offset,
-                                        int16_t shortCircuit);
-typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
-                                           int32_t index, int32_t width);
-typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
-                                     int32_t index, int32_t width,
-                                     int32_t reduce);
-typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
-
-// task defs
-typedef struct kmp_TaskDescr kmp_TaskDescr;
-typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
-typedef struct kmp_TaskDescr {
-  void *sharedPointerTable;   // ptr to a table of shared var ptrs
-  kmp_TaskFctPtr sub;         // task subroutine
-  int32_t partId;             // unused
-  kmp_TaskFctPtr destructors; // destructor of c++ first private
-} kmp_TaskDescr;
-
-// sync defs
-typedef int32_t kmp_CriticalName[8];
-
-////////////////////////////////////////////////////////////////////////////////
-// external interface
-////////////////////////////////////////////////////////////////////////////////
-
-// parallel
-EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
-EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
-                                    int32_t num_threads);
-// simd
-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t simd_limit);
-// aee ... not supported
-// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
-// microtask, ...);
-EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
-EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
-                                           uint32_t global_tid);
-EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid);
-
-// proc bind
-EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid,
-                                  int proc_bind);
-EXTERN int omp_get_num_places(void);
-EXTERN int omp_get_place_num_procs(int place_num);
-EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
-EXTERN int omp_get_place_num(void);
-EXTERN int omp_get_partition_num_places(void);
-EXTERN void omp_get_partition_place_nums(int *place_nums);
-
-// for static (no chunk or chunk)
-EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t sched, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk);
-EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t sched, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk);
-EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t sched, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk);
-EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t sched, int32_t *plastiter1,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t sched, int32_t *plastiter,
-                                          int32_t *plower, int32_t *pupper,
-                                          int32_t *pstride, int32_t incr,
-                                          int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t sched, int32_t *plastiter,
-                                           uint32_t *plower, uint32_t *pupper,
-                                           int32_t *pstride, int32_t incr,
-                                           int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t sched, int32_t *plastiter,
-                                          int64_t *plower, int64_t *pupper,
-                                          int64_t *pstride, int64_t incr,
-                                          int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t sched, int32_t *plastiter1,
-                                           uint64_t *plower, uint64_t *pupper,
-                                           int64_t *pstride, int64_t incr,
-                                           int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc,
-                                             int32_t global_tid, int32_t sched,
-                                             int32_t *plastiter,
-                                             int32_t *plower, int32_t *pupper,
-                                             int32_t *pstride, int32_t incr,
-                                             int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_4u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
-    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
-    int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc,
-                                             int32_t global_tid, int32_t sched,
-                                             int32_t *plastiter,
-                                             int64_t *plower, int64_t *pupper,
-                                             int64_t *pstride, int64_t incr,
-                                             int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_8u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
-    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
-    int64_t chunk);
-
-EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid);
-
-// for dynamic
-EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t sched, int32_t lower, int32_t upper,
-                                   int32_t incr, int32_t chunk);
-EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                    int32_t sched, uint32_t lower,
-                                    uint32_t upper, int32_t incr,
-                                    int32_t chunk);
-EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t sched, int64_t lower, int64_t upper,
-                                   int64_t incr, int64_t chunk);
-EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                    int32_t sched, uint64_t lower,
-                                    uint64_t upper, int64_t incr,
-                                    int64_t chunk);
-
-EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid,
-                                  int32_t *plastiter, int32_t *plower,
-                                  int32_t *pupper, int32_t *pstride);
-EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t *plastiter, uint32_t *plower,
-                                   uint32_t *pupper, int32_t *pstride);
-EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid,
-                                  int32_t *plastiter, int64_t *plower,
-                                  int64_t *pupper, int64_t *pstride);
-EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid,
-                                   int32_t *plastiter, uint64_t *plower,
-                                   uint64_t *pupper, int64_t *pstride);
-
-EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
-
-// Support for reducing conditional lastprivate variables
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
-                                                  int32_t global_tid,
-                                                  int32_t varNum, void *array);
-
-// reduction
-EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
-EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
-    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
-    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
-    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
-    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
-    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
-    kmp_ListGlobalFctPtr glredFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
-                                                       int32_t global_tid,
-                                                       kmp_CriticalName *crit);
-EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
-                                                        int32_t global_tid,
-                                                        kmp_CriticalName *crit);
-EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-
-// sync barrier
-EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
-EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
-
-// single
-EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid);
-
-// sync
-EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
-                            kmp_CriticalName *crit);
-EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
-                                kmp_CriticalName *crit);
-EXTERN void __kmpc_flush(kmp_Ident *loc);
-
-// vote
-EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
-// syncwarp
-EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t);
-
-// tasks
-EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc,
-                                            uint32_t global_tid, int32_t flag,
-                                            size_t sizeOfTaskInclPrivate,
-                                            size_t sizeOfSharedTable,
-                                            kmp_TaskFctPtr sub);
-EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
-                               kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newLegacyTaskDescr,
-                                         int32_t depNum, void *depList,
-                                         int32_t noAliasDepNum,
-                                         void *noAliasDepList);
-EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
-                                      kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
-                                         kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
-                                 int32_t depNum, void *depList,
-                                 int32_t noAliasDepNum, void *noAliasDepList);
-EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid);
-EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid);
-EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
-                                    int end_part);
-EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid);
-EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
-                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
-                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
-                            int32_t sched, uint64_t grainsize, void *task_dup);
-
-// cancel
-EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
-                                        int32_t cancelVal);
-EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
-                             int32_t cancelVal);
-
-// non standard
-EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
-EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
-EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
-                                    int16_t RequiresDataSharing);
-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
-EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           int16_t IsOMPRuntimeInitialized);
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
-                                   int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_kernel_end_parallel();
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
-                                              __kmpc_impl_lanemask_t Mask,
-                                              bool *IsFinal,
-                                              int32_t *LaneSource);
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
-                                          __kmpc_impl_lanemask_t Mask,
-                                          bool *IsFinal, int32_t *LaneSource,
-                                          int32_t *LaneId, int32_t *NumLanes);
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
-
-
-EXTERN void __kmpc_data_sharing_init_stack();
-EXTERN void __kmpc_data_sharing_init_stack_spmd();
-EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
-    int16_t UseSharedMemory);
-EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
-EXTERN void __kmpc_data_sharing_pop_stack(void *a);
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
-EXTERN void __kmpc_end_sharing_variables();
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
-
-// The slot used for data sharing by the master and worker threads. We use a
-// complete (default size version and an incomplete one so that we allow sizes
-// greater than the default).
-struct __kmpc_data_sharing_slot {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[];
-};
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
-                                           size_t InitialDataSize);
-EXTERN void *__kmpc_data_sharing_environment_begin(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    size_t SharingDataSize, size_t SharingDefaultDataSize,
-    int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_data_sharing_environment_end(
-    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
-    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
-    int32_t IsEntryPoint);
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
-                                          int16_t IsOMPRuntimeInitialized);
-
-// SPMD execution mode interrogation function.
-EXTERN int8_t __kmpc_is_spmd_exec_mode();
-
-EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
-                                          const void *buf, size_t size,
-                                          int16_t is_shared, const void **res);
-
-EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
-                                              int16_t is_shared);
-
-#endif
+//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains all the definitions that are relevant to
+//  the interface. The first section contains the interface as
+//  declared by OpenMP.  The second section includes the compiler
+//  specific interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _INTERFACES_H_
+#define _INTERFACES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __AMDGCN__
+#include "amdgcn/src/amdgcn_interface.h"
+#endif
+#ifdef __CUDACC__
+#include "nvptx/src/nvptx_interface.h"
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP interface
+////////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
+
+typedef enum omp_sched_t {
+  omp_sched_static = 1,  /* chunkSize >0 */
+  omp_sched_dynamic = 2, /* chunkSize >0 */
+  omp_sched_guided = 3,  /* chunkSize >0 */
+  omp_sched_auto = 4,    /* no chunkSize */
+} omp_sched_t;
+
+typedef enum omp_proc_bind_t {
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+EXTERN double omp_get_wtick(void);
+EXTERN double omp_get_wtime(void);
+
+EXTERN void omp_set_num_threads(int num);
+EXTERN int omp_get_num_threads(void);
+EXTERN int omp_get_max_threads(void);
+EXTERN int omp_get_thread_limit(void);
+EXTERN int omp_get_thread_num(void);
+EXTERN int omp_get_num_procs(void);
+EXTERN int omp_in_parallel(void);
+EXTERN int omp_in_final(void);
+EXTERN void omp_set_dynamic(int flag);
+EXTERN int omp_get_dynamic(void);
+EXTERN void omp_set_nested(int flag);
+EXTERN int omp_get_nested(void);
+EXTERN void omp_set_max_active_levels(int level);
+EXTERN int omp_get_max_active_levels(void);
+EXTERN int omp_get_level(void);
+EXTERN int omp_get_active_level(void);
+EXTERN int omp_get_ancestor_thread_num(int level);
+EXTERN int omp_get_team_size(int level);
+
+EXTERN void omp_init_lock(omp_lock_t *lock);
+EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_destroy_lock(omp_lock_t *lock);
+EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_set_lock(omp_lock_t *lock);
+EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_unset_lock(omp_lock_t *lock);
+EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
+EXTERN int omp_test_lock(omp_lock_t *lock);
+EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
+EXTERN omp_proc_bind_t omp_get_proc_bind(void);
+EXTERN int omp_get_cancellation(void);
+EXTERN void omp_set_default_device(int deviceId);
+EXTERN int omp_get_default_device(void);
+EXTERN int omp_get_num_devices(void);
+EXTERN int omp_get_num_teams(void);
+EXTERN int omp_get_team_num(void);
+EXTERN int omp_is_initial_device(void);
+EXTERN int omp_get_initial_device(void);
+EXTERN int omp_get_max_task_priority(void);
+
+////////////////////////////////////////////////////////////////////////////////
+// file below is swiped from kmpc host interface
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp specific types
+////////////////////////////////////////////////////////////////////////////////
+
+typedef enum kmp_sched_t {
+  kmp_sched_static_chunk = 33,
+  kmp_sched_static_nochunk = 34,
+  kmp_sched_dynamic = 35,
+  kmp_sched_guided = 36,
+  kmp_sched_runtime = 37,
+  kmp_sched_auto = 38,
+
+  kmp_sched_static_balanced_chunk = 45,
+
+  kmp_sched_static_ordered = 65,
+  kmp_sched_static_nochunk_ordered = 66,
+  kmp_sched_dynamic_ordered = 67,
+  kmp_sched_guided_ordered = 68,
+  kmp_sched_runtime_ordered = 69,
+  kmp_sched_auto_ordered = 70,
+
+  kmp_sched_distr_static_chunk = 91,
+  kmp_sched_distr_static_nochunk = 92,
+  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
+
+  kmp_sched_default = kmp_sched_static_nochunk,
+  kmp_sched_unordered_first = kmp_sched_static_chunk,
+  kmp_sched_unordered_last = kmp_sched_auto,
+  kmp_sched_ordered_first = kmp_sched_static_ordered,
+  kmp_sched_ordered_last = kmp_sched_auto_ordered,
+  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
+  kmp_sched_distribute_last =
+      kmp_sched_distr_static_chunk_sched_static_chunkone,
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+   * Since we need to distinguish the three possible cases (no modifier,
+   * monotonic modifier, nonmonotonic modifier), we need separate bits for
+   * each modifier. The absence of monotonic does not imply nonmonotonic,
+   * especially since 4.5 says that the behaviour of the "no modifier" case
+   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
+   *
+   * Since we're passing a full 32 bit value, we can use a couple of high
+   * bits for these flags; out of paranoia we avoid the sign bit.
+   *
+   * These modifiers can be or-ed into non-static schedules by the compiler
+   * to pass the additional information. They will be stripped early in the
+   * processing in __kmp_dispatch_init when setting up schedules, so
+   * most of the code won't ever see schedules with these bits set.
+   */
+  kmp_sched_modifier_monotonic = (1 << 29),
+  /**< Set if the monotonic schedule modifier was present */
+  kmp_sched_modifier_nonmonotonic = (1 << 30),
+/**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum kmp_sched_t)(                                                          \
+      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
+  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
+   0)
+
+} kmp_sched_t;
+
+/*!
+ * Enum for accesseing the reserved_2 field of the ident_t struct below.
+ */
+enum {
+  /*! Bit set to 1 when in SPMD mode. */
+  KMP_IDENT_SPMD_MODE = 0x01,
+  /*! Bit set to 1 when a simplified runtime is used. */
+  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
+};
+
+/*!
+ * The ident structure that describes a source location.
+ * The struct is identical to the one in the kmp.h file.
+ * We maintain the same data structure for compatibility.
+ */
+typedef int kmp_int32;
+typedef struct ident {
+  kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
+  kmp_int32 flags; /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                      identifies this union member  */
+  kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
+  kmp_int32 reserved_3; /**<  source[4] in Fortran, do not use for C++  */
+  char const *psource; /**<  String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
+} ident_t;
+
+// parallel defs
+typedef ident_t kmp_Ident;
+typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
+typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
+typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
+typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
+                                        int16_t lane_offset,
+                                        int16_t shortCircuit);
+typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
+                                           int32_t index, int32_t width);
+typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
+                                     int32_t index, int32_t width,
+                                     int32_t reduce);
+typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
+
+// task defs
+typedef struct kmp_TaskDescr kmp_TaskDescr;
+typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
+typedef struct kmp_TaskDescr {
+  void *sharedPointerTable;   // ptr to a table of shared var ptrs
+  kmp_TaskFctPtr sub;         // task subroutine
+  int32_t partId;             // unused
+  kmp_TaskFctPtr destructors; // destructor of c++ first private
+} kmp_TaskDescr;
+
+// sync defs
+typedef int32_t kmp_CriticalName[8];
+
+////////////////////////////////////////////////////////////////////////////////
+// external interface
+////////////////////////////////////////////////////////////////////////////////
+
+// parallel
+EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
+EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
+                                    int32_t num_threads);
+// simd
+EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t simd_limit);
+// aee ... not supported
+// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
+// microtask, ...);
+EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
+                                           uint32_t global_tid);
+EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid);
+
+// proc bind
+EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid,
+                                  int proc_bind);
+EXTERN int omp_get_num_places(void);
+EXTERN int omp_get_place_num_procs(int place_num);
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
+EXTERN int omp_get_place_num(void);
+EXTERN int omp_get_partition_num_places(void);
+EXTERN void omp_get_partition_place_nums(int *place_nums);
+
+// for static (no chunk or chunk)
+EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t sched, int32_t *plastiter,
+                                     int32_t *plower, int32_t *pupper,
+                                     int32_t *pstride, int32_t incr,
+                                     int32_t chunk);
+EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t sched, int32_t *plastiter,
+                                      uint32_t *plower, uint32_t *pupper,
+                                      int32_t *pstride, int32_t incr,
+                                      int32_t chunk);
+EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t sched, int32_t *plastiter,
+                                     int64_t *plower, int64_t *pupper,
+                                     int64_t *pstride, int64_t incr,
+                                     int64_t chunk);
+EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t sched, int32_t *plastiter1,
+                                      uint64_t *plower, uint64_t *pupper,
+                                      int64_t *pstride, int64_t incr,
+                                      int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t sched, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t sched, int32_t *plastiter,
+                                           uint32_t *plower, uint32_t *pupper,
+                                           int32_t *pstride, int32_t incr,
+                                           int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t sched, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t sched, int32_t *plastiter1,
+                                           uint64_t *plower, uint64_t *pupper,
+                                           int64_t *pstride, int64_t incr,
+                                           int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc,
+                                             int32_t global_tid, int32_t sched,
+                                             int32_t *plastiter,
+                                             int32_t *plower, int32_t *pupper,
+                                             int32_t *pstride, int32_t incr,
+                                             int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc,
+                                             int32_t global_tid, int32_t sched,
+                                             int32_t *plastiter,
+                                             int64_t *plower, int64_t *pupper,
+                                             int64_t *pstride, int64_t incr,
+                                             int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk);
+
+EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid);
+
+// for dynamic
+EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t sched, int32_t lower, int32_t upper,
+                                   int32_t incr, int32_t chunk);
+EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid,
+                                    int32_t sched, uint32_t lower,
+                                    uint32_t upper, int32_t incr,
+                                    int32_t chunk);
+EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t sched, int64_t lower, int64_t upper,
+                                   int64_t incr, int64_t chunk);
+EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid,
+                                    int32_t sched, uint64_t lower,
+                                    uint64_t upper, int64_t incr,
+                                    int64_t chunk);
+
+EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid,
+                                  int32_t *plastiter, int32_t *plower,
+                                  int32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t *plastiter, uint32_t *plower,
+                                   uint32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid,
+                                  int32_t *plastiter, int64_t *plower,
+                                  int64_t *pupper, int64_t *pstride);
+EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t *plastiter, uint64_t *plower,
+                                   uint64_t *pupper, int64_t *pstride);
+
+EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
+
+// Support for reducing conditional lastprivate variables
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
+                                                  int32_t global_tid,
+                                                  int32_t varNum, void *array);
+
+// reduction
+EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
+EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
+    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
+    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
+    kmp_ListGlobalFctPtr glredFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
+                                                       int32_t global_tid,
+                                                       kmp_CriticalName *crit);
+EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
+                                                        int32_t global_tid,
+                                                        kmp_CriticalName *crit);
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+
+// sync barrier
+EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
+
+// single
+EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid);
+
+// sync
+EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
+                            kmp_CriticalName *crit);
+EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
+                                kmp_CriticalName *crit);
+EXTERN void __kmpc_flush(kmp_Ident *loc);
+
+// vote
+EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+// syncwarp
+EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t);
+
+// tasks
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc,
+                                            uint32_t global_tid, int32_t flag,
+                                            size_t sizeOfTaskInclPrivate,
+                                            size_t sizeOfSharedTable,
+                                            kmp_TaskFctPtr sub);
+EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
+                               kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newLegacyTaskDescr,
+                                         int32_t depNum, void *depList,
+                                         int32_t noAliasDepNum,
+                                         void *noAliasDepList);
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
+                                      kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
+                                 int32_t depNum, void *depList,
+                                 int32_t noAliasDepNum, void *noAliasDepList);
+EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid);
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
+                                    int end_part);
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid);
+EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
+                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
+                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
+                            int32_t sched, uint64_t grainsize, void *task_dup);
+
+// cancel
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
+                                        int32_t cancelVal);
+EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
+                             int32_t cancelVal);
+
+// non standard
+EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
+EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
+                                    int16_t RequiresDataSharing);
+EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
+EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+                                           int16_t IsOMPRuntimeInitialized);
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+                                   int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_kernel_end_parallel();
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
+                                              __kmpc_impl_lanemask_t Mask,
+                                              bool *IsFinal,
+                                              int32_t *LaneSource);
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
+                                          __kmpc_impl_lanemask_t Mask,
+                                          bool *IsFinal, int32_t *LaneSource,
+                                          int32_t *LaneId, int32_t *NumLanes);
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
+
+
+EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+    int16_t UseSharedMemory);
+EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
+EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
+EXTERN void __kmpc_end_sharing_variables();
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
+
+// The slot used for data sharing by the master and worker threads. We use a
+// complete (default size version and an incomplete one so that we allow sizes
+// greater than the default).
+struct __kmpc_data_sharing_slot {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[];
+};
+EXTERN void
+__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
+                                           size_t InitialDataSize);
+EXTERN void *__kmpc_data_sharing_environment_begin(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
+    size_t SharingDataSize, size_t SharingDefaultDataSize,
+    int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_data_sharing_environment_end(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
+    int32_t IsEntryPoint);
+
+EXTERN void *
+__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+                                          int16_t IsOMPRuntimeInitialized);
+
+// SPMD execution mode interrogation function.
+EXTERN int8_t __kmpc_is_spmd_exec_mode();
+
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+                                          const void *buf, size_t size,
+                                          int16_t is_shared, const void **res);
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+                                              int16_t is_shared);
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 84b52f55b73d9..2cbddd17baecc 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -1,199 +1,199 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
-#
-##===----------------------------------------------------------------------===##
-
-set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
-  "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
-
-if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
-  find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
-  if(NOT ALTERNATE_CUDA_HOST_COMPILER)
-    libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
-  endif()
-  set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
-endif()
-
-# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
-# gcc.
-if(CUDA_HOST_COMPILER MATCHES clang)
-
-  find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
-
-  if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
-    libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
-    libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
-    return()
-  endif()
-  set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
-endif()
-
-get_filename_component(devicertl_base_directory
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  DIRECTORY)
-set(devicertl_common_directory
-  ${devicertl_base_directory}/common)
-set(devicertl_nvptx_directory
-  ${devicertl_base_directory}/nvptx)
-
-if(LIBOMPTARGET_DEP_CUDA_FOUND)
-  libomptarget_say("Building CUDA offloading device RTL.")
-
-  # We really don't have any host code, so we don't need to care about
-  # propagating host flags.
-  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-  set(cuda_src_files
-      ${devicertl_common_directory}/src/cancel.cu
-      ${devicertl_common_directory}/src/critical.cu
-      ${devicertl_common_directory}/src/data_sharing.cu
-      ${devicertl_common_directory}/src/libcall.cu
-      ${devicertl_common_directory}/src/loop.cu
-      ${devicertl_common_directory}/src/omptarget.cu
-      ${devicertl_common_directory}/src/parallel.cu
-      ${devicertl_common_directory}/src/reduction.cu
-      ${devicertl_common_directory}/src/support.cu
-      ${devicertl_common_directory}/src/sync.cu
-      ${devicertl_common_directory}/src/task.cu
-      src/target_impl.cu
-  )
-
-  set(omp_data_objects ${devicertl_common_directory}/src/omp_data.cu)
-
-  # Get the compute capability the user requested or use SM_35 by default.
-  # SM_35 is what clang uses by default.
-  set(default_capabilities 35)
-  if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
-    set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
-    libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
-  endif()
-  set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
-    "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
-  string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
-
-  foreach(sm ${nvptx_sm_list})
-    set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
-  endforeach()
-
-  # Activate RTL message dumps if requested by the user.
-  set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
-    "Activate NVPTX device RTL debug messages.")
-  if(${LIBOMPTARGET_NVPTX_DEBUG})
-    set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
-  endif()
-
-  # NVPTX runtime library has to be statically linked. Dynamic linking is not
-  # yet supported by the CUDA toolchain on the device.
-  set(BUILD_SHARED_LIBS OFF)
-  set(CUDA_SEPARABLE_COMPILATION ON)
-  list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
-                              -I${devicertl_nvptx_directory}/src)
-  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
-      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
-
-  # Install device RTL under the lib destination folder.
-  install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-
-  target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
-
-
-  # Check if we can create an LLVM bitcode implementation of the runtime library
-  # that could be inlined in the user application. For that we need to find
-  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
-  # an LLVM linker.
-  set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
-    "Location of a CUDA compiler capable of emitting LLVM bitcode.")
-  set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
-    "Location of a linker capable of linking LLVM bitcode objects.")
-
-  include(LibomptargetNVPTXBitcodeLibrary)
-
-  set(bclib_default FALSE)
-  if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
-    set(bclib_default TRUE)
-  endif()
-  set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL
-    "Enable CUDA LLVM bitcode offloading device RTL.")
-  if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
-    if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
-      libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!")
-    endif()
-    libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
-
-    # Set flags for LLVM Bitcode compilation.
-    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
-                 -I${devicertl_base_directory}
-                 -I${devicertl_nvptx_directory}/src)
-
-    if(${LIBOMPTARGET_NVPTX_DEBUG})
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
-    else()
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
-    endif()
-
-    # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
-    # to handle. Therefore, we use 'weak' instead. We are compiling only for the
-    # device, so it should be equivalent.
-    if(CUDA_VERSION_MAJOR GREATER 8)
-      set(bc_flags ${bc_flags} -Dnv_weak=weak)
-    endif()
-
-    # Create target to build all Bitcode libraries.
-    add_custom_target(omptarget-nvptx-bc)
-
-    # Generate a Bitcode library for all the compute capabilities the user requested.
-    foreach(sm ${nvptx_sm_list})
-      set(cuda_arch --cuda-gpu-arch=sm_${sm})
-
-      # Compile CUDA files to bitcode.
-      set(bc_files "")
-      foreach(src ${cuda_src_files})
-        get_filename_component(infile ${src} ABSOLUTE)
-        get_filename_component(outfile ${src} NAME)
-
-        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
-            -c ${infile} -o ${outfile}-sm_${sm}.bc
-          DEPENDS ${infile}
-          IMPLICIT_DEPENDS CXX ${infile}
-          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
-          VERBATIM
-        )
-        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
-
-        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
-      endforeach()
-
-      # Link to a bitcode library.
-      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
-          DEPENDS ${bc_files}
-          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
-
-      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
-      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
-
-      # Copy library to destination.
-      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-                         $<TARGET_FILE_DIR:omptarget-nvptx>)
-
-      # Install bitcode library under the lib destination folder.
-      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-    endforeach()
-  endif()
-
-  add_subdirectory(test)
-else()
-  libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
-endif()
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
+#
+##===----------------------------------------------------------------------===##
+
+set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
+  "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
+
+if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
+  find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
+  if(NOT ALTERNATE_CUDA_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
+  endif()
+  set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
+endif()
+
+# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
+# gcc.
+if(CUDA_HOST_COMPILER MATCHES clang)
+
+  find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
+
+  if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
+    libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
+    return()
+  endif()
+  set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
+endif()
+
+get_filename_component(devicertl_base_directory
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  DIRECTORY)
+set(devicertl_common_directory
+  ${devicertl_base_directory}/common)
+set(devicertl_nvptx_directory
+  ${devicertl_base_directory}/nvptx)
+
+if(LIBOMPTARGET_DEP_CUDA_FOUND)
+  libomptarget_say("Building CUDA offloading device RTL.")
+
+  # We really don't have any host code, so we don't need to care about
+  # propagating host flags.
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+  set(cuda_src_files
+      ${devicertl_common_directory}/src/cancel.cu
+      ${devicertl_common_directory}/src/critical.cu
+      ${devicertl_common_directory}/src/data_sharing.cu
+      ${devicertl_common_directory}/src/libcall.cu
+      ${devicertl_common_directory}/src/loop.cu
+      ${devicertl_common_directory}/src/omptarget.cu
+      ${devicertl_common_directory}/src/parallel.cu
+      ${devicertl_common_directory}/src/reduction.cu
+      ${devicertl_common_directory}/src/support.cu
+      ${devicertl_common_directory}/src/sync.cu
+      ${devicertl_common_directory}/src/task.cu
+      src/target_impl.cu
+  )
+
+  set(omp_data_objects ${devicertl_common_directory}/src/omp_data.cu)
+
+  # Get the compute capability the user requested or use SM_35 by default.
+  # SM_35 is what clang uses by default.
+  set(default_capabilities 35)
+  if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
+    set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
+    libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
+  endif()
+  set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
+    "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
+  string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
+
+  foreach(sm ${nvptx_sm_list})
+    set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
+  endforeach()
+
+  # Activate RTL message dumps if requested by the user.
+  set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
+    "Activate NVPTX device RTL debug messages.")
+  if(${LIBOMPTARGET_NVPTX_DEBUG})
+    set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
+  endif()
+
+  # NVPTX runtime library has to be statically linked. Dynamic linking is not
+  # yet supported by the CUDA toolchain on the device.
+  set(BUILD_SHARED_LIBS OFF)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+  list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
+                              -I${devicertl_nvptx_directory}/src)
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
+      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+
+  # Install device RTL under the lib destination folder.
+  install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+  target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
+
+
+  # Check if we can create an LLVM bitcode implementation of the runtime library
+  # that could be inlined in the user application. For that we need to find
+  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+  # an LLVM linker.
+  set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+    "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+  set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+    "Location of a linker capable of linking LLVM bitcode objects.")
+
+  include(LibomptargetNVPTXBitcodeLibrary)
+
+  set(bclib_default FALSE)
+  if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+    set(bclib_default TRUE)
+  endif()
+  set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL
+    "Enable CUDA LLVM bitcode offloading device RTL.")
+  if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
+    if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+      libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!")
+    endif()
+    libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+    # Set flags for LLVM Bitcode compilation.
+    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
+                 -I${devicertl_base_directory}
+                 -I${devicertl_nvptx_directory}/src)
+
+    if(${LIBOMPTARGET_NVPTX_DEBUG})
+      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+    else()
+      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+    endif()
+
+    # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
+    # to handle. Therefore, we use 'weak' instead. We are compiling only for the
+    # device, so it should be equivalent.
+    if(CUDA_VERSION_MAJOR GREATER 8)
+      set(bc_flags ${bc_flags} -Dnv_weak=weak)
+    endif()
+
+    # Create target to build all Bitcode libraries.
+    add_custom_target(omptarget-nvptx-bc)
+
+    # Generate a Bitcode library for all the compute capabilities the user requested.
+    foreach(sm ${nvptx_sm_list})
+      set(cuda_arch --cuda-gpu-arch=sm_${sm})
+
+      # Compile CUDA files to bitcode.
+      set(bc_files "")
+      foreach(src ${cuda_src_files})
+        get_filename_component(infile ${src} ABSOLUTE)
+        get_filename_component(outfile ${src} NAME)
+
+        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+            -c ${infile} -o ${outfile}-sm_${sm}.bc
+          DEPENDS ${infile}
+          IMPLICIT_DEPENDS CXX ${infile}
+          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
+          VERBATIM
+        )
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+
+        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
+      endforeach()
+
+      # Link to a bitcode library.
+      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
+          DEPENDS ${bc_files}
+          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
+      )
+      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
+
+      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
+      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
+
+      # Copy library to destination.
+      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
+                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+                         $<TARGET_FILE_DIR:omptarget-nvptx>)
+
+      # Install bitcode library under the lib destination folder.
+      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+    endforeach()
+  endif()
+
+  add_subdirectory(test)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
+endif()
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
index 4149dfacb62ad..45c3208577401 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
@@ -1,523 +1,523 @@
-
-**Design document for OpenMP reductions on the GPU** 
-
-//Abstract: //In this document we summarize the new design for an OpenMP
-implementation of reductions on NVIDIA GPUs.  This document comprises
-* a succinct background review,
-* an introduction to the decoupling of reduction algorithm and
-    data-structure-specific processing routines,
-* detailed illustrations of reduction algorithms used and
-* a brief overview of steps we have made beyond the last implementation.
-
-**Problem Review**
-
-Consider a typical OpenMP program with reduction pragma.
-
-```
-    double foo, bar;
-    #pragma omp parallel for reduction(+:foo, bar)
-    for (int i = 0; i < N; i++) {
-      foo+=A[i]; bar+=B[i];
-    }
-```
-where 'foo' and 'bar' are reduced across all threads in the parallel region.
-Our primary goal is to efficiently aggregate the values of foo and bar in
-such manner that
-* makes the compiler logically concise.
-* efficiently reduces within warps, threads, blocks and the device.
-
-**Introduction to Decoupling**
-In this section we address the problem of making the compiler
-//logically concise// by partitioning the task of reduction into two broad
-categories: data-structure specific routines and algorithmic routines.
-
-The previous reduction implementation was highly coupled with
-the specificity of the reduction element data structures (e.g., sizes, data
-types) and operators of the reduction (e.g., addition, multiplication). In
-our implementation we strive to decouple them. In our final implementations,
-we could remove all template functions in our runtime system.
-
-The (simplified) pseudo code generated by LLVM is as follows:
-
-```
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, 
-               interWarpCpyFn)
-        where:
-        struct ReduceData {
-          double *foo;
-          double *bar;
-        } reduceData
-        reduceData.foo = &foo_p
-        reduceData.bar = &bar_p
-
-        shuffleReduceFn and interWarpCpyFn are two auxiliary functions
-        generated to aid the runtime performing algorithmic steps
-        while being data-structure agnostic about ReduceData.
-
-        In particular, shuffleReduceFn is a function that takes the following
-        inputs:
-        a. local copy of ReduceData
-        b. its lane_id
-        c. the offset of the lane_id which hosts a remote ReduceData
-                relative to the current one
-        d. an algorithm version parameter determining which reduction
-                algorithm to use.
-        This shuffleReduceFn retrieves the remote ReduceData through shuffle
-        intrinsics and  reduces, using the algorithm specified by the 4th
-        parameter, the local ReduceData and with the remote ReduceData element
-        wise, and places the resultant values into the local ReduceData.
-
-        Different reduction algorithms are implemented with different runtime
-        functions, but they all make calls to this same shuffleReduceFn to
-        perform the essential reduction step. Therefore, based on the 4th
-        parameter, this shuffleReduceFn will behave slightly differently to
-        cooperate with the runtime function to ensure correctness under
-        different circumstances.
-
-        InterWarpCpyFn, as the name suggests, is a function that copies data
-        across warps. Its function is to tunnel all the thread private
-        ReduceData that is already reduced within a warp to a lane in the first
-        warp with minimal shared memory footprint. This is an essential step to
-        prepare for the last step of a block reduction.
-
-        (Warp, block, device level reduction routines that utilize these
-        auxiliary functions will be discussed in the next section.)
-
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-```
-
-**Reduction Algorithms**
-
-On the warp level, we have three versions of the algorithms:
-
-1. Full Warp Reduction
-
-```
-gpu_regular_warp_reduce(void *reduce_data,
-                        kmp_ShuffleReductFctPtr ShuffleReduceFn) {
-  for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
-    ShuffleReduceFn(reduce_data, 0, offset, 0);
-}
-```
-ShuffleReduceFn is used here with lane_id set to 0 because it is not used
-therefore we save instructions by not retrieving lane_id from the corresponding
-special registers. The 4th parameters, which represents the version of the
-algorithm being used here, is set to 0 to signify full warp reduction.
-
-In this version specified (=0), the ShuffleReduceFn behaves, per element, as
-follows:
-
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-reduce_elem = reduce_elem @ remote_elem;
-
-```
-
-An illustration of this algorithm operating on a hypothetical 8-lane full-warp
-would be:
-{F74}
-The coloring invariant follows that elements with the same color will be
-combined and reduced in the next reduction step. As can be observed, no overhead
-is present, exactly log(2, N) steps are needed.
-
-2. Contiguous Full Warp Reduction
-```
-gpu_irregular_warp_reduce(void *reduce_data,
-                          kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
-                          int lane_id) {
-  int curr_size;
-  int offset;
-    curr_size = size;
-    mask = curr_size/2;
-    while (offset>0) {
-      ShuffleReduceFn(reduce_data, lane_id, offset, 1);
-      curr_size = (curr_size+1)/2;
-      offset = curr_size/2;
-    }
-}
-```
-
-In this version specified (=1), the ShuffleReduceFn behaves, per element, as
-follows:
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-if (lane_id < offset) {
-    reduce_elem = reduce_elem @ remote_elem
-} else {
-    reduce_elem = remote_elem
-}
-```
-
-An important invariant (also a restriction on the starting state of the
-reduction) is that this algorithm assumes that all unused ReduceData are
-located in a contiguous subset of threads in a warp starting from lane 0.
-
-With the presence of a trailing active lane with an odd-numbered lane
-id, its value will not be aggregated with any other lane. Therefore,
-in order to preserve the invariant, such ReduceData is copied to the first lane
-whose thread-local ReduceData has already being used in a previous reduction
-and would therefore be useless otherwise.
-
-An illustration of this algorithm operating on a hypothetical 8-lane partial
-warp woud be:
-{F75}
-
-As illustrated, this version of the algorithm introduces overhead whenever
-we have odd number of participating lanes in any reduction step to
-copy data between lanes.
-
-3. Dispersed Partial Warp Reduction
-```
-gpu_irregular_simt_reduce(void *reduce_data,
-                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
-  int size, remote_id;
-  int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
-  do {
-      remote_id = find_the_next_active_lane_id_right_after_me();
-      // the above function returns 0 of no active lane
-      // is present right after the current thread.
-      size = get_number_of_active_lanes_in_this_warp();
-      logical_lane_id /= 2;
-      ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-```
-
-There is no assumption made about the initial state of the reduction.
-Any number of lanes (>=1) could be active at any position. The reduction
-result is kept in the first active lane.
-
-In this version specified (=2), the ShuffleReduceFn behaves, per element, as
-follows:
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-if (LaneId % 2 == 0 && Offset > 0) {
-    reduce_elem = reduce_elem @ remote_elem
-} else {
-    reduce_elem = remote_elem
-}
-```
-We will proceed with a brief explanation for some arguments passed in,
-it is important to notice that, in this section, we will introduce the
-concept of logical_lane_id, and it is important to distinguish it
-from physical lane_id as defined by nvidia.
-1. //logical_lane_id//: as the name suggests, it refers to the calculated
-    lane_id (instead of the physical one defined by nvidia) that would make
-    our algorithm logically concise. A thread with logical_lane_id k means
-    there are (k-1) threads before it.
-2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
-    id of the remote lane from which we will retrieve the ReduceData. We
-    subtract (threadIdx+1) from it because we would like to maintain only one
-    underlying shuffle intrinsic (which is used to communicate among lanes in a
-    warp). This particular version of shuffle intrinsic we take accepts only
-    offsets, instead of absolute lane_id. Therefore the subtraction is performed
-    on the absolute lane_id we calculated to obtain the offset.
-
-This algorithm is slightly different in 2 ways and it is not, conceptually, a
-generalization of the above algorithms.
-1. It reduces elements close to each other. For instance, values in the 0th lane
-    is to be combined with that of the 1st lane; values in the 2nd lane is to be
-    combined with that of the 3rd lane. We did not use the previous algorithm
-    where the first half of the (partial) warp is reduced with the second half
-    of the (partial) warp. This is because, the mapping
-    f(x): logical_lane_id -> physical_lane_id;
-    can be easily calculated whereas its inverse
-    f^-1(x): physical_lane_id -> logical_lane_id
-    cannot and performing such reduction requires the inverse to be known.
-2. Because this algorithm is agnostic about the positions of the lanes that are
-    active, we do not need to perform the coping step as in the second
-    algorithm.
-An illustrative run would look like
-{F76}
-As observed, overhead is high because in each and every step of reduction,
-logical_lane_id is recalculated; so is the remote_id.
-
-On a block level, we have implemented the following block reduce algorithm:
-
-```
-gpu_irregular_block_reduce(void *reduce_data,
-              kmp_ShuffleReductFctPtr shuflReduceFn,
-              kmp_InterWarpCopyFctPtr interWarpCpyFn,
-              int size) {
-
-  int wid = threadIdx.x/WARPSIZE;
-  int lane_id = threadIdx.x%WARPSIZE;
-
-  int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
-
-  unsigned tnum = __ballot(1);
-  int thread_num = __popc(tnum);
-
-    //full warp reduction
-    if (thread_num == WARPSIZE) {
-      gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
-    }
-    //partial warp reduction
-    if (thread_num < WARPSIZE) {
-        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
-                                  lane_id);
-    }
-    //Gather all the reduced values from each warp
-    //to the first warp
-    //named_barrier inside this function to ensure
-    //correctness. It is effectively a sync_thread
-    //that won't deadlock.
-    interWarpCpyFn(reduce_data, warp_needed);
-
-    //This is to reduce data gathered from each "warp master".
-    if (wid==0) {
-        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
-                                  lane_id);
-    }
-
-  return;
-}
-```
-In this function, no ShuffleReduceFn is directly called as it makes calls
-to various versions of the warp-reduction functions. It first reduces
-ReduceData warp by warp; in the end, we end up with the number of
-ReduceData equal to the number of warps present in this thread
-block. We then proceed to gather all such ReduceData to the first warp.
-
-As observed, in this algorithm we make use of the function InterWarpCpyFn,
-which copies data from each of the "warp master" (0th lane of each warp, where 
-a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
-mathematical sense) the problem of reduction across warp masters in a block to
-the problem of warp reduction which we already have solutions to.
-
-We can thus completely avoid the use of atomics to reduce in a threadblock.
-
-**Efficient Cross Block Reduce**
-
-The next challenge is to reduce values across threadblocks.  We aim to do this
-without atomics or critical sections.
-
-Let a kernel be started with TB threadblocks.
-Let the GPU have S SMs.
-There can be at most N active threadblocks per SM at any time.
-
-Consider a threadblock tb (tb < TB) running on SM s (s < SM).  'tb' is one of
-at most 'N' active threadblocks on SM s.  Let each threadblock active on an SM
-be given an instance identifier id (0 <= id < N).  Therefore, the tuple (s, id)
-uniquely identifies an active threadblock on the GPU.
-
-To efficiently implement cross block reduce, we first allocate an array for
-each value to be reduced of size S*N (which is the maximum number of active
-threadblocks at any time on the device).
-
-Each threadblock reduces its value to slot [s][id].  This can be done without
-locking since no other threadblock can write to the same slot concurrently.
-
-As a final stage, we reduce the values in the array as follows:
-
-```
-// Compiler generated wrapper function for each target region with a reduction
-clause.
-target_function_wrapper(map_args, reduction_array)  <--- start with 1 team and 1
-   thread.
-  // Use dynamic parallelism to launch M teams, N threads as requested by the
-  user to execute the target region.
-
-  target_function<<M, N>>(map_args)
-
-  Reduce values in reduction_array
-
-```
-
-**Comparison with Last Version**
-
-
-The (simplified) pseudo code generated by LLVM on the host is as follows:
-
-
-```
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
-        where:
-        struct ReduceData {
-          double *foo;
-          double *bar;
-        } reduceData
-        reduceData.foo = &foo_p
-        reduceData.bar = &bar_p
-
-        reduceFn is a pointer to a function that takes in two inputs
-        of type ReduceData, "reduces" them element wise, and places the
-        result in the first input:
-        reduceFn(ReduceData *a, ReduceData *b)
-          a = a @ b
-
-        Every thread in the parallel region calls kmpc_reduce_nowait with
-        its private copy of reduceData.  The runtime reduces across the
-        threads (using tree reduction on the operator 'reduceFn?) and stores
-        the final result in the master thread if successful.
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-    5. else if ret == 2:
-        In this case kmpc_reduce_nowait() could not use tree reduction,
-        so use atomics instead:
-        each thread atomically writes to foo
-        each thread atomically writes to bar
-```
-
-On a GPU, a similar reduction may need to be performed across SIMT threads,
-warps, and threadblocks.  The challenge is to do so efficiently in a fashion
-that is compatible with the LLVM OpenMP implementation.
-
-In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
-the salient steps of the code generated are as follows:
-
-
-```
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
-        status = can_block_reduce()
-        if status == 1:
-          reduce efficiently to thread 0 using shuffles and shared memory.
-          return 1
-        else
-          cannot use efficient block reduction, fallback to atomics
-          return 2
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-    5. else if ret == 2:
-        In this case kmpc_reduce_nowait() could not use tree reduction,
-        so use atomics instead:
-        each thread atomically writes to foo
-        each thread atomically writes to bar
-```
-
-The function can_block_reduce() is defined as follows:
-
-
-```
-int32_t can_block_reduce() {
-  int tid = GetThreadIdInTeam();
-  int nt = GetNumberOfOmpThreads(tid);
-  if (nt != blockDim.x)
-    return 0;
-  unsigned tnum = __ballot(1);
-  if (tnum != (~0x0)) {
-    return 0;
-  }
-  return 1;
-}
-```
-
-This function permits the use of the efficient block reduction algorithm
-using shuffles and shared memory (return 1) only if (a) all SIMT threads in
-a warp are active (i.e., number of threads in the parallel region is a
-multiple of 32) and (b) the number of threads in the parallel region
-(set by the num_threads clause) equals blockDim.x.
-
-If either of these preconditions is not true, each thread in the threadblock
-updates the global value using atomics.
-
-Atomics and compare-and-swap operations are expensive on many threaded
-architectures such as GPUs and we must avoid them completely.
-
-
-**Appendix: Implementation Details**
-
-
-```
-// Compiler generated function.
-reduceFn(ReduceData *a, ReduceData *b)
-  a->foo = a->foo + b->foo
-  a->bar = a->bar + b->bar
-
-// Compiler generated function.
-swapAndReduceFn(ReduceData *thread_private, int lane)
-  ReduceData *remote = new ReduceData()
-  remote->foo = shuffle_double(thread_private->foo, lane)
-  remote->bar = shuffle_double(thread_private->bar, lane)
-  reduceFn(thread_private, remote)
-
-// OMP runtime function.
-warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
-  offset = 16
-  while (offset > 0)
-    swapAndReduceFn(thread_private, offset)
-    offset /= 2
-
-// OMP runtime function.
-warpReduce_irregular():
-  ...
-
-// OMP runtime function.
-kmpc_reduce_warp(reduceData, swapAndReduceFn)
-  if all_lanes_active:
-    warpReduce_regular(reduceData, swapAndReduceFn)
-  else:
-    warpReduce_irregular(reduceData, swapAndReduceFn)
-  if in_simd_region:
-    // all done, reduce to global in simd lane 0
-    return 1
-  else if in_parallel_region:
-    // done reducing to one value per warp, now reduce across warps
-    return 3
-
-// OMP runtime function; one for each basic type.
-kmpc_reduce_block_double(double *a)
-  if lane == 0:
-    shared[wid] = *a
-  named_barrier(1, num_threads)
-  if wid == 0
-    block_reduce(shared)
-  if lane == 0
-    *a = shared[0]
-  named_barrier(1, num_threads)
-  if wid == 0 and lane == 0
-    return 1  // write back reduced result
-  else
-    return 0  // don't do anything
-
-```
-
-
-
-```
-// Compiler generated code.
-    1. Create private copies of variables: foo_p, bar_p
-    2. Each thread reduces the chunk of A and B assigned to it and writes
-       to foo_p and bar_p respectively.
-    3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
-    4. if ret == 1:
-        The master thread stores the reduced result in the globals.
-        foo += reduceData.foo; bar += reduceData.bar
-    5. else if ret == 3:
-        ret = block_reduce_double(reduceData.foo)
-        if ret == 1:
-          foo += reduceData.foo
-        ret = block_reduce_double(reduceData.bar)
-        if ret == 1:
-          bar += reduceData.bar
-```
-
-**Notes**
-
-    1. This scheme requires that the CUDA OMP runtime can call llvm generated
-       functions. This functionality now works.
-    2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
-       (including calls through function pointers) are optimized away.
-    3. If we are reducing multiple to multiple variables in a parallel region,
-       the reduce operations are all performed in warpReduce_[ir]regular(). This
-       results in more instructions in the loop and should result in fewer
-       stalls due to data dependencies.  Unfortunately we cannot do the same in
-       kmpc_reduce_block_double() without increasing shared memory usage.
+
+**Design document for OpenMP reductions on the GPU** 
+
+//Abstract: //In this document we summarize the new design for an OpenMP
+implementation of reductions on NVIDIA GPUs.  This document comprises
+* a succinct background review,
+* an introduction to the decoupling of reduction algorithm and
+    data-structure-specific processing routines,
+* detailed illustrations of reduction algorithms used and
+* a brief overview of steps we have made beyond the last implementation.
+
+**Problem Review**
+
+Consider a typical OpenMP program with reduction pragma.
+
+```
+    double foo, bar;
+    #pragma omp parallel for reduction(+:foo, bar)
+    for (int i = 0; i < N; i++) {
+      foo+=A[i]; bar+=B[i];
+    }
+```
+where 'foo' and 'bar' are reduced across all threads in the parallel region.
+Our primary goal is to efficiently aggregate the values of foo and bar in
+such manner that
+* makes the compiler logically concise.
+* efficiently reduces within warps, threads, blocks and the device.
+
+**Introduction to Decoupling**
+In this section we address the problem of making the compiler
+//logically concise// by partitioning the task of reduction into two broad
+categories: data-structure specific routines and algorithmic routines.
+
+The previous reduction implementation was highly coupled with
+the specificity of the reduction element data structures (e.g., sizes, data
+types) and operators of the reduction (e.g., addition, multiplication). In
+our implementation we strive to decouple them. In our final implementations,
+we could remove all template functions in our runtime system.
+
+The (simplified) pseudo code generated by LLVM is as follows:
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, 
+               interWarpCpyFn)
+        where:
+        struct ReduceData {
+          double *foo;
+          double *bar;
+        } reduceData
+        reduceData.foo = &foo_p
+        reduceData.bar = &bar_p
+
+        shuffleReduceFn and interWarpCpyFn are two auxiliary functions
+        generated to aid the runtime performing algorithmic steps
+        while being data-structure agnostic about ReduceData.
+
+        In particular, shuffleReduceFn is a function that takes the following
+        inputs:
+        a. local copy of ReduceData
+        b. its lane_id
+        c. the offset of the lane_id which hosts a remote ReduceData
+                relative to the current one
+        d. an algorithm version parameter determining which reduction
+                algorithm to use.
+        This shuffleReduceFn retrieves the remote ReduceData through shuffle
+        intrinsics and  reduces, using the algorithm specified by the 4th
+        parameter, the local ReduceData and with the remote ReduceData element
+        wise, and places the resultant values into the local ReduceData.
+
+        Different reduction algorithms are implemented with different runtime
+        functions, but they all make calls to this same shuffleReduceFn to
+        perform the essential reduction step. Therefore, based on the 4th
+        parameter, this shuffleReduceFn will behave slightly differently to
+        cooperate with the runtime function to ensure correctness under
+        different circumstances.
+
+        InterWarpCpyFn, as the name suggests, is a function that copies data
+        across warps. Its function is to tunnel all the thread private
+        ReduceData that is already reduced within a warp to a lane in the first
+        warp with minimal shared memory footprint. This is an essential step to
+        prepare for the last step of a block reduction.
+
+        (Warp, block, device level reduction routines that utilize these
+        auxiliary functions will be discussed in the next section.)
+
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+```
+
+**Reduction Algorithms**
+
+On the warp level, we have three versions of the algorithms:
+
+1. Full Warp Reduction
+
+```
+gpu_regular_warp_reduce(void *reduce_data,
+                        kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+    ShuffleReduceFn(reduce_data, 0, offset, 0);
+}
+```
+ShuffleReduceFn is used here with lane_id set to 0 because it is not used
+therefore we save instructions by not retrieving lane_id from the corresponding
+special registers. The 4th parameters, which represents the version of the
+algorithm being used here, is set to 0 to signify full warp reduction.
+
+In this version specified (=0), the ShuffleReduceFn behaves, per element, as
+follows:
+
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+reduce_elem = reduce_elem @ remote_elem;
+
+```
+
+An illustration of this algorithm operating on a hypothetical 8-lane full-warp
+would be:
+{F74}
+The coloring invariant follows that elements with the same color will be
+combined and reduced in the next reduction step. As can be observed, no overhead
+is present, exactly log(2, N) steps are needed.
+
+2. Contiguous Full Warp Reduction
+```
+gpu_irregular_warp_reduce(void *reduce_data,
+                          kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
+                          int lane_id) {
+  int curr_size;
+  int offset;
+    curr_size = size;
+    mask = curr_size/2;
+    while (offset>0) {
+      ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+      curr_size = (curr_size+1)/2;
+      offset = curr_size/2;
+    }
+}
+```
+
+In this version specified (=1), the ShuffleReduceFn behaves, per element, as
+follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (lane_id < offset) {
+    reduce_elem = reduce_elem @ remote_elem
+} else {
+    reduce_elem = remote_elem
+}
+```
+
+An important invariant (also a restriction on the starting state of the
+reduction) is that this algorithm assumes that all unused ReduceData are
+located in a contiguous subset of threads in a warp starting from lane 0.
+
+With the presence of a trailing active lane with an odd-numbered lane
+id, its value will not be aggregated with any other lane. Therefore,
+in order to preserve the invariant, such ReduceData is copied to the first lane
+whose thread-local ReduceData has already being used in a previous reduction
+and would therefore be useless otherwise.
+
+An illustration of this algorithm operating on a hypothetical 8-lane partial
+warp woud be:
+{F75}
+
+As illustrated, this version of the algorithm introduces overhead whenever
+we have odd number of participating lanes in any reduction step to
+copy data between lanes.
+
+3. Dispersed Partial Warp Reduction
+```
+gpu_irregular_simt_reduce(void *reduce_data,
+                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  int size, remote_id;
+  int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
+  do {
+      remote_id = find_the_next_active_lane_id_right_after_me();
+      // the above function returns 0 of no active lane
+      // is present right after the current thread.
+      size = get_number_of_active_lanes_in_this_warp();
+      logical_lane_id /= 2;
+      ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+```
+
+There is no assumption made about the initial state of the reduction.
+Any number of lanes (>=1) could be active at any position. The reduction
+result is kept in the first active lane.
+
+In this version specified (=2), the ShuffleReduceFn behaves, per element, as
+follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (LaneId % 2 == 0 && Offset > 0) {
+    reduce_elem = reduce_elem @ remote_elem
+} else {
+    reduce_elem = remote_elem
+}
+```
+We will proceed with a brief explanation for some arguments passed in,
+it is important to notice that, in this section, we will introduce the
+concept of logical_lane_id, and it is important to distinguish it
+from physical lane_id as defined by nvidia.
+1. //logical_lane_id//: as the name suggests, it refers to the calculated
+    lane_id (instead of the physical one defined by nvidia) that would make
+    our algorithm logically concise. A thread with logical_lane_id k means
+    there are (k-1) threads before it.
+2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
+    id of the remote lane from which we will retrieve the ReduceData. We
+    subtract (threadIdx+1) from it because we would like to maintain only one
+    underlying shuffle intrinsic (which is used to communicate among lanes in a
+    warp). This particular version of shuffle intrinsic we take accepts only
+    offsets, instead of absolute lane_id. Therefore the subtraction is performed
+    on the absolute lane_id we calculated to obtain the offset.
+
+This algorithm is slightly different in 2 ways and it is not, conceptually, a
+generalization of the above algorithms.
+1. It reduces elements close to each other. For instance, values in the 0th lane
+    is to be combined with that of the 1st lane; values in the 2nd lane is to be
+    combined with that of the 3rd lane. We did not use the previous algorithm
+    where the first half of the (partial) warp is reduced with the second half
+    of the (partial) warp. This is because, the mapping
+    f(x): logical_lane_id -> physical_lane_id;
+    can be easily calculated whereas its inverse
+    f^-1(x): physical_lane_id -> logical_lane_id
+    cannot and performing such reduction requires the inverse to be known.
+2. Because this algorithm is agnostic about the positions of the lanes that are
+    active, we do not need to perform the coping step as in the second
+    algorithm.
+An illustrative run would look like
+{F76}
+As observed, overhead is high because in each and every step of reduction,
+logical_lane_id is recalculated; so is the remote_id.
+
+On a block level, we have implemented the following block reduce algorithm:
+
+```
+gpu_irregular_block_reduce(void *reduce_data,
+              kmp_ShuffleReductFctPtr shuflReduceFn,
+              kmp_InterWarpCopyFctPtr interWarpCpyFn,
+              int size) {
+
+  int wid = threadIdx.x/WARPSIZE;
+  int lane_id = threadIdx.x%WARPSIZE;
+
+  int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
+
+  unsigned tnum = __ballot(1);
+  int thread_num = __popc(tnum);
+
+    //full warp reduction
+    if (thread_num == WARPSIZE) {
+      gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
+    }
+    //partial warp reduction
+    if (thread_num < WARPSIZE) {
+        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
+                                  lane_id);
+    }
+    //Gather all the reduced values from each warp
+    //to the first warp
+    //named_barrier inside this function to ensure
+    //correctness. It is effectively a sync_thread
+    //that won't deadlock.
+    interWarpCpyFn(reduce_data, warp_needed);
+
+    //This is to reduce data gathered from each "warp master".
+    if (wid==0) {
+        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
+                                  lane_id);
+    }
+
+  return;
+}
+```
+In this function, no ShuffleReduceFn is directly called as it makes calls
+to various versions of the warp-reduction functions. It first reduces
+ReduceData warp by warp; in the end, we end up with the number of
+ReduceData equal to the number of warps present in this thread
+block. We then proceed to gather all such ReduceData to the first warp.
+
+As observed, in this algorithm we make use of the function InterWarpCpyFn,
+which copies data from each of the "warp master" (0th lane of each warp, where 
+a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
+mathematical sense) the problem of reduction across warp masters in a block to
+the problem of warp reduction which we already have solutions to.
+
+We can thus completely avoid the use of atomics to reduce in a threadblock.
+
+**Efficient Cross Block Reduce**
+
+The next challenge is to reduce values across threadblocks.  We aim to do this
+without atomics or critical sections.
+
+Let a kernel be started with TB threadblocks.
+Let the GPU have S SMs.
+There can be at most N active threadblocks per SM at any time.
+
+Consider a threadblock tb (tb < TB) running on SM s (s < SM).  'tb' is one of
+at most 'N' active threadblocks on SM s.  Let each threadblock active on an SM
+be given an instance identifier id (0 <= id < N).  Therefore, the tuple (s, id)
+uniquely identifies an active threadblock on the GPU.
+
+To efficiently implement cross block reduce, we first allocate an array for
+each value to be reduced of size S*N (which is the maximum number of active
+threadblocks at any time on the device).
+
+Each threadblock reduces its value to slot [s][id].  This can be done without
+locking since no other threadblock can write to the same slot concurrently.
+
+As a final stage, we reduce the values in the array as follows:
+
+```
+// Compiler generated wrapper function for each target region with a reduction
+clause.
+target_function_wrapper(map_args, reduction_array)  <--- start with 1 team and 1
+   thread.
+  // Use dynamic parallelism to launch M teams, N threads as requested by the
+  user to execute the target region.
+
+  target_function<<M, N>>(map_args)
+
+  Reduce values in reduction_array
+
+```
+
+**Comparison with Last Version**
+
+
+The (simplified) pseudo code generated by LLVM on the host is as follows:
+
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+        where:
+        struct ReduceData {
+          double *foo;
+          double *bar;
+        } reduceData
+        reduceData.foo = &foo_p
+        reduceData.bar = &bar_p
+
+        reduceFn is a pointer to a function that takes in two inputs
+        of type ReduceData, "reduces" them element wise, and places the
+        result in the first input:
+        reduceFn(ReduceData *a, ReduceData *b)
+          a = a @ b
+
+        Every thread in the parallel region calls kmpc_reduce_nowait with
+        its private copy of reduceData.  The runtime reduces across the
+        threads (using tree reduction on the operator 'reduceFn?) and stores
+        the final result in the master thread if successful.
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 2:
+        In this case kmpc_reduce_nowait() could not use tree reduction,
+        so use atomics instead:
+        each thread atomically writes to foo
+        each thread atomically writes to bar
+```
+
+On a GPU, a similar reduction may need to be performed across SIMT threads,
+warps, and threadblocks.  The challenge is to do so efficiently in a fashion
+that is compatible with the LLVM OpenMP implementation.
+
+In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
+the salient steps of the code generated are as follows:
+
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+        status = can_block_reduce()
+        if status == 1:
+          reduce efficiently to thread 0 using shuffles and shared memory.
+          return 1
+        else
+          cannot use efficient block reduction, fallback to atomics
+          return 2
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 2:
+        In this case kmpc_reduce_nowait() could not use tree reduction,
+        so use atomics instead:
+        each thread atomically writes to foo
+        each thread atomically writes to bar
+```
+
+The function can_block_reduce() is defined as follows:
+
+
+```
+int32_t can_block_reduce() {
+  int tid = GetThreadIdInTeam();
+  int nt = GetNumberOfOmpThreads(tid);
+  if (nt != blockDim.x)
+    return 0;
+  unsigned tnum = __ballot(1);
+  if (tnum != (~0x0)) {
+    return 0;
+  }
+  return 1;
+}
+```
+
+This function permits the use of the efficient block reduction algorithm
+using shuffles and shared memory (return 1) only if (a) all SIMT threads in
+a warp are active (i.e., number of threads in the parallel region is a
+multiple of 32) and (b) the number of threads in the parallel region
+(set by the num_threads clause) equals blockDim.x.
+
+If either of these preconditions is not true, each thread in the threadblock
+updates the global value using atomics.
+
+Atomics and compare-and-swap operations are expensive on many threaded
+architectures such as GPUs and we must avoid them completely.
+
+
+**Appendix: Implementation Details**
+
+
+```
+// Compiler generated function.
+reduceFn(ReduceData *a, ReduceData *b)
+  a->foo = a->foo + b->foo
+  a->bar = a->bar + b->bar
+
+// Compiler generated function.
+swapAndReduceFn(ReduceData *thread_private, int lane)
+  ReduceData *remote = new ReduceData()
+  remote->foo = shuffle_double(thread_private->foo, lane)
+  remote->bar = shuffle_double(thread_private->bar, lane)
+  reduceFn(thread_private, remote)
+
+// OMP runtime function.
+warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
+  offset = 16
+  while (offset > 0)
+    swapAndReduceFn(thread_private, offset)
+    offset /= 2
+
+// OMP runtime function.
+warpReduce_irregular():
+  ...
+
+// OMP runtime function.
+kmpc_reduce_warp(reduceData, swapAndReduceFn)
+  if all_lanes_active:
+    warpReduce_regular(reduceData, swapAndReduceFn)
+  else:
+    warpReduce_irregular(reduceData, swapAndReduceFn)
+  if in_simd_region:
+    // all done, reduce to global in simd lane 0
+    return 1
+  else if in_parallel_region:
+    // done reducing to one value per warp, now reduce across warps
+    return 3
+
+// OMP runtime function; one for each basic type.
+kmpc_reduce_block_double(double *a)
+  if lane == 0:
+    shared[wid] = *a
+  named_barrier(1, num_threads)
+  if wid == 0
+    block_reduce(shared)
+  if lane == 0
+    *a = shared[0]
+  named_barrier(1, num_threads)
+  if wid == 0 and lane == 0
+    return 1  // write back reduced result
+  else
+    return 0  // don't do anything
+
+```
+
+
+
+```
+// Compiler generated code.
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 3:
+        ret = block_reduce_double(reduceData.foo)
+        if ret == 1:
+          foo += reduceData.foo
+        ret = block_reduce_double(reduceData.bar)
+        if ret == 1:
+          bar += reduceData.bar
+```
+
+**Notes**
+
+    1. This scheme requires that the CUDA OMP runtime can call llvm generated
+       functions. This functionality now works.
+    2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
+       (including calls through function pointers) are optimized away.
+    3. If we are reducing multiple to multiple variables in a parallel region,
+       the reduce operations are all performed in warpReduce_[ir]regular(). This
+       results in more instructions in the loop and should result in fewer
+       stalls due to data dependencies.  Unfortunately we cannot do the same in
+       kmpc_reduce_block_double() without increasing shared memory usage.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
index c5e91c5bf5270..fa232a6ed8d06 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -1,18 +1,18 @@
-//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _NVPTX_INTERFACE_H_
-#define _NVPTX_INTERFACE_H_
-
-#include <stdint.h>
-
-#define EXTERN extern "C" __device__
-typedef uint32_t __kmpc_impl_lanemask_t;
-typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
-
-#endif
+//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _NVPTX_INTERFACE_H_
+#define _NVPTX_INTERFACE_H_
+
+#include <stdint.h>
+
+#define EXTERN extern "C" __device__
+typedef uint32_t __kmpc_impl_lanemask_t;
+typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 50867bc4010af..320d7a56434fb 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -1,50 +1,50 @@
-//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-
-#include "target_impl.h"
-#include "common/debug.h"
-#include "common/target_atomic.h"
-
-#define __OMP_SPIN 1000
-#define UNSET 0u
-#define SET 1u
-
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
-  __kmpc_impl_unset_lock(lock);
-}
-
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
-  __kmpc_impl_unset_lock(lock);
-}
-
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
-  // TODO: not sure spinning is a good idea here..
-  while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
-    clock_t start = clock();
-    clock_t now;
-    for (;;) {
-      now = clock();
-      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
-        break;
-      }
-    }
-  } // wait for 0 to be the read value
-}
-
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
-  (void)__kmpc_atomic_exchange(lock, UNSET);
-}
-
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
-  return __kmpc_atomic_add(lock, 0u);
-}
+//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of target specific functions
+//
+//===----------------------------------------------------------------------===//
+
+#include "target_impl.h"
+#include "common/debug.h"
+#include "common/target_atomic.h"
+
+#define __OMP_SPIN 1000
+#define UNSET 0u
+#define SET 1u
+
+EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
+  __kmpc_impl_unset_lock(lock);
+}
+
+EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
+  __kmpc_impl_unset_lock(lock);
+}
+
+EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
+  // TODO: not sure spinning is a good idea here..
+  while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
+    clock_t start = clock();
+    clock_t now;
+    for (;;) {
+      now = clock();
+      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+}
+
+EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
+  (void)__kmpc_atomic_exchange(lock, UNSET);
+}
+
+EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
+  return __kmpc_atomic_add(lock, 0u);
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 1b966510ec7ef..032943fe4e063 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -1,218 +1,218 @@
-//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#ifndef _TARGET_IMPL_H_
-#define _TARGET_IMPL_H_
-
-#include <assert.h>
-#include <cuda.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "nvptx_interface.h"
-
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
-#define SHARED __shared__
-#define ALIGN(N) __align__(N)
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-#define MAX_THREADS_PER_TEAM 1024
-
-#define WARPSIZE 32
-
-// The named barrier for active parallel threads of a team in an L1 parallel
-// region to synchronize with each other.
-#define L1_BARRIER (1)
-
-// Maximum number of preallocated arguments to an outlined parallel/simd function.
-// Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#if __CUDA_ARCH__ >= 700
-#define OMP_STATE_COUNT 32
-#define MAX_SM 84
-#elif __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
-#define MAX_SM 56
-#else
-#define OMP_STATE_COUNT 16
-#define MAX_SM 16
-#endif
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
-  // The maximum number of workers in a kernel.
-  DS_Max_Worker_Threads = 992,
-  // The size reserved for data in a shared memory slot.
-  DS_Slot_Size = 256,
-  // The slot size that should be reserved for a working warp.
-  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
-  // The maximum number of warps in use
-  DS_Max_Warp_Number = 32,
-  // The size of the preallocated shared memory buffer per team
-  DS_Shared_Memory_Size = 128,
-};
-
-INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-}
-
-INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
-  uint64_t val;
-  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-  return val;
-}
-
-static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
-    UINT32_C(0xffffffff);
-
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
-  return res;
-}
-
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
-  __kmpc_impl_lanemask_t res;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
-  return res;
-}
-
-INLINE uint32_t __kmpc_impl_smid() {
-  uint32_t id;
-  asm("mov.u32 %0, %%smid;" : "=r"(id));
-  return id;
-}
-
-INLINE double __kmpc_impl_get_wtick() {
-  // Timer precision is 1ns
-  return ((double)1E-9);
-}
-
-INLINE double __kmpc_impl_get_wtime() {
-  unsigned long long nsecs;
-  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
-  return (double)nsecs * __kmpc_impl_get_wtick();
-}
-
-INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
-
-INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
-
-template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
-  return min(x, y);
-}
-
-#ifndef CUDA_VERSION
-#error CUDA_VERSION macro is undefined, something wrong with cuda.
-#endif
-
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
-
-INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
-  return __activemask();
-#else
-  return __ballot(1);
-#endif
-}
-
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-
-INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
-#else
-  return __shfl(Var, SrcLane);
-#endif // CUDA_VERSION
-}
-
-INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-#if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
-#else
-  return __shfl_down(Var, Delta, Width);
-#endif // CUDA_VERSION
-}
-
-INLINE void __kmpc_impl_syncthreads() {
-  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
-#if !defined(__clang__) || __clang_major__ >= 9
-  __syncthreads();
-#else
-  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
-#endif // __clang__
-}
-
-INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
-}
-
-INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
-  asm volatile("bar.sync %0, %1;"
-               :
-               : "r"(barrier), "r"(num_threads)
-               : "memory");
-}
-
-INLINE void __kmpc_impl_threadfence(void) { __threadfence(); }
-INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); }
-INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); }
-
-// Calls to the NVPTX layer (assuming 1D layout)
-INLINE int GetThreadIdInBlock() { return threadIdx.x; }
-INLINE int GetBlockIdInKernel() { return blockIdx.x; }
-INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
-INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
-INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
-INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
-
-// Return true if this is the first active thread in the warp.
-INLINE bool __kmpc_impl_is_first_active_thread() {
-  unsigned long long Mask = __kmpc_impl_activemask();
-  unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
-  unsigned long long Sh = Mask << ShNum;
-  // Truncate Sh to the 32 lower bits
-  return (unsigned)Sh == 0;
-}
-
-// Locks
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
-
-// Memory
-INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
-INLINE void __kmpc_impl_free(void *x) { free(x); }
-
-#endif
+//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of target specific functions
+//
+//===----------------------------------------------------------------------===//
+#ifndef _TARGET_IMPL_H_
+#define _TARGET_IMPL_H_
+
+#include <assert.h>
+#include <cuda.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "nvptx_interface.h"
+
+#define DEVICE __device__
+#define INLINE __forceinline__ DEVICE
+#define NOINLINE __noinline__ DEVICE
+#define SHARED __shared__
+#define ALIGN(N) __align__(N)
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 32
+
+// The named barrier for active parallel threads of a team in an L1 parallel
+// region to synchronize with each other.
+#define L1_BARRIER (1)
+
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#if __CUDA_ARCH__ >= 700
+#define OMP_STATE_COUNT 32
+#define MAX_SM 84
+#elif __CUDA_ARCH__ >= 600
+#define OMP_STATE_COUNT 32
+#define MAX_SM 56
+#else
+#define OMP_STATE_COUNT 16
+#define MAX_SM 16
+#endif
+
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 992,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 32,
+  // The size of the preallocated shared memory buffer per team
+  DS_Shared_Memory_Size = 128,
+};
+
+INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+}
+
+INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+  uint64_t val;
+  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+  return val;
+}
+
+static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
+    UINT32_C(0xffffffff);
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
+  return res;
+}
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
+  return res;
+}
+
+INLINE uint32_t __kmpc_impl_smid() {
+  uint32_t id;
+  asm("mov.u32 %0, %%smid;" : "=r"(id));
+  return id;
+}
+
+INLINE double __kmpc_impl_get_wtick() {
+  // Timer precision is 1ns
+  return ((double)1E-9);
+}
+
+INLINE double __kmpc_impl_get_wtime() {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  return (double)nsecs * __kmpc_impl_get_wtick();
+}
+
+INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
+
+INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+
+template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
+  return min(x, y);
+}
+
+#ifndef CUDA_VERSION
+#error CUDA_VERSION macro is undefined, something wrong with cuda.
+#endif
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+#if CUDA_VERSION >= 9000
+  return __activemask();
+#else
+  return __ballot(1);
+#endif
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+
+INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(Mask, Var, SrcLane);
+#else
+  return __shfl(Var, SrcLane);
+#endif // CUDA_VERSION
+}
+
+INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+                                          int32_t Var, uint32_t Delta,
+                                          int32_t Width) {
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(Mask, Var, Delta, Width);
+#else
+  return __shfl_down(Var, Delta, Width);
+#endif // CUDA_VERSION
+}
+
+INLINE void __kmpc_impl_syncthreads() {
+  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
+#if !defined(__clang__) || __clang_major__ >= 9
+  __syncthreads();
+#else
+  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
+#endif // __clang__
+}
+
+INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
+#if CUDA_VERSION >= 9000
+  __syncwarp(Mask);
+#else
+  // In Cuda < 9.0 no need to sync threads in warps.
+#endif // CUDA_VERSION
+}
+
+INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
+  asm volatile("bar.sync %0, %1;"
+               :
+               : "r"(barrier), "r"(num_threads)
+               : "memory");
+}
+
+INLINE void __kmpc_impl_threadfence(void) { __threadfence(); }
+INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); }
+INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); }
+
+// Calls to the NVPTX layer (assuming 1D layout)
+INLINE int GetThreadIdInBlock() { return threadIdx.x; }
+INLINE int GetBlockIdInKernel() { return blockIdx.x; }
+INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
+INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+
+// Return true if this is the first active thread in the warp.
+INLINE bool __kmpc_impl_is_first_active_thread() {
+  unsigned long long Mask = __kmpc_impl_activemask();
+  unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
+  unsigned long long Sh = Mask << ShNum;
+  // Truncate Sh to the 32 lower bits
+  return (unsigned)Sh == 0;
+}
+
+// Locks
+EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
+EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
+EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
+EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
+EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
+
+// Memory
+INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
+INLINE void __kmpc_impl_free(void *x) { free(x); }
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
index 1eabeb25ff98b..40cb35e6cc028 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
@@ -1,25 +1,25 @@
-if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
-  # Silently return, no need to annoy the user.
-  return()
-endif()
-
-set(deps omptarget-nvptx omptarget omp)
-if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
-  set(deps ${deps} omptarget-nvptx-bc)
-endif()
-
-# Run with only one thread to only launch one application to the GPU at a time.
-add_openmp_testsuite(check-libomptarget-nvptx
-    "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
-    EXCLUDE_FROM_CHECK_ALL
-    DEPENDS ${deps} ARGS -j1)
-
-set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
-    "Extra compiler flags to send to the test compiler.")
-set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS
-    "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING
-    "OpenMP compiler flags to use for testing libomptarget-nvptx.")
-
-# Configure the lit.site.cfg.in file
-set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!")
-configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
+if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
+  # Silently return, no need to annoy the user.
+  return()
+endif()
+
+set(deps omptarget-nvptx omptarget omp)
+if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
+  set(deps ${deps} omptarget-nvptx-bc)
+endif()
+
+# Run with only one thread to only launch one application to the GPU at a time.
+add_openmp_testsuite(check-libomptarget-nvptx
+    "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
+    EXCLUDE_FROM_CHECK_ALL
+    DEPENDS ${deps} ARGS -j1)
+
+set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
+    "Extra compiler flags to send to the test compiler.")
+set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS
+    "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING
+    "OpenMP compiler flags to use for testing libomptarget-nvptx.")
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
index 60254bc7ed2e2..58a16b8e82daf 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
@@ -1,22 +1,22 @@
-// RUN: %compile-run-and-check
-#include <omp.h>
-#include <stdio.h>
-
-int main(){
-  int max_threads = -1;
-  int num_threads = -1;
-
-  #pragma omp target map(tofrom: max_threads)
-    max_threads = omp_get_max_threads();
-
-  #pragma omp target parallel map(tofrom: num_threads)
-  {
-    #pragma omp master
-      num_threads = omp_get_num_threads();
-  }
-  
-  // CHECK: Max Threads: 128, Num Threads: 128
-  printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads);
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+#include <omp.h>
+#include <stdio.h>
+
+int main(){
+  int max_threads = -1;
+  int num_threads = -1;
+
+  #pragma omp target map(tofrom: max_threads)
+    max_threads = omp_get_max_threads();
+
+  #pragma omp target parallel map(tofrom: num_threads)
+  {
+    #pragma omp master
+      num_threads = omp_get_num_threads();
+  }
+  
+  // CHECK: Max Threads: 128, Num Threads: 128
+  printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
index 1fa9ae024f6f5..657aad915bea6 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
@@ -1,38 +1,38 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
-  int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1;
-
-  #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels)
-  {
-    // libomptarget-nvptx doesn't support cancellation.
-    cancellation = omp_get_cancellation();
-
-    // No support for dynamic adjustment of the number of threads.
-    omp_set_dynamic(1);
-    dynamic = omp_get_dynamic();
-
-    // libomptarget-nvptx doesn't support nested parallelism.
-    omp_set_nested(1);
-    nested = omp_get_nested();
-
-    omp_set_max_active_levels(42);
-    maxActiveLevels = omp_get_max_active_levels();
-  }
-
-  // CHECK: cancellation = 0
-  printf("cancellation = %d\n", cancellation);
-  // CHECK: dynamic = 0
-  printf("dynamic = %d\n", dynamic);
-  // CHECK: nested = 0
-  printf("nested = %d\n", nested);
-  // CHECK: maxActiveLevels = 1
-  printf("maxActiveLevels = %d\n", maxActiveLevels);
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+const int MaxThreads = 1024;
+
+int main(int argc, char *argv[]) {
+  int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1;
+
+  #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels)
+  {
+    // libomptarget-nvptx doesn't support cancellation.
+    cancellation = omp_get_cancellation();
+
+    // No support for dynamic adjustment of the number of threads.
+    omp_set_dynamic(1);
+    dynamic = omp_get_dynamic();
+
+    // libomptarget-nvptx doesn't support nested parallelism.
+    omp_set_nested(1);
+    nested = omp_get_nested();
+
+    omp_set_max_active_levels(42);
+    maxActiveLevels = omp_get_max_active_levels();
+  }
+
+  // CHECK: cancellation = 0
+  printf("cancellation = %d\n", cancellation);
+  // CHECK: dynamic = 0
+  printf("dynamic = %d\n", dynamic);
+  // CHECK: nested = 0
+  printf("nested = %d\n", nested);
+  // CHECK: maxActiveLevels = 1
+  printf("maxActiveLevels = %d\n", maxActiveLevels);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
index efb418fef9a0b..d9fd0b86f0e35 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
@@ -1,53 +1,53 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int MaxThreadsL1 = -1, MaxThreadsL2 = -1;
-
-#pragma omp declare reduction(unique:int                                       \
-                              : omp_out = (omp_in == 1 ? omp_in : omp_out))    \
-    initializer(omp_priv = -1)
-
-  // Non-SPMD mode.
-#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32)      \
-    num_teams(1)
-  {
-    MaxThreadsL1 = omp_get_max_threads();
-#pragma omp parallel reduction(unique : MaxThreadsL2)
-    { MaxThreadsL2 = omp_get_max_threads(); }
-  }
-
-  //FIXME: This Non-SPMD kernel will have 32 active threads due to
-  //       thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of
-  //       threads in block (64 in this case), which translates to worker
-  //       threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD
-  //       kernels. According to the spec, omp_get_max_threads must return the
-  //       max active threads possible between the two kernel types.
-
-  // CHECK: Non-SPMD MaxThreadsL1 = 64
-  printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
-  // CHECK: Non-SPMD MaxThreadsL2 = 1
-  printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
-
-  // SPMD mode with full runtime
-  MaxThreadsL2 = -1;
-#pragma omp target parallel reduction(unique : MaxThreadsL2)
-  { MaxThreadsL2 = omp_get_max_threads(); }
-
-  // CHECK: SPMD with full runtime MaxThreadsL2 = 1
-  printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
-
-  // SPMD mode without runtime
-  MaxThreadsL2 = -1;
-#pragma omp target parallel for reduction(unique : MaxThreadsL2)
-  for (int I = 0; I < 2; ++I) {
-    MaxThreadsL2 = omp_get_max_threads();
-  }
-
-  // CHECK: SPMD without runtime MaxThreadsL2 = 1
-  printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int MaxThreadsL1 = -1, MaxThreadsL2 = -1;
+
+#pragma omp declare reduction(unique:int                                       \
+                              : omp_out = (omp_in == 1 ? omp_in : omp_out))    \
+    initializer(omp_priv = -1)
+
+  // Non-SPMD mode.
+#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32)      \
+    num_teams(1)
+  {
+    MaxThreadsL1 = omp_get_max_threads();
+#pragma omp parallel reduction(unique : MaxThreadsL2)
+    { MaxThreadsL2 = omp_get_max_threads(); }
+  }
+
+  //FIXME: This Non-SPMD kernel will have 32 active threads due to
+  //       thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of
+  //       threads in block (64 in this case), which translates to worker
+  //       threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD
+  //       kernels. According to the spec, omp_get_max_threads must return the
+  //       max active threads possible between the two kernel types.
+
+  // CHECK: Non-SPMD MaxThreadsL1 = 64
+  printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
+  // CHECK: Non-SPMD MaxThreadsL2 = 1
+  printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
+
+  // SPMD mode with full runtime
+  MaxThreadsL2 = -1;
+#pragma omp target parallel reduction(unique : MaxThreadsL2)
+  { MaxThreadsL2 = omp_get_max_threads(); }
+
+  // CHECK: SPMD with full runtime MaxThreadsL2 = 1
+  printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
+
+  // SPMD mode without runtime
+  MaxThreadsL2 = -1;
+#pragma omp target parallel for reduction(unique : MaxThreadsL2)
+  for (int I = 0; I < 2; ++I) {
+    MaxThreadsL2 = omp_get_max_threads();
+  }
+
+  // CHECK: SPMD without runtime MaxThreadsL2 = 1
+  printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
index 626d620dc4f3a..33ed6d5735d62 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
@@ -1,72 +1,72 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1;
-
-#pragma omp declare reduction(unique64:int                                     \
-                              : omp_out = (omp_in == 64 ? omp_in : omp_out))   \
-    initializer(omp_priv = -1)
-#pragma omp declare reduction(unique32:int                                     \
-                              : omp_out = (omp_in == 32 ? omp_in : omp_out))   \
-    initializer(omp_priv = -1)
-
-  // Non-SPMD mode.
-#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2)      \
-    thread_limit(64) num_teams(1)
-  {
-    ThreadLimitL0 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique64                                        \
-                               : ThreadLimitL1, ThreadLimitL2) num_threads(32)
-    {
-      ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique64 : ThreadLimitL2)
-      { ThreadLimitL2 = omp_get_thread_limit(); }
-    }
-  }
-
-  // CHECK: Non-SPMD ThreadLimitL0 = 64
-  printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0);
-  // CHECK: Non-SPMD ThreadLimitL1 = 64
-  printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1);
-  // CHECK: Non-SPMD ThreadLimitL2 = 64
-  printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2);
-
-  // SPMD mode with full runtime
-  ThreadLimitL1 = -1;
-  ThreadLimitL2 = -1;
-#pragma omp target parallel reduction(unique32                                 \
-                                      : ThreadLimitL1, ThreadLimitL2)          \
-    num_threads(32)
-  {
-    ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique32 : ThreadLimitL2)
-    { ThreadLimitL2 = omp_get_thread_limit(); }
-  }
-
-  // CHECK: SPMD with full runtime ThreadLimitL1 = 32
-  printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
-  // CHECK: SPMD with full runtime ThreadLimitL2 = 32
-  printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
-
-  // SPMD mode without runtime
-  ThreadLimitL1 = -1;
-  ThreadLimitL2 = -1;
-#pragma omp target parallel for reduction(unique32                             \
-                                          : ThreadLimitL1, ThreadLimitL2)      \
-    num_threads(32)
-  for (int I = 0; I < 2; ++I) {
-    ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique32 : ThreadLimitL2)
-    { ThreadLimitL2 = omp_get_thread_limit(); }
-  }
-
-  // CHECK: SPMD without runtime ThreadLimitL1 = 32
-  printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
-  // CHECK: SPMD without runtime ThreadLimitL2 = 32
-  printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1;
+
+#pragma omp declare reduction(unique64:int                                     \
+                              : omp_out = (omp_in == 64 ? omp_in : omp_out))   \
+    initializer(omp_priv = -1)
+#pragma omp declare reduction(unique32:int                                     \
+                              : omp_out = (omp_in == 32 ? omp_in : omp_out))   \
+    initializer(omp_priv = -1)
+
+  // Non-SPMD mode.
+#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2)      \
+    thread_limit(64) num_teams(1)
+  {
+    ThreadLimitL0 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique64                                        \
+                               : ThreadLimitL1, ThreadLimitL2) num_threads(32)
+    {
+      ThreadLimitL1 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique64 : ThreadLimitL2)
+      { ThreadLimitL2 = omp_get_thread_limit(); }
+    }
+  }
+
+  // CHECK: Non-SPMD ThreadLimitL0 = 64
+  printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0);
+  // CHECK: Non-SPMD ThreadLimitL1 = 64
+  printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1);
+  // CHECK: Non-SPMD ThreadLimitL2 = 64
+  printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2);
+
+  // SPMD mode with full runtime
+  ThreadLimitL1 = -1;
+  ThreadLimitL2 = -1;
+#pragma omp target parallel reduction(unique32                                 \
+                                      : ThreadLimitL1, ThreadLimitL2)          \
+    num_threads(32)
+  {
+    ThreadLimitL1 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique32 : ThreadLimitL2)
+    { ThreadLimitL2 = omp_get_thread_limit(); }
+  }
+
+  // CHECK: SPMD with full runtime ThreadLimitL1 = 32
+  printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
+  // CHECK: SPMD with full runtime ThreadLimitL2 = 32
+  printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
+
+  // SPMD mode without runtime
+  ThreadLimitL1 = -1;
+  ThreadLimitL2 = -1;
+#pragma omp target parallel for reduction(unique32                             \
+                                          : ThreadLimitL1, ThreadLimitL2)      \
+    num_threads(32)
+  for (int I = 0; I < 2; ++I) {
+    ThreadLimitL1 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique32 : ThreadLimitL2)
+    { ThreadLimitL2 = omp_get_thread_limit(); }
+  }
+
+  // CHECK: SPMD without runtime ThreadLimitL1 = 32
+  printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
+  // CHECK: SPMD without runtime ThreadLimitL2 = 32
+  printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c b/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
index dd17ae7c6a76c..d675087ed4319 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
@@ -1,55 +1,55 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp declare target
-static void putValueInParallel(int *ptr, int value) {
-  #pragma omp parallel
-  {
-    *ptr = value;
-  }
-}
-
-static int getId() {
-  int id;
-  putValueInParallel(&id, omp_get_thread_num());
-  return id;
-}
-#pragma omp end declare target
-
-const int MaxThreads = 1024;
-const int Threads = 64;
-
-int main(int argc, char *argv[]) {
-  int master;
-  int check[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check[i] = 0;
-  }
-
-  #pragma omp target map(master, check[:])
-  {
-    master = getId();
-
-    #pragma omp parallel num_threads(Threads)
-    {
-      check[omp_get_thread_num()] = getId();
-    }
-  }
-
-  // CHECK: master = 0.
-  printf("master = %d.\n", master);
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    if (i < Threads) {
-      if (check[i] != i) {
-        printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]);
-      }
-    } else if (check[i] != 0) {
-      printf("invalid: check[%d] should be 0, is %d\n", i, check[i]);
-    }
-  }
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp declare target
+static void putValueInParallel(int *ptr, int value) {
+  #pragma omp parallel
+  {
+    *ptr = value;
+  }
+}
+
+static int getId() {
+  int id;
+  putValueInParallel(&id, omp_get_thread_num());
+  return id;
+}
+#pragma omp end declare target
+
+const int MaxThreads = 1024;
+const int Threads = 64;
+
+int main(int argc, char *argv[]) {
+  int master;
+  int check[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check[i] = 0;
+  }
+
+  #pragma omp target map(master, check[:])
+  {
+    master = getId();
+
+    #pragma omp parallel num_threads(Threads)
+    {
+      check[omp_get_thread_num()] = getId();
+    }
+  }
+
+  // CHECK: master = 0.
+  printf("master = %d.\n", master);
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    if (i < Threads) {
+      if (check[i] != i) {
+        printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]);
+      }
+    } else if (check[i] != 0) {
+      printf("invalid: check[%d] should be 0, is %d\n", i, check[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
index 0774c25af20c2..5d89ac74ac59b 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
@@ -1,69 +1,69 @@
-# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
-# Configuration file for the 'lit' test runner.
-
-import os
-import lit.formats
-
-# Tell pylint that we know config and lit_config exist somewhere.
-if 'PYLINT_IMPORT' in os.environ:
-    config = object()
-    lit_config = object()
-
-def prepend_library_path(name, value, sep):
-    if name in config.environment:
-        config.environment[name] = value + sep + config.environment[name]
-    else:
-        config.environment[name] = value
-
-# name: The name of this test suite.
-config.name = 'libomptarget-nvptx'
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc']
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root object directory where output is placed
-config.test_exec_root = config.binary_dir
-
-# test format
-config.test_format = lit.formats.ShTest()
-
-# compiler flags
-config.test_flags = " -I " + config.omp_header_directory + \
-    " -L " + config.library_dir + \
-    " --libomptarget-nvptx-path=" + config.library_dir;
-
-if config.omp_host_rtl_directory:
-    config.test_flags = config.test_flags + \
-        " -L " + config.omp_host_rtl_directory
-
-config.test_flags = config.test_flags + " " + config.test_extra_flags
-
-# Setup environment to find dynamic library at runtime.
-prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
-prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
-
-# Forbid fallback to host.
-config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
-
-# substitutions
-config.substitutions.append(("%compilexx-run-and-check",
-    "%compilexx-and-run | " + config.libomptarget_filecheck + " %s"))
-config.substitutions.append(("%compile-run-and-check",
-    "%compile-and-run | " + config.libomptarget_filecheck + " %s"))
-config.substitutions.append(("%compilexx-and-run", "%compilexx && %run"))
-config.substitutions.append(("%compile-and-run", "%compile && %run"))
-
-config.substitutions.append(("%compilexx",
-    "%clangxx %openmp_flags %flags %s -o %t"))
-config.substitutions.append(("%compile",
-    "%clang %openmp_flags %flags %s -o %t"))
-
-config.substitutions.append(("%clangxx", config.test_cxx_compiler))
-config.substitutions.append(("%clang", config.test_c_compiler))
-config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
-config.substitutions.append(("%flags", config.test_flags))
-
-config.substitutions.append(("%run", "%t"))
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def prepend_library_path(name, value, sep):
+    if name in config.environment:
+        config.environment[name] = value + sep + config.environment[name]
+    else:
+        config.environment[name] = value
+
+# name: The name of this test suite.
+config.name = 'libomptarget-nvptx'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp', '.cc']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.binary_dir
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.omp_header_directory + \
+    " -L " + config.library_dir + \
+    " --libomptarget-nvptx-path=" + config.library_dir;
+
+if config.omp_host_rtl_directory:
+    config.test_flags = config.test_flags + \
+        " -L " + config.omp_host_rtl_directory
+
+config.test_flags = config.test_flags + " " + config.test_extra_flags
+
+# Setup environment to find dynamic library at runtime.
+prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
+prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
+
+# Forbid fallback to host.
+config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
+
+# substitutions
+config.substitutions.append(("%compilexx-run-and-check",
+    "%compilexx-and-run | " + config.libomptarget_filecheck + " %s"))
+config.substitutions.append(("%compile-run-and-check",
+    "%compile-and-run | " + config.libomptarget_filecheck + " %s"))
+config.substitutions.append(("%compilexx-and-run", "%compilexx && %run"))
+config.substitutions.append(("%compile-and-run", "%compile && %run"))
+
+config.substitutions.append(("%compilexx",
+    "%clangxx %openmp_flags %flags %s -o %t"))
+config.substitutions.append(("%compile",
+    "%clang %openmp_flags %flags %s -o %t"))
+
+config.substitutions.append(("%clangxx", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%flags", config.test_flags))
+
+config.substitutions.append(("%run", "%t"))
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
index d9c14cbc53262..709ef1ce844c6 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
@@ -1,14 +1,14 @@
-@AUTO_GEN_COMMENT@
-
-config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
-config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
-config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@"
-config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@"
-config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@"
-config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
-config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
-config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
-config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@"
+config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
+config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
+config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
index 7c707718e13bd..3a2149f858b99 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
@@ -1,37 +1,37 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int data, out, flag = 0;
-#pragma omp target teams num_teams(2) map(tofrom                               \
-                                          : out) map(to                        \
-                                                     : data, flag)             \
-    thread_limit(1)
-#pragma omp parallel num_threads(1)
-  {
-    if (omp_get_team_num() == 0) {
-      /* Write to the data buffer that will be read by thread in team 1 */
-      data = 42;
-/* Flush data to thread in team 1 */
-#pragma omp barrier
-      /* Set flag to release thread in team 1 */
-#pragma omp atomic write
-      flag = 1;
-    } else if (omp_get_team_num() == 1) {
-      /* Loop until we see the update to the flag */
-      int val;
-      do {
-#pragma omp atomic read
-        val = flag;
-      } while (val < 1);
-      out = data;
-#pragma omp barrier
-    }
-  }
-  // CHECK: out=42.
-  /* Value of out will be 42 */
-  printf("out=%d.\n", out);
-  return !(out == 42);
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int data, out, flag = 0;
+#pragma omp target teams num_teams(2) map(tofrom                               \
+                                          : out) map(to                        \
+                                                     : data, flag)             \
+    thread_limit(1)
+#pragma omp parallel num_threads(1)
+  {
+    if (omp_get_team_num() == 0) {
+      /* Write to the data buffer that will be read by thread in team 1 */
+      data = 42;
+/* Flush data to thread in team 1 */
+#pragma omp barrier
+      /* Set flag to release thread in team 1 */
+#pragma omp atomic write
+      flag = 1;
+    } else if (omp_get_team_num() == 1) {
+      /* Loop until we see the update to the flag */
+      int val;
+      do {
+#pragma omp atomic read
+        val = flag;
+      } while (val < 1);
+      out = data;
+#pragma omp barrier
+    }
+  }
+  // CHECK: out=42.
+  /* Value of out will be 42 */
+  printf("out=%d.\n", out);
+  return !(out == 42);
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
index 412538b6dd156..b5fc059828f81 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
@@ -1,35 +1,35 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  int data, out, flag = 0;
-#pragma omp target parallel num_threads(64) map(tofrom                         \
-                                                : out, flag) map(to            \
-                                                                 : data)
-  {
-    if (omp_get_thread_num() == 0) {
-      /* Write to the data buffer that will be read by thread */
-      data = 42;
-/* Flush data to thread 32 */
-#pragma omp flush(data)
-      /* Set flag to release thread 32 */
-#pragma omp atomic write
-      flag = 1;
-    } else if (omp_get_thread_num() == 32) {
-      /* Loop until we see the update to the flag */
-      int val;
-      do {
-#pragma omp atomic read
-        val = flag;
-      } while (val < 1);
-      out = data;
-#pragma omp flush(out)
-    }
-  }
-  // CHECK: out=42.
-  /* Value of out will be 42 */
-  printf("out=%d.\n", out);
-  return !(out == 42);
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int data, out, flag = 0;
+#pragma omp target parallel num_threads(64) map(tofrom                         \
+                                                : out, flag) map(to            \
+                                                                 : data)
+  {
+    if (omp_get_thread_num() == 0) {
+      /* Write to the data buffer that will be read by thread */
+      data = 42;
+/* Flush data to thread 32 */
+#pragma omp flush(data)
+      /* Set flag to release thread 32 */
+#pragma omp atomic write
+      flag = 1;
+    } else if (omp_get_thread_num() == 32) {
+      /* Loop until we see the update to the flag */
+      int val;
+      do {
+#pragma omp atomic read
+        val = flag;
+      } while (val < 1);
+      out = data;
+#pragma omp flush(out)
+    }
+  }
+  // CHECK: out=42.
+  /* Value of out will be 42 */
+  printf("out=%d.\n", out);
+  return !(out == 42);
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
index 0a137530cef74..7b28c5f302082 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
@@ -1,151 +1,151 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-const int NumThreads = 64;
-
-int main(int argc, char *argv[]) {
-  int level = -1, activeLevel = -1;
-  // The expected value is -1, initialize to different value.
-  int ancestorTNumNeg = 1, teamSizeNeg = 1;
-  int ancestorTNum0 = -1, teamSize0 = -1;
-  // The expected value is -1, initialize to different value.
-  int ancestorTNum1 = 1, teamSize1 = 1;
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  int check3[MaxThreads];
-  int check4[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = check3[i] = check4[i] = 0;
-  }
-
-  #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \
-                     map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \
-                     map(check1[:], check2[:], check3[:], check4[:])
-  {
-    level = omp_get_level();
-    activeLevel = omp_get_active_level();
-
-    // Expected to return -1.
-    ancestorTNumNeg = omp_get_ancestor_thread_num(-1);
-    teamSizeNeg = omp_get_team_size(-1);
-
-    // Expected to return 0 and 1.
-    ancestorTNum0 = omp_get_ancestor_thread_num(0);
-    teamSize0 = omp_get_team_size(0);
-
-    // Expected to return -1 because the requested level is larger than
-    // the nest level.
-    ancestorTNum1 = omp_get_ancestor_thread_num(1);
-    teamSize1 = omp_get_team_size(1);
-
-    // Expecting active parallel region.
-    #pragma omp parallel num_threads(NumThreads)
-    {
-      int id = omp_get_thread_num();
-      // Multiply return value of omp_get_level by 5 to avoid that this test
-      // passes if both API calls return wrong values.
-      check1[id] += omp_get_level() * 5 + omp_get_active_level();
-
-      // Expected to return 0 and 1.
-      check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
-      // Expected to return the current thread num.
-      check2[id] += (omp_get_ancestor_thread_num(1) - id);
-      // Expected to return the current number of threads.
-      check2[id] += 3 * omp_get_team_size(1);
-      // Expected to return -1, see above.
-      check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
-
-      // Expecting serialized parallel region.
-      #pragma omp parallel
-      {
-        #pragma omp atomic
-        check3[id] += omp_get_level() * 5 + omp_get_active_level();
-
-        // Expected to return 0 and 1.
-        int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
-        // Expected to return the parent thread num.
-        check4Inc += (omp_get_ancestor_thread_num(1) - id);
-        // Expected to return the number of threads in the active parallel region.
-        check4Inc += 3 * omp_get_team_size(1);
-        // Expected to return 0 and 1.
-        check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
-        // Expected to return -1, see above.
-        check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
-
-        #pragma omp atomic
-        check4[id] += check4Inc;
-      }
-    }
-  }
-
-  // CHECK: target: level = 0, activeLevel = 0
-  printf("target: level = %d, activeLevel = %d\n", level, activeLevel);
-  // CHECK: level = -1: ancestorTNum = -1, teamSize = -1
-  printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg);
-  // CHECK: level = 0: ancestorTNum = 0, teamSize = 1
-  printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0);
-  // CHECK: level = 1: ancestorTNum = -1, teamSize = -1
-  printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    // Check active parallel region:
-    // omp_get_level() = 1, omp_get_active_level() = 1
-    const int Expected1 = 6;
-    if (i < NumThreads) {
-      if (check1[i] != Expected1) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    // 5 * 1 + 3 * 64 - 1 - 1 (see above)
-    const int Expected2 = 195;
-    if (i < NumThreads) {
-      if (check2[i] != Expected2) {
-        printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-
-    // Check serialized parallel region:
-    // omp_get_level() = 2, omp_get_active_level() = 1
-    const int Expected3 = 11;
-    if (i < NumThreads) {
-      if (check3[i] != Expected3) {
-        printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]);
-      }
-    } else if (check3[i] != 0) {
-      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
-    }
-
-    // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above)
-    const int Expected4 = 198;
-    if (i < NumThreads) {
-      if (check4[i] != Expected4) {
-        printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]);
-      }
-    } else if (check4[i] != 0) {
-      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
-    }
-  }
-
-  // Check for paraller level in non-SPMD kernels.
-  level = 0;
-  #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
-  for (int i=0; i<5032; i+=32) {
-    int ub = (i+32 > 5032) ? 5032 : i+32;
-    #pragma omp parallel for schedule(dynamic)
-    for (int j=i ; j < ub; j++) ;
-    level += omp_get_level();
-  }
-  // CHECK: Integral level = 0.
-  printf("Integral level = %d.\n", level);
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+const int MaxThreads = 1024;
+const int NumThreads = 64;
+
+int main(int argc, char *argv[]) {
+  int level = -1, activeLevel = -1;
+  // The expected value is -1, initialize to different value.
+  int ancestorTNumNeg = 1, teamSizeNeg = 1;
+  int ancestorTNum0 = -1, teamSize0 = -1;
+  // The expected value is -1, initialize to different value.
+  int ancestorTNum1 = 1, teamSize1 = 1;
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  int check3[MaxThreads];
+  int check4[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = check3[i] = check4[i] = 0;
+  }
+
+  #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \
+                     map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \
+                     map(check1[:], check2[:], check3[:], check4[:])
+  {
+    level = omp_get_level();
+    activeLevel = omp_get_active_level();
+
+    // Expected to return -1.
+    ancestorTNumNeg = omp_get_ancestor_thread_num(-1);
+    teamSizeNeg = omp_get_team_size(-1);
+
+    // Expected to return 0 and 1.
+    ancestorTNum0 = omp_get_ancestor_thread_num(0);
+    teamSize0 = omp_get_team_size(0);
+
+    // Expected to return -1 because the requested level is larger than
+    // the nest level.
+    ancestorTNum1 = omp_get_ancestor_thread_num(1);
+    teamSize1 = omp_get_team_size(1);
+
+    // Expecting active parallel region.
+    #pragma omp parallel num_threads(NumThreads)
+    {
+      int id = omp_get_thread_num();
+      // Multiply return value of omp_get_level by 5 to avoid that this test
+      // passes if both API calls return wrong values.
+      check1[id] += omp_get_level() * 5 + omp_get_active_level();
+
+      // Expected to return 0 and 1.
+      check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
+      // Expected to return the current thread num.
+      check2[id] += (omp_get_ancestor_thread_num(1) - id);
+      // Expected to return the current number of threads.
+      check2[id] += 3 * omp_get_team_size(1);
+      // Expected to return -1, see above.
+      check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
+
+      // Expecting serialized parallel region.
+      #pragma omp parallel
+      {
+        #pragma omp atomic
+        check3[id] += omp_get_level() * 5 + omp_get_active_level();
+
+        // Expected to return 0 and 1.
+        int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
+        // Expected to return the parent thread num.
+        check4Inc += (omp_get_ancestor_thread_num(1) - id);
+        // Expected to return the number of threads in the active parallel region.
+        check4Inc += 3 * omp_get_team_size(1);
+        // Expected to return 0 and 1.
+        check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
+        // Expected to return -1, see above.
+        check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
+
+        #pragma omp atomic
+        check4[id] += check4Inc;
+      }
+    }
+  }
+
+  // CHECK: target: level = 0, activeLevel = 0
+  printf("target: level = %d, activeLevel = %d\n", level, activeLevel);
+  // CHECK: level = -1: ancestorTNum = -1, teamSize = -1
+  printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg);
+  // CHECK: level = 0: ancestorTNum = 0, teamSize = 1
+  printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0);
+  // CHECK: level = 1: ancestorTNum = -1, teamSize = -1
+  printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    // Check active parallel region:
+    // omp_get_level() = 1, omp_get_active_level() = 1
+    const int Expected1 = 6;
+    if (i < NumThreads) {
+      if (check1[i] != Expected1) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    // 5 * 1 + 3 * 64 - 1 - 1 (see above)
+    const int Expected2 = 195;
+    if (i < NumThreads) {
+      if (check2[i] != Expected2) {
+        printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+
+    // Check serialized parallel region:
+    // omp_get_level() = 2, omp_get_active_level() = 1
+    const int Expected3 = 11;
+    if (i < NumThreads) {
+      if (check3[i] != Expected3) {
+        printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]);
+      }
+    } else if (check3[i] != 0) {
+      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
+    }
+
+    // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above)
+    const int Expected4 = 198;
+    if (i < NumThreads) {
+      if (check4[i] != Expected4) {
+        printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]);
+      }
+    } else if (check4[i] != 0) {
+      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
+    }
+  }
+
+  // Check for paraller level in non-SPMD kernels.
+  level = 0;
+  #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
+  for (int i=0; i<5032; i+=32) {
+    int ub = (i+32 > 5032) ? 5032 : i+32;
+    #pragma omp parallel for schedule(dynamic)
+    for (int j=i ; j < ub; j++) ;
+    level += omp_get_level();
+  }
+  // CHECK: Integral level = 0.
+  printf("Integral level = %d.\n", level);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
index 70ebb1da9592e..747054c80fe62 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
@@ -1,136 +1,136 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-const int NumThreads = 64;
-const int NumThreads1 = 1;
-
-int main(int argc, char *argv[]) {
-  int inParallel = -1, numThreads = -1, threadNum = -1;
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = 0;
-  }
-
-#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
-  {
-    inParallel = omp_in_parallel();
-    numThreads = omp_get_num_threads();
-    threadNum = omp_get_thread_num();
-
-// Expecting active parallel region.
-#pragma omp parallel num_threads(NumThreads)
-    {
-      int id = omp_get_thread_num();
-      check1[id] += omp_get_num_threads() + omp_in_parallel();
-
-// Expecting serialized parallel region.
-#pragma omp parallel
-      {
-        // Expected to be 1.
-        int nestedInParallel = omp_in_parallel();
-        // Expected to be 1.
-        int nestedNumThreads = omp_get_num_threads();
-        // Expected to be 0.
-        int nestedThreadNum = omp_get_thread_num();
-#pragma omp atomic
-        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
-      }
-    }
-  }
-
-  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
-  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
-         inParallel, numThreads, threadNum);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    // Check that all threads reported
-    // omp_get_num_threads() = 64, omp_in_parallel() = 1.
-    int Expected = NumThreads + 1;
-    if (i < NumThreads) {
-      if (check1[i] != Expected) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
-               check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    // Check serialized parallel region.
-    if (i < NumThreads) {
-      if (check2[i] != 2) {
-        printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-  }
-
-  inParallel = -1;
-  numThreads = -1;
-  threadNum = -1;
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = 0;
-  }
-
-#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
-  {
-    inParallel = omp_in_parallel();
-    numThreads = omp_get_num_threads();
-    threadNum = omp_get_thread_num();
-
-// Expecting active parallel region.
-#pragma omp parallel num_threads(NumThreads1)
-    {
-      int id = omp_get_thread_num();
-      check1[id] += omp_get_num_threads() + omp_in_parallel();
-
-// Expecting serialized parallel region.
-#pragma omp parallel
-      {
-        // Expected to be 0.
-        int nestedInParallel = omp_in_parallel();
-        // Expected to be 1.
-        int nestedNumThreads = omp_get_num_threads();
-        // Expected to be 0.
-        int nestedThreadNum = omp_get_thread_num();
-#pragma omp atomic
-        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
-      }
-    }
-  }
-
-  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
-  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
-         inParallel, numThreads, threadNum);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    // Check that all threads reported
-    // omp_get_num_threads() = 1, omp_in_parallel() = 0.
-    int Expected = 1;
-    if (i < NumThreads1) {
-      if (check1[i] != Expected) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
-               check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    // Check serialized parallel region.
-    if (i < NumThreads1) {
-      if (check2[i] != 1) {
-        printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-  }
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+const int MaxThreads = 1024;
+const int NumThreads = 64;
+const int NumThreads1 = 1;
+
+int main(int argc, char *argv[]) {
+  int inParallel = -1, numThreads = -1, threadNum = -1;
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = 0;
+  }
+
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+  {
+    inParallel = omp_in_parallel();
+    numThreads = omp_get_num_threads();
+    threadNum = omp_get_thread_num();
+
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads)
+    {
+      int id = omp_get_thread_num();
+      check1[id] += omp_get_num_threads() + omp_in_parallel();
+
+// Expecting serialized parallel region.
+#pragma omp parallel
+      {
+        // Expected to be 1.
+        int nestedInParallel = omp_in_parallel();
+        // Expected to be 1.
+        int nestedNumThreads = omp_get_num_threads();
+        // Expected to be 0.
+        int nestedThreadNum = omp_get_thread_num();
+#pragma omp atomic
+        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
+      }
+    }
+  }
+
+  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
+  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
+         inParallel, numThreads, threadNum);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    // Check that all threads reported
+    // omp_get_num_threads() = 64, omp_in_parallel() = 1.
+    int Expected = NumThreads + 1;
+    if (i < NumThreads) {
+      if (check1[i] != Expected) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+               check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    // Check serialized parallel region.
+    if (i < NumThreads) {
+      if (check2[i] != 2) {
+        printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+  }
+
+  inParallel = -1;
+  numThreads = -1;
+  threadNum = -1;
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = 0;
+  }
+
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+  {
+    inParallel = omp_in_parallel();
+    numThreads = omp_get_num_threads();
+    threadNum = omp_get_thread_num();
+
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads1)
+    {
+      int id = omp_get_thread_num();
+      check1[id] += omp_get_num_threads() + omp_in_parallel();
+
+// Expecting serialized parallel region.
+#pragma omp parallel
+      {
+        // Expected to be 0.
+        int nestedInParallel = omp_in_parallel();
+        // Expected to be 1.
+        int nestedNumThreads = omp_get_num_threads();
+        // Expected to be 0.
+        int nestedThreadNum = omp_get_thread_num();
+#pragma omp atomic
+        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
+      }
+    }
+  }
+
+  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
+  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
+         inParallel, numThreads, threadNum);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    // Check that all threads reported
+    // omp_get_num_threads() = 1, omp_in_parallel() = 0.
+    int Expected = 1;
+    if (i < NumThreads1) {
+      if (check1[i] != Expected) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+               check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    // Check serialized parallel region.
+    if (i < NumThreads1) {
+      if (check2[i] != 1) {
+        printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
index 4a2f73fee827a..ea16056b1ce3c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
@@ -1,102 +1,102 @@
-// RUN: %compile-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-const int WarpSize = 32;
-const int NumThreads1 = 1 * WarpSize;
-const int NumThreads2 = 2 * WarpSize;
-const int NumThreads3 = 3 * WarpSize;
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  int check3[MaxThreads];
-  int check4[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = check3[i] = check4[i] = 0;
-  }
-
-  int maxThreads1 = -1;
-  int maxThreads2 = -1;
-  int maxThreads3 = -1;
-
-  #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
-                     map(maxThreads1, maxThreads2, maxThreads3)
-  {
-    #pragma omp parallel num_threads(NumThreads1)
-    {
-      check1[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    // API method to set number of threads in parallel regions without
-    // num_threads() clause.
-    omp_set_num_threads(NumThreads2);
-    maxThreads1 = omp_get_max_threads();
-    #pragma omp parallel
-    {
-      check2[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    maxThreads2 = omp_get_max_threads();
-
-    // num_threads() clause should override nthreads-var ICV.
-    #pragma omp parallel num_threads(NumThreads3)
-    {
-      check3[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    maxThreads3 = omp_get_max_threads();
-
-    // Effect from omp_set_num_threads() should still be visible.
-    #pragma omp parallel
-    {
-      check4[omp_get_thread_num()] += omp_get_num_threads();
-    }
-  }
-
-  // CHECK: maxThreads1 = 64
-  printf("maxThreads1 = %d\n", maxThreads1);
-  // CHECK: maxThreads2 = 64
-  printf("maxThreads2 = %d\n", maxThreads2);
-  // CHECK: maxThreads3 = 64
-  printf("maxThreads3 = %d\n", maxThreads3);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    if (i < NumThreads1) {
-      if (check1[i] != NumThreads1) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    if (i < NumThreads2) {
-      if (check2[i] != NumThreads2) {
-        printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-
-    if (i < NumThreads3) {
-      if (check3[i] != NumThreads3) {
-        printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
-      }
-    } else if (check3[i] != 0) {
-      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
-    }
-
-    if (i < NumThreads2) {
-      if (check4[i] != NumThreads2) {
-        printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
-      }
-    } else if (check4[i] != 0) {
-      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
-    }
-  }
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <stdio.h>
+#include <omp.h>
+
+const int WarpSize = 32;
+const int NumThreads1 = 1 * WarpSize;
+const int NumThreads2 = 2 * WarpSize;
+const int NumThreads3 = 3 * WarpSize;
+const int MaxThreads = 1024;
+
+int main(int argc, char *argv[]) {
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  int check3[MaxThreads];
+  int check4[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = check3[i] = check4[i] = 0;
+  }
+
+  int maxThreads1 = -1;
+  int maxThreads2 = -1;
+  int maxThreads3 = -1;
+
+  #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
+                     map(maxThreads1, maxThreads2, maxThreads3)
+  {
+    #pragma omp parallel num_threads(NumThreads1)
+    {
+      check1[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    // API method to set number of threads in parallel regions without
+    // num_threads() clause.
+    omp_set_num_threads(NumThreads2);
+    maxThreads1 = omp_get_max_threads();
+    #pragma omp parallel
+    {
+      check2[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    maxThreads2 = omp_get_max_threads();
+
+    // num_threads() clause should override nthreads-var ICV.
+    #pragma omp parallel num_threads(NumThreads3)
+    {
+      check3[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    maxThreads3 = omp_get_max_threads();
+
+    // Effect from omp_set_num_threads() should still be visible.
+    #pragma omp parallel
+    {
+      check4[omp_get_thread_num()] += omp_get_num_threads();
+    }
+  }
+
+  // CHECK: maxThreads1 = 64
+  printf("maxThreads1 = %d\n", maxThreads1);
+  // CHECK: maxThreads2 = 64
+  printf("maxThreads2 = %d\n", maxThreads2);
+  // CHECK: maxThreads3 = 64
+  printf("maxThreads3 = %d\n", maxThreads3);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    if (i < NumThreads1) {
+      if (check1[i] != NumThreads1) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    if (i < NumThreads2) {
+      if (check2[i] != NumThreads2) {
+        printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+
+    if (i < NumThreads3) {
+      if (check3[i] != NumThreads3) {
+        printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
+      }
+    } else if (check3[i] != 0) {
+      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
+    }
+
+    if (i < NumThreads2) {
+      if (check4[i] != NumThreads2) {
+        printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
+      }
+    } else if (check4[i] != 0) {
+      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
index 517db59f64ae3..2339c4a589c14 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
@@ -1,51 +1,51 @@
-// RUN: %compilexx-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-int main(void) {
-  int isHost = -1;
-  int ParallelLevel1 = -1, ParallelLevel2 = -1;
-  int Count = 0;
-
-#pragma omp target parallel for map(tofrom                                     \
-                                    : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
-  for (int J = 0; J < 10; ++J) {
-#pragma omp critical
-    {
-      isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
-      ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
-                           ? omp_get_level()
-                           : ParallelLevel1;
-    }
-    if (omp_get_thread_num() > 5) {
-      int L2;
-#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
-      for (int I = 0; I < 10; ++I) {
-        L2 = omp_get_level();
-        Count += omp_get_level(); // (10-6)*10*2 = 80
-      }
-#pragma omp critical
-      ParallelLevel2 =
-          (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
-    } else {
-      Count += omp_get_level(); // 6 * 1 = 6
-    }
-  }
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-  // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
-  printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
-         ParallelLevel2);
-  // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
-  // level) + 6(num of iterations) * 1(par level)
-  // CHECK: Expected count = 86
-  printf("Expected count = %d\n", Count);
-
-  return isHost;
-}
+// RUN: %compilexx-run-and-check
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = -1;
+  int ParallelLevel1 = -1, ParallelLevel2 = -1;
+  int Count = 0;
+
+#pragma omp target parallel for map(tofrom                                     \
+                                    : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
+  for (int J = 0; J < 10; ++J) {
+#pragma omp critical
+    {
+      isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
+      ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
+                           ? omp_get_level()
+                           : ParallelLevel1;
+    }
+    if (omp_get_thread_num() > 5) {
+      int L2;
+#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
+      for (int I = 0; I < 10; ++I) {
+        L2 = omp_get_level();
+        Count += omp_get_level(); // (10-6)*10*2 = 80
+      }
+#pragma omp critical
+      ParallelLevel2 =
+          (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
+    } else {
+      Count += omp_get_level(); // 6 * 1 = 6
+    }
+  }
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+  // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
+  printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
+         ParallelLevel2);
+  // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
+  // level) + 6(num of iterations) * 1(par level)
+  // CHECK: Expected count = 86
+  printf("Expected count = %d\n", Count);
+
+  return isHost;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
index 5e40bb564aa0f..858edd1cc8625 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
@@ -1,77 +1,77 @@
-// RUN: %compile-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-const int WarpSize = 32;
-const int ThreadLimit = 1 * WarpSize;
-const int NumThreads2 = 2 * WarpSize;
-const int NumThreads3 = 3 * WarpSize;
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
-  int check1[MaxThreads];
-  int check2[MaxThreads];
-  int check3[MaxThreads];
-  for (int i = 0; i < MaxThreads; i++) {
-    check1[i] = check2[i] = check3[i] = 0;
-  }
-
-  int threadLimit = -1;
-
-  #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \
-                           map(check1[:], check2[:], check3[:], threadLimit)
-  {
-    threadLimit = omp_get_thread_limit();
-
-    // All parallel regions should get as many threads as specified by the
-    // thread_limit() clause.
-    #pragma omp parallel
-    {
-      check1[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    omp_set_num_threads(NumThreads2);
-    #pragma omp parallel
-    {
-      check2[omp_get_thread_num()] += omp_get_num_threads();
-    }
-
-    #pragma omp parallel num_threads(NumThreads3)
-    {
-      check3[omp_get_thread_num()] += omp_get_num_threads();
-    }
-  }
-
-  // CHECK: threadLimit = 32
-  printf("threadLimit = %d\n", threadLimit);
-
-  // CHECK-NOT: invalid
-  for (int i = 0; i < MaxThreads; i++) {
-    if (i < ThreadLimit) {
-      if (check1[i] != ThreadLimit) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]);
-      }
-    } else if (check1[i] != 0) {
-      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
-    }
-
-    if (i < ThreadLimit) {
-      if (check2[i] != ThreadLimit) {
-        printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]);
-      }
-    } else if (check2[i] != 0) {
-      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
-    }
-
-    if (i < ThreadLimit) {
-      if (check3[i] != ThreadLimit) {
-        printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]);
-      }
-    } else if (check3[i] != 0) {
-      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
-    }
-  }
-
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <stdio.h>
+#include <omp.h>
+
+const int WarpSize = 32;
+const int ThreadLimit = 1 * WarpSize;
+const int NumThreads2 = 2 * WarpSize;
+const int NumThreads3 = 3 * WarpSize;
+const int MaxThreads = 1024;
+
+int main(int argc, char *argv[]) {
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  int check3[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = check3[i] = 0;
+  }
+
+  int threadLimit = -1;
+
+  #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \
+                           map(check1[:], check2[:], check3[:], threadLimit)
+  {
+    threadLimit = omp_get_thread_limit();
+
+    // All parallel regions should get as many threads as specified by the
+    // thread_limit() clause.
+    #pragma omp parallel
+    {
+      check1[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    omp_set_num_threads(NumThreads2);
+    #pragma omp parallel
+    {
+      check2[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    #pragma omp parallel num_threads(NumThreads3)
+    {
+      check3[omp_get_thread_num()] += omp_get_num_threads();
+    }
+  }
+
+  // CHECK: threadLimit = 32
+  printf("threadLimit = %d\n", threadLimit);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    if (i < ThreadLimit) {
+      if (check1[i] != ThreadLimit) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    if (i < ThreadLimit) {
+      if (check2[i] != ThreadLimit) {
+        printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+
+    if (i < ThreadLimit) {
+      if (check3[i] != ThreadLimit) {
+        printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]);
+      }
+    } else if (check3[i] != 0) {
+      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
index b3f8768564080..ef0958070c857 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
@@ -1,22 +1,22 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
-  int res = 0;
-
-#pragma omp parallel num_threads(2) reduction(+:res)
-  {
-    int tid = omp_get_thread_num();
-#pragma omp target teams distribute reduction(+:res)
-    for (int i = tid; i < 2; i++)
-      ++res;
-  }
-  // The first thread makes 2 iterations, the second - 1. Expected result of the
-  // reduction res is 3.
-
-  // CHECK: res = 3.
-  printf("res = %d.\n", res);
-  return 0;
-}
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int res = 0;
+
+#pragma omp parallel num_threads(2) reduction(+:res)
+  {
+    int tid = omp_get_thread_num();
+#pragma omp target teams distribute reduction(+:res)
+    for (int i = tid; i < 2; i++)
+      ++res;
+  }
+  // The first thread makes 2 iterations, the second - 1. Expected result of the
+  // reduction res is 3.
+
+  // CHECK: res = 3.
+  printf("res = %d.\n", res);
+  return 0;
+}
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index de3afc36c7f28..6cdd73c4c9fb5 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -1,261 +1,261 @@
-//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Interface to be used by Clang during the codegen of a
-// target region.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_H_
-#define _OMPTARGET_H_
-
-#include <stdint.h>
-#include <stddef.h>
-
-#define OFFLOAD_SUCCESS (0)
-#define OFFLOAD_FAIL (~0)
-
-#define OFFLOAD_DEVICE_DEFAULT     -1
-#define HOST_DEVICE                -10
-
-/// Data attributes for each data reference used in an OpenMP target region.
-enum tgt_map_type {
-  // No flags
-  OMP_TGT_MAPTYPE_NONE            = 0x000,
-  // copy data from host to device
-  OMP_TGT_MAPTYPE_TO              = 0x001,
-  // copy data from device to host
-  OMP_TGT_MAPTYPE_FROM            = 0x002,
-  // copy regardless of the reference count
-  OMP_TGT_MAPTYPE_ALWAYS          = 0x004,
-  // force unmapping of data
-  OMP_TGT_MAPTYPE_DELETE          = 0x008,
-  // map the pointer as well as the pointee
-  OMP_TGT_MAPTYPE_PTR_AND_OBJ     = 0x010,
-  // pass device base address to kernel
-  OMP_TGT_MAPTYPE_TARGET_PARAM    = 0x020,
-  // return base device address of mapped data
-  OMP_TGT_MAPTYPE_RETURN_PARAM    = 0x040,
-  // private variable - not mapped
-  OMP_TGT_MAPTYPE_PRIVATE         = 0x080,
-  // copy by value - not mapped
-  OMP_TGT_MAPTYPE_LITERAL         = 0x100,
-  // mapping is implicit
-  OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
-  // copy data to device
-  OMP_TGT_MAPTYPE_CLOSE           = 0x400,
-  // member of struct, member given by [16 MSBs] - 1
-  OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
-};
-
-enum OpenMPOffloadingDeclareTargetFlags {
-  /// Mark the entry as having a 'link' attribute.
-  OMP_DECLARE_TARGET_LINK = 0x01,
-  /// Mark the entry as being a global constructor.
-  OMP_DECLARE_TARGET_CTOR = 0x02,
-  /// Mark the entry as being a global destructor.
-  OMP_DECLARE_TARGET_DTOR = 0x04
-};
-
-enum OpenMPOffloadingRequiresDirFlags {
-  /// flag undefined.
-  OMP_REQ_UNDEFINED               = 0x000,
-  /// no requires directive present.
-  OMP_REQ_NONE                    = 0x001,
-  /// reverse_offload clause.
-  OMP_REQ_REVERSE_OFFLOAD         = 0x002,
-  /// unified_address clause.
-  OMP_REQ_UNIFIED_ADDRESS         = 0x004,
-  /// unified_shared_memory clause.
-  OMP_REQ_UNIFIED_SHARED_MEMORY   = 0x008,
-  /// dynamic_allocators clause.
-  OMP_REQ_DYNAMIC_ALLOCATORS      = 0x010
-};
-
-/// This struct is a record of an entry point or global. For a function
-/// entry point the size is expected to be zero
-struct __tgt_offload_entry {
-  void *addr;   // Pointer to the offload entry info (function or global)
-  char *name;   // Name of the function or global
-  size_t size;  // Size of the entry info (0 if it is a function)
-  int32_t flags; // Flags associated with the entry, e.g. 'link'.
-  int32_t reserved; // Reserved, to be used by the runtime library.
-};
-
-/// This struct is a record of the device image information
-struct __tgt_device_image {
-  void *ImageStart;                  // Pointer to the target code start
-  void *ImageEnd;                    // Pointer to the target code end
-  __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries
-  __tgt_offload_entry *EntriesEnd;   // End of table (non inclusive)
-};
-
-/// This struct is a record of all the host code that may be offloaded to a
-/// target.
-struct __tgt_bin_desc {
-  int32_t NumDeviceImages;           // Number of device types supported
-  __tgt_device_image *DeviceImages;  // Array of device images (1 per dev. type)
-  __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries
-  __tgt_offload_entry *HostEntriesEnd;   // End of table (non inclusive)
-};
-
-/// This struct contains the offload entries identified by the target runtime
-struct __tgt_target_table {
-  __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
-  __tgt_offload_entry
-      *EntriesEnd; // End of the table with all the entries (non inclusive)
-};
-
-/// This struct contains information exchanged between different asynchronous
-/// operations for device-dependent optimization and potential synchronization
-struct __tgt_async_info {
-  // A pointer to a queue-like structure where offloading operations are issued.
-  // We assume to use this structure to do synchronization. In CUDA backend, it
-  // is CUstream.
-  void *Queue = nullptr;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int omp_get_num_devices(void);
-int omp_get_initial_device(void);
-void *omp_target_alloc(size_t size, int device_num);
-void omp_target_free(void *device_ptr, int device_num);
-int omp_target_is_present(void *ptr, int device_num);
-int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset,
-    size_t src_offset, int dst_device, int src_device);
-int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
-    int num_dims, const size_t *volume, const size_t *dst_offsets,
-    const size_t *src_offsets, const size_t *dst_dimensions,
-    const size_t *src_dimensions, int dst_device, int src_device);
-int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size,
-    size_t device_offset, int device_num);
-int omp_target_disassociate_ptr(void *host_ptr, int device_num);
-
-/// add the clauses of the requires directives in a given file
-void __tgt_register_requires(int64_t flags);
-
-/// adds a target shared library to the target execution image
-void __tgt_register_lib(__tgt_bin_desc *desc);
-
-/// removes a target shared library from the target execution image
-void __tgt_unregister_lib(__tgt_bin_desc *desc);
-
-// creates the host to target data mapping, stores it in the
-// libomptarget.so internal structure (an entry in a stack of data maps) and
-// passes the data to the device;
-void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
-                             void **args_base, void **args, int64_t *arg_sizes,
-                             int64_t *arg_types);
-void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
-                                    void **args_base, void **args,
-                                    int64_t *arg_sizes, int64_t *arg_types,
-                                    int32_t depNum, void *depList,
-                                    int32_t noAliasDepNum,
-                                    void *noAliasDepList);
-
-// passes data from the target, release target memory and destroys the
-// host-target mapping (top entry from the stack of data maps) created by
-// the last __tgt_target_data_begin
-void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base,
-                           void **args, int64_t *arg_sizes, int64_t *arg_types);
-void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
-                                  void **args_base, void **args,
-                                  int64_t *arg_sizes, int64_t *arg_types,
-                                  int32_t depNum, void *depList,
-                                  int32_t noAliasDepNum, void *noAliasDepList);
-
-/// passes data to/from the target
-void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
-                              void **args_base, void **args, int64_t *arg_sizes,
-                              int64_t *arg_types);
-void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num,
-                                     void **args_base, void **args,
-                                     int64_t *arg_sizes, int64_t *arg_types,
-                                     int32_t depNum, void *depList,
-                                     int32_t noAliasDepNum,
-                                     void *noAliasDepList);
-
-// Performs the same actions as data_begin in case arg_num is non-zero
-// and initiates run of offloaded region on target platform; if arg_num
-// is non-zero after the region execution is done it also performs the
-// same action as data_end above. The following types are used; this
-// function returns 0 if it was able to transfer the execution to a
-// target and an int different from zero otherwise.
-int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
-                 void **args_base, void **args, int64_t *arg_sizes,
-                 int64_t *arg_types);
-int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num,
-                        void **args_base, void **args, int64_t *arg_sizes,
-                        int64_t *arg_types, int32_t depNum, void *depList,
-                        int32_t noAliasDepNum, void *noAliasDepList);
-
-int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num,
-                       void **args_base, void **args, int64_t *arg_sizes,
-                       int64_t *arg_types, int32_t num_teams,
-                       int32_t thread_limit);
-int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
-                              int32_t arg_num, void **args_base, void **args,
-                              int64_t *arg_sizes, int64_t *arg_types,
-                              int32_t num_teams, int32_t thread_limit,
-                              int32_t depNum, void *depList,
-                              int32_t noAliasDepNum, void *noAliasDepList);
-void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount);
-
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef OMPTARGET_DEBUG
-#include <stdio.h>
-#define DEBUGP(prefix, ...)                                                    \
-  {                                                                            \
-    fprintf(stderr, "%s --> ", prefix);                                        \
-    fprintf(stderr, __VA_ARGS__);                                              \
-  }
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
-#define DPxMOD "0x%0*" PRIxPTR
-#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr))
-
-/*
- * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
- * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
- *
- * DPxMOD expands to:
- *   "0x%0*" PRIxPTR
- * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
- * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
- *   "0x%0*lu"
- *
- * Ultimately, the whole statement expands to:
- *   printf("ptr=0x%0*lu...\n",  // the 0* modifier expects an extra argument
- *                               // specifying the width of the output
- *   (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
- *                               // 8 digits for 32bit systems
- *                               // 16 digits for 64bit
- *   (uintptr_t) ptr);
- */
-#else
-#define DEBUGP(prefix, ...)                                                    \
-  {}
-#endif
-
-#ifdef __cplusplus
-#define EXTERN extern "C"
-#else
-#define EXTERN extern
-#endif
-
-#endif // _OMPTARGET_H_
+//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_H_
+#define _OMPTARGET_H_
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define OFFLOAD_SUCCESS (0)
+#define OFFLOAD_FAIL (~0)
+
+#define OFFLOAD_DEVICE_DEFAULT     -1
+#define HOST_DEVICE                -10
+
+/// Data attributes for each data reference used in an OpenMP target region.
+enum tgt_map_type {
+  // No flags
+  OMP_TGT_MAPTYPE_NONE            = 0x000,
+  // copy data from host to device
+  OMP_TGT_MAPTYPE_TO              = 0x001,
+  // copy data from device to host
+  OMP_TGT_MAPTYPE_FROM            = 0x002,
+  // copy regardless of the reference count
+  OMP_TGT_MAPTYPE_ALWAYS          = 0x004,
+  // force unmapping of data
+  OMP_TGT_MAPTYPE_DELETE          = 0x008,
+  // map the pointer as well as the pointee
+  OMP_TGT_MAPTYPE_PTR_AND_OBJ     = 0x010,
+  // pass device base address to kernel
+  OMP_TGT_MAPTYPE_TARGET_PARAM    = 0x020,
+  // return base device address of mapped data
+  OMP_TGT_MAPTYPE_RETURN_PARAM    = 0x040,
+  // private variable - not mapped
+  OMP_TGT_MAPTYPE_PRIVATE         = 0x080,
+  // copy by value - not mapped
+  OMP_TGT_MAPTYPE_LITERAL         = 0x100,
+  // mapping is implicit
+  OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
+  // copy data to device
+  OMP_TGT_MAPTYPE_CLOSE           = 0x400,
+  // member of struct, member given by [16 MSBs] - 1
+  OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
+};
+
+enum OpenMPOffloadingDeclareTargetFlags {
+  /// Mark the entry as having a 'link' attribute.
+  OMP_DECLARE_TARGET_LINK = 0x01,
+  /// Mark the entry as being a global constructor.
+  OMP_DECLARE_TARGET_CTOR = 0x02,
+  /// Mark the entry as being a global destructor.
+  OMP_DECLARE_TARGET_DTOR = 0x04
+};
+
+enum OpenMPOffloadingRequiresDirFlags {
+  /// flag undefined.
+  OMP_REQ_UNDEFINED               = 0x000,
+  /// no requires directive present.
+  OMP_REQ_NONE                    = 0x001,
+  /// reverse_offload clause.
+  OMP_REQ_REVERSE_OFFLOAD         = 0x002,
+  /// unified_address clause.
+  OMP_REQ_UNIFIED_ADDRESS         = 0x004,
+  /// unified_shared_memory clause.
+  OMP_REQ_UNIFIED_SHARED_MEMORY   = 0x008,
+  /// dynamic_allocators clause.
+  OMP_REQ_DYNAMIC_ALLOCATORS      = 0x010
+};
+
+/// This struct is a record of an entry point or global. For a function
+/// entry point the size is expected to be zero
+struct __tgt_offload_entry {
+  void *addr;   // Pointer to the offload entry info (function or global)
+  char *name;   // Name of the function or global
+  size_t size;  // Size of the entry info (0 if it is a function)
+  int32_t flags; // Flags associated with the entry, e.g. 'link'.
+  int32_t reserved; // Reserved, to be used by the runtime library.
+};
+
+/// This struct is a record of the device image information
+struct __tgt_device_image {
+  void *ImageStart;                  // Pointer to the target code start
+  void *ImageEnd;                    // Pointer to the target code end
+  __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries
+  __tgt_offload_entry *EntriesEnd;   // End of table (non inclusive)
+};
+
+/// This struct is a record of all the host code that may be offloaded to a
+/// target.
+struct __tgt_bin_desc {
+  int32_t NumDeviceImages;           // Number of device types supported
+  __tgt_device_image *DeviceImages;  // Array of device images (1 per dev. type)
+  __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries
+  __tgt_offload_entry *HostEntriesEnd;   // End of table (non inclusive)
+};
+
+/// This struct contains the offload entries identified by the target runtime
+struct __tgt_target_table {
+  __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
+  __tgt_offload_entry
+      *EntriesEnd; // End of the table with all the entries (non inclusive)
+};
+
+/// This struct contains information exchanged between different asynchronous
+/// operations for device-dependent optimization and potential synchronization
+struct __tgt_async_info {
+  // A pointer to a queue-like structure where offloading operations are issued.
+  // We assume to use this structure to do synchronization. In CUDA backend, it
+  // is CUstream.
+  void *Queue = nullptr;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int omp_get_num_devices(void);
+int omp_get_initial_device(void);
+void *omp_target_alloc(size_t size, int device_num);
+void omp_target_free(void *device_ptr, int device_num);
+int omp_target_is_present(void *ptr, int device_num);
+int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset,
+    size_t src_offset, int dst_device, int src_device);
+int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
+    int num_dims, const size_t *volume, const size_t *dst_offsets,
+    const size_t *src_offsets, const size_t *dst_dimensions,
+    const size_t *src_dimensions, int dst_device, int src_device);
+int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size,
+    size_t device_offset, int device_num);
+int omp_target_disassociate_ptr(void *host_ptr, int device_num);
+
+/// add the clauses of the requires directives in a given file
+void __tgt_register_requires(int64_t flags);
+
+/// adds a target shared library to the target execution image
+void __tgt_register_lib(__tgt_bin_desc *desc);
+
+/// removes a target shared library from the target execution image
+void __tgt_unregister_lib(__tgt_bin_desc *desc);
+
+// creates the host to target data mapping, stores it in the
+// libomptarget.so internal structure (an entry in a stack of data maps) and
+// passes the data to the device;
+void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+                             void **args_base, void **args, int64_t *arg_sizes,
+                             int64_t *arg_types);
+void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
+                                    void **args_base, void **args,
+                                    int64_t *arg_sizes, int64_t *arg_types,
+                                    int32_t depNum, void *depList,
+                                    int32_t noAliasDepNum,
+                                    void *noAliasDepList);
+
+// passes data from the target, release target memory and destroys the
+// host-target mapping (top entry from the stack of data maps) created by
+// the last __tgt_target_data_begin
+void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base,
+                           void **args, int64_t *arg_sizes, int64_t *arg_types);
+void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+                                  void **args_base, void **args,
+                                  int64_t *arg_sizes, int64_t *arg_types,
+                                  int32_t depNum, void *depList,
+                                  int32_t noAliasDepNum, void *noAliasDepList);
+
+/// passes data to/from the target
+void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+                              void **args_base, void **args, int64_t *arg_sizes,
+                              int64_t *arg_types);
+void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num,
+                                     void **args_base, void **args,
+                                     int64_t *arg_sizes, int64_t *arg_types,
+                                     int32_t depNum, void *depList,
+                                     int32_t noAliasDepNum,
+                                     void *noAliasDepList);
+
+// Performs the same actions as data_begin in case arg_num is non-zero
+// and initiates run of offloaded region on target platform; if arg_num
+// is non-zero after the region execution is done it also performs the
+// same action as data_end above. The following types are used; this
+// function returns 0 if it was able to transfer the execution to a
+// target and an int different from zero otherwise.
+int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
+                 void **args_base, void **args, int64_t *arg_sizes,
+                 int64_t *arg_types);
+int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num,
+                        void **args_base, void **args, int64_t *arg_sizes,
+                        int64_t *arg_types, int32_t depNum, void *depList,
+                        int32_t noAliasDepNum, void *noAliasDepList);
+
+int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num,
+                       void **args_base, void **args, int64_t *arg_sizes,
+                       int64_t *arg_types, int32_t num_teams,
+                       int32_t thread_limit);
+int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
+                              int32_t arg_num, void **args_base, void **args,
+                              int64_t *arg_sizes, int64_t *arg_types,
+                              int32_t num_teams, int32_t thread_limit,
+                              int32_t depNum, void *depList,
+                              int32_t noAliasDepNum, void *noAliasDepList);
+void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef OMPTARGET_DEBUG
+#include <stdio.h>
+#define DEBUGP(prefix, ...)                                                    \
+  {                                                                            \
+    fprintf(stderr, "%s --> ", prefix);                                        \
+    fprintf(stderr, __VA_ARGS__);                                              \
+  }
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#define DPxMOD "0x%0*" PRIxPTR
+#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr))
+
+/*
+ * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
+ * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
+ *
+ * DPxMOD expands to:
+ *   "0x%0*" PRIxPTR
+ * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
+ * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
+ *   "0x%0*lu"
+ *
+ * Ultimately, the whole statement expands to:
+ *   printf("ptr=0x%0*lu...\n",  // the 0* modifier expects an extra argument
+ *                               // specifying the width of the output
+ *   (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
+ *                               // 8 digits for 32bit systems
+ *                               // 16 digits for 64bit
+ *   (uintptr_t) ptr);
+ */
+#else
+#define DEBUGP(prefix, ...)                                                    \
+  {}
+#endif
+
+#ifdef __cplusplus
+#define EXTERN extern "C"
+#else
+#define EXTERN extern
+#endif
+
+#endif // _OMPTARGET_H_
diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h
index 083e422aac163..366ad0161c99e 100644
--- a/openmp/libomptarget/include/omptargetplugin.h
+++ b/openmp/libomptarget/include/omptargetplugin.h
@@ -1,123 +1,133 @@
-//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an interface between target independent OpenMP offload
-// runtime library libomptarget and target dependent plugin.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGETPLUGIN_H_
-#define _OMPTARGETPLUGIN_H_
-
-#include <omptarget.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Return the number of available devices of the type supported by the
-// target RTL.
-int32_t __tgt_rtl_number_of_devices(void);
-
-// Return an integer different from zero if the provided device image can be
-// supported by the runtime. The functionality is similar to comparing the
-// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
-// lightweight query to determine if the RTL is suitable for an image without
-// having to load the library, which can be expensive.
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
-
-// Initialize the requires flags for the device.
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);
-
-// Initialize the specified device. In case of success return 0; otherwise
-// return an error code.
-int32_t __tgt_rtl_init_device(int32_t ID);
-
-// Pass an executable image section described by image to the specified
-// device and prepare an address table of target entities. In case of error,
-// return NULL. Otherwise, return a pointer to the built address table.
-// Individual entries in the table may also be NULL, when the corresponding
-// offload region is not supported on the target device.
-__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
-                                          __tgt_device_image *Image);
-
-// Allocate data on the particular target device, of the specified size.
-// HostPtr is a address of the host data the allocated target data
-// will be associated with (HostPtr may be NULL if it is not known at
-// allocation time, like for example it would be for target data that
-// is allocated by omp_target_alloc() API). Return address of the
-// allocated data on the target that will be used by libomptarget.so to
-// initialize the target data mapping structures. These addresses are
-// used to generate a table of target variables to pass to
-// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
-// case an error occurred on the target device.
-void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
-
-// Pass the data content to the target device using the target address. In case
-// of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
-                              int64_t Size);
-
-int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr,
-                                    int64_t Size,
-                                    __tgt_async_info *AsyncInfoPtr);
-
-// Retrieve the data content from the target device using its address. In case
-// of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
-                                int64_t Size);
-
-// Asynchronous version of __tgt_rtl_data_retrieve
-int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr,
-                                      void *TargetPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfoPtr);
-
-// De-allocate the data referenced by target ptr on the device. In case of
-// success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
-
-// Transfer control to the offloaded entry Entry on the target device.
-// Args and Offsets are arrays of NumArgs size of target addresses and
-// offsets. An offset should be added to the target address before passing it
-// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is
-// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be
-// ignored on some platforms, like x86_64. In that case, it is synchronous. In
-// case of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
-                                    ptrdiff_t *Offsets, int32_t NumArgs);
-
-// Asynchronous version of __tgt_rtl_run_target_region
-int32_t __tgt_rtl_run_target_region_async(int32_t ID, void *Entry, void **Args,
-                                          ptrdiff_t *Offsets, int32_t NumArgs,
-                                          __tgt_async_info *AsyncInfoPtr);
-
-// Similar to __tgt_rtl_run_target_region, but additionally specify the
-// number of teams to be created and a number of threads in each team. If
-// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
-// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
-// case, it is synchronous.
-int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
-                                         ptrdiff_t *Offsets, int32_t NumArgs,
-                                         int32_t NumTeams, int32_t ThreadLimit,
-                                         uint64_t loop_tripcount);
-
-// Asynchronous version of __tgt_rtl_run_target_team_region
-int32_t __tgt_rtl_run_target_team_region_async(
-    int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs,
-    int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount,
-    __tgt_async_info *AsyncInfoPtr);
-
-// Device synchronization. In case of success, return zero. Otherwise, return an
-// error code.
-int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // _OMPTARGETPLUGIN_H_
+//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an interface between target independent OpenMP offload
+// runtime library libomptarget and target dependent plugin.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGETPLUGIN_H_
+#define _OMPTARGETPLUGIN_H_
+
+#include <omptarget.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Return the number of available devices of the type supported by the
+// target RTL.
+int32_t __tgt_rtl_number_of_devices(void);
+
+// Return an integer different from zero if the provided device image can be
+// supported by the runtime. The functionality is similar to comparing the
+// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
+// lightweight query to determine if the RTL is suitable for an image without
+// having to load the library, which can be expensive.
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
+
+// Initialize the requires flags for the device.
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);
+
+// Initialize the specified device. In case of success return 0; otherwise
+// return an error code.
+int32_t __tgt_rtl_init_device(int32_t ID);
+
+// Pass an executable image section described by image to the specified
+// device and prepare an address table of target entities. In case of error,
+// return NULL. Otherwise, return a pointer to the built address table.
+// Individual entries in the table may also be NULL, when the corresponding
+// offload region is not supported on the target device.
+__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
+                                          __tgt_device_image *Image);
+
+// Allocate data on the particular target device, of the specified size.
+// HostPtr is a address of the host data the allocated target data
+// will be associated with (HostPtr may be NULL if it is not known at
+// allocation time, like for example it would be for target data that
+// is allocated by omp_target_alloc() API). Return address of the
+// allocated data on the target that will be used by libomptarget.so to
+// initialize the target data mapping structures. These addresses are
+// used to generate a table of target variables to pass to
+// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
+// case an error occurred on the target device.
+void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
+
+// Pass the data content to the target device using the target address. In case
+// of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
+                              int64_t Size);
+
+int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr,
+                                    int64_t Size,
+                                    __tgt_async_info *AsyncInfoPtr);
+
+// Retrieve the data content from the target device using its address. In case
+// of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
+                                int64_t Size);
+
+// Asynchronous version of __tgt_rtl_data_retrieve
+int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr,
+                                      void *TargetPtr, int64_t Size,
+                                      __tgt_async_info *AsyncInfoPtr);
+
+// Transfer the data content from one device to the other using address. In case
+// of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_transfer(int32_t ID, void *DstPtr, void *SrcPtr,
+                                int64_t Size);
+
+// Asynchronous version of __tgt_rtl_data_transfer
+int32_t __tgt_rtl_data_transfer_async(int32_t ID, void *DstPtr,
+                                      void *SrcPtr, int64_t Size,
+                                      __tgt_async_info *AsyncInfoPtr);
+
+// De-allocate the data referenced by target ptr on the device. In case of
+// success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
+
+// Transfer control to the offloaded entry Entry on the target device.
+// Args and Offsets are arrays of NumArgs size of target addresses and
+// offsets. An offset should be added to the target address before passing it
+// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is
+// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be
+// ignored on some platforms, like x86_64. In that case, it is synchronous. In
+// case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
+                                    ptrdiff_t *Offsets, int32_t NumArgs);
+
+// Asynchronous version of __tgt_rtl_run_target_region
+int32_t __tgt_rtl_run_target_region_async(int32_t ID, void *Entry, void **Args,
+                                          ptrdiff_t *Offsets, int32_t NumArgs,
+                                          __tgt_async_info *AsyncInfoPtr);
+
+// Similar to __tgt_rtl_run_target_region, but additionally specify the
+// number of teams to be created and a number of threads in each team. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous.
+int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
+                                         ptrdiff_t *Offsets, int32_t NumArgs,
+                                         int32_t NumTeams, int32_t ThreadLimit,
+                                         uint64_t loop_tripcount);
+
+// Asynchronous version of __tgt_rtl_run_target_team_region
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs,
+    int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount,
+    __tgt_async_info *AsyncInfoPtr);
+
+// Device synchronization. In case of success, return zero. Otherwise, return an
+// error code.
+int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _OMPTARGETPLUGIN_H_
diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt
index bb3f9c908087a..33c69e3f14b4c 100644
--- a/openmp/libomptarget/plugins/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/CMakeLists.txt
@@ -1,77 +1,77 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build plugins for the user system if available.
-#
-##===----------------------------------------------------------------------===##
-
-# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
-# - build a plugin for an ELF based generic 64-bit target based on libffi.
-# - tmachine: name of the machine processor as used in the cmake build system.
-# - tmachine_name: name of the machine to be printed with the debug messages.
-# - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
-  if(LIBOMPTARGET_DEP_LIBELF_FOUND)
-    if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-    
-      libomptarget_say("Building ${tmachine_name} offloading plugin.")
-    
-      include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
-      include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
-      
-      # Define macro to be used as prefix of the runtime messages for this target.
-      add_definitions("-DTARGET_NAME=${tmachine_name}")
-      
-      # Define macro with the ELF ID for this target.
-      add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
-    
-      add_library("omptarget.rtl.${tmachine_libname}" SHARED 
-        ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp)
-        
-      # Install plugin under the lib destination folder.
-      install(TARGETS "omptarget.rtl.${tmachine_libname}" 
-        LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-        
-      target_link_libraries(
-        "omptarget.rtl.${tmachine_libname}"
-        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} 
-        ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
-        dl
-        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
-    
-      list(APPEND LIBOMPTARGET_TESTED_PLUGINS
-        "omptarget.rtl.${tmachine_libname}")
-
-      # Report to the parent scope that we are building a plugin.
-      set(LIBOMPTARGET_SYSTEM_TARGETS 
-        "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
-      set(LIBOMPTARGET_TESTED_PLUGINS
-        "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-      
-    else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-      libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
-    endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-  else(LIBOMPTARGET_DEP_LIBELF_FOUND)
-    libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
-  endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
-else()
-  libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
-endif()
-endmacro()
-
-add_subdirectory(aarch64)
-add_subdirectory(cuda)
-add_subdirectory(ppc64)
-add_subdirectory(ppc64le)
-add_subdirectory(x86_64)
-
-# Make sure the parent scope can see the plugins that will be created.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build plugins for the user system if available.
+#
+##===----------------------------------------------------------------------===##
+
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# - build a plugin for an ELF based generic 64-bit target based on libffi.
+# - tmachine: name of the machine processor as used in the cmake build system.
+# - tmachine_name: name of the machine to be printed with the debug messages.
+# - tmachine_libname: machine name to be appended to the plugin library name.
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+  if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+    
+      libomptarget_say("Building ${tmachine_name} offloading plugin.")
+    
+      include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+      include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+      
+      # Define macro to be used as prefix of the runtime messages for this target.
+      add_definitions("-DTARGET_NAME=${tmachine_name}")
+      
+      # Define macro with the ELF ID for this target.
+      add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+    
+      add_library("omptarget.rtl.${tmachine_libname}" SHARED 
+        ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp)
+        
+      # Install plugin under the lib destination folder.
+      install(TARGETS "omptarget.rtl.${tmachine_libname}" 
+        LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+        
+      target_link_libraries(
+        "omptarget.rtl.${tmachine_libname}"
+        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} 
+        ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+        dl
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+    
+      list(APPEND LIBOMPTARGET_TESTED_PLUGINS
+        "omptarget.rtl.${tmachine_libname}")
+
+      # Report to the parent scope that we are building a plugin.
+      set(LIBOMPTARGET_SYSTEM_TARGETS 
+        "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+      set(LIBOMPTARGET_TESTED_PLUGINS
+        "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+      
+    else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+      libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+    endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+  else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
+  endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
+else()
+  libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
+endif()
+endmacro()
+
+add_subdirectory(aarch64)
+add_subdirectory(cuda)
+add_subdirectory(ppc64)
+add_subdirectory(ppc64le)
+add_subdirectory(x86_64)
+
+# Make sure the parent scope can see the plugins that will be created.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+
diff --git a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
index 350a56cb9a493..84d9be9dbc2e2 100644
--- a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
@@ -1,17 +1,17 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an aarch64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
-else()
- libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
-endif()
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an aarch64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+else()
+ libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
+endif()
diff --git a/openmp/libomptarget/plugins/common/elf_common.c b/openmp/libomptarget/plugins/common/elf_common.c
index b0efd1abc1489..b912bbf11a27e 100644
--- a/openmp/libomptarget/plugins/common/elf_common.c
+++ b/openmp/libomptarget/plugins/common/elf_common.c
@@ -1,73 +1,73 @@
-//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common ELF functionality for target plugins.
-// Must be included in the plugin source file AFTER omptarget.h has been
-// included and macro DP(...) has been defined.
-// .
-//
-//===----------------------------------------------------------------------===//
-
-#if !(defined(_OMPTARGET_H_) && defined(DP))
-#error Include elf_common.c in the plugin source AFTER omptarget.h has been\
- included and macro DP(...) has been defined.
-#endif
-
-#include <elf.h>
-#include <libelf.h>
-
-// Check whether an image is valid for execution on target_id
-static inline int32_t elf_check_machine(__tgt_device_image *image,
-    uint16_t target_id) {
-
-  // Is the library version incompatible with the header file?
-  if (elf_version(EV_CURRENT) == EV_NONE) {
-    DP("Incompatible ELF library!\n");
-    return 0;
-  }
-
-  char *img_begin = (char *)image->ImageStart;
-  char *img_end = (char *)image->ImageEnd;
-  size_t img_size = img_end - img_begin;
-
-  // Obtain elf handler
-  Elf *e = elf_memory(img_begin, img_size);
-  if (!e) {
-    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
-    return 0;
-  }
-
-  // Check if ELF is the right kind.
-  if (elf_kind(e) != ELF_K_ELF) {
-    DP("Unexpected ELF type!\n");
-    elf_end(e);
-    return 0;
-  }
-  Elf64_Ehdr *eh64 = elf64_getehdr(e);
-  Elf32_Ehdr *eh32 = elf32_getehdr(e);
-
-  if (!eh64 && !eh32) {
-    DP("Unable to get machine ID from ELF file!\n");
-    elf_end(e);
-    return 0;
-  }
-
-  uint16_t MachineID;
-  if (eh64 && !eh32)
-    MachineID = eh64->e_machine;
-  else if (eh32 && !eh64)
-    MachineID = eh32->e_machine;
-  else {
-    DP("Ambiguous ELF header!\n");
-    elf_end(e);
-    return 0;
-  }
-
-  elf_end(e);
-  return MachineID == target_id;
-}
+//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common ELF functionality for target plugins.
+// Must be included in the plugin source file AFTER omptarget.h has been
+// included and macro DP(...) has been defined.
+// .
+//
+//===----------------------------------------------------------------------===//
+
+#if !(defined(_OMPTARGET_H_) && defined(DP))
+#error Include elf_common.c in the plugin source AFTER omptarget.h has been\
+ included and macro DP(...) has been defined.
+#endif
+
+#include <elf.h>
+#include <libelf.h>
+
+// Check whether an image is valid for execution on target_id
+static inline int32_t elf_check_machine(__tgt_device_image *image,
+    uint16_t target_id) {
+
+  // Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE) {
+    DP("Incompatible ELF library!\n");
+    return 0;
+  }
+
+  char *img_begin = (char *)image->ImageStart;
+  char *img_end = (char *)image->ImageEnd;
+  size_t img_size = img_end - img_begin;
+
+  // Obtain elf handler
+  Elf *e = elf_memory(img_begin, img_size);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return 0;
+  }
+
+  // Check if ELF is the right kind.
+  if (elf_kind(e) != ELF_K_ELF) {
+    DP("Unexpected ELF type!\n");
+    elf_end(e);
+    return 0;
+  }
+  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+  Elf32_Ehdr *eh32 = elf32_getehdr(e);
+
+  if (!eh64 && !eh32) {
+    DP("Unable to get machine ID from ELF file!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  uint16_t MachineID;
+  if (eh64 && !eh32)
+    MachineID = eh64->e_machine;
+  else if (eh32 && !eh64)
+    MachineID = eh32->e_machine;
+  else {
+    DP("Ambiguous ELF header!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  elf_end(e);
+  return MachineID == target_id;
+}
diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
index 54bcdf26e9e6b..8fee1c72767a1 100644
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -1,45 +1,45 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a CUDA machine if available.
-#
-##===----------------------------------------------------------------------===##
-if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
-  return()
-elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
-  libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
-  return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
-  libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
-  return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
-  libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
-  return()
-endif()
-
-libomptarget_say("Building CUDA offloading plugin.")
-
-# Define the suffix for the runtime messaging dumps.
-add_definitions(-DTARGET_NAME=CUDA)
-
-include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
-include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
-
-add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-
-target_link_libraries(omptarget.rtl.cuda
-  ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
-  ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
-
-# Report to the parent scope that we are building a plugin for CUDA.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a CUDA machine if available.
+#
+##===----------------------------------------------------------------------===##
+if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
+  return()
+elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
+  return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
+  return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
+  return()
+endif()
+
+libomptarget_say("Building CUDA offloading plugin.")
+
+# Define the suffix for the runtime messaging dumps.
+add_definitions(-DTARGET_NAME=CUDA)
+
+include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
+
+add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+
+# Install plugin under the lib destination folder.
+install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+target_link_libraries(omptarget.rtl.cuda
+  ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+  ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+
+# Report to the parent scope that we are building a plugin for CUDA.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 4ad58e290252d..9e3f1e0b35bea 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -1,1042 +1,1088 @@
-//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for CUDA machine
-//
-//===----------------------------------------------------------------------===//
-
-#include <cassert>
-#include <cstddef>
-#include <cuda.h>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <vector>
-
-#include "omptargetplugin.h"
-
-#ifndef TARGET_NAME
-#define TARGET_NAME CUDA
-#endif
-
-#ifdef OMPTARGET_DEBUG
-static int DebugLevel = 0;
-
-#define GETNAME2(name) #name
-#define GETNAME(name) GETNAME2(name)
-#define DP(...) \
-  do { \
-    if (DebugLevel > 0) { \
-      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
-    } \
-  } while (false)
-
-// Utility for retrieving and printing CUDA error string.
-#define CUDA_ERR_STRING(err) \
-  do { \
-    if (DebugLevel > 0) { \
-      const char *errStr; \
-      cuGetErrorString(err, &errStr); \
-      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
-    } \
-  } while (false)
-#else // OMPTARGET_DEBUG
-#define DP(...) {}
-#define CUDA_ERR_STRING(err) {}
-#endif // OMPTARGET_DEBUG
-
-#include "../../common/elf_common.c"
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  std::vector<__tgt_offload_entry> Entries;
-};
-
-enum ExecutionModeType {
-  SPMD, // constructors, destructors,
-  // combined constructs (`teams distribute parallel for [simd]`)
-  GENERIC, // everything else
-  NONE
-};
-
-/// Use a single entity to encode a kernel and a set of flags.
-struct KernelTy {
-  CUfunction Func;
-
-  // execution mode of kernel
-  // 0 - SPMD mode (without master warp)
-  // 1 - Generic mode (with master warp)
-  int8_t ExecutionMode;
-
-  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
-      : Func(_Func), ExecutionMode(_ExecutionMode) {}
-};
-
-/// Device environment data
-/// Manually sync with the deviceRTL side for now, move to a dedicated header
-/// file later.
-struct omptarget_device_environmentTy {
-  int32_t debug_level;
-};
-
-/// List that contains all the kernels.
-/// FIXME: we may need this to be per device and per library.
-std::list<KernelTy> KernelsList;
-
-namespace {
-bool checkResult(CUresult Err, const char *ErrMsg) {
-  if (Err == CUDA_SUCCESS)
-    return true;
-
-  DP(ErrMsg);
-  CUDA_ERR_STRING(Err);
-  return false;
-}
-
-// Structure contains per-device data
-struct DeviceDataTy {
-  std::list<FuncOrGblEntryTy> FuncGblEntries;
-  CUcontext Context = nullptr;
-  // Device properties
-  int ThreadsPerBlock = 0;
-  int BlocksPerGrid = 0;
-  int WarpSize = 0;
-  // OpenMP properties
-  int NumTeams = 0;
-  int NumThreads = 0;
-};
-
-class StreamManagerTy {
-  int NumberOfDevices;
-  // The initial size of stream pool
-  int EnvNumInitialStreams;
-  // Per-device stream mutex
-  std::vector<std::unique_ptr<std::mutex>> StreamMtx;
-  // Per-device stream Id indicates the next available stream in the pool
-  std::vector<int> NextStreamId;
-  // Per-device stream pool
-  std::vector<std::vector<CUstream>> StreamPool;
-  // Reference to per-device data
-  std::vector<DeviceDataTy> &DeviceData;
-
-  // If there is no CUstream left in the pool, we will resize the pool to
-  // allocate more CUstream. This function should be called with device mutex,
-  // and we do not resize to smaller one.
-  void resizeStreamPool(const int DeviceId, const size_t NewSize) {
-    std::vector<CUstream> &Pool = StreamPool[DeviceId];
-    const size_t CurrentSize = Pool.size();
-    assert(NewSize > CurrentSize && "new size is not larger than current size");
-
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
-      // We will return if cannot switch to the right context in case of
-      // creating bunch of streams that are not corresponding to the right
-      // device. The offloading will fail later because selected CUstream is
-      // nullptr.
-      return;
-    }
-
-    Pool.resize(NewSize, nullptr);
-
-    for (size_t I = CurrentSize; I < NewSize; ++I) {
-      checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
-                  "Error returned from cuStreamCreate\n");
-    }
-  }
-
-public:
-  StreamManagerTy(const int NumberOfDevices,
-                  std::vector<DeviceDataTy> &DeviceData)
-      : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
-        DeviceData(DeviceData) {
-    StreamPool.resize(NumberOfDevices);
-    NextStreamId.resize(NumberOfDevices);
-    StreamMtx.resize(NumberOfDevices);
-
-    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
-      EnvNumInitialStreams = std::stoi(EnvStr);
-
-    // Initialize the next stream id
-    std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
-
-    // Initialize stream mutex
-    for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
-      Ptr = std::make_unique<std::mutex>();
-  }
-
-  ~StreamManagerTy() {
-    // Destroy streams
-    for (int I = 0; I < NumberOfDevices; ++I) {
-      checkResult(cuCtxSetCurrent(DeviceData[I].Context),
-                  "Error returned from cuCtxSetCurrent\n");
-
-      for (CUstream &S : StreamPool[I]) {
-        if (S)
-          checkResult(cuStreamDestroy(S),
-                      "Error returned from cuStreamDestroy\n");
-      }
-    }
-  }
-
-  // Get a CUstream from pool. Per-device next stream id always points to the
-  // next available CUstream. That means, CUstreams [0, id-1] have been
-  // assigned, and [id,] are still available. If there is no CUstream left, we
-  // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
-  // the id will increase one.
-  // xxxxxs+++++++++
-  //      ^
-  //      id
-  // After assignment, the pool becomes the following and s is assigned.
-  // xxxxxs+++++++++
-  //       ^
-  //       id
-  CUstream getStream(const int DeviceId) {
-    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
-    int &Id = NextStreamId[DeviceId];
-    // No CUstream left in the pool, we need to request from CUDA RT
-    if (Id == StreamPool[DeviceId].size()) {
-      // By default we double the stream pool every time
-      resizeStreamPool(DeviceId, Id * 2);
-    }
-    return StreamPool[DeviceId][Id++];
-  }
-
-  // Return a CUstream back to pool. As mentioned above, per-device next
-  // stream is always points to the next available CUstream, so when we return
-  // a CUstream, we need to first decrease the id, and then copy the CUstream
-  // back.
-  // It is worth noting that, the order of streams return might be different
-  // from that they're assigned, that saying, at some point, there might be
-  // two identical CUstreams.
-  // xxax+a+++++
-  //     ^
-  //     id
-  // However, it doesn't matter, because they're always on the two sides of
-  // id. The left one will in the end be overwritten by another CUstream.
-  // Therefore, after several execution, the order of pool might be different
-  // from its initial state.
-  void returnStream(const int DeviceId, CUstream Stream) {
-    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
-    int &Id = NextStreamId[DeviceId];
-    assert(Id > 0 && "Wrong stream ID");
-    StreamPool[DeviceId][--Id] = Stream;
-  }
-
-  bool initializeDeviceStreamPool(const int DeviceId) {
-    assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
-
-    resizeStreamPool(DeviceId, EnvNumInitialStreams);
-
-    // Check the size of stream pool
-    if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
-      return false;
-
-    // Check whether each stream is valid
-    for (CUstream &S : StreamPool[DeviceId])
-      if (!S)
-        return false;
-
-    return true;
-  }
-};
-
-class DeviceRTLTy {
-  int NumberOfDevices;
-  // OpenMP environment properties
-  int EnvNumTeams;
-  int EnvTeamLimit;
-  // OpenMP requires flags
-  int64_t RequiresFlags;
-
-  static constexpr const int HardTeamLimit = 1U << 16U; // 64k
-  static constexpr const int HardThreadLimit = 1024;
-  static constexpr const int DefaultNumTeams = 128;
-  static constexpr const int DefaultNumThreads = 128;
-
-  std::unique_ptr<StreamManagerTy> StreamManager;
-  std::vector<DeviceDataTy> DeviceData;
-  std::vector<CUmodule> Modules;
-
-  // Record entry point associated with device
-  void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-    E.Entries.push_back(entry);
-  }
-
-  // Return true if the entry is associated with device
-  bool findOffloadEntry(const int DeviceId, const void *Addr) const {
-    for (const __tgt_offload_entry &Itr :
-         DeviceData[DeviceId].FuncGblEntries.back().Entries)
-      if (Itr.addr == Addr)
-        return true;
-
-    return false;
-  }
-
-  // Return the pointer to the target entries table
-  __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-
-    if (E.Entries.empty())
-      return nullptr;
-
-    // Update table info according to the entries and return the pointer
-    E.Table.EntriesBegin = E.Entries.data();
-    E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
-
-    return &E.Table;
-  }
-
-  // Clear entries table for a device
-  void clearOffloadEntriesTable(const int DeviceId) {
-    DeviceData[DeviceId].FuncGblEntries.emplace_back();
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-    E.Entries.clear();
-    E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
-  }
-
-  CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
-    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
-
-    if (!AsyncInfoPtr->Queue)
-      AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
-
-    return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
-  }
-
-public:
-  // This class should not be copied
-  DeviceRTLTy(const DeviceRTLTy &) = delete;
-  DeviceRTLTy(DeviceRTLTy &&) = delete;
-
-  DeviceRTLTy()
-      : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        RequiresFlags(OMP_REQ_UNDEFINED) {
-#ifdef OMPTARGET_DEBUG
-    if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
-      DebugLevel = std::stoi(EnvStr);
-#endif // OMPTARGET_DEBUG
-
-    DP("Start initializing CUDA\n");
-
-    CUresult Err = cuInit(0);
-    if (!checkResult(Err, "Error returned from cuInit\n")) {
-      return;
-    }
-
-    Err = cuDeviceGetCount(&NumberOfDevices);
-    if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
-      return;
-
-    if (NumberOfDevices == 0) {
-      DP("There are no devices supporting CUDA.\n");
-      return;
-    }
-
-    DeviceData.resize(NumberOfDevices);
-
-    // Get environment variables regarding teams
-    if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
-      // OMP_TEAM_LIMIT has been set
-      EnvTeamLimit = std::stoi(EnvStr);
-      DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
-    }
-    if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
-      // OMP_NUM_TEAMS has been set
-      EnvNumTeams = std::stoi(EnvStr);
-      DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
-    }
-
-    StreamManager =
-        std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
-  }
-
-  ~DeviceRTLTy() {
-    // First destruct stream manager in case of Contexts is destructed before it
-    StreamManager = nullptr;
-
-    for (CUmodule &M : Modules)
-      // Close module
-      if (M)
-        checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
-
-    for (DeviceDataTy &D : DeviceData) {
-      // Destroy context
-      if (D.Context)
-        checkResult(cuCtxDestroy(D.Context),
-                    "Error returned from cuCtxDestroy\n");
-    }
-  }
-
-  // Check whether a given DeviceId is valid
-  bool isValidDeviceId(const int DeviceId) const {
-    return DeviceId >= 0 && DeviceId < NumberOfDevices;
-  }
-
-  bool getNumOfDevices() const { return NumberOfDevices; }
-
-  void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
-
-  int initDevice(const int DeviceId) {
-    CUdevice Device;
-
-    DP("Getting device %d\n", DeviceId);
-    CUresult Err = cuDeviceGet(&Device, DeviceId);
-    if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
-      return OFFLOAD_FAIL;
-
-    // Create the context and save it to use whenever this device is selected.
-    Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC,
-                      Device);
-    if (!checkResult(Err, "Error returned from cuCtxCreate\n"))
-      return OFFLOAD_FAIL;
-
-    Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    // Initialize stream pool
-    if (!StreamManager->initializeDeviceStreamPool(DeviceId))
-      return OFFLOAD_FAIL;
-
-    // Query attributes to determine number of threads/block and blocks/grid.
-    int MaxGridDimX;
-    Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                               Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting max grid dimension, use default value %d\n",
-         DeviceRTLTy::DefaultNumTeams);
-      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
-    } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
-      DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
-      DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
-    } else {
-      DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
-         "at the hard limit\n",
-         MaxGridDimX, DeviceRTLTy::HardTeamLimit);
-      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
-    }
-
-    // We are only exploiting threads along the x axis.
-    int MaxBlockDimX;
-    Err = cuDeviceGetAttribute(&MaxBlockDimX,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting max block dimension, use default value %d\n",
-         DeviceRTLTy::DefaultNumThreads);
-      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
-    } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
-      DP("Using %d CUDA threads per block\n", MaxBlockDimX);
-      DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
-    } else {
-      DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
-         "capping at the hard limit\n",
-         MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
-      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
-    }
-
-    // Get and set warp size
-    int WarpSize;
-    Err =
-        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting warp size, assume default value 32\n");
-      DeviceData[DeviceId].WarpSize = 32;
-    } else {
-      DP("Using warp size %d\n", WarpSize);
-      DeviceData[DeviceId].WarpSize = WarpSize;
-    }
-
-    // Adjust teams to the env variables
-    if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
-      DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
-         EnvTeamLimit);
-      DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
-    }
-
-    DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
-       DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
-       DeviceData[DeviceId].WarpSize);
-
-    // Set default number of teams
-    if (EnvNumTeams > 0) {
-      DP("Default number of teams set according to environment %d\n",
-         EnvNumTeams);
-      DeviceData[DeviceId].NumTeams = EnvNumTeams;
-    } else {
-      DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
-      DP("Default number of teams set according to library's default %d\n",
-         DeviceRTLTy::DefaultNumTeams);
-    }
-
-    if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
-      DP("Default number of teams exceeds device limit, capping at %d\n",
-         DeviceData[DeviceId].BlocksPerGrid);
-      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
-    }
-
-    // Set default number of threads
-    DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
-    DP("Default number of threads set according to library's default %d\n",
-       DeviceRTLTy::DefaultNumThreads);
-    if (DeviceData[DeviceId].NumThreads >
-        DeviceData[DeviceId].ThreadsPerBlock) {
-      DP("Default number of threads exceeds device limit, capping at %d\n",
-         DeviceData[DeviceId].ThreadsPerBlock);
-      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  __tgt_target_table *loadBinary(const int DeviceId,
-                                 const __tgt_device_image *Image) {
-    // Set the context we are using
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return nullptr;
-
-    // Clear the offload table as we are going to create a new one.
-    clearOffloadEntriesTable(DeviceId);
-
-    // Create the module and extract the function pointers.
-    CUmodule Module;
-    DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
-    Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
-    if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
-      return nullptr;
-
-    DP("CUDA module successfully loaded!\n");
-
-    Modules.push_back(Module);
-
-    // Find the symbols in the module by name.
-    const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
-    const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
-    for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-      if (!E->addr) {
-        // We return nullptr when something like this happens, the host should
-        // have always something in the address to uniquely identify the target
-        // region.
-        DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
-        return nullptr;
-      }
-
-      if (E->size) {
-        __tgt_offload_entry Entry = *E;
-        CUdeviceptr CUPtr;
-        size_t CUSize;
-        Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
-        // We keep this style here because we need the name
-        if (Err != CUDA_SUCCESS) {
-          DP("Loading global '%s' (Failed)\n", E->name);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        if (CUSize != E->size) {
-          DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
-             CUSize, E->size);
-          return nullptr;
-        }
-
-        DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-           DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
-
-        Entry.addr = (void *)(CUPtr);
-
-        // Note: In the current implementation declare target variables
-        // can either be link or to. This means that once unified
-        // memory is activated via the requires directive, the variable
-        // can be used directly from the host in both cases.
-        // TODO: when variables types other than to or link are added,
-        // the below condition should be changed to explicitly
-        // check for to and link variables types:
-        // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
-        // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
-        if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-          // If unified memory is present any target link or to variables
-          // can access host addresses directly. There is no longer a
-          // need for device copies.
-          cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
-          DP("Copy linked variable host address (" DPxMOD
-             ") to device address (" DPxMOD ")\n",
-             DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
-        }
-
-        addOffloadEntry(DeviceId, Entry);
-
-        continue;
-      }
-
-      CUfunction Func;
-      Err = cuModuleGetFunction(&Func, Module, E->name);
-      // We keep this style here because we need the name
-      if (Err != CUDA_SUCCESS) {
-        DP("Loading '%s' (Failed)\n", E->name);
-        CUDA_ERR_STRING(Err);
-        return nullptr;
-      }
-
-      DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
-         DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
-
-      // default value GENERIC (in case symbol is missing from cubin file)
-      int8_t ExecModeVal = ExecutionModeType::GENERIC;
-      std::string ExecModeNameStr(E->name);
-      ExecModeNameStr += "_exec_mode";
-      const char *ExecModeName = ExecModeNameStr.c_str();
-
-      CUdeviceptr ExecModePtr;
-      size_t CUSize;
-      Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
-      if (Err == CUDA_SUCCESS) {
-        if (CUSize != sizeof(int8_t)) {
-          DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
-             ExecModeName, CUSize, sizeof(int8_t));
-          return nullptr;
-        }
-
-        Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
-        if (Err != CUDA_SUCCESS) {
-          DP("Error when copying data from device to host. Pointers: "
-             "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
-             DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        if (ExecModeVal < 0 || ExecModeVal > 1) {
-          DP("Error wrong exec_mode value specified in cubin file: %d\n",
-             ExecModeVal);
-          return nullptr;
-        }
-      } else {
-        DP("Loading global exec_mode '%s' - symbol missing, using default "
-           "value GENERIC (1)\n",
-           ExecModeName);
-        CUDA_ERR_STRING(Err);
-      }
-
-      KernelsList.emplace_back(Func, ExecModeVal);
-
-      __tgt_offload_entry Entry = *E;
-      Entry.addr = &KernelsList.back();
-      addOffloadEntry(DeviceId, Entry);
-    }
-
-    // send device environment data to the device
-    {
-      omptarget_device_environmentTy DeviceEnv{0};
-
-#ifdef OMPTARGET_DEBUG
-      if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
-        DeviceEnv.debug_level = std::stoi(EnvStr);
-#endif
-
-      const char *DeviceEnvName = "omptarget_device_environment";
-      CUdeviceptr DeviceEnvPtr;
-      size_t CUSize;
-
-      Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
-      if (Err == CUDA_SUCCESS) {
-        if (CUSize != sizeof(DeviceEnv)) {
-          DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
-             DeviceEnvName, CUSize, sizeof(int32_t));
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
-        if (Err != CUDA_SUCCESS) {
-          DP("Error when copying data from host to device. Pointers: "
-             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
-             DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        DP("Sending global device environment data %zu bytes\n", CUSize);
-      } else {
-        DP("Finding global device environment '%s' - symbol missing.\n",
-           DeviceEnvName);
-        DP("Continue, considering this is a device RTL which does not accept "
-           "environment setting.\n");
-      }
-    }
-
-    return getOffloadEntriesTable(DeviceId);
-  }
-
-  void *dataAlloc(const int DeviceId, const int64_t Size) const {
-    if (Size == 0)
-      return nullptr;
-
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return nullptr;
-
-    CUdeviceptr DevicePtr;
-    Err = cuMemAlloc(&DevicePtr, Size);
-    if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
-      return nullptr;
-
-    return (void *)DevicePtr;
-  }
-
-  int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
-                 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
-    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
-
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
-
-    Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when copying data from host to device. Pointers: host = " DPxMOD
-         ", device = " DPxMOD ", size = %" PRId64 "\n",
-         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
-                   const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
-    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
-
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
-
-    Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when copying data from device to host. Pointers: host = " DPxMOD
-         ", device = " DPxMOD ", size = %" PRId64 "\n",
-         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int dataDelete(const int DeviceId, void *TgtPtr) const {
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    Err = cuMemFree((CUdeviceptr)TgtPtr);
-    if (!checkResult(Err, "Error returned from cuMemFree\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
-                          void **TgtArgs, ptrdiff_t *TgtOffsets,
-                          const int ArgNum, const int TeamNum,
-                          const int ThreadLimit,
-                          const unsigned int LoopTripCount,
-                          __tgt_async_info *AsyncInfo) const {
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    // All args are references.
-    std::vector<void *> Args(ArgNum);
-    std::vector<void *> Ptrs(ArgNum);
-
-    for (int I = 0; I < ArgNum; ++I) {
-      Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-      Args[I] = &Ptrs[I];
-    }
-
-    const KernelTy *KernelInfo =
-        reinterpret_cast<const KernelTy *>(TgtEntryPtr);
-
-    unsigned int CudaThreadsPerBlock;
-    if (ThreadLimit > 0) {
-      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
-      CudaThreadsPerBlock = ThreadLimit;
-      // Add master warp if necessary
-      if (KernelInfo->ExecutionMode == GENERIC) {
-        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
-        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
-      }
-    } else {
-      DP("Setting CUDA threads per block to default %d\n",
-         DeviceData[DeviceId].NumThreads);
-      CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
-    }
-
-    if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
-      DP("Threads per block capped at device limit %d\n",
-         DeviceData[DeviceId].ThreadsPerBlock);
-      CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
-    }
-
-    int KernelLimit;
-    Err = cuFuncGetAttribute(&KernelLimit,
-                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             KernelInfo->Func);
-    if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
-      DP("Threads per block capped at kernel limit %d\n", KernelLimit);
-      CudaThreadsPerBlock = KernelLimit;
-    }
-
-    unsigned int CudaBlocksPerGrid;
-    if (TeamNum <= 0) {
-      if (LoopTripCount > 0 && EnvNumTeams < 0) {
-        if (KernelInfo->ExecutionMode == SPMD) {
-          // We have a combined construct, i.e. `target teams distribute
-          // parallel for [simd]`. We launch so many teams so that each thread
-          // will execute one iteration of the loop. round up to the nearest
-          // integer
-          CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
-        } else {
-          // If we reach this point, then we have a non-combined construct, i.e.
-          // `teams distribute` with a nested `parallel for` and each team is
-          // assigned one iteration of the `distribute` loop. E.g.:
-          //
-          // #pragma omp target teams distribute
-          // for(...loop_tripcount...) {
-          //   #pragma omp parallel for
-          //   for(...) {}
-          // }
-          //
-          // Threads within a team will execute the iterations of the `parallel`
-          // loop.
-          CudaBlocksPerGrid = LoopTripCount;
-        }
-        DP("Using %d teams due to loop trip count %" PRIu64
-           " and number of threads per block %d\n",
-           CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
-      } else {
-        DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
-        CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
-      }
-    } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
-      DP("Capping number of teams to team limit %d\n",
-         DeviceData[DeviceId].BlocksPerGrid);
-      CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
-    } else {
-      DP("Using requested number of teams %d\n", TeamNum);
-      CudaBlocksPerGrid = TeamNum;
-    }
-
-    // Run on the device.
-    DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
-       CudaThreadsPerBlock);
-
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
-                         /* gridDimZ */ 1, CudaThreadsPerBlock,
-                         /* blockDimY */ 1, /* blockDimZ */ 1,
-                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
-    if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
-      return OFFLOAD_FAIL;
-
-    DP("Launch of entry point at " DPxMOD " successful!\n",
-       DPxPTR(TgtEntryPtr));
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
-    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
-    CUresult Err = cuStreamSynchronize(Stream);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when synchronizing stream. stream = " DPxMOD
-         ", async info ptr = " DPxMOD "\n",
-         DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    // Once the stream is synchronized, return it to stream pool and reset
-    // async_info. This is to make sure the synchronization only works for its
-    // own tasks.
-    StreamManager->returnStream(
-        DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
-    AsyncInfoPtr->Queue = nullptr;
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-DeviceRTLTy DeviceRTL;
-} // namespace
-
-// Exposed library API function
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-  return elf_check_machine(image, /* EM_CUDA */ 190);
-}
-
-int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceRTL.setRequiresFlag(RequiresFlags);
-  return RequiresFlags;
-}
-
-int32_t __tgt_rtl_init_device(int32_t device_id) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  return DeviceRTL.initDevice(device_id);
-}
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
-                                          __tgt_device_image *image) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  return DeviceRTL.loadBinary(device_id, image);
-}
-
-void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  return DeviceRTL.dataAlloc(device_id, size);
-}
-
-int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
-                              int64_t size) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  __tgt_async_info async_info;
-  const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
-                                                 size, &async_info);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(device_id, &async_info);
-}
-
-int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
-                                    void *hst_ptr, int64_t size,
-                                    __tgt_async_info *async_info_ptr) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-  assert(async_info_ptr && "async_info_ptr is nullptr");
-
-  return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
-                              async_info_ptr);
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
-                                int64_t size) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  __tgt_async_info async_info;
-  const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
-                                                   size, &async_info);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(device_id, &async_info);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
-                                      void *tgt_ptr, int64_t size,
-                                      __tgt_async_info *async_info_ptr) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-  assert(async_info_ptr && "async_info_ptr is nullptr");
-
-  return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
-                                async_info_ptr);
-}
-
-int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  return DeviceRTL.dataDelete(device_id, tgt_ptr);
-}
-
-int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-                                         void **tgt_args,
-                                         ptrdiff_t *tgt_offsets,
-                                         int32_t arg_num, int32_t team_num,
-                                         int32_t thread_limit,
-                                         uint64_t loop_tripcount) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  __tgt_async_info async_info;
-  const int32_t rc = __tgt_rtl_run_target_team_region_async(
-      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
-      thread_limit, loop_tripcount, &async_info);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(device_id, &async_info);
-}
-
-int32_t __tgt_rtl_run_target_team_region_async(
-    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
-    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
-    int32_t thread_limit, uint64_t loop_tripcount,
-    __tgt_async_info *async_info_ptr) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  return DeviceRTL.runTargetTeamRegion(
-      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
-      thread_limit, loop_tripcount, async_info_ptr);
-}
-
-int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
-                                    void **tgt_args, ptrdiff_t *tgt_offsets,
-                                    int32_t arg_num) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  __tgt_async_info async_info;
-  const int32_t rc = __tgt_rtl_run_target_region_async(
-      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(device_id, &async_info);
-}
-
-int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
-                                          void *tgt_entry_ptr, void **tgt_args,
-                                          ptrdiff_t *tgt_offsets,
-                                          int32_t arg_num,
-                                          __tgt_async_info *async_info_ptr) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-
-  return __tgt_rtl_run_target_team_region_async(
-      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
-      /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
-      async_info_ptr);
-}
-
-int32_t __tgt_rtl_synchronize(int32_t device_id,
-                              __tgt_async_info *async_info_ptr) {
-  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
-  assert(async_info_ptr && "async_info_ptr is nullptr");
-  assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
-
-  return DeviceRTL.synchronize(device_id, async_info_ptr);
-}
-
-#ifdef __cplusplus
-}
-#endif
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <cuda.h>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME CUDA
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+    } \
+  } while (false)
+
+// Utility for retrieving and printing CUDA error string.
+#define CUDA_ERR_STRING(err) \
+  do { \
+    if (DebugLevel > 0) { \
+      const char *errStr; \
+      cuGetErrorString(err, &errStr); \
+      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#define CUDA_ERR_STRING(err) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+  std::vector<__tgt_offload_entry> Entries;
+};
+
+enum ExecutionModeType {
+  SPMD, // constructors, destructors,
+  // combined constructs (`teams distribute parallel for [simd]`)
+  GENERIC, // everything else
+  NONE
+};
+
+/// Use a single entity to encode a kernel and a set of flags.
+struct KernelTy {
+  CUfunction Func;
+
+  // execution mode of kernel
+  // 0 - SPMD mode (without master warp)
+  // 1 - Generic mode (with master warp)
+  int8_t ExecutionMode;
+
+  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
+      : Func(_Func), ExecutionMode(_ExecutionMode) {}
+};
+
+/// Device environment data
+/// Manually sync with the deviceRTL side for now, move to a dedicated header
+/// file later.
+struct omptarget_device_environmentTy {
+  int32_t debug_level;
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+namespace {
+bool checkResult(CUresult Err, const char *ErrMsg) {
+  if (Err == CUDA_SUCCESS)
+    return true;
+
+  DP(ErrMsg);
+  CUDA_ERR_STRING(Err);
+  return false;
+}
+
+// Structure contains per-device data
+struct DeviceDataTy {
+  std::list<FuncOrGblEntryTy> FuncGblEntries;
+  CUcontext Context = nullptr;
+  // Device properties
+  int ThreadsPerBlock = 0;
+  int BlocksPerGrid = 0;
+  int WarpSize = 0;
+  // OpenMP properties
+  int NumTeams = 0;
+  int NumThreads = 0;
+};
+
+class StreamManagerTy {
+  int NumberOfDevices;
+  // The initial size of stream pool
+  int EnvNumInitialStreams;
+  // Per-device stream mutex
+  std::vector<std::unique_ptr<std::mutex>> StreamMtx;
+  // Per-device stream Id indicates the next available stream in the pool
+  std::vector<int> NextStreamId;
+  // Per-device stream pool
+  std::vector<std::vector<CUstream>> StreamPool;
+  // Reference to per-device data
+  std::vector<DeviceDataTy> &DeviceData;
+
+  // If there is no CUstream left in the pool, we will resize the pool to
+  // allocate more CUstream. This function should be called with device mutex,
+  // and we do not resize to smaller one.
+  void resizeStreamPool(const int DeviceId, const size_t NewSize) {
+    std::vector<CUstream> &Pool = StreamPool[DeviceId];
+    const size_t CurrentSize = Pool.size();
+    assert(NewSize > CurrentSize && "new size is not larger than current size");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
+      // We will return if cannot switch to the right context in case of
+      // creating bunch of streams that are not corresponding to the right
+      // device. The offloading will fail later because selected CUstream is
+      // nullptr.
+      return;
+    }
+
+    Pool.resize(NewSize, nullptr);
+
+    for (size_t I = CurrentSize; I < NewSize; ++I) {
+      checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
+                  "Error returned from cuStreamCreate\n");
+    }
+  }
+
+public:
+  StreamManagerTy(const int NumberOfDevices,
+                  std::vector<DeviceDataTy> &DeviceData)
+      : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
+        DeviceData(DeviceData) {
+    StreamPool.resize(NumberOfDevices);
+    NextStreamId.resize(NumberOfDevices);
+    StreamMtx.resize(NumberOfDevices);
+
+    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
+      EnvNumInitialStreams = std::stoi(EnvStr);
+
+    // Initialize the next stream id
+    std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
+
+    // Initialize stream mutex
+    for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
+      Ptr = std::make_unique<std::mutex>();
+  }
+
+  ~StreamManagerTy() {
+    // Destroy streams
+    for (int I = 0; I < NumberOfDevices; ++I) {
+      checkResult(cuCtxSetCurrent(DeviceData[I].Context),
+                  "Error returned from cuCtxSetCurrent\n");
+
+      for (CUstream &S : StreamPool[I]) {
+        if (S)
+          checkResult(cuStreamDestroy(S),
+                      "Error returned from cuStreamDestroy\n");
+      }
+    }
+  }
+
+  // Get a CUstream from pool. Per-device next stream id always points to the
+  // next available CUstream. That means, CUstreams [0, id-1] have been
+  // assigned, and [id,] are still available. If there is no CUstream left, we
+  // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
+  // the id will increase one.
+  // xxxxxs+++++++++
+  //      ^
+  //      id
+  // After assignment, the pool becomes the following and s is assigned.
+  // xxxxxs+++++++++
+  //       ^
+  //       id
+  CUstream getStream(const int DeviceId) {
+    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
+    int &Id = NextStreamId[DeviceId];
+    // No CUstream left in the pool, we need to request from CUDA RT
+    if (Id == StreamPool[DeviceId].size()) {
+      // By default we double the stream pool every time
+      resizeStreamPool(DeviceId, Id * 2);
+    }
+    return StreamPool[DeviceId][Id++];
+  }
+
+  // Return a CUstream back to pool. As mentioned above, per-device next
+  // stream is always points to the next available CUstream, so when we return
+  // a CUstream, we need to first decrease the id, and then copy the CUstream
+  // back.
+  // It is worth noting that, the order of streams return might be different
+  // from that they're assigned, that saying, at some point, there might be
+  // two identical CUstreams.
+  // xxax+a+++++
+  //     ^
+  //     id
+  // However, it doesn't matter, because they're always on the two sides of
+  // id. The left one will in the end be overwritten by another CUstream.
+  // Therefore, after several execution, the order of pool might be different
+  // from its initial state.
+  void returnStream(const int DeviceId, CUstream Stream) {
+    const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
+    int &Id = NextStreamId[DeviceId];
+    assert(Id > 0 && "Wrong stream ID");
+    StreamPool[DeviceId][--Id] = Stream;
+  }
+
+  bool initializeDeviceStreamPool(const int DeviceId) {
+    assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
+
+    resizeStreamPool(DeviceId, EnvNumInitialStreams);
+
+    // Check the size of stream pool
+    if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
+      return false;
+
+    // Check whether each stream is valid
+    for (CUstream &S : StreamPool[DeviceId])
+      if (!S)
+        return false;
+
+    return true;
+  }
+};
+
+class DeviceRTLTy {
+  int NumberOfDevices;
+  // OpenMP environment properties
+  int EnvNumTeams;
+  int EnvTeamLimit;
+  // OpenMP requires flags
+  int64_t RequiresFlags;
+
+  static constexpr const int HardTeamLimit = 1U << 16U; // 64k
+  static constexpr const int HardThreadLimit = 1024;
+  static constexpr const int DefaultNumTeams = 128;
+  static constexpr const int DefaultNumThreads = 128;
+
+  std::unique_ptr<StreamManagerTy> StreamManager;
+  std::vector<DeviceDataTy> DeviceData;
+  std::vector<CUmodule> Modules;
+
+  // Record entry point associated with device
+  void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
+    E.Entries.push_back(entry);
+  }
+
+  // Return true if the entry is associated with device
+  bool findOffloadEntry(const int DeviceId, const void *Addr) const {
+    for (const __tgt_offload_entry &Itr :
+         DeviceData[DeviceId].FuncGblEntries.back().Entries)
+      if (Itr.addr == Addr)
+        return true;
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table
+  __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
+
+    if (E.Entries.empty())
+      return nullptr;
+
+    // Update table info according to the entries and return the pointer
+    E.Table.EntriesBegin = E.Entries.data();
+    E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
+
+    return &E.Table;
+  }
+
+  // Clear entries table for a device
+  void clearOffloadEntriesTable(const int DeviceId) {
+    DeviceData[DeviceId].FuncGblEntries.emplace_back();
+    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
+    E.Entries.clear();
+    E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
+  }
+
+  CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    if (!AsyncInfoPtr->Queue)
+      AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
+
+    return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
+  }
+
+public:
+  // This class should not be copied
+  DeviceRTLTy(const DeviceRTLTy &) = delete;
+  DeviceRTLTy(DeviceRTLTy &&) = delete;
+
+  DeviceRTLTy()
+      : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
+        RequiresFlags(OMP_REQ_UNDEFINED) {
+#ifdef OMPTARGET_DEBUG
+    if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
+      DebugLevel = std::stoi(EnvStr);
+#endif // OMPTARGET_DEBUG
+
+    DP("Start initializing CUDA\n");
+
+    CUresult Err = cuInit(0);
+    if (!checkResult(Err, "Error returned from cuInit\n")) {
+      return;
+    }
+
+    Err = cuDeviceGetCount(&NumberOfDevices);
+    if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
+      return;
+
+    if (NumberOfDevices == 0) {
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    DeviceData.resize(NumberOfDevices);
+
+    // Get environment variables regarding teams
+    if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
+      // OMP_TEAM_LIMIT has been set
+      EnvTeamLimit = std::stoi(EnvStr);
+      DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
+    }
+    if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
+      // OMP_NUM_TEAMS has been set
+      EnvNumTeams = std::stoi(EnvStr);
+      DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
+    }
+
+    StreamManager =
+        std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
+  }
+
+  ~DeviceRTLTy() {
+    // First destruct stream manager in case of Contexts is destructed before it
+    StreamManager = nullptr;
+
+    for (CUmodule &M : Modules)
+      // Close module
+      if (M)
+        checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
+
+    for (DeviceDataTy &D : DeviceData) {
+      // Destroy context
+      if (D.Context)
+        checkResult(cuCtxDestroy(D.Context),
+                    "Error returned from cuCtxDestroy\n");
+    }
+  }
+
+  // Check whether a given DeviceId is valid
+  bool isValidDeviceId(const int DeviceId) const {
+    return DeviceId >= 0 && DeviceId < NumberOfDevices;
+  }
+
+  bool getNumOfDevices() const { return NumberOfDevices; }
+
+  void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
+
+  int initDevice(const int DeviceId) {
+    CUdevice Device;
+
+    DP("Getting device %d\n", DeviceId);
+    CUresult Err = cuDeviceGet(&Device, DeviceId);
+    if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
+      return OFFLOAD_FAIL;
+
+    // Create the context and save it to use whenever this device is selected.
+    Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC,
+                      Device);
+    if (!checkResult(Err, "Error returned from cuCtxCreate\n"))
+      return OFFLOAD_FAIL;
+
+    Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    // Initialize stream pool
+    if (!StreamManager->initializeDeviceStreamPool(DeviceId))
+      return OFFLOAD_FAIL;
+
+    // Query attributes to determine number of threads/block and blocks/grid.
+    int MaxGridDimX;
+    Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                               Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting max grid dimension, use default value %d\n",
+         DeviceRTLTy::DefaultNumTeams);
+      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
+    } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
+      DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
+      DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
+    } else {
+      DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+         "at the hard limit\n",
+         MaxGridDimX, DeviceRTLTy::HardTeamLimit);
+      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
+    }
+
+    // We are only exploiting threads along the x axis.
+    int MaxBlockDimX;
+    Err = cuDeviceGetAttribute(&MaxBlockDimX,
+                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting max block dimension, use default value %d\n",
+         DeviceRTLTy::DefaultNumThreads);
+      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
+    } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
+      DP("Using %d CUDA threads per block\n", MaxBlockDimX);
+      DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
+    } else {
+      DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
+         "capping at the hard limit\n",
+         MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
+      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+    }
+
+    // Get and set warp size
+    int WarpSize;
+    Err =
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting warp size, assume default value 32\n");
+      DeviceData[DeviceId].WarpSize = 32;
+    } else {
+      DP("Using warp size %d\n", WarpSize);
+      DeviceData[DeviceId].WarpSize = WarpSize;
+    }
+
+    // Adjust teams to the env variables
+    if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
+      DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+         EnvTeamLimit);
+      DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
+    }
+
+    DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
+       DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
+       DeviceData[DeviceId].WarpSize);
+
+    // Set default number of teams
+    if (EnvNumTeams > 0) {
+      DP("Default number of teams set according to environment %d\n",
+         EnvNumTeams);
+      DeviceData[DeviceId].NumTeams = EnvNumTeams;
+    } else {
+      DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
+      DP("Default number of teams set according to library's default %d\n",
+         DeviceRTLTy::DefaultNumTeams);
+    }
+
+    if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
+      DP("Default number of teams exceeds device limit, capping at %d\n",
+         DeviceData[DeviceId].BlocksPerGrid);
+      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
+    }
+
+    // Set default number of threads
+    DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
+    DP("Default number of threads set according to library's default %d\n",
+       DeviceRTLTy::DefaultNumThreads);
+    if (DeviceData[DeviceId].NumThreads >
+        DeviceData[DeviceId].ThreadsPerBlock) {
+      DP("Default number of threads exceeds device limit, capping at %d\n",
+         DeviceData[DeviceId].ThreadsPerBlock);
+      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  __tgt_target_table *loadBinary(const int DeviceId,
+                                 const __tgt_device_image *Image) {
+    // Set the context we are using
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return nullptr;
+
+    // Clear the offload table as we are going to create a new one.
+    clearOffloadEntriesTable(DeviceId);
+
+    // Create the module and extract the function pointers.
+    CUmodule Module;
+    DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
+    Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
+    if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
+      return nullptr;
+
+    DP("CUDA module successfully loaded!\n");
+
+    Modules.push_back(Module);
+
+    // Find the symbols in the module by name.
+    const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
+    const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
+
+    for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
+      if (!E->addr) {
+        // We return nullptr when something like this happens, the host should
+        // have always something in the address to uniquely identify the target
+        // region.
+        DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
+        return nullptr;
+      }
+
+      if (E->size) {
+        __tgt_offload_entry Entry = *E;
+        CUdeviceptr CUPtr;
+        size_t CUSize;
+        Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
+        // We keep this style here because we need the name
+        if (Err != CUDA_SUCCESS) {
+          DP("Loading global '%s' (Failed)\n", E->name);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
+
+        if (CUSize != E->size) {
+          DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
+             CUSize, E->size);
+          return nullptr;
+        }
+
+        DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+           DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
+
+        Entry.addr = (void *)(CUPtr);
+
+        // Note: In the current implementation declare target variables
+        // can either be link or to. This means that once unified
+        // memory is activated via the requires directive, the variable
+        // can be used directly from the host in both cases.
+        // TODO: when variables types other than to or link are added,
+        // the below condition should be changed to explicitly
+        // check for to and link variables types:
+        // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
+        // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
+        if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+          // If unified memory is present any target link or to variables
+          // can access host addresses directly. There is no longer a
+          // need for device copies.
+          cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
+          DP("Copy linked variable host address (" DPxMOD
+             ") to device address (" DPxMOD ")\n",
+             DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
+        }
+
+        addOffloadEntry(DeviceId, Entry);
+
+        continue;
+      }
+
+      CUfunction Func;
+      Err = cuModuleGetFunction(&Func, Module, E->name);
+      // We keep this style here because we need the name
+      if (Err != CUDA_SUCCESS) {
+        DP("Loading '%s' (Failed)\n", E->name);
+        CUDA_ERR_STRING(Err);
+        return nullptr;
+      }
+
+      DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+         DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
+
+      // default value GENERIC (in case symbol is missing from cubin file)
+      int8_t ExecModeVal = ExecutionModeType::GENERIC;
+      std::string ExecModeNameStr(E->name);
+      ExecModeNameStr += "_exec_mode";
+      const char *ExecModeName = ExecModeNameStr.c_str();
+
+      CUdeviceptr ExecModePtr;
+      size_t CUSize;
+      Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
+      if (Err == CUDA_SUCCESS) {
+        if (CUSize != sizeof(int8_t)) {
+          DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+             ExecModeName, CUSize, sizeof(int8_t));
+          return nullptr;
+        }
+
+        Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
+        if (Err != CUDA_SUCCESS) {
+          DP("Error when copying data from device to host. Pointers: "
+             "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+             DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
+
+        if (ExecModeVal < 0 || ExecModeVal > 1) {
+          DP("Error wrong exec_mode value specified in cubin file: %d\n",
+             ExecModeVal);
+          return nullptr;
+        }
+      } else {
+        DP("Loading global exec_mode '%s' - symbol missing, using default "
+           "value GENERIC (1)\n",
+           ExecModeName);
+        CUDA_ERR_STRING(Err);
+      }
+
+      KernelsList.emplace_back(Func, ExecModeVal);
+
+      __tgt_offload_entry Entry = *E;
+      Entry.addr = &KernelsList.back();
+      addOffloadEntry(DeviceId, Entry);
+    }
+
+    // send device environment data to the device
+    {
+      omptarget_device_environmentTy DeviceEnv{0};
+
+#ifdef OMPTARGET_DEBUG
+      if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
+        DeviceEnv.debug_level = std::stoi(EnvStr);
+#endif
+
+      const char *DeviceEnvName = "omptarget_device_environment";
+      CUdeviceptr DeviceEnvPtr;
+      size_t CUSize;
+
+      Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
+      if (Err == CUDA_SUCCESS) {
+        if (CUSize != sizeof(DeviceEnv)) {
+          DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+             DeviceEnvName, CUSize, sizeof(int32_t));
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
+
+        Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
+        if (Err != CUDA_SUCCESS) {
+          DP("Error when copying data from host to device. Pointers: "
+             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+             DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
+
+        DP("Sending global device environment data %zu bytes\n", CUSize);
+      } else {
+        DP("Finding global device environment '%s' - symbol missing.\n",
+           DeviceEnvName);
+        DP("Continue, considering this is a device RTL which does not accept "
+           "environment setting.\n");
+      }
+    }
+
+    return getOffloadEntriesTable(DeviceId);
+  }
+
+  void *dataAlloc(const int DeviceId, const int64_t Size) const {
+    if (Size == 0)
+      return nullptr;
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return nullptr;
+
+    CUdeviceptr DevicePtr;
+    Err = cuMemAlloc(&DevicePtr, Size);
+    if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
+      return nullptr;
+
+    return (void *)DevicePtr;
+  }
+
+  int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
+                 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+
+    Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+         ", device = " DPxMOD ", size = %" PRId64 "\n",
+         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
+                   const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+
+    Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+         ", device = " DPxMOD ", size = %" PRId64 "\n",
+         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  int dataTransfer(const int DeviceId, void *DstPtr, const void *SrcPtr,
+                   const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
+
+    Err = cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error when copying data from device to device. Pointers: dst = " DPxMOD
+         ", src = " DPxMOD ", size = %" PRId64 "\n",
+         DPxPTR(DstPtr), DPxPTR(SrcPtr), Size);
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
+
+  int dataDelete(const int DeviceId, void *TgtPtr) const {
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    Err = cuMemFree((CUdeviceptr)TgtPtr);
+    if (!checkResult(Err, "Error returned from cuMemFree\n"))
+      return OFFLOAD_FAIL;
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
+                          void **TgtArgs, ptrdiff_t *TgtOffsets,
+                          const int ArgNum, const int TeamNum,
+                          const int ThreadLimit,
+                          const unsigned int LoopTripCount,
+                          __tgt_async_info *AsyncInfo) const {
+    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    // All args are references.
+    std::vector<void *> Args(ArgNum);
+    std::vector<void *> Ptrs(ArgNum);
+
+    for (int I = 0; I < ArgNum; ++I) {
+      Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
+      Args[I] = &Ptrs[I];
+    }
+
+    const KernelTy *KernelInfo =
+        reinterpret_cast<const KernelTy *>(TgtEntryPtr);
+
+    unsigned int CudaThreadsPerBlock;
+    if (ThreadLimit > 0) {
+      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
+      CudaThreadsPerBlock = ThreadLimit;
+      // Add master warp if necessary
+      if (KernelInfo->ExecutionMode == GENERIC) {
+        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
+        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+      }
+    } else {
+      DP("Setting CUDA threads per block to default %d\n",
+         DeviceData[DeviceId].NumThreads);
+      CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
+    }
+
+    if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
+      DP("Threads per block capped at device limit %d\n",
+         DeviceData[DeviceId].ThreadsPerBlock);
+      CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
+    }
+
+    int KernelLimit;
+    Err = cuFuncGetAttribute(&KernelLimit,
+                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             KernelInfo->Func);
+    if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
+      DP("Threads per block capped at kernel limit %d\n", KernelLimit);
+      CudaThreadsPerBlock = KernelLimit;
+    }
+
+    unsigned int CudaBlocksPerGrid;
+    if (TeamNum <= 0) {
+      if (LoopTripCount > 0 && EnvNumTeams < 0) {
+        if (KernelInfo->ExecutionMode == SPMD) {
+          // We have a combined construct, i.e. `target teams distribute
+          // parallel for [simd]`. We launch so many teams so that each thread
+          // will execute one iteration of the loop. round up to the nearest
+          // integer
+          CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
+        } else {
+          // If we reach this point, then we have a non-combined construct, i.e.
+          // `teams distribute` with a nested `parallel for` and each team is
+          // assigned one iteration of the `distribute` loop. E.g.:
+          //
+          // #pragma omp target teams distribute
+          // for(...loop_tripcount...) {
+          //   #pragma omp parallel for
+          //   for(...) {}
+          // }
+          //
+          // Threads within a team will execute the iterations of the `parallel`
+          // loop.
+          CudaBlocksPerGrid = LoopTripCount;
+        }
+        DP("Using %d teams due to loop trip count %" PRIu64
+           " and number of threads per block %d\n",
+           CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
+      } else {
+        DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
+        CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
+      }
+    } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
+      DP("Capping number of teams to team limit %d\n",
+         DeviceData[DeviceId].BlocksPerGrid);
+      CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
+    } else {
+      DP("Using requested number of teams %d\n", TeamNum);
+      CudaBlocksPerGrid = TeamNum;
+    }
+
+    // Run on the device.
+    DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
+       CudaThreadsPerBlock);
+
+    CUstream Stream = getStream(DeviceId, AsyncInfo);
+    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
+                         /* gridDimZ */ 1, CudaThreadsPerBlock,
+                         /* blockDimY */ 1, /* blockDimZ */ 1,
+                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+    if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
+      return OFFLOAD_FAIL;
+
+    DP("Launch of entry point at " DPxMOD " successful!\n",
+       DPxPTR(TgtEntryPtr));
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
+    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
+    CUresult Err = cuStreamSynchronize(Stream);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error when synchronizing stream. stream = " DPxMOD
+         ", async info ptr = " DPxMOD "\n",
+         DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
+      CUDA_ERR_STRING(Err);
+      return OFFLOAD_FAIL;
+    }
+
+    // Once the stream is synchronized, return it to stream pool and reset
+    // async_info. This is to make sure the synchronization only works for its
+    // own tasks.
+    StreamManager->returnStream(
+        DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
+    AsyncInfoPtr->Queue = nullptr;
+
+    return OFFLOAD_SUCCESS;
+  }
+};
+
+DeviceRTLTy DeviceRTL;
+} // namespace
+
+// Exposed library API function
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_check_machine(image, /* EM_CUDA */ 190);
+}
+
+int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Init requires flags to %ld\n", RequiresFlags);
+  DeviceRTL.setRequiresFlag(RequiresFlags);
+  return RequiresFlags;
+}
+
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.initDevice(device_id);
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.loadBinary(device_id, image);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.dataAlloc(device_id, size);
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
+                                                 size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
+                                    void *hst_ptr, int64_t size,
+                                    __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+
+  return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
+                              async_info_ptr);
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
+                                                   size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
+                                      void *tgt_ptr, int64_t size,
+                                      __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+
+  return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
+                                async_info_ptr);
+}
+
+int32_t __tgt_rtl_data_transfer(int32_t device_id, void *dst_ptr, void *src_ptr,
+                                int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_data_transfer_async(device_id, hst_ptr, tgt_ptr,
+                                                   size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_data_transfer_async(int32_t device_id, void *dst_ptr,
+                                      void *src_ptr, int64_t size,
+                                      __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+
+  return DeviceRTL.dataTransfer(device_id, dst_ptr, src_ptr, size,
+                                async_info_ptr);
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.dataDelete(device_id, tgt_ptr);
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+                                         void **tgt_args,
+                                         ptrdiff_t *tgt_offsets,
+                                         int32_t arg_num, int32_t team_num,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_run_target_team_region_async(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+      thread_limit, loop_tripcount, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount,
+    __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return DeviceRTL.runTargetTeamRegion(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+      thread_limit, loop_tripcount, async_info_ptr);
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+                                    void **tgt_args, ptrdiff_t *tgt_offsets,
+                                    int32_t arg_num) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_run_target_region_async(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &async_info);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
+                                          void *tgt_entry_ptr, void **tgt_args,
+                                          ptrdiff_t *tgt_offsets,
+                                          int32_t arg_num,
+                                          __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+
+  return __tgt_rtl_run_target_team_region_async(
+      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
+      /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
+      async_info_ptr);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+                              __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+  assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
+
+  return DeviceRTL.synchronize(device_id, async_info_ptr);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports
index a4e1a3186daa5..67d689e2f3285 100644
--- a/openmp/libomptarget/plugins/exports
+++ b/openmp/libomptarget/plugins/exports
@@ -1,21 +1,23 @@
-VERS1.0 {
-  global:
-    __tgt_rtl_is_valid_binary;
-    __tgt_rtl_number_of_devices;
-    __tgt_rtl_init_requires;
-    __tgt_rtl_init_device;
-    __tgt_rtl_load_binary;
-    __tgt_rtl_data_alloc;
-    __tgt_rtl_data_submit;
-    __tgt_rtl_data_submit_async;
-    __tgt_rtl_data_retrieve;
-    __tgt_rtl_data_retrieve_async;
-    __tgt_rtl_data_delete;
-    __tgt_rtl_run_target_team_region;
-    __tgt_rtl_run_target_team_region_async;
-    __tgt_rtl_run_target_region;
-    __tgt_rtl_run_target_region_async;
-    __tgt_rtl_synchronize;
-  local:
-    *;
-};
+VERS1.0 {
+  global:
+    __tgt_rtl_is_valid_binary;
+    __tgt_rtl_number_of_devices;
+    __tgt_rtl_init_requires;
+    __tgt_rtl_init_device;
+    __tgt_rtl_load_binary;
+    __tgt_rtl_data_alloc;
+    __tgt_rtl_data_submit;
+    __tgt_rtl_data_submit_async;
+    __tgt_rtl_data_retrieve;
+    __tgt_rtl_data_retrieve_async;
+    __tgt_rtl_data_transfer;
+    __tgt_rtl_data_transfer_async;
+    __tgt_rtl_data_delete;
+    __tgt_rtl_run_target_team_region;
+    __tgt_rtl_run_target_team_region_async;
+    __tgt_rtl_run_target_region;
+    __tgt_rtl_run_target_region_async;
+    __tgt_rtl_synchronize;
+  local:
+    *;
+};
diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
index 8a6e085d3f75c..a2d0de38c74fe 100644
--- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -1,343 +1,349 @@
-//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for generic 64-bit machine
-//
-//===----------------------------------------------------------------------===//
-
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
-#include <dlfcn.h>
-#include <ffi.h>
-#include <gelf.h>
-#include <link.h>
-#include <list>
-#include <string>
-#include <vector>
-
-#include "omptargetplugin.h"
-
-#ifndef TARGET_NAME
-#define TARGET_NAME Generic ELF - 64bit
-#endif
-
-#ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID 0
-#endif
-
-#ifdef OMPTARGET_DEBUG
-static int DebugLevel = 0;
-
-#define GETNAME2(name) #name
-#define GETNAME(name) GETNAME2(name)
-#define DP(...) \
-  do { \
-    if (DebugLevel > 0) { \
-      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
-    } \
-  } while (false)
-#else // OMPTARGET_DEBUG
-#define DP(...) {}
-#endif // OMPTARGET_DEBUG
-
-#include "../../common/elf_common.c"
-
-#define NUMBER_OF_DEVICES 4
-#define OFFLOADSECTIONNAME "omp_offloading_entries"
-
-/// Array of Dynamic libraries loaded for this target.
-struct DynLibTy {
-  char *FileName;
-  void *Handle;
-};
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-};
-
-/// Class containing all the device information.
-class RTLDeviceInfoTy {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-
-public:
-  std::list<DynLibTy> DynLibs;
-
-  // Record entry point associated with device.
-  void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
-                          __tgt_offload_entry *end) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncGblEntries[device_id].emplace_back();
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
-    E.Table.EntriesBegin = begin;
-    E.Table.EntriesEnd = end;
-  }
-
-  // Return true if the entry is associated with device.
-  bool findOffloadEntry(int32_t device_id, void *addr) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
-    for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
-         i < e; ++i) {
-      if (i->addr == addr)
-        return true;
-    }
-
-    return false;
-  }
-
-  // Return the pointer to the target entries table.
-  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
-    assert(device_id < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
-
-    return &E.Table;
-  }
-
-  RTLDeviceInfoTy(int32_t num_devices) {
-#ifdef OMPTARGET_DEBUG
-    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
-      DebugLevel = std::stoi(envStr);
-    }
-#endif // OMPTARGET_DEBUG
-
-    FuncGblEntries.resize(num_devices);
-  }
-
-  ~RTLDeviceInfoTy() {
-    // Close dynamic libraries
-    for (auto &lib : DynLibs) {
-      if (lib.Handle) {
-        dlclose(lib.Handle);
-        remove(lib.FileName);
-      }
-    }
-  }
-};
-
-static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-// If we don't have a valid ELF ID we can just fail.
-#if TARGET_ELF_ID < 1
-  return 0;
-#else
-  return elf_check_machine(image, TARGET_ELF_ID);
-#endif
-}
-
-int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
-
-int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
-                                          __tgt_device_image *image) {
-
-  DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
-     DPxPTR(image->ImageStart));
-
-  assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id");
-
-  size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
-  size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
-  DP("Expecting to have %zd entries defined.\n", NumEntries);
-
-  // Is the library version incompatible with the header file?
-  if (elf_version(EV_CURRENT) == EV_NONE) {
-    DP("Incompatible ELF library!\n");
-    return NULL;
-  }
-
-  // Obtain elf handler
-  Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
-  if (!e) {
-    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
-    return NULL;
-  }
-
-  if (elf_kind(e) != ELF_K_ELF) {
-    DP("Invalid Elf kind!\n");
-    elf_end(e);
-    return NULL;
-  }
-
-  // Find the entries section offset
-  Elf_Scn *section = 0;
-  Elf64_Off entries_offset = 0;
-
-  size_t shstrndx;
-
-  if (elf_getshdrstrndx(e, &shstrndx)) {
-    DP("Unable to get ELF strings index!\n");
-    elf_end(e);
-    return NULL;
-  }
-
-  while ((section = elf_nextscn(e, section))) {
-    GElf_Shdr hdr;
-    gelf_getshdr(section, &hdr);
-
-    if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
-      entries_offset = hdr.sh_addr;
-      break;
-    }
-  }
-
-  if (!entries_offset) {
-    DP("Entries Section Offset Not Found\n");
-    elf_end(e);
-    return NULL;
-  }
-
-  DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
-
-  // load dynamic library and get the entry points. We use the dl library
-  // to do the loading of the library, but we could do it directly to avoid the
-  // dump to the temporary file.
-  //
-  // 1) Create tmp file with the library contents.
-  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
-  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
-  int tmp_fd = mkstemp(tmp_name);
-
-  if (tmp_fd == -1) {
-    elf_end(e);
-    return NULL;
-  }
-
-  FILE *ftmp = fdopen(tmp_fd, "wb");
-
-  if (!ftmp) {
-    elf_end(e);
-    return NULL;
-  }
-
-  fwrite(image->ImageStart, ImageSize, 1, ftmp);
-  fclose(ftmp);
-
-  DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)};
-
-  if (!Lib.Handle) {
-    DP("Target library loading error: %s\n", dlerror());
-    elf_end(e);
-    return NULL;
-  }
-
-  DeviceInfo.DynLibs.push_back(Lib);
-
-  struct link_map *libInfo = (struct link_map *)Lib.Handle;
-
-  // The place where the entries info is loaded is the library base address
-  // plus the offset determined from the ELF file.
-  Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
-
-  DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
-      DPxPTR(entries_addr));
-
-  // Table of pointers to all the entries in the target.
-  __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
-
-  __tgt_offload_entry *entries_begin = &entries_table[0];
-  __tgt_offload_entry *entries_end = entries_begin + NumEntries;
-
-  if (!entries_begin) {
-    DP("Can't obtain entries begin\n");
-    elf_end(e);
-    return NULL;
-  }
-
-  DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
-      DPxPTR(entries_begin), DPxPTR(entries_end));
-  DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
-
-  elf_end(e);
-
-  return DeviceInfo.getOffloadEntriesTable(device_id);
-}
-
-void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
-  void *ptr = malloc(size);
-  return ptr;
-}
-
-int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
-                              int64_t size) {
-  memcpy(tgt_ptr, hst_ptr, size);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
-                                int64_t size) {
-  memcpy(hst_ptr, tgt_ptr, size);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
-  free(tgt_ptr);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-                                         void **tgt_args,
-                                         ptrdiff_t *tgt_offsets,
-                                         int32_t arg_num, int32_t team_num,
-                                         int32_t thread_limit,
-                                         uint64_t loop_tripcount /*not used*/) {
-  // ignore team num and thread limit.
-
-  // Use libffi to launch execution.
-  ffi_cif cif;
-
-  // All args are references.
-  std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer);
-  std::vector<void *> args(arg_num);
-  std::vector<void *> ptrs(arg_num);
-
-  for (int32_t i = 0; i < arg_num; ++i) {
-    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
-    args[i] = &ptrs[i];
-  }
-
-  ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num,
-                                   &ffi_type_void, &args_types[0]);
-
-  assert(status == FFI_OK && "Unable to prepare target launch!");
-
-  if (status != FFI_OK)
-    return OFFLOAD_FAIL;
-
-  DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
-
-  void (*entry)(void);
-  *((void**) &entry) = tgt_entry_ptr;
-  ffi_call(&cif, entry, NULL, &args[0]);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
-                                    void **tgt_args, ptrdiff_t *tgt_offsets,
-                                    int32_t arg_num) {
-  // use one team and one thread.
-  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
-                                          tgt_offsets, arg_num, 1, 1, 0);
-}
-
-#ifdef __cplusplus
-}
-#endif
+//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for generic 64-bit machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <ffi.h>
+#include <gelf.h>
+#include <link.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic ELF - 64bit
+#endif
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+#define NUMBER_OF_DEVICES 4
+#define OFFLOADSECTIONNAME "omp_offloading_entries"
+
+/// Array of Dynamic libraries loaded for this target.
+struct DynLibTy {
+  char *FileName;
+  void *Handle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+};
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+  std::list<DynLibTy> DynLibs;
+
+  // Record entry point associated with device.
+  void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
+                          __tgt_offload_entry *end) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = end;
+  }
+
+  // Return true if the entry is associated with device.
+  bool findOffloadEntry(int32_t device_id, void *addr) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
+         i < e; ++i) {
+      if (i->addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table.
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    return &E.Table;
+  }
+
+  RTLDeviceInfoTy(int32_t num_devices) {
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+      DebugLevel = std::stoi(envStr);
+    }
+#endif // OMPTARGET_DEBUG
+
+    FuncGblEntries.resize(num_devices);
+  }
+
+  ~RTLDeviceInfoTy() {
+    // Close dynamic libraries
+    for (auto &lib : DynLibs) {
+      if (lib.Handle) {
+        dlclose(lib.Handle);
+        remove(lib.FileName);
+      }
+    }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+// If we don't have a valid ELF ID we can just fail.
+#if TARGET_ELF_ID < 1
+  return 0;
+#else
+  return elf_check_machine(image, TARGET_ELF_ID);
+#endif
+}
+
+int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+
+  DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
+     DPxPTR(image->ImageStart));
+
+  assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id");
+
+  size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+  size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
+  DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+  // Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE) {
+    DP("Incompatible ELF library!\n");
+    return NULL;
+  }
+
+  // Obtain elf handler
+  Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return NULL;
+  }
+
+  if (elf_kind(e) != ELF_K_ELF) {
+    DP("Invalid Elf kind!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  // Find the entries section offset
+  Elf_Scn *section = 0;
+  Elf64_Off entries_offset = 0;
+
+  size_t shstrndx;
+
+  if (elf_getshdrstrndx(e, &shstrndx)) {
+    DP("Unable to get ELF strings index!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  while ((section = elf_nextscn(e, section))) {
+    GElf_Shdr hdr;
+    gelf_getshdr(section, &hdr);
+
+    if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
+      entries_offset = hdr.sh_addr;
+      break;
+    }
+  }
+
+  if (!entries_offset) {
+    DP("Entries Section Offset Not Found\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
+
+  // load dynamic library and get the entry points. We use the dl library
+  // to do the loading of the library, but we could do it directly to avoid the
+  // dump to the temporary file.
+  //
+  // 1) Create tmp file with the library contents.
+  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+  int tmp_fd = mkstemp(tmp_name);
+
+  if (tmp_fd == -1) {
+    elf_end(e);
+    return NULL;
+  }
+
+  FILE *ftmp = fdopen(tmp_fd, "wb");
+
+  if (!ftmp) {
+    elf_end(e);
+    return NULL;
+  }
+
+  fwrite(image->ImageStart, ImageSize, 1, ftmp);
+  fclose(ftmp);
+
+  DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)};
+
+  if (!Lib.Handle) {
+    DP("Target library loading error: %s\n", dlerror());
+    elf_end(e);
+    return NULL;
+  }
+
+  DeviceInfo.DynLibs.push_back(Lib);
+
+  struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+  // The place where the entries info is loaded is the library base address
+  // plus the offset determined from the ELF file.
+  Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+  DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
+      DPxPTR(entries_addr));
+
+  // Table of pointers to all the entries in the target.
+  __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
+
+  __tgt_offload_entry *entries_begin = &entries_table[0];
+  __tgt_offload_entry *entries_end = entries_begin + NumEntries;
+
+  if (!entries_begin) {
+    DP("Can't obtain entries begin\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
+      DPxPTR(entries_begin), DPxPTR(entries_end));
+  DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
+
+  elf_end(e);
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+  void *ptr = malloc(size);
+  return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  memcpy(tgt_ptr, hst_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  memcpy(hst_ptr, tgt_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_transfer(int32_t device_id, void *dst_ptr, void *src_ptr,
+                                int64_t size) {
+  memcpy(dst_ptr, src_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  free(tgt_ptr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+                                         void **tgt_args,
+                                         ptrdiff_t *tgt_offsets,
+                                         int32_t arg_num, int32_t team_num,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount /*not used*/) {
+  // ignore team num and thread limit.
+
+  // Use libffi to launch execution.
+  ffi_cif cif;
+
+  // All args are references.
+  std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer);
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+  }
+
+  ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num,
+                                   &ffi_type_void, &args_types[0]);
+
+  assert(status == FFI_OK && "Unable to prepare target launch!");
+
+  if (status != FFI_OK)
+    return OFFLOAD_FAIL;
+
+  DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
+
+  void (*entry)(void);
+  *((void**) &entry) = tgt_entry_ptr;
+  ffi_call(&cif, entry, NULL, &args[0]);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+                                    void **tgt_args, ptrdiff_t *tgt_offsets,
+                                    int32_t arg_num) {
+  // use one team and one thread.
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+                                          tgt_offsets, arg_num, 1, 1, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
index 3915196453e0a..ffa684732ba28 100644
--- a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
@@ -1,17 +1,17 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
 endif()
\ No newline at end of file
diff --git a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
index 0cfe7c0051fa4..e5d8cffe4aad7 100644
--- a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
@@ -1,17 +1,17 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64le machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64le machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
 endif()
\ No newline at end of file
diff --git a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
index f61e1e856c80d..33e9bb373cb01 100644
--- a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
@@ -1,17 +1,17 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a x86_64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
-else()
- libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a x86_64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+else()
+ libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
 endif()
\ No newline at end of file
diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt
index f30087ed43423..e534619ff5100 100644
--- a/openmp/libomptarget/src/CMakeLists.txt
+++ b/openmp/libomptarget/src/CMakeLists.txt
@@ -1,31 +1,31 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build offloading library libomptarget.so.
-#
-##===----------------------------------------------------------------------===##
-
-libomptarget_say("Building offloading runtime library libomptarget.")
-
-set(src_files
-  api.cpp
-  device.cpp
-  interface.cpp
-  rtl.cpp
-  omptarget.cpp
-)
-
-# Build libomptarget library with libdl dependency.
-add_library(omptarget SHARED ${src_files})
-target_link_libraries(omptarget
-  ${CMAKE_DL_LIBS}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
-
-# Install libomptarget under the lib destination folder.
-install(TARGETS omptarget LIBRARY COMPONENT omptarget
-  DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build offloading library libomptarget.so.
+#
+##===----------------------------------------------------------------------===##
+
+libomptarget_say("Building offloading runtime library libomptarget.")
+
+set(src_files
+  api.cpp
+  device.cpp
+  interface.cpp
+  rtl.cpp
+  omptarget.cpp
+)
+
+# Build libomptarget library with libdl dependency.
+add_library(omptarget SHARED ${src_files})
+target_link_libraries(omptarget
+  ${CMAKE_DL_LIBS}
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
+
+# Install libomptarget under the lib destination folder.
+install(TARGETS omptarget LIBRARY COMPONENT omptarget
+  DESTINATION "${OPENMP_INSTALL_LIBDIR}")
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 3c7b709fb894e..4d6b1d185b147 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -1,291 +1,295 @@
-//===----------- api.cpp - Target independent OpenMP target RTL -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of OpenMP API interface functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include <omptarget.h>
-
-#include "device.h"
-#include "private.h"
-#include "rtl.h"
-
-#include <climits>
-#include <cstring>
-#include <cstdlib>
-
-EXTERN int omp_get_num_devices(void) {
-  RTLsMtx->lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx->unlock();
-
-  DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
-
-  return Devices_size;
-}
-
-EXTERN int omp_get_initial_device(void) {
-  DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
-  return HOST_DEVICE;
-}
-
-EXTERN void *omp_target_alloc(size_t size, int device_num) {
-  DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
-      device_num, size);
-
-  if (size <= 0) {
-    DP("Call to omp_target_alloc with non-positive length\n");
-    return NULL;
-  }
-
-  void *rc = NULL;
-
-  if (device_num == omp_get_initial_device()) {
-    rc = malloc(size);
-    DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
-    return rc;
-  }
-
-  if (!device_is_ready(device_num)) {
-    DP("omp_target_alloc returns NULL ptr\n");
-    return NULL;
-  }
-
-  DeviceTy &Device = Devices[device_num];
-  rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL);
-  DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
-  return rc;
-}
-
-EXTERN void omp_target_free(void *device_ptr, int device_num) {
-  DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
-      device_num, DPxPTR(device_ptr));
-
-  if (!device_ptr) {
-    DP("Call to omp_target_free with NULL ptr\n");
-    return;
-  }
-
-  if (device_num == omp_get_initial_device()) {
-    free(device_ptr);
-    DP("omp_target_free deallocated host ptr\n");
-    return;
-  }
-
-  if (!device_is_ready(device_num)) {
-    DP("omp_target_free returns, nothing to do\n");
-    return;
-  }
-
-  DeviceTy &Device = Devices[device_num];
-  Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
-  DP("omp_target_free deallocated device ptr\n");
-}
-
-EXTERN int omp_target_is_present(void *ptr, int device_num) {
-  DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
-      device_num, DPxPTR(ptr));
-
-  if (!ptr) {
-    DP("Call to omp_target_is_present with NULL ptr, returning false\n");
-    return false;
-  }
-
-  if (device_num == omp_get_initial_device()) {
-    DP("Call to omp_target_is_present on host, returning true\n");
-    return true;
-  }
-
-  RTLsMtx->lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx->unlock();
-  if (Devices_size <= (size_t)device_num) {
-    DP("Call to omp_target_is_present with invalid device ID, returning "
-        "false\n");
-    return false;
-  }
-
-  DeviceTy& Device = Devices[device_num];
-  bool IsLast; // not used
-  bool IsHostPtr;
-  void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr);
-  int rc = (TgtPtr != NULL);
-  // Under unified memory the host pointer can be returned by the
-  // getTgtPtrBegin() function which means that there is no device
-  // corresponding point for ptr. This function should return false
-  // in that situation.
-  if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
-    rc = !IsHostPtr;
-  DP("Call to omp_target_is_present returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
-    size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
-  DP("Call to omp_target_memcpy, dst device %d, src device %d, "
-      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
-      "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
-      DPxPTR(src), dst_offset, src_offset, length);
-
-  if (!dst || !src || length <= 0) {
-    DP("Call to omp_target_memcpy with invalid arguments\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
-      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
-      return OFFLOAD_FAIL;
-  }
-
-  if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
-      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
-      return OFFLOAD_FAIL;
-  }
-
-  int rc = OFFLOAD_SUCCESS;
-  void *srcAddr = (char *)src + src_offset;
-  void *dstAddr = (char *)dst + dst_offset;
-
-  if (src_device == omp_get_initial_device() &&
-      dst_device == omp_get_initial_device()) {
-    DP("copy from host to host\n");
-    const void *p = memcpy(dstAddr, srcAddr, length);
-    if (p == NULL)
-      rc = OFFLOAD_FAIL;
-  } else if (src_device == omp_get_initial_device()) {
-    DP("copy from host to device\n");
-    DeviceTy& DstDev = Devices[dst_device];
-    rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr);
-  } else if (dst_device == omp_get_initial_device()) {
-    DP("copy from device to host\n");
-    DeviceTy& SrcDev = Devices[src_device];
-    rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr);
-  } else {
-    DP("copy from device to device\n");
-    void *buffer = malloc(length);
-    DeviceTy& SrcDev = Devices[src_device];
-    DeviceTy& DstDev = Devices[dst_device];
-    rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr);
-    if (rc == OFFLOAD_SUCCESS)
-      rc = DstDev.data_submit(dstAddr, buffer, length, nullptr);
-    free(buffer);
-  }
-
-  DP("omp_target_memcpy returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
-    int num_dims, const size_t *volume, const size_t *dst_offsets,
-    const size_t *src_offsets, const size_t *dst_dimensions,
-    const size_t *src_dimensions, int dst_device, int src_device) {
-  DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
-      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
-      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
-      "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device,
-      src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets),
-      DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions),
-      DPxPTR(volume), element_size, num_dims);
-
-  if (!(dst || src)) {
-    DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
-        INT_MAX);
-    return INT_MAX;
-  }
-
-  if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
-      !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
-    DP("Call to omp_target_memcpy_rect with invalid arguments\n");
-    return OFFLOAD_FAIL;
-  }
-
-  int rc;
-  if (num_dims == 1) {
-    rc = omp_target_memcpy(dst, src, element_size * volume[0],
-        element_size * dst_offsets[0], element_size * src_offsets[0],
-        dst_device, src_device);
-  } else {
-    size_t dst_slice_size = element_size;
-    size_t src_slice_size = element_size;
-    for (int i=1; i<num_dims; ++i) {
-      dst_slice_size *= dst_dimensions[i];
-      src_slice_size *= src_dimensions[i];
-    }
-
-    size_t dst_off = dst_offsets[0] * dst_slice_size;
-    size_t src_off = src_offsets[0] * src_slice_size;
-    for (size_t i=0; i<volume[0]; ++i) {
-      rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i,
-          (char *) src + src_off + src_slice_size * i, element_size,
-          num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1,
-          dst_dimensions + 1, src_dimensions + 1, dst_device, src_device);
-
-      if (rc) {
-        DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
-        return rc;
-      }
-    }
-  }
-
-  DP("omp_target_memcpy_rect returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
-    size_t size, size_t device_offset, int device_num) {
-  DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
-      "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
-      DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
-
-  if (!host_ptr || !device_ptr || size <= 0) {
-    DP("Call to omp_target_associate_ptr with invalid arguments\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (device_num == omp_get_initial_device()) {
-    DP("omp_target_associate_ptr: no association possible on the host\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (!device_is_ready(device_num)) {
-    DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
-    return OFFLOAD_FAIL;
-  }
-
-  DeviceTy& Device = Devices[device_num];
-  void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
-  int rc = Device.associatePtr(host_ptr, device_addr, size);
-  DP("omp_target_associate_ptr returns %d\n", rc);
-  return rc;
-}
-
-EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
-  DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
-      "device_num %d\n", DPxPTR(host_ptr), device_num);
-
-  if (!host_ptr) {
-    DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (device_num == omp_get_initial_device()) {
-    DP("omp_target_disassociate_ptr: no association possible on the host\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (!device_is_ready(device_num)) {
-    DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
-    return OFFLOAD_FAIL;
-  }
-
-  DeviceTy& Device = Devices[device_num];
-  int rc = Device.disassociatePtr(host_ptr);
-  DP("omp_target_disassociate_ptr returns %d\n", rc);
-  return rc;
-}
+//===----------- api.cpp - Target independent OpenMP target RTL -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of OpenMP API interface functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <climits>
+#include <cstring>
+#include <cstdlib>
+
+EXTERN int omp_get_num_devices(void) {
+  RTLsMtx->lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx->unlock();
+
+  DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
+
+  return Devices_size;
+}
+
+EXTERN int omp_get_initial_device(void) {
+  DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
+  return HOST_DEVICE;
+}
+
+EXTERN void *omp_target_alloc(size_t size, int device_num) {
+  DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
+      device_num, size);
+
+  if (size <= 0) {
+    DP("Call to omp_target_alloc with non-positive length\n");
+    return NULL;
+  }
+
+  void *rc = NULL;
+
+  if (device_num == omp_get_initial_device()) {
+    rc = malloc(size);
+    DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
+    return rc;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_alloc returns NULL ptr\n");
+    return NULL;
+  }
+
+  DeviceTy &Device = Devices[device_num];
+  rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL);
+  DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
+  return rc;
+}
+
+EXTERN void omp_target_free(void *device_ptr, int device_num) {
+  DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
+      device_num, DPxPTR(device_ptr));
+
+  if (!device_ptr) {
+    DP("Call to omp_target_free with NULL ptr\n");
+    return;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    free(device_ptr);
+    DP("omp_target_free deallocated host ptr\n");
+    return;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_free returns, nothing to do\n");
+    return;
+  }
+
+  DeviceTy &Device = Devices[device_num];
+  Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
+  DP("omp_target_free deallocated device ptr\n");
+}
+
+EXTERN int omp_target_is_present(void *ptr, int device_num) {
+  DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
+      device_num, DPxPTR(ptr));
+
+  if (!ptr) {
+    DP("Call to omp_target_is_present with NULL ptr, returning false\n");
+    return false;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    DP("Call to omp_target_is_present on host, returning true\n");
+    return true;
+  }
+
+  RTLsMtx->lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx->unlock();
+  if (Devices_size <= (size_t)device_num) {
+    DP("Call to omp_target_is_present with invalid device ID, returning "
+        "false\n");
+    return false;
+  }
+
+  DeviceTy& Device = Devices[device_num];
+  bool IsLast; // not used
+  bool IsHostPtr;
+  void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr);
+  int rc = (TgtPtr != NULL);
+  // Under unified memory the host pointer can be returned by the
+  // getTgtPtrBegin() function which means that there is no device
+  // corresponding point for ptr. This function should return false
+  // in that situation.
+  if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
+    rc = !IsHostPtr;
+  DP("Call to omp_target_is_present returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
+    size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
+  DP("Call to omp_target_memcpy, dst device %d, src device %d, "
+      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
+      "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
+      DPxPTR(src), dst_offset, src_offset, length);
+
+  if (!dst || !src || length <= 0) {
+    DP("Call to omp_target_memcpy with invalid arguments\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
+      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
+      return OFFLOAD_FAIL;
+  }
+
+  if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
+      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
+      return OFFLOAD_FAIL;
+  }
+
+  int rc = OFFLOAD_SUCCESS;
+  void *srcAddr = (char *)src + src_offset;
+  void *dstAddr = (char *)dst + dst_offset;
+
+  if (src_device == omp_get_initial_device() &&
+      dst_device == omp_get_initial_device()) {
+    DP("copy from host to host\n");
+    const void *p = memcpy(dstAddr, srcAddr, length);
+    if (p == NULL)
+      rc = OFFLOAD_FAIL;
+  } else if (src_device == omp_get_initial_device()) {
+    DP("copy from host to device\n");
+    DeviceTy& DstDev = Devices[dst_device];
+    rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr);
+  } else if (dst_device == omp_get_initial_device()) {
+    DP("copy from device to host\n");
+    DeviceTy& SrcDev = Devices[src_device];
+    rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr);
+  } else {
+    DP("copy from device to device\n");
+    DeviceTy& SrcDev = Devices[src_device];
+    DeviceTy& DstDev = Devices[dst_device];
+    if (SrcDev.RTL->RTLName != DstDev.RTL->RTLName) {
+      void *buffer = malloc(length);
+      rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr);
+      if (rc == OFFLOAD_SUCCESS)
+        rc = DstDev.data_submit(dstAddr, buffer, length, nullptr);
+      free(buffer);
+    } else {
+      SrcDev.data_transfer(dstAddr, srcAddr, length, nullptr);
+    }
+  }
+
+  DP("omp_target_memcpy returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
+    int num_dims, const size_t *volume, const size_t *dst_offsets,
+    const size_t *src_offsets, const size_t *dst_dimensions,
+    const size_t *src_dimensions, int dst_device, int src_device) {
+  DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
+      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
+      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
+      "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device,
+      src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets),
+      DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions),
+      DPxPTR(volume), element_size, num_dims);
+
+  if (!(dst || src)) {
+    DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
+        INT_MAX);
+    return INT_MAX;
+  }
+
+  if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
+      !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
+    DP("Call to omp_target_memcpy_rect with invalid arguments\n");
+    return OFFLOAD_FAIL;
+  }
+
+  int rc;
+  if (num_dims == 1) {
+    rc = omp_target_memcpy(dst, src, element_size * volume[0],
+        element_size * dst_offsets[0], element_size * src_offsets[0],
+        dst_device, src_device);
+  } else {
+    size_t dst_slice_size = element_size;
+    size_t src_slice_size = element_size;
+    for (int i=1; i<num_dims; ++i) {
+      dst_slice_size *= dst_dimensions[i];
+      src_slice_size *= src_dimensions[i];
+    }
+
+    size_t dst_off = dst_offsets[0] * dst_slice_size;
+    size_t src_off = src_offsets[0] * src_slice_size;
+    for (size_t i=0; i<volume[0]; ++i) {
+      rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i,
+          (char *) src + src_off + src_slice_size * i, element_size,
+          num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1,
+          dst_dimensions + 1, src_dimensions + 1, dst_device, src_device);
+
+      if (rc) {
+        DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
+        return rc;
+      }
+    }
+  }
+
+  DP("omp_target_memcpy_rect returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
+    size_t size, size_t device_offset, int device_num) {
+  DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
+      "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
+      DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
+
+  if (!host_ptr || !device_ptr || size <= 0) {
+    DP("Call to omp_target_associate_ptr with invalid arguments\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    DP("omp_target_associate_ptr: no association possible on the host\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
+    return OFFLOAD_FAIL;
+  }
+
+  DeviceTy& Device = Devices[device_num];
+  void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
+  int rc = Device.associatePtr(host_ptr, device_addr, size);
+  DP("omp_target_associate_ptr returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
+  DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
+      "device_num %d\n", DPxPTR(host_ptr), device_num);
+
+  if (!host_ptr) {
+    DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    DP("omp_target_disassociate_ptr: no association possible on the host\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
+    return OFFLOAD_FAIL;
+  }
+
+  DeviceTy& Device = Devices[device_num];
+  int rc = Device.disassociatePtr(host_ptr);
+  DP("omp_target_disassociate_ptr returns %d\n", rc);
+  return rc;
+}
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 765dd54fe5ca6..69a9701402ddb 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -1,411 +1,421 @@
-//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Functionality for managing devices that are handled by RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#include "device.h"
-#include "private.h"
-#include "rtl.h"
-
-#include <cassert>
-#include <climits>
-#include <string>
-
-/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
-DevicesTy Devices;
-
-int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
-  DataMapMtx.lock();
-
-  // Check if entry exists
-  for (auto &HT : HostDataToTargetMap) {
-    if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
-      // Mapping already exists
-      bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
-                     HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
-                     HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
-      DataMapMtx.unlock();
-      if (isValid) {
-        DP("Attempt to re-associate the same device ptr+offset with the same "
-            "host ptr, nothing to do\n");
-        return OFFLOAD_SUCCESS;
-      } else {
-        DP("Not allowed to re-associate a different device ptr+offset with the "
-            "same host ptr\n");
-        return OFFLOAD_FAIL;
-      }
-    }
-  }
-
-  // Mapping does not exist, allocate it with refCount=INF
-  HostDataToTargetTy newEntry((uintptr_t) HstPtrBegin /*HstPtrBase*/,
-                              (uintptr_t) HstPtrBegin /*HstPtrBegin*/,
-                              (uintptr_t) HstPtrBegin + Size /*HstPtrEnd*/,
-                              (uintptr_t) TgtPtrBegin /*TgtPtrBegin*/,
-                              true /*IsRefCountINF*/);
-
-  DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
-      DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
-      DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
-      DPxPTR(newEntry.TgtPtrBegin));
-  HostDataToTargetMap.push_front(newEntry);
-
-  DataMapMtx.unlock();
-
-  return OFFLOAD_SUCCESS;
-}
-
-int DeviceTy::disassociatePtr(void *HstPtrBegin) {
-  DataMapMtx.lock();
-
-  // Check if entry exists
-  for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
-      ii != HostDataToTargetMap.end(); ++ii) {
-    if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
-      // Mapping exists
-      if (ii->isRefCountInf()) {
-        DP("Association found, removing it\n");
-        HostDataToTargetMap.erase(ii);
-        DataMapMtx.unlock();
-        return OFFLOAD_SUCCESS;
-      } else {
-        DP("Trying to disassociate a pointer which was not mapped via "
-            "omp_target_associate_ptr\n");
-        break;
-      }
-    }
-  }
-
-  // Mapping not found
-  DataMapMtx.unlock();
-  DP("Association not found\n");
-  return OFFLOAD_FAIL;
-}
-
-// Get ref count of map entry containing HstPtrBegin
-uint64_t DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
-  uintptr_t hp = (uintptr_t)HstPtrBegin;
-  uint64_t RefCnt = 0;
-
-  DataMapMtx.lock();
-  for (auto &HT : HostDataToTargetMap) {
-    if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
-      DP("DeviceTy::getMapEntry: requested entry found\n");
-      RefCnt = HT.getRefCount();
-      break;
-    }
-  }
-  DataMapMtx.unlock();
-
-  if (RefCnt == 0) {
-    DP("DeviceTy::getMapEntry: requested entry not found\n");
-  }
-
-  return RefCnt;
-}
-
-LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
-  uintptr_t hp = (uintptr_t)HstPtrBegin;
-  LookupResult lr;
-
-  DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
-      Size);
-  for (lr.Entry = HostDataToTargetMap.begin();
-      lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
-    auto &HT = *lr.Entry;
-    // Is it contained?
-    lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
-        (hp+Size) <= HT.HstPtrEnd;
-    // Does it extend into an already mapped region?
-    lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
-    // Does it extend beyond the mapped region?
-    lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
-
-    if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
-        lr.Flags.ExtendsAfter) {
-      break;
-    }
-  }
-
-  if (lr.Flags.ExtendsBefore) {
-    DP("WARNING: Pointer is not mapped but section extends into already "
-        "mapped data\n");
-  }
-  if (lr.Flags.ExtendsAfter) {
-    DP("WARNING: Pointer is already mapped but section extends beyond mapped "
-        "region\n");
-  }
-
-  return lr;
-}
-
-// Used by target_data_begin
-// Return the target pointer begin (where the data will be moved).
-// Allocate memory if this is the first occurrence of this mapping.
-// Increment the reference counter.
-// If NULL is returned, then either data allocation failed or the user tried
-// to do an illegal mapping.
-void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
-    int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit,
-    bool UpdateRefCount, bool HasCloseModifier) {
-  void *rc = NULL;
-  IsHostPtr = false;
-  DataMapMtx.lock();
-  LookupResult lr = lookupMapping(HstPtrBegin, Size);
-
-  // Check if the pointer is contained.
-  // If a variable is mapped to the device manually by the user - which would
-  // lead to the IsContained flag to be true - then we must ensure that the
-  // device address is returned even under unified memory conditions.
-  if (lr.Flags.IsContained ||
-      ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
-    auto &HT = *lr.Entry;
-    IsNew = false;
-
-    if (UpdateRefCount)
-      HT.incRefCount();
-
-    uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
-    DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
-        "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
-        DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
-        (UpdateRefCount ? " updated" : ""),
-        HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str());
-    rc = (void *)tp;
-  } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
-    // Explicit extension of mapped data - not allowed.
-    DP("Explicit extension of mapping is not allowed.\n");
-  } else if (Size) {
-    // If unified shared memory is active, implicitly mapped variables that are not
-    // privatized use host address. Any explicitly mapped variables also use
-    // host address where correctness is not impeded. In all other cases
-    // maps are respected.
-    // In addition to the mapping rules above, the close map
-    // modifier forces the mapping of the variable to the device.
-    if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-        !HasCloseModifier) {
-      DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
-         DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
-      IsHostPtr = true;
-      rc = HstPtrBegin;
-    } else {
-      // If it is not contained and Size > 0 we should create a new entry for it.
-      IsNew = true;
-      uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
-      DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
-         "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
-         DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
-      HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
-          (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
-      rc = (void *)tp;
-    }
-  }
-
-  DataMapMtx.unlock();
-  return rc;
-}
-
-// Used by target_data_begin, target_data_end, target_data_update and target.
-// Return the target pointer begin (where the data will be moved).
-// Decrement the reference counter if called from target_data_end.
-void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-    bool UpdateRefCount, bool &IsHostPtr) {
-  void *rc = NULL;
-  IsHostPtr = false;
-  IsLast = false;
-  DataMapMtx.lock();
-  LookupResult lr = lookupMapping(HstPtrBegin, Size);
-
-  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
-    auto &HT = *lr.Entry;
-    IsLast = HT.getRefCount() == 1;
-
-    if (!IsLast && UpdateRefCount)
-      HT.decRefCount();
-
-    uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
-    DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
-        "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
-        (UpdateRefCount ? " updated" : ""),
-        HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str());
-    rc = (void *)tp;
-  } else if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-    // If the value isn't found in the mapping and unified shared memory
-    // is on then it means we have stumbled upon a value which we need to
-    // use directly from the host.
-    DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
-       DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
-    IsHostPtr = true;
-    rc = HstPtrBegin;
-  }
-
-  DataMapMtx.unlock();
-  return rc;
-}
-
-// Return the target pointer begin (where the data will be moved).
-// Lock-free version called when loading global symbols from the fat binary.
-void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
-  uintptr_t hp = (uintptr_t)HstPtrBegin;
-  LookupResult lr = lookupMapping(HstPtrBegin, Size);
-  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
-    auto &HT = *lr.Entry;
-    uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
-    return (void *)tp;
-  }
-
-  return NULL;
-}
-
-int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete,
-                            bool HasCloseModifier) {
-  if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier)
-    return OFFLOAD_SUCCESS;
-  // Check if the pointer is contained in any sub-nodes.
-  int rc;
-  DataMapMtx.lock();
-  LookupResult lr = lookupMapping(HstPtrBegin, Size);
-  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
-    auto &HT = *lr.Entry;
-    if (ForceDelete)
-      HT.resetRefCount();
-    if (HT.decRefCount() == 0) {
-      DP("Deleting tgt data " DPxMOD " of size %ld\n",
-          DPxPTR(HT.TgtPtrBegin), Size);
-      RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
-      DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
-          ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
-          DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
-      HostDataToTargetMap.erase(lr.Entry);
-    }
-    rc = OFFLOAD_SUCCESS;
-  } else {
-    DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
-       " memory\n", DPxPTR(HstPtrBegin));
-    rc = OFFLOAD_FAIL;
-  }
-
-  DataMapMtx.unlock();
-  return rc;
-}
-
-/// Init device, should not be called directly.
-void DeviceTy::init() {
-  // Make call to init_requires if it exists for this plugin.
-  if (RTL->init_requires)
-    RTL->init_requires(RTLs->RequiresFlags);
-  int32_t rc = RTL->init_device(RTLDeviceID);
-  if (rc == OFFLOAD_SUCCESS) {
-    IsInit = true;
-  }
-}
-
-/// Thread-safe method to initialize the device only once.
-int32_t DeviceTy::initOnce() {
-  std::call_once(InitFlag, &DeviceTy::init, this);
-
-  // At this point, if IsInit is true, then either this thread or some other
-  // thread in the past successfully initialized the device, so we can return
-  // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
-  // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
-  // that some other thread already attempted to execute init() and if IsInit
-  // is still false, return OFFLOAD_FAIL.
-  if (IsInit)
-    return OFFLOAD_SUCCESS;
-  else
-    return OFFLOAD_FAIL;
-}
-
-// Load binary to device.
-__tgt_target_table *DeviceTy::load_binary(void *Img) {
-  RTL->Mtx.lock();
-  __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
-  RTL->Mtx.unlock();
-  return rc;
-}
-
-// Submit data to device
-int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
-                              int64_t Size, __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
-    return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
-  else
-    return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
-                                  AsyncInfoPtr);
-}
-
-// Retrieve data from device
-int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
-                                int64_t Size, __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
-    return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
-  else
-    return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
-                                    AsyncInfoPtr);
-}
-
-// Run region on device
-int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
-                             ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
-                             __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
-    return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
-                           TgtVarsSize);
-  else
-    return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
-                                 TgtOffsets, TgtVarsSize, AsyncInfoPtr);
-}
-
-// Run team region on device.
-int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
-                                  ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
-                                  int32_t NumTeams, int32_t ThreadLimit,
-                                  uint64_t LoopTripCount,
-                                  __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
-    return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
-                                TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
-                                LoopTripCount);
-  else
-    return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
-                                      TgtOffsets, TgtVarsSize, NumTeams,
-                                      ThreadLimit, LoopTripCount, AsyncInfoPtr);
-}
-
-/// Check whether a device has an associated RTL and initialize it if it's not
-/// already initialized.
-bool device_is_ready(int device_num) {
-  DP("Checking whether device %d is ready.\n", device_num);
-  // Devices.size() can only change while registering a new
-  // library, so try to acquire the lock of RTLs' mutex.
-  RTLsMtx->lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx->unlock();
-  if (Devices_size <= (size_t)device_num) {
-    DP("Device ID  %d does not have a matching RTL\n", device_num);
-    return false;
-  }
-
-  // Get device info
-  DeviceTy &Device = Devices[device_num];
-
-  DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
-       Device.RTLDeviceID, Device.IsInit);
-
-  // Init the device if not done before
-  if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
-    DP("Failed to init device %d\n", device_num);
-    return false;
-  }
-
-  DP("Device %d is ready to use.\n", device_num);
-
-  return true;
-}
+//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functionality for managing devices that are handled by RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <climits>
+#include <string>
+
+/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
+DevicesTy Devices;
+
+int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
+  DataMapMtx.lock();
+
+  // Check if entry exists
+  for (auto &HT : HostDataToTargetMap) {
+    if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
+      // Mapping already exists
+      bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
+                     HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
+                     HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
+      DataMapMtx.unlock();
+      if (isValid) {
+        DP("Attempt to re-associate the same device ptr+offset with the same "
+            "host ptr, nothing to do\n");
+        return OFFLOAD_SUCCESS;
+      } else {
+        DP("Not allowed to re-associate a different device ptr+offset with the "
+            "same host ptr\n");
+        return OFFLOAD_FAIL;
+      }
+    }
+  }
+
+  // Mapping does not exist, allocate it with refCount=INF
+  HostDataToTargetTy newEntry((uintptr_t) HstPtrBegin /*HstPtrBase*/,
+                              (uintptr_t) HstPtrBegin /*HstPtrBegin*/,
+                              (uintptr_t) HstPtrBegin + Size /*HstPtrEnd*/,
+                              (uintptr_t) TgtPtrBegin /*TgtPtrBegin*/,
+                              true /*IsRefCountINF*/);
+
+  DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
+      DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
+      DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
+      DPxPTR(newEntry.TgtPtrBegin));
+  HostDataToTargetMap.push_front(newEntry);
+
+  DataMapMtx.unlock();
+
+  return OFFLOAD_SUCCESS;
+}
+
+int DeviceTy::disassociatePtr(void *HstPtrBegin) {
+  DataMapMtx.lock();
+
+  // Check if entry exists
+  for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
+      ii != HostDataToTargetMap.end(); ++ii) {
+    if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
+      // Mapping exists
+      if (ii->isRefCountInf()) {
+        DP("Association found, removing it\n");
+        HostDataToTargetMap.erase(ii);
+        DataMapMtx.unlock();
+        return OFFLOAD_SUCCESS;
+      } else {
+        DP("Trying to disassociate a pointer which was not mapped via "
+            "omp_target_associate_ptr\n");
+        break;
+      }
+    }
+  }
+
+  // Mapping not found
+  DataMapMtx.unlock();
+  DP("Association not found\n");
+  return OFFLOAD_FAIL;
+}
+
+// Get ref count of map entry containing HstPtrBegin
+uint64_t DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
+  uintptr_t hp = (uintptr_t)HstPtrBegin;
+  uint64_t RefCnt = 0;
+
+  DataMapMtx.lock();
+  for (auto &HT : HostDataToTargetMap) {
+    if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
+      DP("DeviceTy::getMapEntry: requested entry found\n");
+      RefCnt = HT.getRefCount();
+      break;
+    }
+  }
+  DataMapMtx.unlock();
+
+  if (RefCnt == 0) {
+    DP("DeviceTy::getMapEntry: requested entry not found\n");
+  }
+
+  return RefCnt;
+}
+
+LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
+  uintptr_t hp = (uintptr_t)HstPtrBegin;
+  LookupResult lr;
+
+  DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
+      Size);
+  for (lr.Entry = HostDataToTargetMap.begin();
+      lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
+    auto &HT = *lr.Entry;
+    // Is it contained?
+    lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
+        (hp+Size) <= HT.HstPtrEnd;
+    // Does it extend into an already mapped region?
+    lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
+    // Does it extend beyond the mapped region?
+    lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
+
+    if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
+        lr.Flags.ExtendsAfter) {
+      break;
+    }
+  }
+
+  if (lr.Flags.ExtendsBefore) {
+    DP("WARNING: Pointer is not mapped but section extends into already "
+        "mapped data\n");
+  }
+  if (lr.Flags.ExtendsAfter) {
+    DP("WARNING: Pointer is already mapped but section extends beyond mapped "
+        "region\n");
+  }
+
+  return lr;
+}
+
+// Used by target_data_begin
+// Return the target pointer begin (where the data will be moved).
+// Allocate memory if this is the first occurrence of this mapping.
+// Increment the reference counter.
+// If NULL is returned, then either data allocation failed or the user tried
+// to do an illegal mapping.
+void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
+    int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit,
+    bool UpdateRefCount, bool HasCloseModifier) {
+  void *rc = NULL;
+  IsHostPtr = false;
+  DataMapMtx.lock();
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+
+  // Check if the pointer is contained.
+  // If a variable is mapped to the device manually by the user - which would
+  // lead to the IsContained flag to be true - then we must ensure that the
+  // device address is returned even under unified memory conditions.
+  if (lr.Flags.IsContained ||
+      ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
+    auto &HT = *lr.Entry;
+    IsNew = false;
+
+    if (UpdateRefCount)
+      HT.incRefCount();
+
+    uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
+    DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
+        "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
+        DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
+        (UpdateRefCount ? " updated" : ""),
+        HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str());
+    rc = (void *)tp;
+  } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
+    // Explicit extension of mapped data - not allowed.
+    DP("Explicit extension of mapping is not allowed.\n");
+  } else if (Size) {
+    // If unified shared memory is active, implicitly mapped variables that are not
+    // privatized use host address. Any explicitly mapped variables also use
+    // host address where correctness is not impeded. In all other cases
+    // maps are respected.
+    // In addition to the mapping rules above, the close map
+    // modifier forces the mapping of the variable to the device.
+    if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+        !HasCloseModifier) {
+      DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
+         DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
+      IsHostPtr = true;
+      rc = HstPtrBegin;
+    } else {
+      // If it is not contained and Size > 0 we should create a new entry for it.
+      IsNew = true;
+      uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
+      DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
+         "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
+         DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
+      HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
+          (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
+      rc = (void *)tp;
+    }
+  }
+
+  DataMapMtx.unlock();
+  return rc;
+}
+
+// Used by target_data_begin, target_data_end, target_data_update and target.
+// Return the target pointer begin (where the data will be moved).
+// Decrement the reference counter if called from target_data_end.
+void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+    bool UpdateRefCount, bool &IsHostPtr) {
+  void *rc = NULL;
+  IsHostPtr = false;
+  IsLast = false;
+  DataMapMtx.lock();
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+
+  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+    auto &HT = *lr.Entry;
+    IsLast = HT.getRefCount() == 1;
+
+    if (!IsLast && UpdateRefCount)
+      HT.decRefCount();
+
+    uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
+    DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
+        "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
+        (UpdateRefCount ? " updated" : ""),
+        HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str());
+    rc = (void *)tp;
+  } else if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+    // If the value isn't found in the mapping and unified shared memory
+    // is on then it means we have stumbled upon a value which we need to
+    // use directly from the host.
+    DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
+       DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
+    IsHostPtr = true;
+    rc = HstPtrBegin;
+  }
+
+  DataMapMtx.unlock();
+  return rc;
+}
+
+// Return the target pointer begin (where the data will be moved).
+// Lock-free version called when loading global symbols from the fat binary.
+void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
+  uintptr_t hp = (uintptr_t)HstPtrBegin;
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+    auto &HT = *lr.Entry;
+    uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
+    return (void *)tp;
+  }
+
+  return NULL;
+}
+
+int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete,
+                            bool HasCloseModifier) {
+  if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier)
+    return OFFLOAD_SUCCESS;
+  // Check if the pointer is contained in any sub-nodes.
+  int rc;
+  DataMapMtx.lock();
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+    auto &HT = *lr.Entry;
+    if (ForceDelete)
+      HT.resetRefCount();
+    if (HT.decRefCount() == 0) {
+      DP("Deleting tgt data " DPxMOD " of size %ld\n",
+          DPxPTR(HT.TgtPtrBegin), Size);
+      RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
+      DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
+          ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
+          DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
+      HostDataToTargetMap.erase(lr.Entry);
+    }
+    rc = OFFLOAD_SUCCESS;
+  } else {
+    DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
+       " memory\n", DPxPTR(HstPtrBegin));
+    rc = OFFLOAD_FAIL;
+  }
+
+  DataMapMtx.unlock();
+  return rc;
+}
+
+/// Init device, should not be called directly.
+void DeviceTy::init() {
+  // Make call to init_requires if it exists for this plugin.
+  if (RTL->init_requires)
+    RTL->init_requires(RTLs->RequiresFlags);
+  int32_t rc = RTL->init_device(RTLDeviceID);
+  if (rc == OFFLOAD_SUCCESS) {
+    IsInit = true;
+  }
+}
+
+/// Thread-safe method to initialize the device only once.
+int32_t DeviceTy::initOnce() {
+  std::call_once(InitFlag, &DeviceTy::init, this);
+
+  // At this point, if IsInit is true, then either this thread or some other
+  // thread in the past successfully initialized the device, so we can return
+  // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
+  // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
+  // that some other thread already attempted to execute init() and if IsInit
+  // is still false, return OFFLOAD_FAIL.
+  if (IsInit)
+    return OFFLOAD_SUCCESS;
+  else
+    return OFFLOAD_FAIL;
+}
+
+// Load binary to device.
+__tgt_target_table *DeviceTy::load_binary(void *Img) {
+  RTL->Mtx.lock();
+  __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
+  RTL->Mtx.unlock();
+  return rc;
+}
+
+// Submit data to device
+int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
+                              int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
+    return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+  else
+    return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
+                                  AsyncInfoPtr);
+}
+
+// Retrieve data from device
+int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
+                                int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
+    return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+  else
+    return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
+                                    AsyncInfoPtr);
+}
+
+// Transfer data between device from same vendor
+int32_t DeviceTy::data_transfer(void *DstPtrBegin, void *SrcPtrBegin,
+                                int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
+    return RTL->data_transfer(RTLDeviceID, DstPtrBegin, SrcPtrBegin, Size);
+  else
+    return RTL->data_transfer_async(RTLDeviceID, DstPtrBegin, SrcPtrBegin, Size,
+                                    AsyncInfoPtr);
+}
+
+// Run region on device
+int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
+                             ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+                             __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
+    return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
+                           TgtVarsSize);
+  else
+    return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
+                                 TgtOffsets, TgtVarsSize, AsyncInfoPtr);
+}
+
+// Run team region on device.
+int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
+                                  ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+                                  int32_t NumTeams, int32_t ThreadLimit,
+                                  uint64_t LoopTripCount,
+                                  __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
+    return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
+                                TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
+                                LoopTripCount);
+  else
+    return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
+                                      TgtOffsets, TgtVarsSize, NumTeams,
+                                      ThreadLimit, LoopTripCount, AsyncInfoPtr);
+}
+
+/// Check whether a device has an associated RTL and initialize it if it's not
+/// already initialized.
+bool device_is_ready(int device_num) {
+  DP("Checking whether device %d is ready.\n", device_num);
+  // Devices.size() can only change while registering a new
+  // library, so try to acquire the lock of RTLs' mutex.
+  RTLsMtx->lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx->unlock();
+  if (Devices_size <= (size_t)device_num) {
+    DP("Device ID  %d does not have a matching RTL\n", device_num);
+    return false;
+  }
+
+  // Get device info
+  DeviceTy &Device = Devices[device_num];
+
+  DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
+       Device.RTLDeviceID, Device.IsInit);
+
+  // Init the device if not done before
+  if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
+    DP("Failed to init device %d\n", device_num);
+    return false;
+  }
+
+  DP("Device %d is ready to use.\n", device_num);
+
+  return true;
+}
diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h
index a3a5767f81ff5..2526c7d5268ce 100644
--- a/openmp/libomptarget/src/device.h
+++ b/openmp/libomptarget/src/device.h
@@ -1,204 +1,206 @@
-//===----------- device.h - Target independent OpenMP target RTL ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations for managing devices that are handled by RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_DEVICE_H
-#define _OMPTARGET_DEVICE_H
-
-#include <cassert>
-#include <cstddef>
-#include <list>
-#include <map>
-#include <mutex>
-#include <vector>
-
-// Forward declarations.
-struct RTLInfoTy;
-struct __tgt_bin_desc;
-struct __tgt_target_table;
-struct __tgt_async_info;
-
-/// Map between host data and target data.
-struct HostDataToTargetTy {
-  uintptr_t HstPtrBase; // host info.
-  uintptr_t HstPtrBegin;
-  uintptr_t HstPtrEnd; // non-inclusive.
-
-  uintptr_t TgtPtrBegin; // target info.
-
-private:
-  uint64_t RefCount;
-  static const uint64_t INFRefCount = ~(uint64_t)0;
-
-public:
-  HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB,
-      bool IsINF = false)
-      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
-        TgtPtrBegin(TB), RefCount(IsINF ? INFRefCount : 1) {}
-
-  uint64_t getRefCount() const {
-    return RefCount;
-  }
-
-  uint64_t resetRefCount() {
-    if (RefCount != INFRefCount)
-      RefCount = 1;
-
-    return RefCount;
-  }
-
-  uint64_t incRefCount() {
-    if (RefCount != INFRefCount) {
-      ++RefCount;
-      assert(RefCount < INFRefCount && "refcount overflow");
-    }
-
-    return RefCount;
-  }
-
-  uint64_t decRefCount() {
-    if (RefCount != INFRefCount) {
-      assert(RefCount > 0 && "refcount underflow");
-      --RefCount;
-    }
-
-    return RefCount;
-  }
-
-  bool isRefCountInf() const {
-    return RefCount == INFRefCount;
-  }
-};
-
-typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
-
-struct LookupResult {
-  struct {
-    unsigned IsContained   : 1;
-    unsigned ExtendsBefore : 1;
-    unsigned ExtendsAfter  : 1;
-  } Flags;
-
-  HostDataToTargetListTy::iterator Entry;
-
-  LookupResult() : Flags({0,0,0}), Entry() {}
-};
-
-/// Map for shadow pointers
-struct ShadowPtrValTy {
-  void *HstPtrVal;
-  void *TgtPtrAddr;
-  void *TgtPtrVal;
-};
-typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
-
-///
-struct PendingCtorDtorListsTy {
-  std::list<void *> PendingCtors;
-  std::list<void *> PendingDtors;
-};
-typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
-    PendingCtorsDtorsPerLibrary;
-
-struct DeviceTy {
-  int32_t DeviceID;
-  RTLInfoTy *RTL;
-  int32_t RTLDeviceID;
-
-  bool IsInit;
-  std::once_flag InitFlag;
-  bool HasPendingGlobals;
-
-  HostDataToTargetListTy HostDataToTargetMap;
-  PendingCtorsDtorsPerLibrary PendingCtorsDtors;
-
-  ShadowPtrListTy ShadowPtrMap;
-
-  std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx;
-
-  // NOTE: Once libomp gains full target-task support, this state should be
-  // moved into the target task in libomp.
-  std::map<int32_t, uint64_t> LoopTripCnt;
-
-  DeviceTy(RTLInfoTy *RTL)
-      : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
-        HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(),
-        ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {}
-
-  // The existence of mutexes makes DeviceTy non-copyable. We need to
-  // provide a copy constructor and an assignment operator explicitly.
-  DeviceTy(const DeviceTy &d)
-      : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
-        IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
-        HostDataToTargetMap(d.HostDataToTargetMap),
-        PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
-        DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(),
-        LoopTripCnt(d.LoopTripCnt) {}
-
-  DeviceTy& operator=(const DeviceTy &d) {
-    DeviceID = d.DeviceID;
-    RTL = d.RTL;
-    RTLDeviceID = d.RTLDeviceID;
-    IsInit = d.IsInit;
-    HasPendingGlobals = d.HasPendingGlobals;
-    HostDataToTargetMap = d.HostDataToTargetMap;
-    PendingCtorsDtors = d.PendingCtorsDtors;
-    ShadowPtrMap = d.ShadowPtrMap;
-    LoopTripCnt = d.LoopTripCnt;
-
-    return *this;
-  }
-
-  uint64_t getMapEntryRefCnt(void *HstPtrBegin);
-  LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
-  void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
-      bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true,
-      bool HasCloseModifier = false);
-  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
-  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-      bool UpdateRefCount, bool &IsHostPtr);
-  int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete,
-                    bool HasCloseModifier = false);
-  int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
-  int disassociatePtr(void *HstPtrBegin);
-
-  // calls to RTL
-  int32_t initOnce();
-  __tgt_target_table *load_binary(void *Img);
-
-  // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be
-  // synchronous.
-  int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
-                      __tgt_async_info *AsyncInfoPtr);
-  int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
-                        __tgt_async_info *AsyncInfoPtr);
-
-  int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
-                     ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
-                     __tgt_async_info *AsyncInfoPtr);
-  int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
-                          ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
-                          int32_t NumTeams, int32_t ThreadLimit,
-                          uint64_t LoopTripCount,
-                          __tgt_async_info *AsyncInfoPtr);
-
-private:
-  // Call to RTL
-  void init(); // To be called only via DeviceTy::initOnce()
-};
-
-/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
-typedef std::vector<DeviceTy> DevicesTy;
-extern DevicesTy Devices;
-
-extern bool device_is_ready(int device_num);
-
-#endif
+//===----------- device.h - Target independent OpenMP target RTL ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for managing devices that are handled by RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_DEVICE_H
+#define _OMPTARGET_DEVICE_H
+
+#include <cassert>
+#include <cstddef>
+#include <list>
+#include <map>
+#include <mutex>
+#include <vector>
+
+// Forward declarations.
+struct RTLInfoTy;
+struct __tgt_bin_desc;
+struct __tgt_target_table;
+struct __tgt_async_info;
+
+/// Map between host data and target data.
+struct HostDataToTargetTy {
+  uintptr_t HstPtrBase; // host info.
+  uintptr_t HstPtrBegin;
+  uintptr_t HstPtrEnd; // non-inclusive.
+
+  uintptr_t TgtPtrBegin; // target info.
+
+private:
+  uint64_t RefCount;
+  static const uint64_t INFRefCount = ~(uint64_t)0;
+
+public:
+  HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB,
+      bool IsINF = false)
+      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
+        TgtPtrBegin(TB), RefCount(IsINF ? INFRefCount : 1) {}
+
+  uint64_t getRefCount() const {
+    return RefCount;
+  }
+
+  uint64_t resetRefCount() {
+    if (RefCount != INFRefCount)
+      RefCount = 1;
+
+    return RefCount;
+  }
+
+  uint64_t incRefCount() {
+    if (RefCount != INFRefCount) {
+      ++RefCount;
+      assert(RefCount < INFRefCount && "refcount overflow");
+    }
+
+    return RefCount;
+  }
+
+  uint64_t decRefCount() {
+    if (RefCount != INFRefCount) {
+      assert(RefCount > 0 && "refcount underflow");
+      --RefCount;
+    }
+
+    return RefCount;
+  }
+
+  bool isRefCountInf() const {
+    return RefCount == INFRefCount;
+  }
+};
+
+typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
+
+struct LookupResult {
+  struct {
+    unsigned IsContained   : 1;
+    unsigned ExtendsBefore : 1;
+    unsigned ExtendsAfter  : 1;
+  } Flags;
+
+  HostDataToTargetListTy::iterator Entry;
+
+  LookupResult() : Flags({0,0,0}), Entry() {}
+};
+
+/// Map for shadow pointers
+struct ShadowPtrValTy {
+  void *HstPtrVal;
+  void *TgtPtrAddr;
+  void *TgtPtrVal;
+};
+typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
+
+///
+struct PendingCtorDtorListsTy {
+  std::list<void *> PendingCtors;
+  std::list<void *> PendingDtors;
+};
+typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
+    PendingCtorsDtorsPerLibrary;
+
+struct DeviceTy {
+  int32_t DeviceID;
+  RTLInfoTy *RTL;
+  int32_t RTLDeviceID;
+
+  bool IsInit;
+  std::once_flag InitFlag;
+  bool HasPendingGlobals;
+
+  HostDataToTargetListTy HostDataToTargetMap;
+  PendingCtorsDtorsPerLibrary PendingCtorsDtors;
+
+  ShadowPtrListTy ShadowPtrMap;
+
+  std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx;
+
+  // NOTE: Once libomp gains full target-task support, this state should be
+  // moved into the target task in libomp.
+  std::map<int32_t, uint64_t> LoopTripCnt;
+
+  DeviceTy(RTLInfoTy *RTL)
+      : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
+        HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(),
+        ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {}
+
+  // The existence of mutexes makes DeviceTy non-copyable. We need to
+  // provide a copy constructor and an assignment operator explicitly.
+  DeviceTy(const DeviceTy &d)
+      : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
+        IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
+        HostDataToTargetMap(d.HostDataToTargetMap),
+        PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
+        DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(),
+        LoopTripCnt(d.LoopTripCnt) {}
+
+  DeviceTy& operator=(const DeviceTy &d) {
+    DeviceID = d.DeviceID;
+    RTL = d.RTL;
+    RTLDeviceID = d.RTLDeviceID;
+    IsInit = d.IsInit;
+    HasPendingGlobals = d.HasPendingGlobals;
+    HostDataToTargetMap = d.HostDataToTargetMap;
+    PendingCtorsDtors = d.PendingCtorsDtors;
+    ShadowPtrMap = d.ShadowPtrMap;
+    LoopTripCnt = d.LoopTripCnt;
+
+    return *this;
+  }
+
+  uint64_t getMapEntryRefCnt(void *HstPtrBegin);
+  LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
+  void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
+      bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true,
+      bool HasCloseModifier = false);
+  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
+  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+      bool UpdateRefCount, bool &IsHostPtr);
+  int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete,
+                    bool HasCloseModifier = false);
+  int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
+  int disassociatePtr(void *HstPtrBegin);
+
+  // calls to RTL
+  int32_t initOnce();
+  __tgt_target_table *load_binary(void *Img);
+
+  // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be
+  // synchronous.
+  int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
+                      __tgt_async_info *AsyncInfoPtr);
+  int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
+                        __tgt_async_info *AsyncInfoPtr);
+  int32_t data_transfer(void *DstPtrBegin, void *SrcPtrBegin, int64_t Size,
+                        __tgt_async_info *AsyncInfoPtr);
+
+  int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
+                     ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+                     __tgt_async_info *AsyncInfoPtr);
+  int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
+                          ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
+                          int32_t NumTeams, int32_t ThreadLimit,
+                          uint64_t LoopTripCount,
+                          __tgt_async_info *AsyncInfoPtr);
+
+private:
+  // Call to RTL
+  void init(); // To be called only via DeviceTy::initOnce()
+};
+
+/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
+typedef std::vector<DeviceTy> DevicesTy;
+extern DevicesTy Devices;
+
+extern bool device_is_ready(int device_num);
+
+#endif
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index e1fee4bbefcec..e8f35e531db2a 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -1,31 +1,31 @@
-VERS1.0 {
-  global:
-    __tgt_register_requires;
-    __tgt_register_lib;
-    __tgt_unregister_lib;
-    __tgt_target_data_begin;
-    __tgt_target_data_end;
-    __tgt_target_data_update;
-    __tgt_target;
-    __tgt_target_teams;
-    __tgt_target_data_begin_nowait;
-    __tgt_target_data_end_nowait;
-    __tgt_target_data_update_nowait;
-    __tgt_target_nowait;
-    __tgt_target_teams_nowait;
-    __tgt_mapper_num_components;
-    __tgt_push_mapper_component;
-    omp_get_num_devices;
-    omp_get_initial_device;
-    omp_target_alloc;
-    omp_target_free;
-    omp_target_is_present;
-    omp_target_memcpy;
-    omp_target_memcpy_rect;
-    omp_target_associate_ptr;
-    omp_target_disassociate_ptr;
-    __kmpc_push_target_tripcount;
-  local:
-    *;
-};
-
+VERS1.0 {
+  global:
+    __tgt_register_requires;
+    __tgt_register_lib;
+    __tgt_unregister_lib;
+    __tgt_target_data_begin;
+    __tgt_target_data_end;
+    __tgt_target_data_update;
+    __tgt_target;
+    __tgt_target_teams;
+    __tgt_target_data_begin_nowait;
+    __tgt_target_data_end_nowait;
+    __tgt_target_data_update_nowait;
+    __tgt_target_nowait;
+    __tgt_target_teams_nowait;
+    __tgt_mapper_num_components;
+    __tgt_push_mapper_component;
+    omp_get_num_devices;
+    omp_get_initial_device;
+    omp_target_alloc;
+    omp_target_free;
+    omp_target_is_present;
+    omp_target_memcpy;
+    omp_target_memcpy_rect;
+    omp_target_associate_ptr;
+    omp_target_disassociate_ptr;
+    __kmpc_push_target_tripcount;
+  local:
+    *;
+};
+
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 924bc490b1107..5e2aff6c82dbc 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -1,350 +1,350 @@
-//===-------- interface.cpp - Target independent OpenMP target RTL --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of the interface to be used by Clang during the codegen of a
-// target region.
-//
-//===----------------------------------------------------------------------===//
-
-#include <omptarget.h>
-
-#include "device.h"
-#include "private.h"
-#include "rtl.h"
-
-#include <cassert>
-#include <cstdlib>
-#include <mutex>
-
-// Store target policy (disabled, mandatory, default)
-kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default;
-std::mutex TargetOffloadMtx;
-
-////////////////////////////////////////////////////////////////////////////////
-/// manage the success or failure of a target construct
-
-static void HandleDefaultTargetOffload() {
-  TargetOffloadMtx.lock();
-  if (TargetOffloadPolicy == tgt_default) {
-    if (omp_get_num_devices() > 0) {
-      DP("Default TARGET OFFLOAD policy is now mandatory "
-         "(devices were found)\n");
-      TargetOffloadPolicy = tgt_mandatory;
-    } else {
-      DP("Default TARGET OFFLOAD policy is now disabled "
-         "(no devices were found)\n");
-      TargetOffloadPolicy = tgt_disabled;
-    }
-  }
-  TargetOffloadMtx.unlock();
-}
-
-static int IsOffloadDisabled() {
-  if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload();
-  return TargetOffloadPolicy == tgt_disabled;
-}
-
-static void HandleTargetOutcome(bool success) {
-  switch (TargetOffloadPolicy) {
-    case tgt_disabled:
-      if (success) {
-        FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
-      }
-      break;
-    case tgt_default:
-      FATAL_MESSAGE0(1, "default offloading policy must be switched to "
-                        "mandatory or disabled");
-      break;
-    case tgt_mandatory:
-      if (!success) {
-        FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory");
-      }
-      break;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// adds requires flags
-EXTERN void __tgt_register_requires(int64_t flags) {
-  RTLs->RegisterRequires(flags);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// adds a target shared library to the target execution image
-EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
-  RTLs->RegisterLib(desc);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// unloads a target shared library
-EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
-  RTLs->UnregisterLib(desc);
-}
-
-/// creates host-to-target data mapping, stores it in the
-/// libomptarget.so internal structure (an entry in a stack of data maps)
-/// and passes the data to the device.
-EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  if (IsOffloadDisabled()) return;
-
-  DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
-      device_id, arg_num);
-
-  // No devices available?
-  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
-    device_id = omp_get_default_device();
-    DP("Use default device id %" PRId64 "\n", device_id);
-  }
-
-  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
-    return;
-  }
-
-  DeviceTy &Device = Devices[device_id];
-
-#ifdef OMPTARGET_DEBUG
-  for (int i = 0; i < arg_num; ++i) {
-    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-       ", Type=0x%" PRIx64 "\n",
-       i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]);
-  }
-#endif
-
-  int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
-                             arg_types, nullptr);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
-}
-
-EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
-    int32_t depNum, void *depList, int32_t noAliasDepNum,
-    void *noAliasDepList) {
-  if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
-
-  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
-                          arg_types);
-}
-
-/// passes data from the target, releases target memory and destroys
-/// the host-target mapping (top entry from the stack of data maps)
-/// created by the last __tgt_target_data_begin.
-EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  if (IsOffloadDisabled()) return;
-  DP("Entering data end region with %d mappings\n", arg_num);
-
-  // No devices available?
-  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
-    device_id = omp_get_default_device();
-  }
-
-  RTLsMtx->lock();
-  size_t Devices_size = Devices.size();
-  RTLsMtx->unlock();
-  if (Devices_size <= (size_t)device_id) {
-    DP("Device ID  %" PRId64 " does not have a matching RTL.\n", device_id);
-    HandleTargetOutcome(false);
-    return;
-  }
-
-  DeviceTy &Device = Devices[device_id];
-  if (!Device.IsInit) {
-    DP("Uninit device: ignore");
-    HandleTargetOutcome(false);
-    return;
-  }
-
-#ifdef OMPTARGET_DEBUG
-  for (int i=0; i<arg_num; ++i) {
-    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
-        arg_sizes[i], arg_types[i]);
-  }
-#endif
-
-  int rc = target_data_end(Device, arg_num, args_base, args, arg_sizes,
-                           arg_types, nullptr);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
-}
-
-EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
-    int32_t depNum, void *depList, int32_t noAliasDepNum,
-    void *noAliasDepList) {
-  if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
-
-  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
-                        arg_types);
-}
-
-EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  if (IsOffloadDisabled()) return;
-  DP("Entering data update with %d mappings\n", arg_num);
-
-  // No devices available?
-  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
-    device_id = omp_get_default_device();
-  }
-
-  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
-    return;
-  }
-
-  DeviceTy& Device = Devices[device_id];
-  int rc = target_data_update(Device, arg_num, args_base,
-      args, arg_sizes, arg_types);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
-}
-
-EXTERN void __tgt_target_data_update_nowait(
-    int64_t device_id, int32_t arg_num, void **args_base, void **args,
-    int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList,
-    int32_t noAliasDepNum, void *noAliasDepList) {
-  if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
-
-  __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
-                           arg_types);
-}
-
-EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  if (IsOffloadDisabled()) return OFFLOAD_FAIL;
-  DP("Entering target region with entry point " DPxMOD " and device Id %"
-      PRId64 "\n", DPxPTR(host_ptr), device_id);
-
-  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
-    device_id = omp_get_default_device();
-  }
-
-  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
-    return OFFLOAD_FAIL;
-  }
-
-#ifdef OMPTARGET_DEBUG
-  for (int i=0; i<arg_num; ++i) {
-    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
-        arg_sizes[i], arg_types[i]);
-  }
-#endif
-
-  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-      arg_types, 0, 0, false /*team*/);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
-  return rc;
-}
-
-EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr,
-    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
-    int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
-    void *noAliasDepList) {
-  if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
-
-  return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-                      arg_types);
-}
-
-EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
-    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
-    int64_t *arg_types, int32_t team_num, int32_t thread_limit) {
-  if (IsOffloadDisabled()) return OFFLOAD_FAIL;
-  DP("Entering target region with entry point " DPxMOD " and device Id %"
-      PRId64 "\n", DPxPTR(host_ptr), device_id);
-
-  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
-    device_id = omp_get_default_device();
-  }
-
-  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
-    return OFFLOAD_FAIL;
-  }
-
-#ifdef OMPTARGET_DEBUG
-  for (int i=0; i<arg_num; ++i) {
-    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
-        arg_sizes[i], arg_types[i]);
-  }
-#endif
-
-  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
-      arg_types, team_num, thread_limit, true /*team*/);
-  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
-
-  return rc;
-}
-
-EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
-    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
-    int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
-    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
-  if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
-
-  return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
-                            arg_sizes, arg_types, team_num, thread_limit);
-}
-
-// Get the current number of components for a user-defined mapper.
-EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) {
-  auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle;
-  int64_t size = MapperComponentsPtr->Components.size();
-  DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
-     DPxPTR(rt_mapper_handle), size);
-  return size;
-}
-
-// Push back one component for a user-defined mapper.
-EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
-                                        void *begin, int64_t size,
-                                        int64_t type) {
-  DP("__tgt_push_mapper_component(Handle=" DPxMOD
-     ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
-     ", Type=0x%" PRIx64 ").\n",
-     DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type);
-  auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle;
-  MapperComponentsPtr->Components.push_back(
-      MapComponentInfoTy(base, begin, size, type));
-}
-
-EXTERN void __kmpc_push_target_tripcount(int64_t device_id,
-    uint64_t loop_tripcount) {
-  if (IsOffloadDisabled())
-    return;
-
-  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
-    device_id = omp_get_default_device();
-  }
-
-  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
-    DP("Failed to get device %" PRId64 " ready\n", device_id);
-    HandleTargetOutcome(false);
-    return;
-  }
-
-  DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id,
-      loop_tripcount);
-  TblMapMtx->lock();
-  Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL),
-                                         loop_tripcount);
-  TblMapMtx->unlock();
-}
+//===-------- interface.cpp - Target independent OpenMP target RTL --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <mutex>
+
+// Store target policy (disabled, mandatory, default)
+kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default;
+std::mutex TargetOffloadMtx;
+
+////////////////////////////////////////////////////////////////////////////////
+/// manage the success or failure of a target construct
+
+static void HandleDefaultTargetOffload() {
+  TargetOffloadMtx.lock();
+  if (TargetOffloadPolicy == tgt_default) {
+    if (omp_get_num_devices() > 0) {
+      DP("Default TARGET OFFLOAD policy is now mandatory "
+         "(devices were found)\n");
+      TargetOffloadPolicy = tgt_mandatory;
+    } else {
+      DP("Default TARGET OFFLOAD policy is now disabled "
+         "(no devices were found)\n");
+      TargetOffloadPolicy = tgt_disabled;
+    }
+  }
+  TargetOffloadMtx.unlock();
+}
+
+static int IsOffloadDisabled() {
+  if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload();
+  return TargetOffloadPolicy == tgt_disabled;
+}
+
+static void HandleTargetOutcome(bool success) {
+  switch (TargetOffloadPolicy) {
+    case tgt_disabled:
+      if (success) {
+        FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
+      }
+      break;
+    case tgt_default:
+      FATAL_MESSAGE0(1, "default offloading policy must be switched to "
+                        "mandatory or disabled");
+      break;
+    case tgt_mandatory:
+      if (!success) {
+        FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory");
+      }
+      break;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// adds requires flags
+EXTERN void __tgt_register_requires(int64_t flags) {
+  RTLs->RegisterRequires(flags);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// adds a target shared library to the target execution image
+EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
+  RTLs->RegisterLib(desc);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// unloads a target shared library
+EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
+  RTLs->UnregisterLib(desc);
+}
+
+/// creates host-to-target data mapping, stores it in the
+/// libomptarget.so internal structure (an entry in a stack of data maps)
+/// and passes the data to the device.
+EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return;
+
+  DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
+      device_id, arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+    DP("Use default device id %" PRId64 "\n", device_id);
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DeviceTy &Device = Devices[device_id];
+
+#ifdef OMPTARGET_DEBUG
+  for (int i = 0; i < arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+       ", Type=0x%" PRIx64 "\n",
+       i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
+                             arg_types, nullptr);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+}
+
+EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
+                          arg_types);
+}
+
+/// passes data from the target, releases target memory and destroys
+/// the host-target mapping (top entry from the stack of data maps)
+/// created by the last __tgt_target_data_begin.
+EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return;
+  DP("Entering data end region with %d mappings\n", arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  RTLsMtx->lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx->unlock();
+  if (Devices_size <= (size_t)device_id) {
+    DP("Device ID  %" PRId64 " does not have a matching RTL.\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DeviceTy &Device = Devices[device_id];
+  if (!Device.IsInit) {
+    DP("Uninit device: ignore");
+    HandleTargetOutcome(false);
+    return;
+  }
+
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target_data_end(Device, arg_num, args_base, args, arg_sizes,
+                           arg_types, nullptr);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+}
+
+EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
+                        arg_types);
+}
+
+EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return;
+  DP("Entering data update with %d mappings\n", arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DeviceTy& Device = Devices[device_id];
+  int rc = target_data_update(Device, arg_num, args_base,
+      args, arg_sizes, arg_types);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+}
+
+EXTERN void __tgt_target_data_update_nowait(
+    int64_t device_id, int32_t arg_num, void **args_base, void **args,
+    int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList,
+    int32_t noAliasDepNum, void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
+                           arg_types);
+}
+
+EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return OFFLOAD_FAIL;
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
+
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return OFFLOAD_FAIL;
+  }
+
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, 0, 0, false /*team*/);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+  return rc;
+}
+
+EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+                      arg_types);
+}
+
+EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t team_num, int32_t thread_limit) {
+  if (IsOffloadDisabled()) return OFFLOAD_FAIL;
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
+
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return OFFLOAD_FAIL;
+  }
+
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, team_num, thread_limit, true /*team*/);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+
+  return rc;
+}
+
+EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
+                            arg_sizes, arg_types, team_num, thread_limit);
+}
+
+// Get the current number of components for a user-defined mapper.
+EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) {
+  auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle;
+  int64_t size = MapperComponentsPtr->Components.size();
+  DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
+     DPxPTR(rt_mapper_handle), size);
+  return size;
+}
+
+// Push back one component for a user-defined mapper.
+EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
+                                        void *begin, int64_t size,
+                                        int64_t type) {
+  DP("__tgt_push_mapper_component(Handle=" DPxMOD
+     ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+     ", Type=0x%" PRIx64 ").\n",
+     DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type);
+  auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle;
+  MapperComponentsPtr->Components.push_back(
+      MapComponentInfoTy(base, begin, size, type));
+}
+
+EXTERN void __kmpc_push_target_tripcount(int64_t device_id,
+    uint64_t loop_tripcount) {
+  if (IsOffloadDisabled())
+    return;
+
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id,
+      loop_tripcount);
+  TblMapMtx->lock();
+  Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL),
+                                         loop_tripcount);
+  TblMapMtx->unlock();
+}
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 3113bdc2a9d39..ea6ca336d1c0b 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -1,823 +1,823 @@
-//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implementation of the interface to be used by Clang during the codegen of a
-// target region.
-//
-//===----------------------------------------------------------------------===//
-
-#include <omptarget.h>
-
-#include "device.h"
-#include "private.h"
-#include "rtl.h"
-
-#include <cassert>
-#include <vector>
-
-#ifdef OMPTARGET_DEBUG
-int DebugLevel = 0;
-#endif // OMPTARGET_DEBUG
-
-
-
-/* All begin addresses for partially mapped structs must be 8-aligned in order
- * to ensure proper alignment of members. E.g.
- *
- * struct S {
- *   int a;   // 4-aligned
- *   int b;   // 4-aligned
- *   int *p;  // 8-aligned
- * } s1;
- * ...
- * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
- * {
- *   s1.b = 5;
- *   for (int i...) s1.p[i] = ...;
- * }
- *
- * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
- * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
- * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
- * requirements for its type. Now, when we allocate memory on the device, in
- * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
- * This means that the chunk of the struct on the device will start at a
- * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
- * address of p will be a misaligned 0x204 (on the host there was no need to add
- * padding between b and p, so p comes exactly 4 bytes after b). If the device
- * kernel tries to access s1.p, a misaligned address error occurs (as reported
- * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
- * extending the size of the allocated chuck accordingly, the chuck on the
- * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
- * &s1.p=0x208, as they should be to satisfy the alignment requirements.
- */
-static const int64_t alignment = 8;
-
-/// Map global data and execute pending ctors
-static int InitLibrary(DeviceTy& Device) {
-  /*
-   * Map global data
-   */
-  int32_t device_id = Device.DeviceID;
-  int rc = OFFLOAD_SUCCESS;
-
-  Device.PendingGlobalsMtx.lock();
-  TrlTblMtx->lock();
-  for (HostEntriesBeginToTransTableTy::iterator
-      ii = HostEntriesBeginToTransTable->begin();
-      ii != HostEntriesBeginToTransTable->end(); ++ii) {
-    TranslationTable *TransTable = &ii->second;
-    if (TransTable->HostTable.EntriesBegin ==
-        TransTable->HostTable.EntriesEnd) {
-      // No host entry so no need to proceed
-      continue;
-    }
-    if (TransTable->TargetsTable[device_id] != 0) {
-      // Library entries have already been processed
-      continue;
-    }
-
-    // 1) get image.
-    assert(TransTable->TargetsImages.size() > (size_t)device_id &&
-           "Not expecting a device ID outside the table's bounds!");
-    __tgt_device_image *img = TransTable->TargetsImages[device_id];
-    if (!img) {
-      DP("No image loaded for device id %d.\n", device_id);
-      rc = OFFLOAD_FAIL;
-      break;
-    }
-    // 2) load image into the target table.
-    __tgt_target_table *TargetTable =
-        TransTable->TargetsTable[device_id] = Device.load_binary(img);
-    // Unable to get table for this image: invalidate image and fail.
-    if (!TargetTable) {
-      DP("Unable to generate entries table for device id %d.\n", device_id);
-      TransTable->TargetsImages[device_id] = 0;
-      rc = OFFLOAD_FAIL;
-      break;
-    }
-
-    // Verify whether the two table sizes match.
-    size_t hsize =
-        TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
-    size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
-
-    // Invalid image for these host entries!
-    if (hsize != tsize) {
-      DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
-         device_id, hsize, tsize);
-      TransTable->TargetsImages[device_id] = 0;
-      TransTable->TargetsTable[device_id] = 0;
-      rc = OFFLOAD_FAIL;
-      break;
-    }
-
-    // process global data that needs to be mapped.
-    Device.DataMapMtx.lock();
-    __tgt_target_table *HostTable = &TransTable->HostTable;
-    for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
-                             *CurrHostEntry = HostTable->EntriesBegin,
-                             *EntryDeviceEnd = TargetTable->EntriesEnd;
-         CurrDeviceEntry != EntryDeviceEnd;
-         CurrDeviceEntry++, CurrHostEntry++) {
-      if (CurrDeviceEntry->size != 0) {
-        // has data.
-        assert(CurrDeviceEntry->size == CurrHostEntry->size &&
-               "data size mismatch");
-
-        // Fortran may use multiple weak declarations for the same symbol,
-        // therefore we must allow for multiple weak symbols to be loaded from
-        // the fat binary. Treat these mappings as any other "regular" mapping.
-        // Add entry to map.
-        if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
-          continue;
-        DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
-            "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
-            CurrDeviceEntry->size);
-        Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
-            (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
-            (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
-            (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
-            true /*IsRefCountINF*/));
-      }
-    }
-    Device.DataMapMtx.unlock();
-  }
-  TrlTblMtx->unlock();
-
-  if (rc != OFFLOAD_SUCCESS) {
-    Device.PendingGlobalsMtx.unlock();
-    return rc;
-  }
-
-  /*
-   * Run ctors for static objects
-   */
-  if (!Device.PendingCtorsDtors.empty()) {
-    // Call all ctors for all libraries registered so far
-    for (auto &lib : Device.PendingCtorsDtors) {
-      if (!lib.second.PendingCtors.empty()) {
-        DP("Has pending ctors... call now\n");
-        for (auto &entry : lib.second.PendingCtors) {
-          void *ctor = entry;
-          int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
-                          NULL, 1, 1, true /*team*/);
-          if (rc != OFFLOAD_SUCCESS) {
-            DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
-            Device.PendingGlobalsMtx.unlock();
-            return OFFLOAD_FAIL;
-          }
-        }
-        // Clear the list to indicate that this device has been used
-        lib.second.PendingCtors.clear();
-        DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
-      }
-    }
-  }
-  Device.HasPendingGlobals = false;
-  Device.PendingGlobalsMtx.unlock();
-
-  return OFFLOAD_SUCCESS;
-}
-
-// Check whether a device has been initialized, global ctors have been
-// executed and global data has been mapped; do so if not already done.
-int CheckDeviceAndCtors(int64_t device_id) {
-  // Is device ready?
-  if (!device_is_ready(device_id)) {
-    DP("Device %" PRId64 " is not ready.\n", device_id);
-    return OFFLOAD_FAIL;
-  }
-
-  // Get device info.
-  DeviceTy &Device = Devices[device_id];
-
-  // Check whether global data has been mapped for this device
-  Device.PendingGlobalsMtx.lock();
-  bool hasPendingGlobals = Device.HasPendingGlobals;
-  Device.PendingGlobalsMtx.unlock();
-  if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
-    DP("Failed to init globals on device %" PRId64 "\n", device_id);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-static int32_t member_of(int64_t type) {
-  return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
-}
-
-/// Internal function to do the mapping and transfer the data to the device
-int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base,
-                      void **args, int64_t *arg_sizes, int64_t *arg_types,
-                      __tgt_async_info *async_info_ptr) {
-  // process each input.
-  for (int32_t i = 0; i < arg_num; ++i) {
-    // Ignore private variables and arrays - there is no mapping for them.
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
-      continue;
-
-    void *HstPtrBegin = args[i];
-    void *HstPtrBase = args_base[i];
-    int64_t data_size = arg_sizes[i];
-
-    // Adjust for proper alignment if this is a combined entry (for structs).
-    // Look at the next argument - if that is MEMBER_OF this one, then this one
-    // is a combined entry.
-    int64_t padding = 0;
-    const int next_i = i+1;
-    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
-        member_of(arg_types[next_i]) == i) {
-      padding = (int64_t)HstPtrBegin % alignment;
-      if (padding) {
-        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
-            "\n", padding, DPxPTR(HstPtrBegin));
-        HstPtrBegin = (char *) HstPtrBegin - padding;
-        data_size += padding;
-      }
-    }
-
-    // Address of pointer on the host and device, respectively.
-    void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
-    bool IsNew, Pointer_IsNew;
-    bool IsHostPtr = false;
-    bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
-    // Force the creation of a device side copy of the data when:
-    // a close map modifier was associated with a map that contained a to.
-    bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
-    // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
-    // have reached this point via __tgt_target_data_begin and not __tgt_target
-    // then no argument is marked as TARGET_PARAM ("omp target data map" is not
-    // associated with a target region, so there are no target parameters). This
-    // may be considered a hack, we could revise the scheme in the future.
-    bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
-    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
-      DP("Has a pointer entry: \n");
-      // base is address of pointer.
-      Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
-          sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef,
-          HasCloseModifier);
-      if (!Pointer_TgtPtrBegin) {
-        DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
-            "illegal mapping).\n");
-        return OFFLOAD_FAIL;
-      }
-      DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
-          "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
-          (Pointer_IsNew ? "" : " not"));
-      Pointer_HstPtrBegin = HstPtrBase;
-      // modify current entry.
-      HstPtrBase = *(void **)HstPtrBase;
-      UpdateRef = true; // subsequently update ref count of pointee
-    }
-
-    void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
-        data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier);
-    if (!TgtPtrBegin && data_size) {
-      // If data_size==0, then the argument could be a zero-length pointer to
-      // NULL, so getOrAlloc() returning NULL is not an error.
-      DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
-          "illegal mapping).\n");
-    }
-    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
-        (IsNew ? "" : " not"));
-
-    if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
-      uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
-      void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
-      DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
-      args_base[i] = TgtPtrBase;
-    }
-
-    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
-      bool copy = false;
-      if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
-          HasCloseModifier) {
-        if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
-          copy = true;
-        } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
-          // Copy data only if the "parent" struct has RefCount==1.
-          int32_t parent_idx = member_of(arg_types[i]);
-          uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
-          assert(parent_rc > 0 && "parent struct not found");
-          if (parent_rc == 1) {
-            copy = true;
-          }
-        }
-      }
-
-      if (copy && !IsHostPtr) {
-        DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-           data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size,
-                                    async_info_ptr);
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-      }
-    }
-
-    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
-      DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
-          DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
-      uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
-      void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
-      int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
-                                  sizeof(void *), async_info_ptr);
-      if (rt != OFFLOAD_SUCCESS) {
-        DP("Copying data to device failed.\n");
-        return OFFLOAD_FAIL;
-      }
-      // create shadow pointers for this entry
-      Device.ShadowMtx.lock();
-      Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
-          Pointer_TgtPtrBegin, TgtPtrBase};
-      Device.ShadowMtx.unlock();
-    }
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-/// Internal function to undo the mapping and retrieve the data from the device.
-int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
-                    void **args, int64_t *arg_sizes, int64_t *arg_types,
-                    __tgt_async_info *async_info_ptr) {
-  // process each input.
-  for (int32_t i = arg_num - 1; i >= 0; --i) {
-    // Ignore private variables and arrays - there is no mapping for them.
-    // Also, ignore the use_device_ptr directive, it has no effect here.
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
-      continue;
-
-    void *HstPtrBegin = args[i];
-    int64_t data_size = arg_sizes[i];
-    // Adjust for proper alignment if this is a combined entry (for structs).
-    // Look at the next argument - if that is MEMBER_OF this one, then this one
-    // is a combined entry.
-    int64_t padding = 0;
-    const int next_i = i+1;
-    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
-        member_of(arg_types[next_i]) == i) {
-      padding = (int64_t)HstPtrBegin % alignment;
-      if (padding) {
-        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
-            "\n", padding, DPxPTR(HstPtrBegin));
-        HstPtrBegin = (char *) HstPtrBegin - padding;
-        data_size += padding;
-      }
-    }
-
-    bool IsLast, IsHostPtr;
-    bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
-        (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
-    bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
-    bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
-
-    // If PTR_AND_OBJ, HstPtrBegin is address of pointee
-    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
-        UpdateRef, IsHostPtr);
-    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
-        (IsLast ? "" : " not"));
-
-    bool DelEntry = IsLast || ForceDelete;
-
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-        !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
-      DelEntry = false; // protect parent struct from being deallocated
-    }
-
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
-      // Move data back to the host
-      if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
-        bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
-        bool CopyMember = false;
-        if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
-            HasCloseModifier) {
-          if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-              !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
-            // Copy data only if the "parent" struct has RefCount==1.
-            int32_t parent_idx = member_of(arg_types[i]);
-            uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
-            assert(parent_rc > 0 && "parent struct not found");
-            if (parent_rc == 1) {
-              CopyMember = true;
-            }
-          }
-        }
-
-        if ((DelEntry || Always || CopyMember) &&
-            !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-              TgtPtrBegin == HstPtrBegin)) {
-          DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-             data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size,
-                                        async_info_ptr);
-          if (rt != OFFLOAD_SUCCESS) {
-            DP("Copying data from device failed.\n");
-            return OFFLOAD_FAIL;
-          }
-        }
-      }
-
-      // If we copied back to the host a struct/array containing pointers, we
-      // need to restore the original host pointer values from their shadow
-      // copies. If the struct is going to be deallocated, remove any remaining
-      // shadow pointer entries for this struct.
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-           it != Device.ShadowPtrMap.end();) {
-        void **ShadowHstPtrAddr = (void**) it->first;
-
-        // An STL map is sorted on its keys; use this property
-        // to quickly determine when to break out of the loop.
-        if ((uintptr_t) ShadowHstPtrAddr < lb) {
-          ++it;
-          continue;
-        }
-        if ((uintptr_t) ShadowHstPtrAddr >= ub)
-          break;
-
-        // If we copied the struct to the host, we need to restore the pointer.
-        if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
-          DP("Restoring original host pointer value " DPxMOD " for host "
-              "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
-              DPxPTR(ShadowHstPtrAddr));
-          *ShadowHstPtrAddr = it->second.HstPtrVal;
-        }
-        // If the struct is to be deallocated, remove the shadow entry.
-        if (DelEntry) {
-          DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
-          it = Device.ShadowPtrMap.erase(it);
-        } else {
-          ++it;
-        }
-      }
-      Device.ShadowMtx.unlock();
-
-      // Deallocate map
-      if (DelEntry) {
-        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete,
-                                      HasCloseModifier);
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Deallocating data from device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-      }
-    }
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-/// Internal function to pass data to/from the target.
-int target_data_update(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
-  // process each input.
-  for (int32_t i = 0; i < arg_num; ++i) {
-    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
-        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
-      continue;
-
-    void *HstPtrBegin = args[i];
-    int64_t MapSize = arg_sizes[i];
-    bool IsLast, IsHostPtr;
-    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
-        false, IsHostPtr);
-    if (!TgtPtrBegin) {
-      DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
-      continue;
-    }
-
-    if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-        TgtPtrBegin == HstPtrBegin) {
-      DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
-         DPxPTR(HstPtrBegin));
-      continue;
-    }
-
-    if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
-      DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-          arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-      int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr);
-      if (rt != OFFLOAD_SUCCESS) {
-        DP("Copying data from device failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-          it != Device.ShadowPtrMap.end(); ++it) {
-        void **ShadowHstPtrAddr = (void**) it->first;
-        if ((uintptr_t) ShadowHstPtrAddr < lb)
-          continue;
-        if ((uintptr_t) ShadowHstPtrAddr >= ub)
-          break;
-        DP("Restoring original host pointer value " DPxMOD " for host pointer "
-            DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
-            DPxPTR(ShadowHstPtrAddr));
-        *ShadowHstPtrAddr = it->second.HstPtrVal;
-      }
-      Device.ShadowMtx.unlock();
-    }
-
-    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
-      DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-          arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-      int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr);
-      if (rt != OFFLOAD_SUCCESS) {
-        DP("Copying data to device failed.\n");
-        return OFFLOAD_FAIL;
-      }
-
-      uintptr_t lb = (uintptr_t) HstPtrBegin;
-      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
-      Device.ShadowMtx.lock();
-      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
-          it != Device.ShadowPtrMap.end(); ++it) {
-        void **ShadowHstPtrAddr = (void**) it->first;
-        if ((uintptr_t) ShadowHstPtrAddr < lb)
-          continue;
-        if ((uintptr_t) ShadowHstPtrAddr >= ub)
-          break;
-        DP("Restoring original target pointer value " DPxMOD " for target "
-            "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
-            DPxPTR(it->second.TgtPtrAddr));
-        rt = Device.data_submit(it->second.TgtPtrAddr,
-            &it->second.TgtPtrVal, sizeof(void *), nullptr);
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed.\n");
-          Device.ShadowMtx.unlock();
-          return OFFLOAD_FAIL;
-        }
-      }
-      Device.ShadowMtx.unlock();
-    }
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
-                                      OMP_TGT_MAPTYPE_LITERAL |
-                                      OMP_TGT_MAPTYPE_IMPLICIT;
-static bool isLambdaMapping(int64_t Mapping) {
-  return (Mapping & LambdaMapping) == LambdaMapping;
-}
-
-/// performs the same actions as data_begin in case arg_num is
-/// non-zero and initiates run of the offloaded region on the target platform;
-/// if arg_num is non-zero after the region execution is done it also
-/// performs the same action as data_update and data_end above. This function
-/// returns 0 if it was able to transfer the execution to a target and an
-/// integer different from zero otherwise.
-int target(int64_t device_id, void *host_ptr, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
-    int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
-  DeviceTy &Device = Devices[device_id];
-
-  // Find the table information in the map or look it up in the translation
-  // tables.
-  TableMap *TM = 0;
-  TblMapMtx->lock();
-  HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(host_ptr);
-  if (TableMapIt == HostPtrToTableMap->end()) {
-    // We don't have a map. So search all the registered libraries.
-    TrlTblMtx->lock();
-    for (HostEntriesBeginToTransTableTy::iterator
-             ii = HostEntriesBeginToTransTable->begin(),
-             ie = HostEntriesBeginToTransTable->end();
-         !TM && ii != ie; ++ii) {
-      // get the translation table (which contains all the good info).
-      TranslationTable *TransTable = &ii->second;
-      // iterate over all the host table entries to see if we can locate the
-      // host_ptr.
-      __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
-      __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
-      __tgt_offload_entry *cur = begin;
-      for (uint32_t i = 0; cur < end; ++cur, ++i) {
-        if (cur->addr != host_ptr)
-          continue;
-        // we got a match, now fill the HostPtrToTableMap so that we
-        // may avoid this search next time.
-        TM = &(*HostPtrToTableMap)[host_ptr];
-        TM->Table = TransTable;
-        TM->Index = i;
-        break;
-      }
-    }
-    TrlTblMtx->unlock();
-  } else {
-    TM = &TableMapIt->second;
-  }
-  TblMapMtx->unlock();
-
-  // No map for this host pointer found!
-  if (!TM) {
-    DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
-       DPxPTR(host_ptr));
-    return OFFLOAD_FAIL;
-  }
-
-  // get target table.
-  TrlTblMtx->lock();
-  assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
-         "Not expecting a device ID outside the table's bounds!");
-  __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
-  TrlTblMtx->unlock();
-  assert(TargetTable && "Global data has not been mapped\n");
-
-  __tgt_async_info AsyncInfo;
-
-  // Move data to device.
-  int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
-                             arg_types, &AsyncInfo);
-  if (rc != OFFLOAD_SUCCESS) {
-    DP("Call to target_data_begin failed, abort target.\n");
-    return OFFLOAD_FAIL;
-  }
-
-  std::vector<void *> tgt_args;
-  std::vector<ptrdiff_t> tgt_offsets;
-
-  // List of (first-)private arrays allocated for this target region
-  std::vector<void *> fpArrays;
-  std::vector<int> tgtArgsPositions(arg_num, -1);
-
-  for (int32_t i = 0; i < arg_num; ++i) {
-    if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
-      // This is not a target parameter, do not push it into tgt_args.
-      // Check for lambda mapping.
-      if (isLambdaMapping(arg_types[i])) {
-        assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-               "PTR_AND_OBJ must be also MEMBER_OF.");
-        unsigned idx = member_of(arg_types[i]);
-        int tgtIdx = tgtArgsPositions[idx];
-        assert(tgtIdx != -1 && "Base address must be translated already.");
-        // The parent lambda must be processed already and it must be the last
-        // in tgt_args and tgt_offsets arrays.
-        void *HstPtrVal = args[i];
-        void *HstPtrBegin = args_base[i];
-        void *HstPtrBase = args[idx];
-        bool IsLast, IsHostPtr; // unused.
-        void *TgtPtrBase =
-            (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
-        DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
-        uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
-        void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
-        void *Pointer_TgtPtrBegin =
-            Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false,
-                                  IsHostPtr);
-        if (!Pointer_TgtPtrBegin) {
-          DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
-             DPxPTR(HstPtrVal));
-          continue;
-        }
-        if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-            TgtPtrBegin == HstPtrBegin) {
-          DP("Unified memory is active, no need to map lambda captured"
-             "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal));
-          continue;
-        }
-        DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
-           DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
-        int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
-                                    sizeof(void *), &AsyncInfo);
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed.\n");
-          return OFFLOAD_FAIL;
-        }
-      }
-      continue;
-    }
-    void *HstPtrBegin = args[i];
-    void *HstPtrBase = args_base[i];
-    void *TgtPtrBegin;
-    ptrdiff_t TgtBaseOffset;
-    bool IsLast, IsHostPtr; // unused.
-    if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
-      DP("Forwarding first-private value " DPxMOD " to the target construct\n",
-          DPxPTR(HstPtrBase));
-      TgtPtrBegin = HstPtrBase;
-      TgtBaseOffset = 0;
-    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
-      // Allocate memory for (first-)private array
-      TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
-          arg_sizes[i], HstPtrBegin);
-      if (!TgtPtrBegin) {
-        DP ("Data allocation for %sprivate array " DPxMOD " failed, "
-            "abort target.\n",
-            (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
-            DPxPTR(HstPtrBegin));
-        return OFFLOAD_FAIL;
-      }
-      fpArrays.push_back(TgtPtrBegin);
-      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-#ifdef OMPTARGET_DEBUG
-      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
-      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
-          "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
-          arg_sizes[i], DPxPTR(TgtPtrBegin),
-          (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
-          DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
-#endif
-      // If first-private, copy data from host
-      if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i],
-                                    &AsyncInfo);
-        if (rt != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed, failed.\n");
-          return OFFLOAD_FAIL;
-        }
-      }
-    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
-      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
-          false, IsHostPtr);
-      TgtBaseOffset = 0; // no offset for ptrs.
-      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
-         "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
-         DPxPTR(HstPtrBase));
-    } else {
-      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
-          false, IsHostPtr);
-      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-#ifdef OMPTARGET_DEBUG
-      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
-      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
-          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
-#endif
-    }
-    tgtArgsPositions[i] = tgt_args.size();
-    tgt_args.push_back(TgtPtrBegin);
-    tgt_offsets.push_back(TgtBaseOffset);
-  }
-
-  assert(tgt_args.size() == tgt_offsets.size() &&
-      "Size mismatch in arguments and offsets");
-
-  // Pop loop trip count
-  uint64_t ltc = 0;
-  TblMapMtx->lock();
-  auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
-  if (I != Device.LoopTripCnt.end()) {
-    ltc = I->second;
-    Device.LoopTripCnt.erase(I);
-    DP("loop trip count is %lu.\n", ltc);
-  }
-  TblMapMtx->unlock();
-
-  // Launch device execution.
-  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
-      TargetTable->EntriesBegin[TM->Index].name,
-      DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
-  if (IsTeamConstruct) {
-    rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
-                                &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
-                                team_num, thread_limit, ltc, &AsyncInfo);
-  } else {
-    rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
-                           &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
-                           &AsyncInfo);
-  }
-  if (rc != OFFLOAD_SUCCESS) {
-    DP ("Executing target region abort target.\n");
-    return OFFLOAD_FAIL;
-  }
-
-  // Deallocate (first-)private arrays
-  for (auto it : fpArrays) {
-    int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
-    if (rt != OFFLOAD_SUCCESS) {
-      DP("Deallocation of (first-)private arrays failed.\n");
-      return OFFLOAD_FAIL;
-    }
-  }
-
-  // Move data from device.
-  int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
-                           arg_types, &AsyncInfo);
-  if (rt != OFFLOAD_SUCCESS) {
-    DP("Call to target_data_end failed, abort targe.\n");
-    return OFFLOAD_FAIL;
-  }
-
-  if (Device.RTL->synchronize)
-    return Device.RTL->synchronize(device_id, &AsyncInfo);
-
-  return OFFLOAD_SUCCESS;
-}
+//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <vector>
+
+#ifdef OMPTARGET_DEBUG
+int DebugLevel = 0;
+#endif // OMPTARGET_DEBUG
+
+
+
+/* All begin addresses for partially mapped structs must be 8-aligned in order
+ * to ensure proper alignment of members. E.g.
+ *
+ * struct S {
+ *   int a;   // 4-aligned
+ *   int b;   // 4-aligned
+ *   int *p;  // 8-aligned
+ * } s1;
+ * ...
+ * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
+ * {
+ *   s1.b = 5;
+ *   for (int i...) s1.p[i] = ...;
+ * }
+ *
+ * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
+ * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
+ * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
+ * requirements for its type. Now, when we allocate memory on the device, in
+ * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
+ * This means that the chunk of the struct on the device will start at a
+ * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
+ * address of p will be a misaligned 0x204 (on the host there was no need to add
+ * padding between b and p, so p comes exactly 4 bytes after b). If the device
+ * kernel tries to access s1.p, a misaligned address error occurs (as reported
+ * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
+ * extending the size of the allocated chuck accordingly, the chuck on the
+ * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
+ * &s1.p=0x208, as they should be to satisfy the alignment requirements.
+ */
+static const int64_t alignment = 8;
+
+/// Map global data and execute pending ctors
+static int InitLibrary(DeviceTy& Device) {
+  /*
+   * Map global data
+   */
+  int32_t device_id = Device.DeviceID;
+  int rc = OFFLOAD_SUCCESS;
+
+  Device.PendingGlobalsMtx.lock();
+  TrlTblMtx->lock();
+  for (HostEntriesBeginToTransTableTy::iterator
+      ii = HostEntriesBeginToTransTable->begin();
+      ii != HostEntriesBeginToTransTable->end(); ++ii) {
+    TranslationTable *TransTable = &ii->second;
+    if (TransTable->HostTable.EntriesBegin ==
+        TransTable->HostTable.EntriesEnd) {
+      // No host entry so no need to proceed
+      continue;
+    }
+    if (TransTable->TargetsTable[device_id] != 0) {
+      // Library entries have already been processed
+      continue;
+    }
+
+    // 1) get image.
+    assert(TransTable->TargetsImages.size() > (size_t)device_id &&
+           "Not expecting a device ID outside the table's bounds!");
+    __tgt_device_image *img = TransTable->TargetsImages[device_id];
+    if (!img) {
+      DP("No image loaded for device id %d.\n", device_id);
+      rc = OFFLOAD_FAIL;
+      break;
+    }
+    // 2) load image into the target table.
+    __tgt_target_table *TargetTable =
+        TransTable->TargetsTable[device_id] = Device.load_binary(img);
+    // Unable to get table for this image: invalidate image and fail.
+    if (!TargetTable) {
+      DP("Unable to generate entries table for device id %d.\n", device_id);
+      TransTable->TargetsImages[device_id] = 0;
+      rc = OFFLOAD_FAIL;
+      break;
+    }
+
+    // Verify whether the two table sizes match.
+    size_t hsize =
+        TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
+    size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
+
+    // Invalid image for these host entries!
+    if (hsize != tsize) {
+      DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
+         device_id, hsize, tsize);
+      TransTable->TargetsImages[device_id] = 0;
+      TransTable->TargetsTable[device_id] = 0;
+      rc = OFFLOAD_FAIL;
+      break;
+    }
+
+    // process global data that needs to be mapped.
+    Device.DataMapMtx.lock();
+    __tgt_target_table *HostTable = &TransTable->HostTable;
+    for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
+                             *CurrHostEntry = HostTable->EntriesBegin,
+                             *EntryDeviceEnd = TargetTable->EntriesEnd;
+         CurrDeviceEntry != EntryDeviceEnd;
+         CurrDeviceEntry++, CurrHostEntry++) {
+      if (CurrDeviceEntry->size != 0) {
+        // has data.
+        assert(CurrDeviceEntry->size == CurrHostEntry->size &&
+               "data size mismatch");
+
+        // Fortran may use multiple weak declarations for the same symbol,
+        // therefore we must allow for multiple weak symbols to be loaded from
+        // the fat binary. Treat these mappings as any other "regular" mapping.
+        // Add entry to map.
+        if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
+          continue;
+        DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
+            "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
+            CurrDeviceEntry->size);
+        Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
+            (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
+            (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
+            (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
+            (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
+            true /*IsRefCountINF*/));
+      }
+    }
+    Device.DataMapMtx.unlock();
+  }
+  TrlTblMtx->unlock();
+
+  if (rc != OFFLOAD_SUCCESS) {
+    Device.PendingGlobalsMtx.unlock();
+    return rc;
+  }
+
+  /*
+   * Run ctors for static objects
+   */
+  if (!Device.PendingCtorsDtors.empty()) {
+    // Call all ctors for all libraries registered so far
+    for (auto &lib : Device.PendingCtorsDtors) {
+      if (!lib.second.PendingCtors.empty()) {
+        DP("Has pending ctors... call now\n");
+        for (auto &entry : lib.second.PendingCtors) {
+          void *ctor = entry;
+          int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
+                          NULL, 1, 1, true /*team*/);
+          if (rc != OFFLOAD_SUCCESS) {
+            DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
+            Device.PendingGlobalsMtx.unlock();
+            return OFFLOAD_FAIL;
+          }
+        }
+        // Clear the list to indicate that this device has been used
+        lib.second.PendingCtors.clear();
+        DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
+      }
+    }
+  }
+  Device.HasPendingGlobals = false;
+  Device.PendingGlobalsMtx.unlock();
+
+  return OFFLOAD_SUCCESS;
+}
+
+// Check whether a device has been initialized, global ctors have been
+// executed and global data has been mapped; do so if not already done.
+int CheckDeviceAndCtors(int64_t device_id) {
+  // Is device ready?
+  if (!device_is_ready(device_id)) {
+    DP("Device %" PRId64 " is not ready.\n", device_id);
+    return OFFLOAD_FAIL;
+  }
+
+  // Get device info.
+  DeviceTy &Device = Devices[device_id];
+
+  // Check whether global data has been mapped for this device
+  Device.PendingGlobalsMtx.lock();
+  bool hasPendingGlobals = Device.HasPendingGlobals;
+  Device.PendingGlobalsMtx.unlock();
+  if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
+    DP("Failed to init globals on device %" PRId64 "\n", device_id);
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+static int32_t member_of(int64_t type) {
+  return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
+}
+
+/// Internal function to do the mapping and transfer the data to the device
+int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base,
+                      void **args, int64_t *arg_sizes, int64_t *arg_types,
+                      __tgt_async_info *async_info_ptr) {
+  // process each input.
+  for (int32_t i = 0; i < arg_num; ++i) {
+    // Ignore private variables and arrays - there is no mapping for them.
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+      continue;
+
+    void *HstPtrBegin = args[i];
+    void *HstPtrBase = args_base[i];
+    int64_t data_size = arg_sizes[i];
+
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
+    // Address of pointer on the host and device, respectively.
+    void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
+    bool IsNew, Pointer_IsNew;
+    bool IsHostPtr = false;
+    bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
+    // Force the creation of a device side copy of the data when:
+    // a close map modifier was associated with a map that contained a to.
+    bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
+    // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
+    // have reached this point via __tgt_target_data_begin and not __tgt_target
+    // then no argument is marked as TARGET_PARAM ("omp target data map" is not
+    // associated with a target region, so there are no target parameters). This
+    // may be considered a hack, we could revise the scheme in the future.
+    bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
+    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+      DP("Has a pointer entry: \n");
+      // base is address of pointer.
+      Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
+          sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef,
+          HasCloseModifier);
+      if (!Pointer_TgtPtrBegin) {
+        DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+            "illegal mapping).\n");
+        return OFFLOAD_FAIL;
+      }
+      DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
+          "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
+          (Pointer_IsNew ? "" : " not"));
+      Pointer_HstPtrBegin = HstPtrBase;
+      // modify current entry.
+      HstPtrBase = *(void **)HstPtrBase;
+      UpdateRef = true; // subsequently update ref count of pointee
+    }
+
+    void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
+        data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier);
+    if (!TgtPtrBegin && data_size) {
+      // If data_size==0, then the argument could be a zero-length pointer to
+      // NULL, so getOrAlloc() returning NULL is not an error.
+      DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+          "illegal mapping).\n");
+    }
+    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+        " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
+        (IsNew ? "" : " not"));
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
+      uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
+      void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
+      DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
+      args_base[i] = TgtPtrBase;
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+      bool copy = false;
+      if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
+          HasCloseModifier) {
+        if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
+          copy = true;
+        } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
+          // Copy data only if the "parent" struct has RefCount==1.
+          int32_t parent_idx = member_of(arg_types[i]);
+          uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+          assert(parent_rc > 0 && "parent struct not found");
+          if (parent_rc == 1) {
+            copy = true;
+          }
+        }
+      }
+
+      if (copy && !IsHostPtr) {
+        DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+           data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size,
+                                    async_info_ptr);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
+      DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
+          DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
+      uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
+      void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
+      int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
+                                  sizeof(void *), async_info_ptr);
+      if (rt != OFFLOAD_SUCCESS) {
+        DP("Copying data to device failed.\n");
+        return OFFLOAD_FAIL;
+      }
+      // create shadow pointers for this entry
+      Device.ShadowMtx.lock();
+      Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
+          Pointer_TgtPtrBegin, TgtPtrBase};
+      Device.ShadowMtx.unlock();
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+/// Internal function to undo the mapping and retrieve the data from the device.
+int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
+                    void **args, int64_t *arg_sizes, int64_t *arg_types,
+                    __tgt_async_info *async_info_ptr) {
+  // process each input.
+  for (int32_t i = arg_num - 1; i >= 0; --i) {
+    // Ignore private variables and arrays - there is no mapping for them.
+    // Also, ignore the use_device_ptr directive, it has no effect here.
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+      continue;
+
+    void *HstPtrBegin = args[i];
+    int64_t data_size = arg_sizes[i];
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
+    bool IsLast, IsHostPtr;
+    bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
+    bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
+    bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
+
+    // If PTR_AND_OBJ, HstPtrBegin is address of pointee
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
+        UpdateRef, IsHostPtr);
+    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+        " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
+        (IsLast ? "" : " not"));
+
+    bool DelEntry = IsLast || ForceDelete;
+
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+        !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+      DelEntry = false; // protect parent struct from being deallocated
+    }
+
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
+      // Move data back to the host
+      if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+        bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
+        bool CopyMember = false;
+        if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
+            HasCloseModifier) {
+          if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+              !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+            // Copy data only if the "parent" struct has RefCount==1.
+            int32_t parent_idx = member_of(arg_types[i]);
+            uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+            assert(parent_rc > 0 && "parent struct not found");
+            if (parent_rc == 1) {
+              CopyMember = true;
+            }
+          }
+        }
+
+        if ((DelEntry || Always || CopyMember) &&
+            !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+              TgtPtrBegin == HstPtrBegin)) {
+          DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+             data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size,
+                                        async_info_ptr);
+          if (rt != OFFLOAD_SUCCESS) {
+            DP("Copying data from device failed.\n");
+            return OFFLOAD_FAIL;
+          }
+        }
+      }
+
+      // If we copied back to the host a struct/array containing pointers, we
+      // need to restore the original host pointer values from their shadow
+      // copies. If the struct is going to be deallocated, remove any remaining
+      // shadow pointer entries for this struct.
+      uintptr_t lb = (uintptr_t) HstPtrBegin;
+      uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
+      Device.ShadowMtx.lock();
+      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+           it != Device.ShadowPtrMap.end();) {
+        void **ShadowHstPtrAddr = (void**) it->first;
+
+        // An STL map is sorted on its keys; use this property
+        // to quickly determine when to break out of the loop.
+        if ((uintptr_t) ShadowHstPtrAddr < lb) {
+          ++it;
+          continue;
+        }
+        if ((uintptr_t) ShadowHstPtrAddr >= ub)
+          break;
+
+        // If we copied the struct to the host, we need to restore the pointer.
+        if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+          DP("Restoring original host pointer value " DPxMOD " for host "
+              "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+              DPxPTR(ShadowHstPtrAddr));
+          *ShadowHstPtrAddr = it->second.HstPtrVal;
+        }
+        // If the struct is to be deallocated, remove the shadow entry.
+        if (DelEntry) {
+          DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
+          it = Device.ShadowPtrMap.erase(it);
+        } else {
+          ++it;
+        }
+      }
+      Device.ShadowMtx.unlock();
+
+      // Deallocate map
+      if (DelEntry) {
+        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete,
+                                      HasCloseModifier);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Deallocating data from device failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+/// Internal function to pass data to/from the target.
+int target_data_update(DeviceTy &Device, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  // process each input.
+  for (int32_t i = 0; i < arg_num; ++i) {
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+      continue;
+
+    void *HstPtrBegin = args[i];
+    int64_t MapSize = arg_sizes[i];
+    bool IsLast, IsHostPtr;
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
+        false, IsHostPtr);
+    if (!TgtPtrBegin) {
+      DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
+      continue;
+    }
+
+    if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+        TgtPtrBegin == HstPtrBegin) {
+      DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
+         DPxPTR(HstPtrBegin));
+      continue;
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+      DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+          arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+      int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr);
+      if (rt != OFFLOAD_SUCCESS) {
+        DP("Copying data from device failed.\n");
+        return OFFLOAD_FAIL;
+      }
+
+      uintptr_t lb = (uintptr_t) HstPtrBegin;
+      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+      Device.ShadowMtx.lock();
+      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+          it != Device.ShadowPtrMap.end(); ++it) {
+        void **ShadowHstPtrAddr = (void**) it->first;
+        if ((uintptr_t) ShadowHstPtrAddr < lb)
+          continue;
+        if ((uintptr_t) ShadowHstPtrAddr >= ub)
+          break;
+        DP("Restoring original host pointer value " DPxMOD " for host pointer "
+            DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+            DPxPTR(ShadowHstPtrAddr));
+        *ShadowHstPtrAddr = it->second.HstPtrVal;
+      }
+      Device.ShadowMtx.unlock();
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+      DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+          arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+      int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr);
+      if (rt != OFFLOAD_SUCCESS) {
+        DP("Copying data to device failed.\n");
+        return OFFLOAD_FAIL;
+      }
+
+      uintptr_t lb = (uintptr_t) HstPtrBegin;
+      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+      Device.ShadowMtx.lock();
+      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+          it != Device.ShadowPtrMap.end(); ++it) {
+        void **ShadowHstPtrAddr = (void**) it->first;
+        if ((uintptr_t) ShadowHstPtrAddr < lb)
+          continue;
+        if ((uintptr_t) ShadowHstPtrAddr >= ub)
+          break;
+        DP("Restoring original target pointer value " DPxMOD " for target "
+            "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
+            DPxPTR(it->second.TgtPtrAddr));
+        rt = Device.data_submit(it->second.TgtPtrAddr,
+            &it->second.TgtPtrVal, sizeof(void *), nullptr);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed.\n");
+          Device.ShadowMtx.unlock();
+          return OFFLOAD_FAIL;
+        }
+      }
+      Device.ShadowMtx.unlock();
+    }
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
+                                      OMP_TGT_MAPTYPE_LITERAL |
+                                      OMP_TGT_MAPTYPE_IMPLICIT;
+static bool isLambdaMapping(int64_t Mapping) {
+  return (Mapping & LambdaMapping) == LambdaMapping;
+}
+
+/// performs the same actions as data_begin in case arg_num is
+/// non-zero and initiates run of the offloaded region on the target platform;
+/// if arg_num is non-zero after the region execution is done it also
+/// performs the same action as data_update and data_end above. This function
+/// returns 0 if it was able to transfer the execution to a target and an
+/// integer different from zero otherwise.
+int target(int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
+  DeviceTy &Device = Devices[device_id];
+
+  // Find the table information in the map or look it up in the translation
+  // tables.
+  TableMap *TM = 0;
+  TblMapMtx->lock();
+  HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(host_ptr);
+  if (TableMapIt == HostPtrToTableMap->end()) {
+    // We don't have a map. So search all the registered libraries.
+    TrlTblMtx->lock();
+    for (HostEntriesBeginToTransTableTy::iterator
+             ii = HostEntriesBeginToTransTable->begin(),
+             ie = HostEntriesBeginToTransTable->end();
+         !TM && ii != ie; ++ii) {
+      // get the translation table (which contains all the good info).
+      TranslationTable *TransTable = &ii->second;
+      // iterate over all the host table entries to see if we can locate the
+      // host_ptr.
+      __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
+      __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
+      __tgt_offload_entry *cur = begin;
+      for (uint32_t i = 0; cur < end; ++cur, ++i) {
+        if (cur->addr != host_ptr)
+          continue;
+        // we got a match, now fill the HostPtrToTableMap so that we
+        // may avoid this search next time.
+        TM = &(*HostPtrToTableMap)[host_ptr];
+        TM->Table = TransTable;
+        TM->Index = i;
+        break;
+      }
+    }
+    TrlTblMtx->unlock();
+  } else {
+    TM = &TableMapIt->second;
+  }
+  TblMapMtx->unlock();
+
+  // No map for this host pointer found!
+  if (!TM) {
+    DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+       DPxPTR(host_ptr));
+    return OFFLOAD_FAIL;
+  }
+
+  // get target table.
+  TrlTblMtx->lock();
+  assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
+         "Not expecting a device ID outside the table's bounds!");
+  __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
+  TrlTblMtx->unlock();
+  assert(TargetTable && "Global data has not been mapped\n");
+
+  __tgt_async_info AsyncInfo;
+
+  // Move data to device.
+  int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
+                             arg_types, &AsyncInfo);
+  if (rc != OFFLOAD_SUCCESS) {
+    DP("Call to target_data_begin failed, abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  std::vector<void *> tgt_args;
+  std::vector<ptrdiff_t> tgt_offsets;
+
+  // List of (first-)private arrays allocated for this target region
+  std::vector<void *> fpArrays;
+  std::vector<int> tgtArgsPositions(arg_num, -1);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
+      // This is not a target parameter, do not push it into tgt_args.
+      // Check for lambda mapping.
+      if (isLambdaMapping(arg_types[i])) {
+        assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+               "PTR_AND_OBJ must be also MEMBER_OF.");
+        unsigned idx = member_of(arg_types[i]);
+        int tgtIdx = tgtArgsPositions[idx];
+        assert(tgtIdx != -1 && "Base address must be translated already.");
+        // The parent lambda must be processed already and it must be the last
+        // in tgt_args and tgt_offsets arrays.
+        void *HstPtrVal = args[i];
+        void *HstPtrBegin = args_base[i];
+        void *HstPtrBase = args[idx];
+        bool IsLast, IsHostPtr; // unused.
+        void *TgtPtrBase =
+            (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
+        DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
+        uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
+        void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
+        void *Pointer_TgtPtrBegin =
+            Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false,
+                                  IsHostPtr);
+        if (!Pointer_TgtPtrBegin) {
+          DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
+             DPxPTR(HstPtrVal));
+          continue;
+        }
+        if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+            TgtPtrBegin == HstPtrBegin) {
+          DP("Unified memory is active, no need to map lambda captured"
+             "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal));
+          continue;
+        }
+        DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
+           DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
+                                    sizeof(void *), &AsyncInfo);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+      continue;
+    }
+    void *HstPtrBegin = args[i];
+    void *HstPtrBase = args_base[i];
+    void *TgtPtrBegin;
+    ptrdiff_t TgtBaseOffset;
+    bool IsLast, IsHostPtr; // unused.
+    if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
+      DP("Forwarding first-private value " DPxMOD " to the target construct\n",
+          DPxPTR(HstPtrBase));
+      TgtPtrBegin = HstPtrBase;
+      TgtBaseOffset = 0;
+    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
+      // Allocate memory for (first-)private array
+      TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
+          arg_sizes[i], HstPtrBegin);
+      if (!TgtPtrBegin) {
+        DP ("Data allocation for %sprivate array " DPxMOD " failed, "
+            "abort target.\n",
+            (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+            DPxPTR(HstPtrBegin));
+        return OFFLOAD_FAIL;
+      }
+      fpArrays.push_back(TgtPtrBegin);
+      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
+          "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
+          arg_sizes[i], DPxPTR(TgtPtrBegin),
+          (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+          DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
+#endif
+      // If first-private, copy data from host
+      if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i],
+                                    &AsyncInfo);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed, failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
+          false, IsHostPtr);
+      TgtBaseOffset = 0; // no offset for ptrs.
+      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
+         "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
+         DPxPTR(HstPtrBase));
+    } else {
+      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
+          false, IsHostPtr);
+      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
+          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
+#endif
+    }
+    tgtArgsPositions[i] = tgt_args.size();
+    tgt_args.push_back(TgtPtrBegin);
+    tgt_offsets.push_back(TgtBaseOffset);
+  }
+
+  assert(tgt_args.size() == tgt_offsets.size() &&
+      "Size mismatch in arguments and offsets");
+
+  // Pop loop trip count
+  uint64_t ltc = 0;
+  TblMapMtx->lock();
+  auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
+  if (I != Device.LoopTripCnt.end()) {
+    ltc = I->second;
+    Device.LoopTripCnt.erase(I);
+    DP("loop trip count is %lu.\n", ltc);
+  }
+  TblMapMtx->unlock();
+
+  // Launch device execution.
+  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+      TargetTable->EntriesBegin[TM->Index].name,
+      DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
+  if (IsTeamConstruct) {
+    rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
+                                &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
+                                team_num, thread_limit, ltc, &AsyncInfo);
+  } else {
+    rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
+                           &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
+                           &AsyncInfo);
+  }
+  if (rc != OFFLOAD_SUCCESS) {
+    DP ("Executing target region abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  // Deallocate (first-)private arrays
+  for (auto it : fpArrays) {
+    int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
+    if (rt != OFFLOAD_SUCCESS) {
+      DP("Deallocation of (first-)private arrays failed.\n");
+      return OFFLOAD_FAIL;
+    }
+  }
+
+  // Move data from device.
+  int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
+                           arg_types, &AsyncInfo);
+  if (rt != OFFLOAD_SUCCESS) {
+    DP("Call to target_data_end failed, abort targe.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (Device.RTL->synchronize)
+    return Device.RTL->synchronize(device_id, &AsyncInfo);
+
+  return OFFLOAD_SUCCESS;
+}
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index dbc5bafbab5bf..866c2e54413ac 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -1,108 +1,108 @@
-//===---------- private.h - Target independent OpenMP target RTL ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Private function declarations and helper macros for debugging output.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_PRIVATE_H
-#define _OMPTARGET_PRIVATE_H
-
-#include <omptarget.h>
-
-#include <cstdint>
-
-extern int target_data_begin(DeviceTy &Device, int32_t arg_num,
-                             void **args_base, void **args, int64_t *arg_sizes,
-                             int64_t *arg_types,
-                             __tgt_async_info *async_info_ptr);
-
-extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
-                           void **args, int64_t *arg_sizes, int64_t *arg_types,
-                           __tgt_async_info *async_info_ptr);
-
-extern int target_data_update(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
-
-extern int target(int64_t device_id, void *host_ptr, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
-    int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
-
-extern int CheckDeviceAndCtors(int64_t device_id);
-
-// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition
-enum kmp_target_offload_kind {
-  tgt_disabled = 0,
-  tgt_default = 1,
-  tgt_mandatory = 2
-};
-typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
-extern kmp_target_offload_kind_t TargetOffloadPolicy;
-
-// This structure stores information of a mapped memory region.
-struct MapComponentInfoTy {
-  void *Base;
-  void *Begin;
-  int64_t Size;
-  int64_t Type;
-  MapComponentInfoTy() = default;
-  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type)
-      : Base(Base), Begin(Begin), Size(Size), Type(Type) {}
-};
-
-// This structure stores all components of a user-defined mapper. The number of
-// components are dynamically decided, so we utilize C++ STL vector
-// implementation here.
-struct MapperComponentsTy {
-  std::vector<MapComponentInfoTy> Components;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// implementation for fatal messages
-////////////////////////////////////////////////////////////////////////////////
-
-#define FATAL_MESSAGE0(_num, _str)                                    \
-  do {                                                                \
-    fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \
-    exit(1);                                                          \
-  } while (0)
-
-#define FATAL_MESSAGE(_num, _str, ...)                              \
-  do {                                                              \
-    fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \
-            __VA_ARGS__);                                           \
-    exit(1);                                                        \
-  } while (0)
-
-// Implemented in libomp, they are called from within __tgt_* functions.
-#ifdef __cplusplus
-extern "C" {
-#endif
-// functions that extract info from libomp; keep in sync
-int omp_get_default_device(void) __attribute__((weak));
-int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak));
-int32_t __kmpc_global_thread_num(void *) __attribute__((weak));
-int __kmpc_get_target_offload(void) __attribute__((weak));
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef OMPTARGET_DEBUG
-extern int DebugLevel;
-
-#define DP(...) \
-  do { \
-    if (DebugLevel > 0) { \
-      DEBUGP("Libomptarget", __VA_ARGS__); \
-    } \
-  } while (false)
-#else // OMPTARGET_DEBUG
-#define DP(...) {}
-#endif // OMPTARGET_DEBUG
-
-#endif
+//===---------- private.h - Target independent OpenMP target RTL ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Private function declarations and helper macros for debugging output.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_PRIVATE_H
+#define _OMPTARGET_PRIVATE_H
+
+#include <omptarget.h>
+
+#include <cstdint>
+
+extern int target_data_begin(DeviceTy &Device, int32_t arg_num,
+                             void **args_base, void **args, int64_t *arg_sizes,
+                             int64_t *arg_types,
+                             __tgt_async_info *async_info_ptr);
+
+extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
+                           void **args, int64_t *arg_sizes, int64_t *arg_types,
+                           __tgt_async_info *async_info_ptr);
+
+extern int target_data_update(DeviceTy &Device, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+
+extern int target(int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
+
+extern int CheckDeviceAndCtors(int64_t device_id);
+
+// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition
+enum kmp_target_offload_kind {
+  tgt_disabled = 0,
+  tgt_default = 1,
+  tgt_mandatory = 2
+};
+typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
+extern kmp_target_offload_kind_t TargetOffloadPolicy;
+
+// This structure stores information of a mapped memory region.
+struct MapComponentInfoTy {
+  void *Base;
+  void *Begin;
+  int64_t Size;
+  int64_t Type;
+  MapComponentInfoTy() = default;
+  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type)
+      : Base(Base), Begin(Begin), Size(Size), Type(Type) {}
+};
+
+// This structure stores all components of a user-defined mapper. The number of
+// components are dynamically decided, so we utilize C++ STL vector
+// implementation here.
+struct MapperComponentsTy {
+  std::vector<MapComponentInfoTy> Components;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// implementation for fatal messages
+////////////////////////////////////////////////////////////////////////////////
+
+#define FATAL_MESSAGE0(_num, _str)                                    \
+  do {                                                                \
+    fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \
+    exit(1);                                                          \
+  } while (0)
+
+#define FATAL_MESSAGE(_num, _str, ...)                              \
+  do {                                                              \
+    fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \
+            __VA_ARGS__);                                           \
+    exit(1);                                                        \
+  } while (0)
+
+// Implemented in libomp, they are called from within __tgt_* functions.
+#ifdef __cplusplus
+extern "C" {
+#endif
+// functions that extract info from libomp; keep in sync
+int omp_get_default_device(void) __attribute__((weak));
+int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak));
+int32_t __kmpc_global_thread_num(void *) __attribute__((weak));
+int __kmpc_get_target_offload(void) __attribute__((weak));
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef OMPTARGET_DEBUG
+extern int DebugLevel;
+
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Libomptarget", __VA_ARGS__); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#endif
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 1439f67e7c648..7ee8377d33399 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -1,434 +1,438 @@
-//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Functionality for handling RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#include "device.h"
-#include "private.h"
-#include "rtl.h"
-
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <dlfcn.h>
-#include <mutex>
-#include <string>
-
-// List of all plugins that can support offloading.
-static const char *RTLNames[] = {
-    /* PowerPC target */ "libomptarget.rtl.ppc64.so",
-    /* x86_64 target  */ "libomptarget.rtl.x86_64.so",
-    /* CUDA target    */ "libomptarget.rtl.cuda.so",
-    /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
-
-RTLsTy *RTLs;
-std::mutex *RTLsMtx;
-
-HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable;
-std::mutex *TrlTblMtx;
-
-HostPtrToTableMapTy *HostPtrToTableMap;
-std::mutex *TblMapMtx;
-
-__attribute__((constructor(101))) void init() {
-  DP("Init target library!\n");
-  RTLs = new RTLsTy();
-  RTLsMtx = new std::mutex();
-  HostEntriesBeginToTransTable = new HostEntriesBeginToTransTableTy();
-  TrlTblMtx = new std::mutex();
-  HostPtrToTableMap = new HostPtrToTableMapTy();
-  TblMapMtx = new std::mutex();
-}
-
-__attribute__((destructor(101))) void deinit() {
-  DP("Deinit target library!\n");
-  delete RTLs;
-  delete RTLsMtx;
-  delete HostEntriesBeginToTransTable;
-  delete TrlTblMtx;
-  delete HostPtrToTableMap;
-  delete TblMapMtx;
-}
-
-void RTLsTy::LoadRTLs() {
-#ifdef OMPTARGET_DEBUG
-  if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
-    DebugLevel = std::stoi(envStr);
-  }
-#endif // OMPTARGET_DEBUG
-
-  // Parse environment variable OMP_TARGET_OFFLOAD (if set)
-  TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload();
-  if (TargetOffloadPolicy == tgt_disabled) {
-    return;
-  }
-
-  DP("Loading RTLs...\n");
-
-  // Attempt to open all the plugins and, if they exist, check if the interface
-  // is correct and if they are supporting any devices.
-  for (auto *Name : RTLNames) {
-    DP("Loading library '%s'...\n", Name);
-    void *dynlib_handle = dlopen(Name, RTLD_NOW);
-
-    if (!dynlib_handle) {
-      // Library does not exist or cannot be found.
-      DP("Unable to load library '%s': %s!\n", Name, dlerror());
-      continue;
-    }
-
-    DP("Successfully loaded library '%s'!\n", Name);
-
-    // Retrieve the RTL information from the runtime library.
-    RTLInfoTy R;
-
-    R.LibraryHandler = dynlib_handle;
-    R.isUsed = false;
-
-#ifdef OMPTARGET_DEBUG
-    R.RTLName = Name;
-#endif
-
-    if (!(*((void **)&R.is_valid_binary) =
-              dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary")))
-      continue;
-    if (!(*((void **)&R.number_of_devices) =
-              dlsym(dynlib_handle, "__tgt_rtl_number_of_devices")))
-      continue;
-    if (!(*((void **)&R.init_device) =
-              dlsym(dynlib_handle, "__tgt_rtl_init_device")))
-      continue;
-    if (!(*((void **)&R.load_binary) =
-              dlsym(dynlib_handle, "__tgt_rtl_load_binary")))
-      continue;
-    if (!(*((void **)&R.data_alloc) =
-              dlsym(dynlib_handle, "__tgt_rtl_data_alloc")))
-      continue;
-    if (!(*((void **)&R.data_submit) =
-              dlsym(dynlib_handle, "__tgt_rtl_data_submit")))
-      continue;
-    if (!(*((void **)&R.data_retrieve) =
-              dlsym(dynlib_handle, "__tgt_rtl_data_retrieve")))
-      continue;
-    if (!(*((void **)&R.data_delete) =
-              dlsym(dynlib_handle, "__tgt_rtl_data_delete")))
-      continue;
-    if (!(*((void **)&R.run_region) =
-              dlsym(dynlib_handle, "__tgt_rtl_run_target_region")))
-      continue;
-    if (!(*((void **)&R.run_team_region) =
-              dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region")))
-      continue;
-
-    // Optional functions
-    *((void **)&R.init_requires) =
-        dlsym(dynlib_handle, "__tgt_rtl_init_requires");
-    *((void **)&R.data_submit_async) =
-        dlsym(dynlib_handle, "__tgt_rtl_data_submit_async");
-    *((void **)&R.data_retrieve_async) =
-        dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async");
-    *((void **)&R.run_region_async) =
-        dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async");
-    *((void **)&R.run_team_region_async) =
-        dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async");
-    *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize");
-
-    // No devices are supported by this RTL?
-    if (!(R.NumberOfDevices = R.number_of_devices())) {
-      DP("No devices supported in this RTL\n");
-      continue;
-    }
-
-    DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(),
-       R.NumberOfDevices);
-
-    // The RTL is valid! Will save the information in the RTLs list.
-    AllRTLs.push_back(R);
-  }
-
-  DP("RTLs loaded!\n");
-
-  return;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Functionality for registering libs
-
-static void RegisterImageIntoTranslationTable(TranslationTable &TT,
-    RTLInfoTy &RTL, __tgt_device_image *image) {
-
-  // same size, as when we increase one, we also increase the other.
-  assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
-         "We should have as many images as we have tables!");
-
-  // Resize the Targets Table and Images to accommodate the new targets if
-  // required
-  unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
-
-  if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
-    TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
-    TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
-  }
-
-  // Register the image in all devices for this target type.
-  for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) {
-    // If we are changing the image we are also invalidating the target table.
-    if (TT.TargetsImages[RTL.Idx + i] != image) {
-      TT.TargetsImages[RTL.Idx + i] = image;
-      TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table.
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Functionality for registering Ctors/Dtors
-
-static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
-    __tgt_device_image *img, RTLInfoTy *RTL) {
-
-  for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
-    DeviceTy &Device = Devices[RTL->Idx + i];
-    Device.PendingGlobalsMtx.lock();
-    Device.HasPendingGlobals = true;
-    for (__tgt_offload_entry *entry = img->EntriesBegin;
-        entry != img->EntriesEnd; ++entry) {
-      if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
-        DP("Adding ctor " DPxMOD " to the pending list.\n",
-            DPxPTR(entry->addr));
-        Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
-      } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
-        // Dtors are pushed in reverse order so they are executed from end
-        // to beginning when unregistering the library!
-        DP("Adding dtor " DPxMOD " to the pending list.\n",
-            DPxPTR(entry->addr));
-        Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
-      }
-
-      if (entry->flags & OMP_DECLARE_TARGET_LINK) {
-        DP("The \"link\" attribute is not yet supported!\n");
-      }
-    }
-    Device.PendingGlobalsMtx.unlock();
-  }
-}
-
-void RTLsTy::RegisterRequires(int64_t flags) {
-  // TODO: add more elaborate check.
-  // Minimal check: only set requires flags if previous value
-  // is undefined. This ensures that only the first call to this
-  // function will set the requires flags. All subsequent calls
-  // will be checked for compatibility.
-  assert(flags != OMP_REQ_UNDEFINED &&
-         "illegal undefined flag for requires directive!");
-  if (RequiresFlags == OMP_REQ_UNDEFINED) {
-    RequiresFlags = flags;
-    return;
-  }
-
-  // If multiple compilation units are present enforce
-  // consistency across all of them for require clauses:
-  //  - reverse_offload
-  //  - unified_address
-  //  - unified_shared_memory
-  if ((RequiresFlags & OMP_REQ_REVERSE_OFFLOAD) !=
-      (flags & OMP_REQ_REVERSE_OFFLOAD)) {
-    FATAL_MESSAGE0(1,
-        "'#pragma omp requires reverse_offload' not used consistently!");
-  }
-  if ((RequiresFlags & OMP_REQ_UNIFIED_ADDRESS) !=
-          (flags & OMP_REQ_UNIFIED_ADDRESS)) {
-    FATAL_MESSAGE0(1,
-        "'#pragma omp requires unified_address' not used consistently!");
-  }
-  if ((RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) !=
-          (flags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
-    FATAL_MESSAGE0(1,
-        "'#pragma omp requires unified_shared_memory' not used consistently!");
-  }
-
-  // TODO: insert any other missing checks
-
-  DP("New requires flags %ld compatible with existing %ld!\n",
-     flags, RequiresFlags);
-}
-
-void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
-  // Attempt to load all plugins available in the system.
-  std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
-
-  RTLsMtx->lock();
-  // Register the images with the RTLs that understand them, if any.
-  for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
-    // Obtain the image.
-    __tgt_device_image *img = &desc->DeviceImages[i];
-
-    RTLInfoTy *FoundRTL = NULL;
-
-    // Scan the RTLs that have associated images until we find one that supports
-    // the current image.
-    for (auto &R : AllRTLs) {
-      if (!R.is_valid_binary(img)) {
-        DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
-            DPxPTR(img->ImageStart), R.RTLName.c_str());
-        continue;
-      }
-
-      DP("Image " DPxMOD " is compatible with RTL %s!\n",
-          DPxPTR(img->ImageStart), R.RTLName.c_str());
-
-      // If this RTL is not already in use, initialize it.
-      if (!R.isUsed) {
-        // Initialize the device information for the RTL we are about to use.
-        DeviceTy device(&R);
-        size_t start = Devices.size();
-        Devices.resize(start + R.NumberOfDevices, device);
-        for (int32_t device_id = 0; device_id < R.NumberOfDevices;
-            device_id++) {
-          // global device ID
-          Devices[start + device_id].DeviceID = start + device_id;
-          // RTL local device ID
-          Devices[start + device_id].RTLDeviceID = device_id;
-        }
-
-        // Initialize the index of this RTL and save it in the used RTLs.
-        R.Idx = (UsedRTLs.empty())
-                    ? 0
-                    : UsedRTLs.back()->Idx + UsedRTLs.back()->NumberOfDevices;
-        assert((size_t) R.Idx == start &&
-            "RTL index should equal the number of devices used so far.");
-        R.isUsed = true;
-        UsedRTLs.push_back(&R);
-
-        DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
-      }
-
-      // Initialize (if necessary) translation table for this library.
-      TrlTblMtx->lock();
-      if(!HostEntriesBeginToTransTable->count(desc->HostEntriesBegin)){
-        TranslationTable &tt =
-            (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin];
-        tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
-        tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
-      }
-
-      // Retrieve translation table for this library.
-      TranslationTable &TransTable =
-          (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin];
-
-      DP("Registering image " DPxMOD " with RTL %s!\n",
-          DPxPTR(img->ImageStart), R.RTLName.c_str());
-      RegisterImageIntoTranslationTable(TransTable, R, img);
-      TrlTblMtx->unlock();
-      FoundRTL = &R;
-
-      // Load ctors/dtors for static objects
-      RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
-
-      // if an RTL was found we are done - proceed to register the next image
-      break;
-    }
-
-    if (!FoundRTL) {
-      DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
-    }
-  }
-  RTLsMtx->unlock();
-
-
-  DP("Done registering entries!\n");
-}
-
-void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
-  DP("Unloading target library!\n");
-
-  RTLsMtx->lock();
-  // Find which RTL understands each image, if any.
-  for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
-    // Obtain the image.
-    __tgt_device_image *img = &desc->DeviceImages[i];
-
-    RTLInfoTy *FoundRTL = NULL;
-
-    // Scan the RTLs that have associated images until we find one that supports
-    // the current image. We only need to scan RTLs that are already being used.
-    for (auto *R : UsedRTLs) {
-
-      assert(R->isUsed && "Expecting used RTLs.");
-
-      if (!R->is_valid_binary(img)) {
-        DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
-            DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
-        continue;
-      }
-
-      DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n",
-          DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
-
-      FoundRTL = R;
-
-      // Execute dtors for static objects if the device has been used, i.e.
-      // if its PendingCtors list has been emptied.
-      for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
-        DeviceTy &Device = Devices[FoundRTL->Idx + i];
-        Device.PendingGlobalsMtx.lock();
-        if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
-          for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
-            int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
-                1, true /*team*/);
-            if (rc != OFFLOAD_SUCCESS) {
-              DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
-            }
-          }
-          // Remove this library's entry from PendingCtorsDtors
-          Device.PendingCtorsDtors.erase(desc);
-        }
-        Device.PendingGlobalsMtx.unlock();
-      }
-
-      DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
-          DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
-
-      break;
-    }
-
-    // if no RTL was found proceed to unregister the next image
-    if (!FoundRTL){
-      DP("No RTLs in use support the image " DPxMOD "!\n",
-          DPxPTR(img->ImageStart));
-    }
-  }
-  RTLsMtx->unlock();
-  DP("Done unregistering images!\n");
-
-  // Remove entries from HostPtrToTableMap
-  TblMapMtx->lock();
-  for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
-      cur < desc->HostEntriesEnd; ++cur) {
-    HostPtrToTableMap->erase(cur->addr);
-  }
-
-  // Remove translation table for this descriptor.
-  auto tt = HostEntriesBeginToTransTable->find(desc->HostEntriesBegin);
-  if (tt != HostEntriesBeginToTransTable->end()) {
-    DP("Removing translation table for descriptor " DPxMOD "\n",
-        DPxPTR(desc->HostEntriesBegin));
-    HostEntriesBeginToTransTable->erase(tt);
-  } else {
-    DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
-        "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
-  }
-
-  TblMapMtx->unlock();
-
-  // TODO: Remove RTL and the devices it manages if it's not used anymore?
-  // TODO: Write some RTL->unload_image(...) function?
-
-  DP("Done unregistering library!\n");
-}
+//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functionality for handling RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <mutex>
+#include <string>
+
+// List of all plugins that can support offloading.
+static const char *RTLNames[] = {
+    /* PowerPC target */ "libomptarget.rtl.ppc64.so",
+    /* x86_64 target  */ "libomptarget.rtl.x86_64.so",
+    /* CUDA target    */ "libomptarget.rtl.cuda.so",
+    /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
+
+RTLsTy *RTLs;
+std::mutex *RTLsMtx;
+
+HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable;
+std::mutex *TrlTblMtx;
+
+HostPtrToTableMapTy *HostPtrToTableMap;
+std::mutex *TblMapMtx;
+
+__attribute__((constructor(101))) void init() {
+  DP("Init target library!\n");
+  RTLs = new RTLsTy();
+  RTLsMtx = new std::mutex();
+  HostEntriesBeginToTransTable = new HostEntriesBeginToTransTableTy();
+  TrlTblMtx = new std::mutex();
+  HostPtrToTableMap = new HostPtrToTableMapTy();
+  TblMapMtx = new std::mutex();
+}
+
+__attribute__((destructor(101))) void deinit() {
+  DP("Deinit target library!\n");
+  delete RTLs;
+  delete RTLsMtx;
+  delete HostEntriesBeginToTransTable;
+  delete TrlTblMtx;
+  delete HostPtrToTableMap;
+  delete TblMapMtx;
+}
+
+void RTLsTy::LoadRTLs() {
+#ifdef OMPTARGET_DEBUG
+  if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+    DebugLevel = std::stoi(envStr);
+  }
+#endif // OMPTARGET_DEBUG
+
+  // Parse environment variable OMP_TARGET_OFFLOAD (if set)
+  TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload();
+  if (TargetOffloadPolicy == tgt_disabled) {
+    return;
+  }
+
+  DP("Loading RTLs...\n");
+
+  // Attempt to open all the plugins and, if they exist, check if the interface
+  // is correct and if they are supporting any devices.
+  for (auto *Name : RTLNames) {
+    DP("Loading library '%s'...\n", Name);
+    void *dynlib_handle = dlopen(Name, RTLD_NOW);
+
+    if (!dynlib_handle) {
+      // Library does not exist or cannot be found.
+      DP("Unable to load library '%s': %s!\n", Name, dlerror());
+      continue;
+    }
+
+    DP("Successfully loaded library '%s'!\n", Name);
+
+    // Retrieve the RTL information from the runtime library.
+    RTLInfoTy R;
+
+    R.LibraryHandler = dynlib_handle;
+    R.isUsed = false;
+
+
+    R.RTLName = Name;
+
+
+    if (!(*((void **)&R.is_valid_binary) =
+              dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary")))
+      continue;
+    if (!(*((void **)&R.number_of_devices) =
+              dlsym(dynlib_handle, "__tgt_rtl_number_of_devices")))
+      continue;
+    if (!(*((void **)&R.init_device) =
+              dlsym(dynlib_handle, "__tgt_rtl_init_device")))
+      continue;
+    if (!(*((void **)&R.load_binary) =
+              dlsym(dynlib_handle, "__tgt_rtl_load_binary")))
+      continue;
+    if (!(*((void **)&R.data_alloc) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_alloc")))
+      continue;
+    if (!(*((void **)&R.data_submit) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_submit")))
+      continue;
+    if (!(*((void **)&R.data_retrieve) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_retrieve")))
+      continue;
+    if (!(*((void **)&R.data_delete) =
+              dlsym(dynlib_handle, "__tgt_rtl_data_delete")))
+      continue;
+    if (!(*((void **)&R.run_region) =
+              dlsym(dynlib_handle, "__tgt_rtl_run_target_region")))
+      continue;
+    if (!(*((void **)&R.run_team_region) =
+              dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region")))
+      continue;
+
+    // Optional functions
+    *((void **)&R.init_requires) =
+        dlsym(dynlib_handle, "__tgt_rtl_init_requires");
+    *((void **)&R.data_submit_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_submit_async");
+    *((void **)&R.data_retrieve_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async");
+    *((void **)&R.data_transfer) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_transfer");
+    *((void **)&R.data_transfer_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_transfer_async");
+    *((void **)&R.run_region_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async");
+    *((void **)&R.run_team_region_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async");
+    *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize");
+
+    // No devices are supported by this RTL?
+    if (!(R.NumberOfDevices = R.number_of_devices())) {
+      DP("No devices supported in this RTL\n");
+      continue;
+    }
+
+    DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(),
+       R.NumberOfDevices);
+
+    // The RTL is valid! Will save the information in the RTLs list.
+    AllRTLs.push_back(R);
+  }
+
+  DP("RTLs loaded!\n");
+
+  return;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Functionality for registering libs
+
+static void RegisterImageIntoTranslationTable(TranslationTable &TT,
+    RTLInfoTy &RTL, __tgt_device_image *image) {
+
+  // same size, as when we increase one, we also increase the other.
+  assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
+         "We should have as many images as we have tables!");
+
+  // Resize the Targets Table and Images to accommodate the new targets if
+  // required
+  unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
+
+  if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
+    TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
+    TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
+  }
+
+  // Register the image in all devices for this target type.
+  for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) {
+    // If we are changing the image we are also invalidating the target table.
+    if (TT.TargetsImages[RTL.Idx + i] != image) {
+      TT.TargetsImages[RTL.Idx + i] = image;
+      TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table.
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Functionality for registering Ctors/Dtors
+
+static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
+    __tgt_device_image *img, RTLInfoTy *RTL) {
+
+  for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
+    DeviceTy &Device = Devices[RTL->Idx + i];
+    Device.PendingGlobalsMtx.lock();
+    Device.HasPendingGlobals = true;
+    for (__tgt_offload_entry *entry = img->EntriesBegin;
+        entry != img->EntriesEnd; ++entry) {
+      if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
+        DP("Adding ctor " DPxMOD " to the pending list.\n",
+            DPxPTR(entry->addr));
+        Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
+      } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
+        // Dtors are pushed in reverse order so they are executed from end
+        // to beginning when unregistering the library!
+        DP("Adding dtor " DPxMOD " to the pending list.\n",
+            DPxPTR(entry->addr));
+        Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
+      }
+
+      if (entry->flags & OMP_DECLARE_TARGET_LINK) {
+        DP("The \"link\" attribute is not yet supported!\n");
+      }
+    }
+    Device.PendingGlobalsMtx.unlock();
+  }
+}
+
+void RTLsTy::RegisterRequires(int64_t flags) {
+  // TODO: add more elaborate check.
+  // Minimal check: only set requires flags if previous value
+  // is undefined. This ensures that only the first call to this
+  // function will set the requires flags. All subsequent calls
+  // will be checked for compatibility.
+  assert(flags != OMP_REQ_UNDEFINED &&
+         "illegal undefined flag for requires directive!");
+  if (RequiresFlags == OMP_REQ_UNDEFINED) {
+    RequiresFlags = flags;
+    return;
+  }
+
+  // If multiple compilation units are present enforce
+  // consistency across all of them for require clauses:
+  //  - reverse_offload
+  //  - unified_address
+  //  - unified_shared_memory
+  if ((RequiresFlags & OMP_REQ_REVERSE_OFFLOAD) !=
+      (flags & OMP_REQ_REVERSE_OFFLOAD)) {
+    FATAL_MESSAGE0(1,
+        "'#pragma omp requires reverse_offload' not used consistently!");
+  }
+  if ((RequiresFlags & OMP_REQ_UNIFIED_ADDRESS) !=
+          (flags & OMP_REQ_UNIFIED_ADDRESS)) {
+    FATAL_MESSAGE0(1,
+        "'#pragma omp requires unified_address' not used consistently!");
+  }
+  if ((RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) !=
+          (flags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+    FATAL_MESSAGE0(1,
+        "'#pragma omp requires unified_shared_memory' not used consistently!");
+  }
+
+  // TODO: insert any other missing checks
+
+  DP("New requires flags %ld compatible with existing %ld!\n",
+     flags, RequiresFlags);
+}
+
+void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
+  // Attempt to load all plugins available in the system.
+  std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
+
+  RTLsMtx->lock();
+  // Register the images with the RTLs that understand them, if any.
+  for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+    // Obtain the image.
+    __tgt_device_image *img = &desc->DeviceImages[i];
+
+    RTLInfoTy *FoundRTL = NULL;
+
+    // Scan the RTLs that have associated images until we find one that supports
+    // the current image.
+    for (auto &R : AllRTLs) {
+      if (!R.is_valid_binary(img)) {
+        DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
+            DPxPTR(img->ImageStart), R.RTLName.c_str());
+        continue;
+      }
+
+      DP("Image " DPxMOD " is compatible with RTL %s!\n",
+          DPxPTR(img->ImageStart), R.RTLName.c_str());
+
+      // If this RTL is not already in use, initialize it.
+      if (!R.isUsed) {
+        // Initialize the device information for the RTL we are about to use.
+        DeviceTy device(&R);
+        size_t start = Devices.size();
+        Devices.resize(start + R.NumberOfDevices, device);
+        for (int32_t device_id = 0; device_id < R.NumberOfDevices;
+            device_id++) {
+          // global device ID
+          Devices[start + device_id].DeviceID = start + device_id;
+          // RTL local device ID
+          Devices[start + device_id].RTLDeviceID = device_id;
+        }
+
+        // Initialize the index of this RTL and save it in the used RTLs.
+        R.Idx = (UsedRTLs.empty())
+                    ? 0
+                    : UsedRTLs.back()->Idx + UsedRTLs.back()->NumberOfDevices;
+        assert((size_t) R.Idx == start &&
+            "RTL index should equal the number of devices used so far.");
+        R.isUsed = true;
+        UsedRTLs.push_back(&R);
+
+        DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
+      }
+
+      // Initialize (if necessary) translation table for this library.
+      TrlTblMtx->lock();
+      if(!HostEntriesBeginToTransTable->count(desc->HostEntriesBegin)){
+        TranslationTable &tt =
+            (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin];
+        tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
+        tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
+      }
+
+      // Retrieve translation table for this library.
+      TranslationTable &TransTable =
+          (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin];
+
+      DP("Registering image " DPxMOD " with RTL %s!\n",
+          DPxPTR(img->ImageStart), R.RTLName.c_str());
+      RegisterImageIntoTranslationTable(TransTable, R, img);
+      TrlTblMtx->unlock();
+      FoundRTL = &R;
+
+      // Load ctors/dtors for static objects
+      RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
+
+      // if an RTL was found we are done - proceed to register the next image
+      break;
+    }
+
+    if (!FoundRTL) {
+      DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
+    }
+  }
+  RTLsMtx->unlock();
+
+
+  DP("Done registering entries!\n");
+}
+
+void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
+  DP("Unloading target library!\n");
+
+  RTLsMtx->lock();
+  // Find which RTL understands each image, if any.
+  for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+    // Obtain the image.
+    __tgt_device_image *img = &desc->DeviceImages[i];
+
+    RTLInfoTy *FoundRTL = NULL;
+
+    // Scan the RTLs that have associated images until we find one that supports
+    // the current image. We only need to scan RTLs that are already being used.
+    for (auto *R : UsedRTLs) {
+
+      assert(R->isUsed && "Expecting used RTLs.");
+
+      if (!R->is_valid_binary(img)) {
+        DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
+            DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+        continue;
+      }
+
+      DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n",
+          DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+
+      FoundRTL = R;
+
+      // Execute dtors for static objects if the device has been used, i.e.
+      // if its PendingCtors list has been emptied.
+      for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
+        DeviceTy &Device = Devices[FoundRTL->Idx + i];
+        Device.PendingGlobalsMtx.lock();
+        if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
+          for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
+            int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
+                1, true /*team*/);
+            if (rc != OFFLOAD_SUCCESS) {
+              DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
+            }
+          }
+          // Remove this library's entry from PendingCtorsDtors
+          Device.PendingCtorsDtors.erase(desc);
+        }
+        Device.PendingGlobalsMtx.unlock();
+      }
+
+      DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
+          DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+
+      break;
+    }
+
+    // if no RTL was found proceed to unregister the next image
+    if (!FoundRTL){
+      DP("No RTLs in use support the image " DPxMOD "!\n",
+          DPxPTR(img->ImageStart));
+    }
+  }
+  RTLsMtx->unlock();
+  DP("Done unregistering images!\n");
+
+  // Remove entries from HostPtrToTableMap
+  TblMapMtx->lock();
+  for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
+      cur < desc->HostEntriesEnd; ++cur) {
+    HostPtrToTableMap->erase(cur->addr);
+  }
+
+  // Remove translation table for this descriptor.
+  auto tt = HostEntriesBeginToTransTable->find(desc->HostEntriesBegin);
+  if (tt != HostEntriesBeginToTransTable->end()) {
+    DP("Removing translation table for descriptor " DPxMOD "\n",
+        DPxPTR(desc->HostEntriesBegin));
+    HostEntriesBeginToTransTable->erase(tt);
+  } else {
+    DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
+        "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
+  }
+
+  TblMapMtx->unlock();
+
+  // TODO: Remove RTL and the devices it manages if it's not used anymore?
+  // TODO: Write some RTL->unload_image(...) function?
+
+  DP("Done unregistering library!\n");
+}
diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h
index 86ecd6724a8df..62f6f466e6cee 100644
--- a/openmp/libomptarget/src/rtl.h
+++ b/openmp/libomptarget/src/rtl.h
@@ -1,185 +1,192 @@
-//===------------ rtl.h - Target independent OpenMP target RTL ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations for handling RTL plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_RTL_H
-#define _OMPTARGET_RTL_H
-
-#include "omptarget.h"
-#include <list>
-#include <map>
-#include <mutex>
-#include <string>
-#include <vector>
-
-// Forward declarations.
-struct DeviceTy;
-struct __tgt_bin_desc;
-
-struct RTLInfoTy {
-  typedef int32_t(is_valid_binary_ty)(void *);
-  typedef int32_t(number_of_devices_ty)();
-  typedef int32_t(init_device_ty)(int32_t);
-  typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
-  typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
-  typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
-  typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t,
-                                        __tgt_async_info *);
-  typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
-  typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t,
-                                          __tgt_async_info *);
-  typedef int32_t(data_delete_ty)(int32_t, void *);
-  typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
-                                 int32_t);
-  typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *,
-                                       int32_t, __tgt_async_info *);
-  typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
-                                      int32_t, int32_t, int32_t, uint64_t);
-  typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **,
-                                            ptrdiff_t *, int32_t, int32_t,
-                                            int32_t, uint64_t,
-                                            __tgt_async_info *);
-  typedef int64_t(init_requires_ty)(int64_t);
-  typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *);
-
-  int32_t Idx = -1;             // RTL index, index is the number of devices
-                                // of other RTLs that were registered before,
-                                // i.e. the OpenMP index of the first device
-                                // to be registered with this RTL.
-  int32_t NumberOfDevices = -1; // Number of devices this RTL deals with.
-
-  void *LibraryHandler = nullptr;
-
-#ifdef OMPTARGET_DEBUG
-  std::string RTLName;
-#endif
-
-  // Functions implemented in the RTL.
-  is_valid_binary_ty *is_valid_binary = nullptr;
-  number_of_devices_ty *number_of_devices = nullptr;
-  init_device_ty *init_device = nullptr;
-  load_binary_ty *load_binary = nullptr;
-  data_alloc_ty *data_alloc = nullptr;
-  data_submit_ty *data_submit = nullptr;
-  data_submit_async_ty *data_submit_async = nullptr;
-  data_retrieve_ty *data_retrieve = nullptr;
-  data_retrieve_async_ty *data_retrieve_async = nullptr;
-  data_delete_ty *data_delete = nullptr;
-  run_region_ty *run_region = nullptr;
-  run_region_async_ty *run_region_async = nullptr;
-  run_team_region_ty *run_team_region = nullptr;
-  run_team_region_async_ty *run_team_region_async = nullptr;
-  init_requires_ty *init_requires = nullptr;
-  synchronize_ty *synchronize = nullptr;
-
-  // Are there images associated with this RTL.
-  bool isUsed = false;
-
-  // Mutex for thread-safety when calling RTL interface functions.
-  // It is easier to enforce thread-safety at the libomptarget level,
-  // so that developers of new RTLs do not have to worry about it.
-  std::mutex Mtx;
-
-  // The existence of the mutex above makes RTLInfoTy non-copyable.
-  // We need to provide a copy constructor explicitly.
-  RTLInfoTy() = default;
-
-  RTLInfoTy(const RTLInfoTy &r) {
-    Idx = r.Idx;
-    NumberOfDevices = r.NumberOfDevices;
-    LibraryHandler = r.LibraryHandler;
-#ifdef OMPTARGET_DEBUG
-    RTLName = r.RTLName;
-#endif
-    is_valid_binary = r.is_valid_binary;
-    number_of_devices = r.number_of_devices;
-    init_device = r.init_device;
-    load_binary = r.load_binary;
-    data_alloc = r.data_alloc;
-    data_submit = r.data_submit;
-    data_submit_async = r.data_submit_async;
-    data_retrieve = r.data_retrieve;
-    data_retrieve_async = r.data_retrieve_async;
-    data_delete = r.data_delete;
-    run_region = r.run_region;
-    run_region_async = r.run_region_async;
-    run_team_region = r.run_team_region;
-    run_team_region_async = r.run_team_region_async;
-    init_requires = r.init_requires;
-    isUsed = r.isUsed;
-    synchronize = r.synchronize;
-  }
-};
-
-/// RTLs identified in the system.
-class RTLsTy {
-private:
-  // Mutex-like object to guarantee thread-safety and unique initialization
-  // (i.e. the library attempts to load the RTLs (plugins) only once).
-  std::once_flag initFlag;
-  void LoadRTLs(); // not thread-safe
-
-public:
-  // List of the detected runtime libraries.
-  std::list<RTLInfoTy> AllRTLs;
-
-  // Array of pointers to the detected runtime libraries that have compatible
-  // binaries.
-  std::vector<RTLInfoTy *> UsedRTLs;
-
-  int64_t RequiresFlags = OMP_REQ_UNDEFINED;
-
-  explicit RTLsTy() = default;
-
-  // Register the clauses of the requires directive.
-  void RegisterRequires(int64_t flags);
-
-  // Register a shared library with all (compatible) RTLs.
-  void RegisterLib(__tgt_bin_desc *desc);
-
-  // Unregister a shared library from all RTLs.
-  void UnregisterLib(__tgt_bin_desc *desc);
-};
-extern RTLsTy *RTLs;
-extern std::mutex *RTLsMtx;
-
-
-/// Map between the host entry begin and the translation table. Each
-/// registered library gets one TranslationTable. Use the map from
-/// __tgt_offload_entry so that we may quickly determine whether we
-/// are trying to (re)register an existing lib or really have a new one.
-struct TranslationTable {
-  __tgt_target_table HostTable;
-
-  // Image assigned to a given device.
-  std::vector<__tgt_device_image *> TargetsImages; // One image per device ID.
-
-  // Table of entry points or NULL if it was not already computed.
-  std::vector<__tgt_target_table *> TargetsTable; // One table per device ID.
-};
-typedef std::map<__tgt_offload_entry *, TranslationTable>
-    HostEntriesBeginToTransTableTy;
-extern HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable;
-extern std::mutex *TrlTblMtx;
-
-/// Map between the host ptr and a table index
-struct TableMap {
-  TranslationTable *Table = nullptr; // table associated with the host ptr.
-  uint32_t Index = 0; // index in which the host ptr translated entry is found.
-  TableMap() = default;
-  TableMap(TranslationTable *table, uint32_t index)
-      : Table(table), Index(index) {}
-};
-typedef std::map<void *, TableMap> HostPtrToTableMapTy;
-extern HostPtrToTableMapTy *HostPtrToTableMap;
-extern std::mutex *TblMapMtx;
-
-#endif
+//===------------ rtl.h - Target independent OpenMP target RTL ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for handling RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_RTL_H
+#define _OMPTARGET_RTL_H
+
+#include "omptarget.h"
+#include <list>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+// Forward declarations.
+struct DeviceTy;
+struct __tgt_bin_desc;
+
+struct RTLInfoTy {
+  typedef int32_t(is_valid_binary_ty)(void *);
+  typedef int32_t(number_of_devices_ty)();
+  typedef int32_t(init_device_ty)(int32_t);
+  typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
+  typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
+  typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t,
+                                        __tgt_async_info *);
+  typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t,
+                                          __tgt_async_info *);
+  typedef int32_t(data_transfer_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_transfer_async_ty)(int32_t, void *, void *, int64_t,
+                                          __tgt_async_info *);
+  typedef int32_t(data_delete_ty)(int32_t, void *);
+  typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
+                                 int32_t);
+  typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *,
+                                       int32_t, __tgt_async_info *);
+  typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
+                                      int32_t, int32_t, int32_t, uint64_t);
+  typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **,
+                                            ptrdiff_t *, int32_t, int32_t,
+                                            int32_t, uint64_t,
+                                            __tgt_async_info *);
+  typedef int64_t(init_requires_ty)(int64_t);
+  typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *);
+
+  int32_t Idx = -1;             // RTL index, index is the number of devices
+                                // of other RTLs that were registered before,
+                                // i.e. the OpenMP index of the first device
+                                // to be registered with this RTL.
+  int32_t NumberOfDevices = -1; // Number of devices this RTL deals with.
+
+  void *LibraryHandler = nullptr;
+
+
+  std::string RTLName;
+
+
+  // Functions implemented in the RTL.
+  is_valid_binary_ty *is_valid_binary = nullptr;
+  number_of_devices_ty *number_of_devices = nullptr;
+  init_device_ty *init_device = nullptr;
+  load_binary_ty *load_binary = nullptr;
+  data_alloc_ty *data_alloc = nullptr;
+  data_submit_ty *data_submit = nullptr;
+  data_submit_async_ty *data_submit_async = nullptr;
+  data_retrieve_ty *data_retrieve = nullptr;
+  data_retrieve_async_ty *data_retrieve_async = nullptr;
+  data_transfer_ty *data_retrieve = nullptr;
+  data_transfer_async_ty *data_retrieve_async = nullptr;
+  data_delete_ty *data_delete = nullptr;
+  run_region_ty *run_region = nullptr;
+  run_region_async_ty *run_region_async = nullptr;
+  run_team_region_ty *run_team_region = nullptr;
+  run_team_region_async_ty *run_team_region_async = nullptr;
+  init_requires_ty *init_requires = nullptr;
+  synchronize_ty *synchronize = nullptr;
+
+  // Are there images associated with this RTL.
+  bool isUsed = false;
+
+  // Mutex for thread-safety when calling RTL interface functions.
+  // It is easier to enforce thread-safety at the libomptarget level,
+  // so that developers of new RTLs do not have to worry about it.
+  std::mutex Mtx;
+
+  // The existence of the mutex above makes RTLInfoTy non-copyable.
+  // We need to provide a copy constructor explicitly.
+  RTLInfoTy() = default;
+
+  RTLInfoTy(const RTLInfoTy &r) {
+    Idx = r.Idx;
+    NumberOfDevices = r.NumberOfDevices;
+    LibraryHandler = r.LibraryHandler;
+#ifdef OMPTARGET_DEBUG
+    RTLName = r.RTLName;
+#endif
+    is_valid_binary = r.is_valid_binary;
+    number_of_devices = r.number_of_devices;
+    init_device = r.init_device;
+    load_binary = r.load_binary;
+    data_alloc = r.data_alloc;
+    data_submit = r.data_submit;
+    data_submit_async = r.data_submit_async;
+    data_retrieve = r.data_retrieve;
+    data_retrieve_async = r.data_retrieve_async;
+    data_transfer = r.data_transfer;
+    data_transfer_async = r.data_transfer_async;
+    data_delete = r.data_delete;
+    run_region = r.run_region;
+    run_region_async = r.run_region_async;
+    run_team_region = r.run_team_region;
+    run_team_region_async = r.run_team_region_async;
+    init_requires = r.init_requires;
+    isUsed = r.isUsed;
+    synchronize = r.synchronize;
+  }
+};
+
+/// RTLs identified in the system.
+class RTLsTy {
+private:
+  // Mutex-like object to guarantee thread-safety and unique initialization
+  // (i.e. the library attempts to load the RTLs (plugins) only once).
+  std::once_flag initFlag;
+  void LoadRTLs(); // not thread-safe
+
+public:
+  // List of the detected runtime libraries.
+  std::list<RTLInfoTy> AllRTLs;
+
+  // Array of pointers to the detected runtime libraries that have compatible
+  // binaries.
+  std::vector<RTLInfoTy *> UsedRTLs;
+
+  int64_t RequiresFlags = OMP_REQ_UNDEFINED;
+
+  explicit RTLsTy() = default;
+
+  // Register the clauses of the requires directive.
+  void RegisterRequires(int64_t flags);
+
+  // Register a shared library with all (compatible) RTLs.
+  void RegisterLib(__tgt_bin_desc *desc);
+
+  // Unregister a shared library from all RTLs.
+  void UnregisterLib(__tgt_bin_desc *desc);
+};
+extern RTLsTy *RTLs;
+extern std::mutex *RTLsMtx;
+
+
+/// Map between the host entry begin and the translation table. Each
+/// registered library gets one TranslationTable. Use the map from
+/// __tgt_offload_entry so that we may quickly determine whether we
+/// are trying to (re)register an existing lib or really have a new one.
+struct TranslationTable {
+  __tgt_target_table HostTable;
+
+  // Image assigned to a given device.
+  std::vector<__tgt_device_image *> TargetsImages; // One image per device ID.
+
+  // Table of entry points or NULL if it was not already computed.
+  std::vector<__tgt_target_table *> TargetsTable; // One table per device ID.
+};
+typedef std::map<__tgt_offload_entry *, TranslationTable>
+    HostEntriesBeginToTransTableTy;
+extern HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable;
+extern std::mutex *TrlTblMtx;
+
+/// Map between the host ptr and a table index
+struct TableMap {
+  TranslationTable *Table = nullptr; // table associated with the host ptr.
+  uint32_t Index = 0; // index in which the host ptr translated entry is found.
+  TableMap() = default;
+  TableMap(TranslationTable *table, uint32_t index)
+      : Table(table), Index(index) {}
+};
+typedef std::map<void *, TableMap> HostPtrToTableMapTy;
+extern HostPtrToTableMapTy *HostPtrToTableMap;
+extern std::mutex *TblMapMtx;
+
+#endif
diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt
index aa3fffcfe60ef..de6a6308740f3 100644
--- a/openmp/libomptarget/test/CMakeLists.txt
+++ b/openmp/libomptarget/test/CMakeLists.txt
@@ -1,19 +1,19 @@
-# CMakeLists.txt file for unit testing OpenMP offloading runtime library.
-if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR
-   OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0)
-  libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.")
-  libomptarget_warning_say("The check-libomptarget target will not be available!")
-  return()
-endif()
-
-if(LIBOMPTARGET_ENABLE_DEBUG)
-  set(LIBOMPTARGET_DEBUG True)
-else()
-  set(LIBOMPTARGET_DEBUG False)
-endif()
-
-add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp ${LIBOMPTARGET_TESTED_PLUGINS})
-
-# Configure the lit.site.cfg.in file
-set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
-configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
+# CMakeLists.txt file for unit testing OpenMP offloading runtime library.
+if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR
+   OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0)
+  libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.")
+  libomptarget_warning_say("The check-libomptarget target will not be available!")
+  return()
+endif()
+
+if(LIBOMPTARGET_ENABLE_DEBUG)
+  set(LIBOMPTARGET_DEBUG True)
+else()
+  set(LIBOMPTARGET_DEBUG False)
+endif()
+
+add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp ${LIBOMPTARGET_TESTED_PLUGINS})
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/openmp/libomptarget/test/api/omp_get_num_devices.c b/openmp/libomptarget/test/api/omp_get_num_devices.c
index d0e84db6b1081..b121847151226 100644
--- a/openmp/libomptarget/test/api/omp_get_num_devices.c
+++ b/openmp/libomptarget/test/api/omp_get_num_devices.c
@@ -1,36 +1,36 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-#include <stdio.h>
-#include <omp.h>
-
-int test_omp_get_num_devices()
-{
-  /* checks that omp_get_num_devices() > 0 */
-  int num_devices = omp_get_num_devices();
-  printf("num_devices = %d\n", num_devices);
-
-  #pragma omp target
-  {}
-
-  return (num_devices > 0);
-}
-
-int main()
-{
-  int i;
-  int failed=0;
-
-  if (!test_omp_get_num_devices()) {
-    failed++;
-  }
-  if (failed)
-    printf("FAIL\n");
-  else
-    printf("PASS\n");
-  return failed;
-}
-
-// CHECK: PASS
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int test_omp_get_num_devices()
+{
+  /* checks that omp_get_num_devices() > 0 */
+  int num_devices = omp_get_num_devices();
+  printf("num_devices = %d\n", num_devices);
+
+  #pragma omp target
+  {}
+
+  return (num_devices > 0);
+}
+
+int main()
+{
+  int i;
+  int failed=0;
+
+  if (!test_omp_get_num_devices()) {
+    failed++;
+  }
+  if (failed)
+    printf("FAIL\n");
+  else
+    printf("PASS\n");
+  return failed;
+}
+
+// CHECK: PASS
diff --git a/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c b/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c
index 85dcb73f11490..fb4d7036c417c 100644
--- a/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c
+++ b/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c
@@ -1,30 +1,30 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-#include <omp.h>
-#include <stdio.h>
-
-static int test_omp_get_num_devices_with_empty_target() {
-  /* checks that omp_get_num_devices() > 0 */
-  return omp_get_num_devices() > 0;
-}
-
-int main() {
-  int failed = 0;
-
-  if (!test_omp_get_num_devices_with_empty_target()) {
-    ++failed;
-  }
-
-  if (failed) {
-    printf("FAIL\n");
-  } else {
-    printf("PASS\n");
-  }
-
-  return failed;
-}
-
-// CHECK: PASS
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+
+static int test_omp_get_num_devices_with_empty_target() {
+  /* checks that omp_get_num_devices() > 0 */
+  return omp_get_num_devices() > 0;
+}
+
+int main() {
+  int failed = 0;
+
+  if (!test_omp_get_num_devices_with_empty_target()) {
+    ++failed;
+  }
+
+  if (failed) {
+    printf("FAIL\n");
+  } else {
+    printf("PASS\n");
+  }
+
+  return failed;
+}
+
+// CHECK: PASS
diff --git a/openmp/libomptarget/test/env/omp_target_debug.c b/openmp/libomptarget/test/env/omp_target_debug.c
index ce84c9842f64f..4ad503f258e2c 100644
--- a/openmp/libomptarget/test/env/omp_target_debug.c
+++ b/openmp/libomptarget/test/env/omp_target_debug.c
@@ -1,20 +1,20 @@
-// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=NDEBUG
-// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
-// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG
-// REQUIRES: libomptarget-debug
-
-int main(void) {
-#pragma omp target
-  {}
-  return 0;
-}
-
-// DEBUG: Libomptarget
-// NDEBUG-NOT: Libomptarget
-// NDEBUG-NOT: Target
-
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG
+// REQUIRES: libomptarget-debug
+
+int main(void) {
+#pragma omp target
+  {}
+  return 0;
+}
+
+// DEBUG: Libomptarget
+// NDEBUG-NOT: Libomptarget
+// NDEBUG-NOT: Target
+
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 43116055c82b0..d6ba85080d963 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -1,142 +1,142 @@
-# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
-# Configuration file for the 'lit' test runner.
-
-import os
-import lit.formats
-
-# Tell pylint that we know config and lit_config exist somewhere.
-if 'PYLINT_IMPORT' in os.environ:
-    config = object()
-    lit_config = object()
-
-def append_dynamic_library_path(name, value, sep):
-    if name in config.environment:
-        config.environment[name] = value + sep + config.environment[name]
-    else:
-        config.environment[name] = value
-
-# name: The name of this test suite.
-config.name = 'libomptarget'
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc']
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root object directory where output is placed
-config.test_exec_root = config.libomptarget_obj_root
-
-# test format
-config.test_format = lit.formats.ShTest()
-
-# compiler flags
-config.test_flags = " -I " + config.test_source_root + \
-    " -I " + config.omp_header_directory + \
-    " -L " + config.library_dir;
-
-if config.omp_host_rtl_directory:
-    config.test_flags = config.test_flags + " -L " + \
-        config.omp_host_rtl_directory
-
-config.test_flags = config.test_flags + " " + config.test_extra_flags
-
-# Allow REQUIRES / UNSUPPORTED / XFAIL to work
-config.target_triple = [ ]
-for feature in config.test_compiler_features:
-    config.available_features.add(feature)
-
-if config.libomptarget_debug:
-  config.available_features.add('libomptarget-debug')
-
-# Setup environment to find dynamic library at runtime
-if config.operating_system == 'Windows':
-    append_dynamic_library_path('PATH', config.library_dir, ";")
-    append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";")
-elif config.operating_system == 'Darwin':
-    append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":")
-    append_dynamic_library_path('DYLD_LIBRARY_PATH', \
-        config.omp_host_rtl_directory, ";")
-    config.test_flags += " -Wl,-rpath," + config.library_dir
-    config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
-else: # Unices
-    append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
-    append_dynamic_library_path('LD_LIBRARY_PATH', \
-        config.omp_host_rtl_directory, ":")
-
-# substitutions
-# - for targets that exist in the system create the actual command.
-# - for valid targets that do not exist in the system, return false, so that the
-#   same test can be used for different targets.
-
-# Scan all the valid targets.
-for libomptarget_target in config.libomptarget_all_targets:
-    # Is this target in the current system? If so create a compile, run and test
-    # command. Otherwise create command that return false.
-    if libomptarget_target in config.libomptarget_system_targets:
-        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
-            libomptarget_target, \
-            "%libomptarget-compilexx-and-run-" + libomptarget_target + \
-            " | " + config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
-            libomptarget_target, \
-            "%libomptarget-compile-and-run-" + libomptarget_target + \
-            " | " + config.libomptarget_filecheck + " %s"))
-        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
-            libomptarget_target, \
-            "%libomptarget-compilexx-" + libomptarget_target + " && " + \
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-and-run-" + \
-            libomptarget_target, \
-            "%libomptarget-compile-" + libomptarget_target + " && " + \
-            "%libomptarget-run-" + libomptarget_target))
-        config.substitutions.append(("%libomptarget-compilexx-" + \
-            libomptarget_target, \
-            "%clangxx-" + libomptarget_target + " %s -o %t-" + \
-            libomptarget_target))
-        config.substitutions.append(("%libomptarget-compile-" + \
-            libomptarget_target, \
-            "%clang-" + libomptarget_target + " %s -o %t-" + \
-            libomptarget_target))
-        config.substitutions.append(("%libomptarget-run-" + \
-            libomptarget_target, \
-            "%t-" + libomptarget_target))
-        config.substitutions.append(("%clangxx-" + libomptarget_target, \
-            "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
-        config.substitutions.append(("%clang-" + libomptarget_target, \
-            "%clang %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
-        config.substitutions.append(("%fcheck-" + libomptarget_target, \
-            config.libomptarget_filecheck + " %s"))
-    else:
-        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compile-and-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compilexx-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-compile-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%libomptarget-run-" + \
-            libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%clang-" + libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%clangxx-" + libomptarget_target, \
-            "echo ignored-command"))
-        config.substitutions.append(("%fcheck-" + libomptarget_target, \
-            "echo ignored-command"))
-
-config.substitutions.append(("%clangxx", config.test_cxx_compiler))
-config.substitutions.append(("%clang", config.test_c_compiler))
-config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
-config.substitutions.append(("%flags", config.test_flags))
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def append_dynamic_library_path(name, value, sep):
+    if name in config.environment:
+        config.environment[name] = value + sep + config.environment[name]
+    else:
+        config.environment[name] = value
+
+# name: The name of this test suite.
+config.name = 'libomptarget'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp', '.cc']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.libomptarget_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.test_source_root + \
+    " -I " + config.omp_header_directory + \
+    " -L " + config.library_dir;
+
+if config.omp_host_rtl_directory:
+    config.test_flags = config.test_flags + " -L " + \
+        config.omp_host_rtl_directory
+
+config.test_flags = config.test_flags + " " + config.test_extra_flags
+
+# Allow REQUIRES / UNSUPPORTED / XFAIL to work
+config.target_triple = [ ]
+for feature in config.test_compiler_features:
+    config.available_features.add(feature)
+
+if config.libomptarget_debug:
+  config.available_features.add('libomptarget-debug')
+
+# Setup environment to find dynamic library at runtime
+if config.operating_system == 'Windows':
+    append_dynamic_library_path('PATH', config.library_dir, ";")
+    append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";")
+elif config.operating_system == 'Darwin':
+    append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":")
+    append_dynamic_library_path('DYLD_LIBRARY_PATH', \
+        config.omp_host_rtl_directory, ";")
+    config.test_flags += " -Wl,-rpath," + config.library_dir
+    config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
+else: # Unices
+    append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
+    append_dynamic_library_path('LD_LIBRARY_PATH', \
+        config.omp_host_rtl_directory, ":")
+
+# substitutions
+# - for targets that exist in the system create the actual command.
+# - for valid targets that do not exist in the system, return false, so that the
+#   same test can be used for different targets.
+
+# Scan all the valid targets.
+for libomptarget_target in config.libomptarget_all_targets:
+    # Is this target in the current system? If so create a compile, run and test
+    # command. Otherwise create command that return false.
+    if libomptarget_target in config.libomptarget_system_targets:
+        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+            libomptarget_target, \
+            "%libomptarget-compilexx-and-run-" + libomptarget_target + \
+            " | " + config.libomptarget_filecheck + " %s"))
+        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+            libomptarget_target, \
+            "%libomptarget-compile-and-run-" + libomptarget_target + \
+            " | " + config.libomptarget_filecheck + " %s"))
+        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+            libomptarget_target, \
+            "%libomptarget-compilexx-" + libomptarget_target + " && " + \
+            "%libomptarget-run-" + libomptarget_target))
+        config.substitutions.append(("%libomptarget-compile-and-run-" + \
+            libomptarget_target, \
+            "%libomptarget-compile-" + libomptarget_target + " && " + \
+            "%libomptarget-run-" + libomptarget_target))
+        config.substitutions.append(("%libomptarget-compilexx-" + \
+            libomptarget_target, \
+            "%clangxx-" + libomptarget_target + " %s -o %t-" + \
+            libomptarget_target))
+        config.substitutions.append(("%libomptarget-compile-" + \
+            libomptarget_target, \
+            "%clang-" + libomptarget_target + " %s -o %t-" + \
+            libomptarget_target))
+        config.substitutions.append(("%libomptarget-run-" + \
+            libomptarget_target, \
+            "%t-" + libomptarget_target))
+        config.substitutions.append(("%clangxx-" + libomptarget_target, \
+            "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
+        config.substitutions.append(("%clang-" + libomptarget_target, \
+            "%clang %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
+        config.substitutions.append(("%fcheck-" + libomptarget_target, \
+            config.libomptarget_filecheck + " %s"))
+    else:
+        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compile-and-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compile-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%clang-" + libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%clangxx-" + libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%fcheck-" + libomptarget_target, \
+            "echo ignored-command"))
+
+config.substitutions.append(("%clangxx", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%flags", config.test_flags))
diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in
index 26ef4920d91ee..c8aff49aa6a90 100644
--- a/openmp/libomptarget/test/lit.site.cfg.in
+++ b/openmp/libomptarget/test/lit.site.cfg.in
@@ -1,19 +1,19 @@
-@AUTO_GEN_COMMENT@
-
-config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
-config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
-config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
-config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
-config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
-config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
-config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
-config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
-config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
-config.operating_system = "@CMAKE_SYSTEM_NAME@"
-config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
-config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split()
-config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
-config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
+config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
+config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
+config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
+config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split()
+config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/openmp/libomptarget/test/mapping/declare_mapper_api.cpp b/openmp/libomptarget/test/mapping/declare_mapper_api.cpp
index 275b6c3c57025..9e4447ce2d4b8 100644
--- a/openmp/libomptarget/test/mapping/declare_mapper_api.cpp
+++ b/openmp/libomptarget/test/mapping/declare_mapper_api.cpp
@@ -1,47 +1,47 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-// Data structure definitions copied from OpenMP RTL.
-struct MapComponentInfoTy {
-  void *Base;
-  void *Begin;
-  int64_t Size;
-  int64_t Type;
-  MapComponentInfoTy() = default;
-  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type)
-      : Base(Base), Begin(Begin), Size(Size), Type(Type) {}
-};
-
-struct MapperComponentsTy {
-  std::vector<MapComponentInfoTy> Components;
-};
-
-// OpenMP RTL interfaces
-#ifdef __cplusplus
-extern "C" {
-#endif
-int64_t __tgt_mapper_num_components(void *rt_mapper_handle);
-void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
-                                 void *begin, int64_t size, int64_t type);
-#ifdef __cplusplus
-}
-#endif
-
-int main(int argc, char *argv[]) {
-  MapperComponentsTy MC;
-  void *base, *begin;
-  int64_t size, type;
-  // Push 2 elements into MC.
-  __tgt_push_mapper_component((void *)&MC, base, begin, size, type);
-  __tgt_push_mapper_component((void *)&MC, base, begin, size, type);
-  int64_t num = __tgt_mapper_num_components((void *)&MC);
-  // CHECK: num=2
-  printf("num=%lld\n", num);
-  return 0;
-}
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+// Data structure definitions copied from OpenMP RTL.
+struct MapComponentInfoTy {
+  void *Base;
+  void *Begin;
+  int64_t Size;
+  int64_t Type;
+  MapComponentInfoTy() = default;
+  MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type)
+      : Base(Base), Begin(Begin), Size(Size), Type(Type) {}
+};
+
+struct MapperComponentsTy {
+  std::vector<MapComponentInfoTy> Components;
+};
+
+// OpenMP RTL interfaces
+#ifdef __cplusplus
+extern "C" {
+#endif
+int64_t __tgt_mapper_num_components(void *rt_mapper_handle);
+void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
+                                 void *begin, int64_t size, int64_t type);
+#ifdef __cplusplus
+}
+#endif
+
+int main(int argc, char *argv[]) {
+  MapperComponentsTy MC;
+  void *base, *begin;
+  int64_t size, type;
+  // Push 2 elements into MC.
+  __tgt_push_mapper_component((void *)&MC, base, begin, size, type);
+  __tgt_push_mapper_component((void *)&MC, base, begin, size, type);
+  int64_t num = __tgt_mapper_num_components((void *)&MC);
+  // CHECK: num=2
+  printf("num=%lld\n", num);
+  return 0;
+}
diff --git a/openmp/libomptarget/test/mapping/delete_inf_refcount.c b/openmp/libomptarget/test/mapping/delete_inf_refcount.c
index b4106be04ab73..781ece71eb987 100644
--- a/openmp/libomptarget/test/mapping/delete_inf_refcount.c
+++ b/openmp/libomptarget/test/mapping/delete_inf_refcount.c
@@ -1,32 +1,32 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-#include <stdio.h>
-#include <omp.h>
-
-#pragma omp declare target
-int isHost;
-#pragma omp end declare target
-
-int main(void) {
-  isHost = -1;
-
-#pragma omp target enter data map(to: isHost)
-
-#pragma omp target
-  { isHost = omp_is_initial_device(); }
-#pragma omp target update from(isHost)
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-#pragma omp target exit data map(delete: isHost)
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-
-  return isHost;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+#pragma omp declare target
+int isHost;
+#pragma omp end declare target
+
+int main(void) {
+  isHost = -1;
+
+#pragma omp target enter data map(to: isHost)
+
+#pragma omp target
+  { isHost = omp_is_initial_device(); }
+#pragma omp target update from(isHost)
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+#pragma omp target exit data map(delete: isHost)
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return isHost;
+}
diff --git a/openmp/libomptarget/test/mapping/pr38704.c b/openmp/libomptarget/test/mapping/pr38704.c
index 3e7135e284114..fcb4afee9530e 100644
--- a/openmp/libomptarget/test/mapping/pr38704.c
+++ b/openmp/libomptarget/test/mapping/pr38704.c
@@ -1,47 +1,47 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-// Clang 6.0 doesn't use the new map interface, undefined behavior when
-// the compiler emits "old" interface code for structures.
-// UNSUPPORTED: clang-6
-
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct {
-  int *ptr1;
-  int *ptr2;
-} StructWithPtrs;
-
-int main(int argc, char *argv[]) {
-  StructWithPtrs s, s2;
-  s.ptr1 = malloc(sizeof(int));
-  s.ptr2 = malloc(2 * sizeof(int));
-  s2.ptr1 = malloc(sizeof(int));
-  s2.ptr2 = malloc(2 * sizeof(int));
-
-#pragma omp target enter data map(to: s2.ptr2[0:1])
-#pragma omp target map(s.ptr1[0:1], s.ptr2[0:2])
-  {
-    s.ptr1[0] = 1;
-    s.ptr2[0] = 2;
-    s.ptr2[1] = 3;
-  }
-#pragma omp target exit data map(from: s2.ptr1[0:1], s2.ptr2[0:1])
-
-  // CHECK: s.ptr1[0] = 1
-  // CHECK: s.ptr2[0] = 2
-  // CHECK: s.ptr2[1] = 3
-  printf("s.ptr1[0] = %d\n", s.ptr1[0]);
-  printf("s.ptr2[0] = %d\n", s.ptr2[0]);
-  printf("s.ptr2[1] = %d\n", s.ptr2[1]);
-
-  free(s.ptr1);
-  free(s.ptr2);
-  free(s2.ptr1);
-  free(s2.ptr2);
-
-  return 0;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+// Clang 6.0 doesn't use the new map interface, undefined behavior when
+// the compiler emits "old" interface code for structures.
+// UNSUPPORTED: clang-6
+
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct {
+  int *ptr1;
+  int *ptr2;
+} StructWithPtrs;
+
+int main(int argc, char *argv[]) {
+  StructWithPtrs s, s2;
+  s.ptr1 = malloc(sizeof(int));
+  s.ptr2 = malloc(2 * sizeof(int));
+  s2.ptr1 = malloc(sizeof(int));
+  s2.ptr2 = malloc(2 * sizeof(int));
+
+#pragma omp target enter data map(to: s2.ptr2[0:1])
+#pragma omp target map(s.ptr1[0:1], s.ptr2[0:2])
+  {
+    s.ptr1[0] = 1;
+    s.ptr2[0] = 2;
+    s.ptr2[1] = 3;
+  }
+#pragma omp target exit data map(from: s2.ptr1[0:1], s2.ptr2[0:1])
+
+  // CHECK: s.ptr1[0] = 1
+  // CHECK: s.ptr2[0] = 2
+  // CHECK: s.ptr2[1] = 3
+  printf("s.ptr1[0] = %d\n", s.ptr1[0]);
+  printf("s.ptr2[0] = %d\n", s.ptr2[0]);
+  printf("s.ptr2[1] = %d\n", s.ptr2[1]);
+
+  free(s.ptr1);
+  free(s.ptr2);
+  free(s2.ptr1);
+  free(s2.ptr2);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/offloading/dynamic_module.c b/openmp/libomptarget/test/offloading/dynamic_module.c
index 7f062b6d752c0..ae58ec2c9d07c 100644
--- a/openmp/libomptarget/test/offloading/dynamic_module.c
+++ b/openmp/libomptarget/test/offloading/dynamic_module.c
@@ -1,17 +1,17 @@
-// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-aarch64-unknown-linux-gnu %t.so && %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64le-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-x86_64-pc-linux-gnu %t.so && %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu
-
-#ifdef SHARED
-void foo() {}
-#else
-#include <stdio.h>
-int main() {
-#pragma omp target
-  ;
-  // CHECK: DONE.
-  printf("%s\n", "DONE.");
-  return 0;
-}
-#endif
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-aarch64-unknown-linux-gnu %t.so && %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64le-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-x86_64-pc-linux-gnu %t.so && %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu
+
+#ifdef SHARED
+void foo() {}
+#else
+#include <stdio.h>
+int main() {
+#pragma omp target
+  ;
+  // CHECK: DONE.
+  printf("%s\n", "DONE.");
+  return 0;
+}
+#endif
diff --git a/openmp/libomptarget/test/offloading/dynamic_module_load.c b/openmp/libomptarget/test/offloading/dynamic_module_load.c
index fe917e4fe1cfb..8c61464929963 100644
--- a/openmp/libomptarget/test/offloading/dynamic_module_load.c
+++ b/openmp/libomptarget/test/offloading/dynamic_module_load.c
@@ -1,34 +1,34 @@
-// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-aarch64-unknown-linux-gnu -ldl && %libomptarget-run-aarch64-unknown-linux-gnu %t.so 2>&1 | %fcheck-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64le-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64le-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-x86_64-pc-linux-gnu -ldl && %libomptarget-run-x86_64-pc-linux-gnu %t.so 2>&1 | %fcheck-x86_64-pc-linux-gnu
-
-#ifdef SHARED
-#include <stdio.h>
-int foo() {
-#pragma omp target
-  ;
-  printf("%s\n", "DONE.");
-  return 0;
-}
-#else
-#include <dlfcn.h>
-#include <stdio.h>
-int main(int argc, char **argv) {
-  void *Handle = dlopen(argv[1], RTLD_NOW);
-  int (*Foo)(void);
-
-  if (Handle == NULL) {
-    printf("dlopen() failed: %s\n", dlerror());
-    return 1;
-  }
-  Foo = (int (*)(void)) dlsym(Handle, "foo");
-  if (Handle == NULL) {
-    printf("dlsym() failed: %s\n", dlerror());
-    return 1;
-  }
-  // CHECK: DONE.
-  // CHECK-NOT: {{abort|fault}}
-  return Foo();
-}
-#endif
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-aarch64-unknown-linux-gnu -ldl && %libomptarget-run-aarch64-unknown-linux-gnu %t.so 2>&1 | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64le-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64le-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-x86_64-pc-linux-gnu -ldl && %libomptarget-run-x86_64-pc-linux-gnu %t.so 2>&1 | %fcheck-x86_64-pc-linux-gnu
+
+#ifdef SHARED
+#include <stdio.h>
+int foo() {
+#pragma omp target
+  ;
+  printf("%s\n", "DONE.");
+  return 0;
+}
+#else
+#include <dlfcn.h>
+#include <stdio.h>
+int main(int argc, char **argv) {
+  void *Handle = dlopen(argv[1], RTLD_NOW);
+  int (*Foo)(void);
+
+  if (Handle == NULL) {
+    printf("dlopen() failed: %s\n", dlerror());
+    return 1;
+  }
+  Foo = (int (*)(void)) dlsym(Handle, "foo");
+  if (Handle == NULL) {
+    printf("dlsym() failed: %s\n", dlerror());
+    return 1;
+  }
+  // CHECK: DONE.
+  // CHECK-NOT: {{abort|fault}}
+  return Foo();
+}
+#endif
diff --git a/openmp/libomptarget/test/offloading/looptripcnt.c b/openmp/libomptarget/test/offloading/looptripcnt.c
index 025231b0c6d32..855f47468c3e3 100644
--- a/openmp/libomptarget/test/offloading/looptripcnt.c
+++ b/openmp/libomptarget/test/offloading/looptripcnt.c
@@ -1,36 +1,36 @@
-// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
-// REQUIRES: libomptarget-debug
-
-/*
-  Test for looptripcount being popped from runtime stack.
-*/
-#include <stdio.h>
-#include <omp.h>
-int main()
-{
-  int N = 128;
-  int NN = 1024;
-  int num_teams[NN];
-  int num_threads[NN];
-
-  printf("#pragma omp target teams distribute parallel for thread_limit(4)\n");
-#pragma omp target teams distribute parallel for thread_limit(4)
-  for (int j = 0; j< N; j++) {
-    num_threads[j] = omp_get_num_threads();
-    num_teams[j] = omp_get_num_teams();
-  }
-  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
-// DEBUG: loop trip count is 128
-  printf("#pragma omp target teams distribute parallel for\n");
-#pragma omp target teams distribute parallel for
-  for (int j = 0; j< N; j++) {
-    num_threads[j] = omp_get_num_threads();
-    num_teams[j] = omp_get_num_teams();
-  }
-  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
-// DEBUG: loop trip count is 128
-  return 0;
-}
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// REQUIRES: libomptarget-debug
+
+/*
+  Test for looptripcount being popped from runtime stack.
+*/
+#include <stdio.h>
+#include <omp.h>
+int main()
+{
+  int N = 128;
+  int NN = 1024;
+  int num_teams[NN];
+  int num_threads[NN];
+
+  printf("#pragma omp target teams distribute parallel for thread_limit(4)\n");
+#pragma omp target teams distribute parallel for thread_limit(4)
+  for (int j = 0; j< N; j++) {
+    num_threads[j] = omp_get_num_threads();
+    num_teams[j] = omp_get_num_teams();
+  }
+  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
+// DEBUG: loop trip count is 128
+  printf("#pragma omp target teams distribute parallel for\n");
+#pragma omp target teams distribute parallel for
+  for (int j = 0; j< N; j++) {
+    num_threads[j] = omp_get_num_threads();
+    num_teams[j] = omp_get_num_teams();
+  }
+  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
+// DEBUG: loop trip count is 128
+  return 0;
+}
diff --git a/openmp/libomptarget/test/offloading/offloading_ext_success.c b/openmp/libomptarget/test/offloading/offloading_ext_success.c
new file mode 100644
index 0000000000000..86e294feba5ad
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/offloading_ext_success.c
@@ -0,0 +1,50 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define N 128
+
+int main(void) {
+  int num_d = omp_get_num_devices();
+  int d[num_d];
+  int h = omp_get_initial_device();
+  double *mem_dev_src;
+  double *mem_dev_dst;
+
+  int rc = -1;
+
+  if (num_d < 1) {
+    printf("no device in system\n");
+  } else if (num_d == 1) {
+    printf("only one device in system\n");
+  } else {
+
+// access all device number and offset
+#pragma omp target parallel for
+    for (int i = 0; i < num_d; i++) {
+      d[i] = omp_get_device_num();
+    }
+
+    // memory allocation and initialization
+    mem_dev_src = (double *)omp_target_alloc(sizeof(double) * N, d[0]);
+    if (mem_dev_src == NULL) {
+      printf("mem allocation in src device failed\n");
+      return -1;
+    }
+    for (int i = 0; i < N; i++) {
+      mem_dev_src[i] = (double)rand();
+    }
+    mem_dev_dst = (double *)omp_target_alloc(sizeof(double) * N, d[1]);
+    if (mem_dev_dst == NULL) {
+      printf("mem allocation in src device failed\n");
+      return -1;
+    }
+
+    rc = omp_target_memcpy(mem_dev_dst, mem_dev_src, N, 0, 0, d[1], d[0]);
+  }
+  return rc;
+}
\ No newline at end of file
diff --git a/openmp/libomptarget/test/offloading/offloading_success.c b/openmp/libomptarget/test/offloading/offloading_success.c
index 12e78fac1f5a3..e5e108ea5d84f 100644
--- a/openmp/libomptarget/test/offloading/offloading_success.c
+++ b/openmp/libomptarget/test/offloading/offloading_success.c
@@ -1,23 +1,23 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-#include <stdio.h>
-#include <omp.h>
-
-int main(void) {
-  int isHost = -1;
-
-#pragma omp target map(from: isHost)
-  { isHost = omp_is_initial_device(); }
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-
-  return isHost;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = -1;
+
+#pragma omp target map(from: isHost)
+  { isHost = omp_is_initial_device(); }
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return isHost;
+}
diff --git a/openmp/libomptarget/test/offloading/offloading_success.cpp b/openmp/libomptarget/test/offloading/offloading_success.cpp
index eecd97a3f317d..1b84fa86e93b4 100644
--- a/openmp/libomptarget/test/offloading/offloading_success.cpp
+++ b/openmp/libomptarget/test/offloading/offloading_success.cpp
@@ -1,23 +1,23 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-
-#include <stdio.h>
-#include <omp.h>
-
-int main(void) {
-  int isHost = 0;
-
-#pragma omp target map(from: isHost)
-  { isHost = omp_is_initial_device(); }
-
-  if (isHost < 0) {
-    printf("Runtime error, isHost=%d\n", isHost);
-  }
-
-  // CHECK: Target region executed on the device
-  printf("Target region executed on the %s\n", isHost ? "host" : "device");
-
-  return isHost;
-}
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = 0;
+
+#pragma omp target map(from: isHost)
+  { isHost = omp_is_initial_device(); }
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return isHost;
+}
diff --git a/openmp/libomptarget/test/offloading/parallel_offloading_map.c b/openmp/libomptarget/test/offloading/parallel_offloading_map.c
index 3bd59574747d5..c4a766b95defa 100644
--- a/openmp/libomptarget/test/offloading/parallel_offloading_map.c
+++ b/openmp/libomptarget/test/offloading/parallel_offloading_map.c
@@ -1,41 +1,41 @@
-// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty
-#include <assert.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
-  const int num_threads = 64, N = 128;
-  int array[num_threads] = {0};
-
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; ++i) {
-    int tmp[N];
-
-    for (int j = 0; j < N; ++j) {
-      tmp[j] = i;
-    }
-
-#pragma omp target teams distribute parallel for map(tofrom : tmp)
-    for (int j = 0; j < N; ++j) {
-      tmp[j] += j;
-    }
-
-    for (int j = 0; j < N; ++j) {
-      array[i] += tmp[j];
-    }
-  }
-
-  // Verify
-  for (int i = 0; i < num_threads; ++i) {
-    const int ref = (0 + N - 1) * N / 2 + i * N;
-    assert(array[i] == ref);
-  }
-
-  printf("PASS\n");
-
-  return 0;
-}
-
-// CHECK: PASS
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty
+#include <assert.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  const int num_threads = 64, N = 128;
+  int array[num_threads] = {0};
+
+#pragma omp parallel for
+  for (int i = 0; i < num_threads; ++i) {
+    int tmp[N];
+
+    for (int j = 0; j < N; ++j) {
+      tmp[j] = i;
+    }
+
+#pragma omp target teams distribute parallel for map(tofrom : tmp)
+    for (int j = 0; j < N; ++j) {
+      tmp[j] += j;
+    }
+
+    for (int j = 0; j < N; ++j) {
+      array[i] += tmp[j];
+    }
+  }
+
+  // Verify
+  for (int i = 0; i < num_threads; ++i) {
+    const int ref = (0 + N - 1) * N / 2 + i * N;
+    assert(array[i] == ref);
+  }
+
+  printf("PASS\n");
+
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/openmp/libomptarget/test/offloading/requires.c b/openmp/libomptarget/test/offloading/requires.c
index 079ce5cb9348c..6ebf22db97ecb 100644
--- a/openmp/libomptarget/test/offloading/requires.c
+++ b/openmp/libomptarget/test/offloading/requires.c
@@ -1,46 +1,46 @@
-// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
-// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
-// REQUIRES: libomptarget-debug
-
-/*
-  Test for the 'requires' clause check.
-  When a target region is used, the requires flags are set in the
-  runtime for the entire compilation unit. If the flags are set again,
-  (for whatever reason) the set must be consistent with previously
-  set values.
-*/
-#include <stdio.h>
-#include <omp.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-void run_reg_requires() {
-  // Before the target region is registered, the requires registers the status
-  // of the requires clauses. Since there are no requires clauses in this file
-  // the flags state can only be OMP_REQ_NONE i.e. 1.
-
-  // This is the 2nd time this function is called so it should print the debug
-  // info belonging to the check.
-  __tgt_register_requires(1);
-  __tgt_register_requires(1);
-  // DEBUG: New requires flags 1 compatible with existing 1!
-}
-
-// ---------------------------------------------------------------------------
-int main() {
-  run_reg_requires();
-
-// This also runs reg requires for the first time.
-#pragma omp target
-  {}
-
-  return 0;
-}
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// REQUIRES: libomptarget-debug
+
+/*
+  Test for the 'requires' clause check.
+  When a target region is used, the requires flags are set in the
+  runtime for the entire compilation unit. If the flags are set again,
+  (for whatever reason) the set must be consistent with previously
+  set values.
+*/
+#include <stdio.h>
+#include <omp.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+void run_reg_requires() {
+  // Before the target region is registered, the requires registers the status
+  // of the requires clauses. Since there are no requires clauses in this file
+  // the flags state can only be OMP_REQ_NONE i.e. 1.
+
+  // This is the 2nd time this function is called so it should print the debug
+  // info belonging to the check.
+  __tgt_register_requires(1);
+  __tgt_register_requires(1);
+  // DEBUG: New requires flags 1 compatible with existing 1!
+}
+
+// ---------------------------------------------------------------------------
+int main() {
+  run_reg_requires();
+
+// This also runs reg requires for the first time.
+#pragma omp target
+  {}
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/offloading/target_depend_nowait.cpp b/openmp/libomptarget/test/offloading/target_depend_nowait.cpp
index 2c1c7e7191882..636d076c59815 100644
--- a/openmp/libomptarget/test/offloading/target_depend_nowait.cpp
+++ b/openmp/libomptarget/test/offloading/target_depend_nowait.cpp
@@ -1,62 +1,62 @@
-// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
-
-#include <omp.h>
-#include <stdio.h>
-
-#define N 1024
-
-int A[N];
-int B[N];
-int C[N];
-int main() {
-  for (int i = 0; i < N; i++)
-    A[i] = B[i] = i;
-
-#pragma omp parallel num_threads(2)
-  {
-    if (omp_get_thread_num() == 1) {
-// map data A & B and move to
-#pragma omp target enter data map(to : A, B) depend(out : A[0]) nowait
-
-// no data move since already mapped
-#pragma omp target map(A, B) depend(out : A[0]) nowait
-      {
-        for (int i = 0; i < N; i++)
-          ++A[i];
-        for (int i = 0; i < N; i++)
-          ++B[i];
-      }
-
-// no data move since already mapped
-#pragma omp target teams num_teams(1) map(A, B) depend(out : A[0]) nowait
-      {
-        for (int i = 0; i < N; i++)
-          ++A[i];
-        for (int i = 0; i < N; i++)
-          ++B[i];
-      }
-
-// A updated via update
-#pragma omp target update from(A) depend(out : A[0]) nowait
-
-// B updated via exit, A just released
-#pragma omp target exit data map(release                                       \
-                                 : A) map(from                                 \
-                                          : B) depend(out                      \
-                                                      : A[0]) nowait
-    } // if
-  }   // parallel
-
-  int Sum = 0;
-  for (int i = 0; i < N; i++)
-    Sum += A[i] + B[i];
-  // Sum is 2 * N * (2 + N - 1 + 2) / 2
-  // CHECK: Sum = 1051648.
-  printf("Sum = %d.\n", Sum);
-
-  return Sum != 2 * N * (2 + N - 1 + 2) / 2;
-}
-
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 1024
+
+int A[N];
+int B[N];
+int C[N];
+int main() {
+  for (int i = 0; i < N; i++)
+    A[i] = B[i] = i;
+
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1) {
+// map data A & B and move to
+#pragma omp target enter data map(to : A, B) depend(out : A[0]) nowait
+
+// no data move since already mapped
+#pragma omp target map(A, B) depend(out : A[0]) nowait
+      {
+        for (int i = 0; i < N; i++)
+          ++A[i];
+        for (int i = 0; i < N; i++)
+          ++B[i];
+      }
+
+// no data move since already mapped
+#pragma omp target teams num_teams(1) map(A, B) depend(out : A[0]) nowait
+      {
+        for (int i = 0; i < N; i++)
+          ++A[i];
+        for (int i = 0; i < N; i++)
+          ++B[i];
+      }
+
+// A updated via update
+#pragma omp target update from(A) depend(out : A[0]) nowait
+
+// B updated via exit, A just released
+#pragma omp target exit data map(release                                       \
+                                 : A) map(from                                 \
+                                          : B) depend(out                      \
+                                                      : A[0]) nowait
+    } // if
+  }   // parallel
+
+  int Sum = 0;
+  for (int i = 0; i < N; i++)
+    Sum += A[i] + B[i];
+  // Sum is 2 * N * (2 + N - 1 + 2) / 2
+  // CHECK: Sum = 1051648.
+  printf("Sum = %d.\n", Sum);
+
+  return Sum != 2 * N * (2 + N - 1 + 2) / 2;
+}
+
diff --git a/openmp/libomptarget/test/unified_shared_memory/api.c b/openmp/libomptarget/test/unified_shared_memory/api.c
index b0a71ad358017..4a6af5eb4b903 100644
--- a/openmp/libomptarget/test/unified_shared_memory/api.c
+++ b/openmp/libomptarget/test/unified_shared_memory/api.c
@@ -1,164 +1,164 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-#include <stdio.h>
-#include <omp.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-void init(int A[], int B[], int C[]) {
-  for (int i = 0; i < N; ++i) {
-    A[i] = 0;
-    B[i] = 1;
-    C[i] = i;
-  }
-}
-
-int main(int argc, char *argv[]) {
-  const int device = omp_get_default_device();
-
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
-  // CHECK: Initial device: -10
-  printf("Initial device: %d\n", omp_get_initial_device());
-
-  //
-  // Target alloc & target memcpy
-  //
-  int A[N], B[N], C[N];
-
-  // Init
-  init(A, B, C);
-
-  int *pA, *pB, *pC;
-
-  // map ptrs
-  pA = &A[0];
-  pB = &B[0];
-  pC = &C[0];
-
-  int *d_A = (int *)omp_target_alloc(N * sizeof(int), device);
-  int *d_B = (int *)omp_target_alloc(N * sizeof(int), device);
-  int *d_C = (int *)omp_target_alloc(N * sizeof(int), device);
-
-  // CHECK: omp_target_alloc succeeded
-  printf("omp_target_alloc %s\n", d_A && d_B && d_C ? "succeeded" : "failed");
-
-  omp_target_memcpy(d_B, pB, N * sizeof(int), 0, 0, device,
-                    omp_get_initial_device());
-  omp_target_memcpy(d_C, pC, N * sizeof(int), 0, 0, device,
-                    omp_get_initial_device());
-
-#pragma omp target is_device_ptr(d_A, d_B, d_C) device(device)
-  {
-#pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < N; i++) {
-      d_A[i] = d_B[i] + d_C[i] + 1;
-    }
-  }
-
-  omp_target_memcpy(pA, d_A, N * sizeof(int), 0, 0, omp_get_initial_device(),
-                    device);
-
-  // CHECK: Test omp_target_memcpy: Succeeded
-  int fail = 0;
-  for (int i = 0; i < N; ++i) {
-    if (A[i] != i + 2)
-      fail++;
-  }
-  if (fail) {
-    printf("Test omp_target_memcpy: Failed\n");
-  } else {
-    printf("Test omp_target_memcpy: Succeeded\n");
-  }
-
-  //
-  // target_is_present and target_associate/disassociate_ptr
-  //
-  init(A, B, C);
-
-  // CHECK: B is not present, associating it...
-  // CHECK: omp_target_associate_ptr B succeeded
-  if (!omp_target_is_present(B, device)) {
-    printf("B is not present, associating it...\n");
-    int rc = omp_target_associate_ptr(B, d_B, N * sizeof(int), 0, device);
-    printf("omp_target_associate_ptr B %s\n", !rc ? "succeeded" : "failed");
-  }
-
-  // CHECK: C is not present, associating it...
-  // CHECK: omp_target_associate_ptr C succeeded
-  if (!omp_target_is_present(C, device)) {
-    printf("C is not present, associating it...\n");
-    int rc = omp_target_associate_ptr(C, d_C, N * sizeof(int), 0, device);
-    printf("omp_target_associate_ptr C %s\n", !rc ? "succeeded" : "failed");
-  }
-
-// CHECK: Inside target data: A is not present
-// CHECK: Inside target data: B is present
-// CHECK: Inside target data: C is present
-#pragma omp target data map(from : B, C) device(device)
-  {
-    printf("Inside target data: A is%s present\n",
-           omp_target_is_present(A, device) ? "" : " not");
-    printf("Inside target data: B is%s present\n",
-           omp_target_is_present(B, device) ? "" : " not");
-    printf("Inside target data: C is%s present\n",
-           omp_target_is_present(C, device) ? "" : " not");
-
-#pragma omp target map(from : A) device(device)
-    {
-#pragma omp parallel for schedule(static, 1)
-      for (int i = 0; i < N; i++)
-        A[i] = B[i] + C[i] + 1;
-    }
-  }
-
-  // CHECK: B is present, disassociating it...
-  // CHECK: omp_target_disassociate_ptr B succeeded
-  // CHECK: C is present, disassociating it...
-  // CHECK: omp_target_disassociate_ptr C succeeded
-  if (omp_target_is_present(B, device)) {
-    printf("B is present, disassociating it...\n");
-    int rc = omp_target_disassociate_ptr(B, device);
-    printf("omp_target_disassociate_ptr B %s\n", !rc ? "succeeded" : "failed");
-  }
-  if (omp_target_is_present(C, device)) {
-    printf("C is present, disassociating it...\n");
-    int rc = omp_target_disassociate_ptr(C, device);
-    printf("omp_target_disassociate_ptr C %s\n", !rc ? "succeeded" : "failed");
-  }
-
-  // CHECK: Test omp_target_associate_ptr: Succeeded
-  fail = 0;
-  for (int i = 0; i < N; ++i) {
-    if (A[i] != i + 2)
-      fail++;
-  }
-  if (fail) {
-    printf("Test omp_target_associate_ptr: Failed\n");
-  } else {
-    printf("Test omp_target_associate_ptr: Succeeded\n");
-  }
-
-  omp_target_free(d_A, device);
-  omp_target_free(d_B, device);
-  omp_target_free(d_C, device);
-
-  printf("Done!\n");
-
-  return 0;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+void init(int A[], int B[], int C[]) {
+  for (int i = 0; i < N; ++i) {
+    A[i] = 0;
+    B[i] = 1;
+    C[i] = i;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  const int device = omp_get_default_device();
+
+  // Manual registration of requires flags for Clang versions
+  // that do not support requires.
+  __tgt_register_requires(8);
+
+  // CHECK: Initial device: -10
+  printf("Initial device: %d\n", omp_get_initial_device());
+
+  //
+  // Target alloc & target memcpy
+  //
+  int A[N], B[N], C[N];
+
+  // Init
+  init(A, B, C);
+
+  int *pA, *pB, *pC;
+
+  // map ptrs
+  pA = &A[0];
+  pB = &B[0];
+  pC = &C[0];
+
+  int *d_A = (int *)omp_target_alloc(N * sizeof(int), device);
+  int *d_B = (int *)omp_target_alloc(N * sizeof(int), device);
+  int *d_C = (int *)omp_target_alloc(N * sizeof(int), device);
+
+  // CHECK: omp_target_alloc succeeded
+  printf("omp_target_alloc %s\n", d_A && d_B && d_C ? "succeeded" : "failed");
+
+  omp_target_memcpy(d_B, pB, N * sizeof(int), 0, 0, device,
+                    omp_get_initial_device());
+  omp_target_memcpy(d_C, pC, N * sizeof(int), 0, 0, device,
+                    omp_get_initial_device());
+
+#pragma omp target is_device_ptr(d_A, d_B, d_C) device(device)
+  {
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < N; i++) {
+      d_A[i] = d_B[i] + d_C[i] + 1;
+    }
+  }
+
+  omp_target_memcpy(pA, d_A, N * sizeof(int), 0, 0, omp_get_initial_device(),
+                    device);
+
+  // CHECK: Test omp_target_memcpy: Succeeded
+  int fail = 0;
+  for (int i = 0; i < N; ++i) {
+    if (A[i] != i + 2)
+      fail++;
+  }
+  if (fail) {
+    printf("Test omp_target_memcpy: Failed\n");
+  } else {
+    printf("Test omp_target_memcpy: Succeeded\n");
+  }
+
+  //
+  // target_is_present and target_associate/disassociate_ptr
+  //
+  init(A, B, C);
+
+  // CHECK: B is not present, associating it...
+  // CHECK: omp_target_associate_ptr B succeeded
+  if (!omp_target_is_present(B, device)) {
+    printf("B is not present, associating it...\n");
+    int rc = omp_target_associate_ptr(B, d_B, N * sizeof(int), 0, device);
+    printf("omp_target_associate_ptr B %s\n", !rc ? "succeeded" : "failed");
+  }
+
+  // CHECK: C is not present, associating it...
+  // CHECK: omp_target_associate_ptr C succeeded
+  if (!omp_target_is_present(C, device)) {
+    printf("C is not present, associating it...\n");
+    int rc = omp_target_associate_ptr(C, d_C, N * sizeof(int), 0, device);
+    printf("omp_target_associate_ptr C %s\n", !rc ? "succeeded" : "failed");
+  }
+
+// CHECK: Inside target data: A is not present
+// CHECK: Inside target data: B is present
+// CHECK: Inside target data: C is present
+#pragma omp target data map(from : B, C) device(device)
+  {
+    printf("Inside target data: A is%s present\n",
+           omp_target_is_present(A, device) ? "" : " not");
+    printf("Inside target data: B is%s present\n",
+           omp_target_is_present(B, device) ? "" : " not");
+    printf("Inside target data: C is%s present\n",
+           omp_target_is_present(C, device) ? "" : " not");
+
+#pragma omp target map(from : A) device(device)
+    {
+#pragma omp parallel for schedule(static, 1)
+      for (int i = 0; i < N; i++)
+        A[i] = B[i] + C[i] + 1;
+    }
+  }
+
+  // CHECK: B is present, disassociating it...
+  // CHECK: omp_target_disassociate_ptr B succeeded
+  // CHECK: C is present, disassociating it...
+  // CHECK: omp_target_disassociate_ptr C succeeded
+  if (omp_target_is_present(B, device)) {
+    printf("B is present, disassociating it...\n");
+    int rc = omp_target_disassociate_ptr(B, device);
+    printf("omp_target_disassociate_ptr B %s\n", !rc ? "succeeded" : "failed");
+  }
+  if (omp_target_is_present(C, device)) {
+    printf("C is present, disassociating it...\n");
+    int rc = omp_target_disassociate_ptr(C, device);
+    printf("omp_target_disassociate_ptr C %s\n", !rc ? "succeeded" : "failed");
+  }
+
+  // CHECK: Test omp_target_associate_ptr: Succeeded
+  fail = 0;
+  for (int i = 0; i < N; ++i) {
+    if (A[i] != i + 2)
+      fail++;
+  }
+  if (fail) {
+    printf("Test omp_target_associate_ptr: Failed\n");
+  } else {
+    printf("Test omp_target_associate_ptr: Succeeded\n");
+  }
+
+  omp_target_free(d_A, device);
+  omp_target_free(d_B, device);
+  omp_target_free(d_C, device);
+
+  printf("Done!\n");
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
index 4cedbae36004b..39d185a7be751 100644
--- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
@@ -1,95 +1,95 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc = 0, *device_alloc = 0;
-  int *a = (int *)malloc(N * sizeof(int));
-
-  // Init
-  for (int i = 0; i < N; ++i) {
-    a[i] = 10;
-  }
-  host_alloc = &a[0];
-
-  //
-  // map + target no close
-  //
-#pragma omp target data map(tofrom : a[ : N]) map(tofrom : device_alloc)
-  {
-#pragma omp target map(tofrom : device_alloc)
-    { device_alloc = &a[0]; }
-  }
-
-  // CHECK: a used from unified memory.
-  if (device_alloc == host_alloc)
-    printf("a used from unified memory.\n");
-
-  //
-  // map + target with close
-  //
-  device_alloc = 0;
-#pragma omp target data map(close, tofrom : a[ : N]) map(tofrom : device_alloc)
-  {
-#pragma omp target map(tofrom : device_alloc)
-    { device_alloc = &a[0]; }
-  }
-  // CHECK: a copied to device.
-  if (device_alloc != host_alloc)
-    printf("a copied to device.\n");
-
-  //
-  // map + use_device_ptr no close
-  //
-  device_alloc = 0;
-#pragma omp target data map(tofrom : a[ : N]) use_device_ptr(a)
-  { device_alloc = &a[0]; }
-
-  // CHECK: a used from unified memory with use_device_ptr.
-  if (device_alloc == host_alloc)
-    printf("a used from unified memory with use_device_ptr.\n");
-
-  //
-  // map + use_device_ptr close
-  //
-  device_alloc = 0;
-#pragma omp target data map(close, tofrom : a[ : N]) use_device_ptr(a)
-  { device_alloc = &a[0]; }
-
-  // CHECK: a used from device memory with use_device_ptr.
-  if (device_alloc != host_alloc)
-    printf("a used from device memory with use_device_ptr.\n");
-
-  //
-  // map enter/exit + close
-  //
-  device_alloc = 0;
-#pragma omp target enter data map(close, to : a[ : N])
-
-#pragma omp target map(from : device_alloc)
-  { device_alloc = &a[0]; }
-
-#pragma omp target exit data map(from : a[ : N])
-
-  // CHECK: a has been mapped to the device.
-  if (device_alloc != host_alloc)
-    printf("a has been mapped to the device.\n");
-
-  free(a);
-
-  // CHECK: Done!
-  printf("Done!\n");
-
-  return 0;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
+
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc = 0, *device_alloc = 0;
+  int *a = (int *)malloc(N * sizeof(int));
+
+  // Init
+  for (int i = 0; i < N; ++i) {
+    a[i] = 10;
+  }
+  host_alloc = &a[0];
+
+  //
+  // map + target no close
+  //
+#pragma omp target data map(tofrom : a[ : N]) map(tofrom : device_alloc)
+  {
+#pragma omp target map(tofrom : device_alloc)
+    { device_alloc = &a[0]; }
+  }
+
+  // CHECK: a used from unified memory.
+  if (device_alloc == host_alloc)
+    printf("a used from unified memory.\n");
+
+  //
+  // map + target with close
+  //
+  device_alloc = 0;
+#pragma omp target data map(close, tofrom : a[ : N]) map(tofrom : device_alloc)
+  {
+#pragma omp target map(tofrom : device_alloc)
+    { device_alloc = &a[0]; }
+  }
+  // CHECK: a copied to device.
+  if (device_alloc != host_alloc)
+    printf("a copied to device.\n");
+
+  //
+  // map + use_device_ptr no close
+  //
+  device_alloc = 0;
+#pragma omp target data map(tofrom : a[ : N]) use_device_ptr(a)
+  { device_alloc = &a[0]; }
+
+  // CHECK: a used from unified memory with use_device_ptr.
+  if (device_alloc == host_alloc)
+    printf("a used from unified memory with use_device_ptr.\n");
+
+  //
+  // map + use_device_ptr close
+  //
+  device_alloc = 0;
+#pragma omp target data map(close, tofrom : a[ : N]) use_device_ptr(a)
+  { device_alloc = &a[0]; }
+
+  // CHECK: a used from device memory with use_device_ptr.
+  if (device_alloc != host_alloc)
+    printf("a used from device memory with use_device_ptr.\n");
+
+  //
+  // map enter/exit + close
+  //
+  device_alloc = 0;
+#pragma omp target enter data map(close, to : a[ : N])
+
+#pragma omp target map(from : device_alloc)
+  { device_alloc = &a[0]; }
+
+#pragma omp target exit data map(from : a[ : N])
+
+  // CHECK: a has been mapped to the device.
+  if (device_alloc != host_alloc)
+    printf("a has been mapped to the device.\n");
+
+  free(a);
+
+  // CHECK: Done!
+  printf("Done!\n");
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/unified_shared_memory/close_manual.c b/openmp/libomptarget/test/unified_shared_memory/close_manual.c
index 0417b8bf254e3..37a499cc7a342 100644
--- a/openmp/libomptarget/test/unified_shared_memory/close_manual.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_manual.c
@@ -1,86 +1,86 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-#include <omp.h>
-#include <stdio.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
-extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
-                                    void **args_base, void **args,
-                                    int64_t *arg_sizes, int64_t *arg_types);
-
-extern void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
-                                  void **args_base, void **args,
-                                  int64_t *arg_sizes, int64_t *arg_types);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc = 0, *device_alloc = 0;
-  int *a = (int *)malloc(N * sizeof(int));
-
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
-  // Init
-  for (int i = 0; i < N; ++i) {
-    a[i] = 10;
-  }
-  host_alloc = &a[0];
-
-// Dummy target region that ensures the runtime library is loaded when
-// the target data begin/end functions are manually called below.
-#pragma omp target
-  {}
-
-  // Manual calls
-  int device_id = omp_get_default_device();
-  int arg_num = 1;
-  void **args_base = (void **)&a;
-  void **args = (void **)&a;
-  int64_t arg_sizes[arg_num];
-
-  arg_sizes[0] = sizeof(int) * N;
-
-  int64_t arg_types[arg_num];
-
-  // Ox400 enables the CLOSE map type in the runtime:
-  // OMP_TGT_MAPTYPE_CLOSE = 0x400
-  // OMP_TGT_MAPTYPE_TO    = 0x001
-  arg_types[0] = 0x400 | 0x001;
-
-  device_alloc = host_alloc;
-
-  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
-                          arg_types);
-
-#pragma omp target data use_device_ptr(a)
-  { device_alloc = a; }
-
-  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
-                        arg_types);
-
-  // CHECK: a was copied to the device
-  if (device_alloc != host_alloc)
-    printf("a was copied to the device\n");
-
-  free(a);
-
-  // CHECK: Done!
-  printf("Done!\n");
-
-  return 0;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+                                    void **args_base, void **args,
+                                    int64_t *arg_sizes, int64_t *arg_types);
+
+extern void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
+                                  void **args_base, void **args,
+                                  int64_t *arg_sizes, int64_t *arg_types);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc = 0, *device_alloc = 0;
+  int *a = (int *)malloc(N * sizeof(int));
+
+  // Manual registration of requires flags for Clang versions
+  // that do not support requires.
+  __tgt_register_requires(8);
+
+  // Init
+  for (int i = 0; i < N; ++i) {
+    a[i] = 10;
+  }
+  host_alloc = &a[0];
+
+// Dummy target region that ensures the runtime library is loaded when
+// the target data begin/end functions are manually called below.
+#pragma omp target
+  {}
+
+  // Manual calls
+  int device_id = omp_get_default_device();
+  int arg_num = 1;
+  void **args_base = (void **)&a;
+  void **args = (void **)&a;
+  int64_t arg_sizes[arg_num];
+
+  arg_sizes[0] = sizeof(int) * N;
+
+  int64_t arg_types[arg_num];
+
+  // Ox400 enables the CLOSE map type in the runtime:
+  // OMP_TGT_MAPTYPE_CLOSE = 0x400
+  // OMP_TGT_MAPTYPE_TO    = 0x001
+  arg_types[0] = 0x400 | 0x001;
+
+  device_alloc = host_alloc;
+
+  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
+                          arg_types);
+
+#pragma omp target data use_device_ptr(a)
+  { device_alloc = a; }
+
+  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
+                        arg_types);
+
+  // CHECK: a was copied to the device
+  if (device_alloc != host_alloc)
+    printf("a was copied to the device\n");
+
+  free(a);
+
+  // CHECK: Done!
+  printf("Done!\n");
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
index b319c6b69ac29..a3ca71caf022f 100644
--- a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
@@ -1,135 +1,135 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc, *device_alloc;
-  void *host_data, *device_data;
-  int *alloc = (int *)malloc(N * sizeof(int));
-  int data[N];
-
-  for (int i = 0; i < N; ++i) {
-    alloc[i] = 10;
-    data[i] = 1;
-  }
-
-  host_data = &data[0];
-  host_alloc = &alloc[0];
-
-//
-// Test that updates on the device are not visible to host
-// when only a TO mapping is used.
-//
-#pragma omp target map(tofrom                                                  \
-                       : device_data, device_alloc) map(close, to              \
-                                                        : alloc[:N], data      \
-                                                        [:N])
-  {
-    device_data = &data[0];
-    device_alloc = &alloc[0];
-
-    for (int i = 0; i < N; i++) {
-      alloc[i] += 1;
-      data[i] += 1;
-    }
-  }
-
-  // CHECK: Address of alloc on device different from host address.
-  if (device_alloc != host_alloc)
-    printf("Address of alloc on device different from host address.\n");
-
-  // CHECK: Address of data on device different from host address.
-  if (device_data != host_data)
-    printf("Address of data on device different from host address.\n");
-
-  // On the host, check that the arrays have been updated.
-  // CHECK: Alloc host values not updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (alloc[i] != 10)
-      fails++;
-  }
-  printf("Alloc host values not updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  // CHECK: Data host values not updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (data[i] != 1)
-      fails++;
-  }
-  printf("Data host values not updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  //
-  // Test that updates on the device are visible on host
-  // when a from is used.
-  //
-
-  for (int i = 0; i < N; i++) {
-    alloc[i] += 1;
-    data[i] += 1;
-  }
-
-#pragma omp target map(close, tofrom : alloc[:N], data[:N])
-  {
-    // CHECK: Alloc device values are correct: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (alloc[i] != 11)
-        fails++;
-    }
-    printf("Alloc device values are correct: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-    // CHECK: Data device values are correct: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (data[i] != 2)
-        fails++;
-    }
-    printf("Data device values are correct: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-
-    // Update values on the device
-    for (int i = 0; i < N; i++) {
-      alloc[i] += 1;
-      data[i] += 1;
-    }
-  }
-
-  // CHECK: Alloc host values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (alloc[i] != 12)
-      fails++;
-  }
-  printf("Alloc host values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  // CHECK: Data host values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (data[i] != 3)
-      fails++;
-  }
-  printf("Data host values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  free(alloc);
-
-  // CHECK: Done!
-  printf("Done!\n");
-
-  return 0;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
+
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc, *device_alloc;
+  void *host_data, *device_data;
+  int *alloc = (int *)malloc(N * sizeof(int));
+  int data[N];
+
+  for (int i = 0; i < N; ++i) {
+    alloc[i] = 10;
+    data[i] = 1;
+  }
+
+  host_data = &data[0];
+  host_alloc = &alloc[0];
+
+//
+// Test that updates on the device are not visible to host
+// when only a TO mapping is used.
+//
+#pragma omp target map(tofrom                                                  \
+                       : device_data, device_alloc) map(close, to              \
+                                                        : alloc[:N], data      \
+                                                        [:N])
+  {
+    device_data = &data[0];
+    device_alloc = &alloc[0];
+
+    for (int i = 0; i < N; i++) {
+      alloc[i] += 1;
+      data[i] += 1;
+    }
+  }
+
+  // CHECK: Address of alloc on device different from host address.
+  if (device_alloc != host_alloc)
+    printf("Address of alloc on device different from host address.\n");
+
+  // CHECK: Address of data on device different from host address.
+  if (device_data != host_data)
+    printf("Address of data on device different from host address.\n");
+
+  // On the host, check that the arrays have been updated.
+  // CHECK: Alloc host values not updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (alloc[i] != 10)
+      fails++;
+  }
+  printf("Alloc host values not updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  // CHECK: Data host values not updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (data[i] != 1)
+      fails++;
+  }
+  printf("Data host values not updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  //
+  // Test that updates on the device are visible on host
+  // when a from is used.
+  //
+
+  for (int i = 0; i < N; i++) {
+    alloc[i] += 1;
+    data[i] += 1;
+  }
+
+#pragma omp target map(close, tofrom : alloc[:N], data[:N])
+  {
+    // CHECK: Alloc device values are correct: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (alloc[i] != 11)
+        fails++;
+    }
+    printf("Alloc device values are correct: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+    // CHECK: Data device values are correct: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (data[i] != 2)
+        fails++;
+    }
+    printf("Data device values are correct: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+
+    // Update values on the device
+    for (int i = 0; i < N; i++) {
+      alloc[i] += 1;
+      data[i] += 1;
+    }
+  }
+
+  // CHECK: Alloc host values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (alloc[i] != 12)
+      fails++;
+  }
+  printf("Alloc host values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  // CHECK: Data host values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (data[i] != 3)
+      fails++;
+  }
+  printf("Data host values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  free(alloc);
+
+  // CHECK: Done!
+  printf("Done!\n");
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/unified_shared_memory/shared_update.c b/openmp/libomptarget/test/unified_shared_memory/shared_update.c
index 8036bc2f0405f..b27c79a1a67fa 100644
--- a/openmp/libomptarget/test/unified_shared_memory/shared_update.c
+++ b/openmp/libomptarget/test/unified_shared_memory/shared_update.c
@@ -1,114 +1,114 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
-
-#include <stdio.h>
-#include <omp.h>
-
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
-// End of definitions copied from OpenMP RTL.
-// ---------------------------------------------------------------------------
-
-#pragma omp requires unified_shared_memory
-
-#define N 1024
-
-int main(int argc, char *argv[]) {
-  int fails;
-  void *host_alloc, *device_alloc;
-  void *host_data, *device_data;
-  int *alloc = (int *)malloc(N * sizeof(int));
-  int data[N];
-
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
-  for (int i = 0; i < N; ++i) {
-    alloc[i] = 10;
-    data[i] = 1;
-  }
-
-  host_data = &data[0];
-  host_alloc = &alloc[0];
-
-// implicit mapping of data
-#pragma omp target map(tofrom : device_data, device_alloc)
-  {
-    device_data = &data[0];
-    device_alloc = &alloc[0];
-
-    for (int i = 0; i < N; i++) {
-      alloc[i] += 1;
-      data[i] += 1;
-    }
-  }
-
-  // CHECK: Address of alloc on device matches host address.
-  if (device_alloc == host_alloc)
-    printf("Address of alloc on device matches host address.\n");
-
-  // CHECK: Address of data on device matches host address.
-  if (device_data == host_data)
-    printf("Address of data on device matches host address.\n");
-
-  // On the host, check that the arrays have been updated.
-  // CHECK: Alloc device values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (alloc[i] != 11)
-      fails++;
-  }
-  printf("Alloc device values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  // CHECK: Data device values updated: Succeeded
-  fails = 0;
-  for (int i = 0; i < N; i++) {
-    if (data[i] != 2)
-      fails++;
-  }
-  printf("Data device values updated: %s\n",
-         (fails == 0) ? "Succeeded" : "Failed");
-
-  //
-  // Test that updates on the host snd on the device are both visible.
-  //
-
-  // Update on the host.
-  for (int i = 0; i < N; ++i) {
-    alloc[i] += 1;
-    data[i] += 1;
-  }
-
-#pragma omp target
-  {
-    // CHECK: Alloc host values updated: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (alloc[i] != 12)
-        fails++;
-    }
-    printf("Alloc host values updated: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-    // CHECK: Data host values updated: Succeeded
-    fails = 0;
-    for (int i = 0; i < N; i++) {
-      if (data[i] != 3)
-        fails++;
-    }
-    printf("Data host values updated: %s\n",
-           (fails == 0) ? "Succeeded" : "Failed");
-  }
-
-  free(alloc);
-
-  printf("Done!\n");
-
-  return 0;
-}
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc, *device_alloc;
+  void *host_data, *device_data;
+  int *alloc = (int *)malloc(N * sizeof(int));
+  int data[N];
+
+  // Manual registration of requires flags for Clang versions
+  // that do not support requires.
+  __tgt_register_requires(8);
+
+  for (int i = 0; i < N; ++i) {
+    alloc[i] = 10;
+    data[i] = 1;
+  }
+
+  host_data = &data[0];
+  host_alloc = &alloc[0];
+
+// implicit mapping of data
+#pragma omp target map(tofrom : device_data, device_alloc)
+  {
+    device_data = &data[0];
+    device_alloc = &alloc[0];
+
+    for (int i = 0; i < N; i++) {
+      alloc[i] += 1;
+      data[i] += 1;
+    }
+  }
+
+  // CHECK: Address of alloc on device matches host address.
+  if (device_alloc == host_alloc)
+    printf("Address of alloc on device matches host address.\n");
+
+  // CHECK: Address of data on device matches host address.
+  if (device_data == host_data)
+    printf("Address of data on device matches host address.\n");
+
+  // On the host, check that the arrays have been updated.
+  // CHECK: Alloc device values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (alloc[i] != 11)
+      fails++;
+  }
+  printf("Alloc device values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  // CHECK: Data device values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (data[i] != 2)
+      fails++;
+  }
+  printf("Data device values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  //
+  // Test that updates on the host snd on the device are both visible.
+  //
+
+  // Update on the host.
+  for (int i = 0; i < N; ++i) {
+    alloc[i] += 1;
+    data[i] += 1;
+  }
+
+#pragma omp target
+  {
+    // CHECK: Alloc host values updated: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (alloc[i] != 12)
+        fails++;
+    }
+    printf("Alloc host values updated: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+    // CHECK: Data host values updated: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (data[i] != 3)
+        fails++;
+    }
+    printf("Data host values updated: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+  }
+
+  free(alloc);
+
+  printf("Done!\n");
+
+  return 0;
+}